From d2cbe4267f30c64bc728a249ce354af03b380e51 Mon Sep 17 00:00:00 2001 From: drikster80 Date: Wed, 20 Nov 2024 14:00:01 +0000 Subject: [PATCH 01/25] Update Docker for aarch64 builds Signed-off-by: drikster80 --- Dockerfile | 82 ++++++++++++++++++++++++++++++++++--- requirements-build.txt | 2 +- requirements-cuda-arm64.txt | 3 ++ requirements-cuda.txt | 4 +- 4 files changed, 83 insertions(+), 8 deletions(-) create mode 100644 requirements-cuda-arm64.txt diff --git a/Dockerfile b/Dockerfile index 220dbe26712ec..b82e61c4f1cbb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,13 +11,14 @@ ARG CUDA_VERSION=12.4.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 +ARG TARGETPLATFORM ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo \ + && apt-get install -y ccache software-properties-common git curl sudo kmod \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ @@ -46,9 +47,14 @@ WORKDIR /workspace # install build and runtime dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-cuda.txt requirements-cuda.txt +COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-cuda.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} #################### WHEEL BUILD IMAGE #################### FROM base AS build +ARG TARGETPLATFORM # install build dependencies COPY requirements-build.txt requirements-build.txt @@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-build.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi + COPY . . ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ @@ -113,6 +125,53 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + apt-get update && apt-get install zlib1g-dev && \ + python3 -m pip install packaging pybind11 && \ + git clone https://github.com/openai/triton && \ + cd triton/python && \ + git submodule update --init --recursive && \ + pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation git+https://github.com/vllm-project/flash-attention.git ; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=FALSE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/Dao-AILab/causal-conv1d.git ; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + MAMBA_FORCE_BUILD=TRUE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/state-spaces/mamba.git ; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + apt-get update && apt-get install -y cuda-toolkit-12-4 && \ + git clone -b v0.1.6 https://github.com/flashinfer-ai/flashinfer.git --recursive && \ + cd flashinfer/python && \ + pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + git clone -b 0.44.1 https://github.com/bitsandbytes-foundation/bitsandbytes.git && \ + cd bitsandbytes && \ + pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \ + fi + + # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # Default max size of the wheel is 250MB @@ -124,6 +183,7 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ else \ echo "Skipping wheel size check."; \ fi + #################### EXTENSION Build IMAGE #################### #################### DEV IMAGE #################### @@ -143,6 +203,9 @@ ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive +ARG TARGETPLATFORM + +COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment @@ -168,14 +231,23 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ -# install vllm wheel first, so that torch etc will be installed +# Install vllm wheel first, so that torch etc will be installed. +# On Arm64 platforms, all newly compiled wheels will also be installed (flashinfer, triton, mamba, etc.) RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose RUN --mount=type=cache,target=/root/.cache/pip \ - . /etc/environment && \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + pip uninstall -y torch && \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ +. /etc/environment && \ +if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ + python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ +fi COPY examples examples #################### vLLM installation IMAGE #################### @@ -214,7 +286,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10 + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' 'timm==0.9.10' ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/requirements-build.txt b/requirements-build.txt index fec01caaf25ef..388b193403e88 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.5.1 +torch==2.5.1; platform_machine != 'aarch64' wheel jinja2 diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt new file mode 100644 index 0000000000000..a8baf1dedb5a8 --- /dev/null +++ b/requirements-cuda-arm64.txt @@ -0,0 +1,3 @@ +--index-url https://download.pytorch.org/whl/nightly/cu124 +torchvision; platform_machine == 'aarch64' +torch; platform_machine == 'aarch64' diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 058ab7c1ee9df..5d4dee8c7129a 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py >= 12.560.30 # for pynvml package -torch == 2.5.1 +torch == 2.5.1; platform_machine != 'aarch64' # These must be updated alongside torch -torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 From 29ed3585d96e3501bebfc693247728bc792e1147 Mon Sep 17 00:00:00 2001 From: drikster80 Date: Wed, 20 Nov 2024 18:55:24 +0000 Subject: [PATCH 02/25] Update docs for arm64 docker builds & GH200 example Signed-off-by: drikster80 --- docs/source/serving/deploying_with_docker.rst | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 14d94b09e9b9c..3118e19daf118 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -37,6 +37,32 @@ You can build and run vLLM from source via the provided `Dockerfile Date: Wed, 20 Nov 2024 21:48:37 +0000 Subject: [PATCH 03/25] Fix FLASHINFER not installing by default on x86_64 Signed-off-by: drikster80 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index b82e61c4f1cbb..c87288b0605c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -245,7 +245,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \ . /etc/environment && \ -if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ +if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ fi COPY examples examples From ccef455e5e57379ac09be1b106ef3a462ee9fb2e Mon Sep 17 00:00:00 2001 From: drikster80 Date: Wed, 4 Dec 2024 16:31:06 +0000 Subject: [PATCH 04/25] Removed build of flash-attn. Added libnccl2 --- Dockerfile | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index c87288b0605c2..764c6c0312979 100644 --- a/Dockerfile +++ b/Dockerfile @@ -136,12 +136,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \ fi -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=.git,target=.git \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation git+https://github.com/vllm-project/flash-attention.git ; \ - fi - RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ @@ -214,7 +208,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \ + && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip libnccl2\ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ From 8b3c0b34608d00b80686388d010584b4c82308d8 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Sun, 15 Dec 2024 21:24:52 +0800 Subject: [PATCH 05/25] simplified Dockerfile build process for ARM64 systems --- Dockerfile | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index c1b6e1bbfe354..3291a023222b5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 +ARG TARGETPLATFORM ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies @@ -46,9 +47,14 @@ WORKDIR /workspace # install build and runtime dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-cuda.txt requirements-cuda.txt +COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-cuda.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} #################### WHEEL BUILD IMAGE #################### FROM base AS build +ARG TARGETPLATFORM # install build dependencies COPY requirements-build.txt requirements-build.txt @@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-build.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi + COPY . . ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ @@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt COPY requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt - #################### DEV IMAGE #################### + #################### vLLM installation IMAGE #################### # image with vLLM installed FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base @@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive +ARG TARGETPLATFORM + +COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment @@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ -# install vllm wheel first, so that torch etc will be installed +# Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose RUN --mount=type=cache,target=/root/.cache/pip \ - . /etc/environment && \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + pip uninstall -y torch && \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ +. /etc/environment && \ +if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ + python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ +fi COPY examples examples #################### vLLM installation IMAGE #################### - #################### TEST IMAGE #################### # image to run unit testing suite # note that this uses vllm installed by `pip` @@ -209,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 RUN mkdir test_docs RUN mv docs test_docs/ RUN mv vllm test_docs/ - #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### @@ -218,8 +239,11 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10 - + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \ + else \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' 'timm==0.9.10'; \ + fi ENV VLLM_USAGE_SOURCE production-docker-image ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] From 69bb56707cadde4b78cf396e3c2779e5375ed2f9 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Sun, 15 Dec 2024 21:27:05 +0800 Subject: [PATCH 06/25] bitsandbytes>=0.45.0 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3291a023222b5..123703848749c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -242,7 +242,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \ else \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' 'timm==0.9.10'; \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \ fi ENV VLLM_USAGE_SOURCE production-docker-image From d5d2911a529db9475af10fd85ba50a90d1d5d878 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Mon, 16 Dec 2024 11:45:04 +0800 Subject: [PATCH 07/25] rm needless kmod/libnccl2 && rm needless build from source for triton/causal-conv1d/mamba/flashinfer/bitsandbytes --- Dockerfile | 46 ++-------------------------------------------- 1 file changed, 2 insertions(+), 44 deletions(-) diff --git a/Dockerfile b/Dockerfile index 489382b3d1eca..123703848749c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ ENV DEBIAN_FRONTEND=noninteractive RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo kmod \ + && apt-get install -y ccache software-properties-common git curl sudo \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ @@ -125,47 +125,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=.git,target=.git \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - apt-get update && apt-get install zlib1g-dev && \ - python3 -m pip install packaging pybind11 && \ - git clone https://github.com/openai/triton && \ - cd triton/python && \ - git submodule update --init --recursive && \ - pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \ - fi - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=.git,target=.git \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=FALSE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/Dao-AILab/causal-conv1d.git ; \ - fi - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=.git,target=.git \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - MAMBA_FORCE_BUILD=TRUE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/state-spaces/mamba.git ; \ - fi - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=.git,target=.git \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - apt-get update && apt-get install -y cuda-toolkit-12-4 && \ - git clone -b v0.1.6 https://github.com/flashinfer-ai/flashinfer.git --recursive && \ - cd flashinfer/python && \ - pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \ - fi - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=.git,target=.git \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - git clone -b 0.44.1 https://github.com/bitsandbytes-foundation/bitsandbytes.git && \ - cd bitsandbytes && \ - pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \ - fi - - # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # Default max size of the wheel is 250MB @@ -177,7 +136,6 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ else \ echo "Skipping wheel size check."; \ fi - #################### EXTENSION Build IMAGE #################### #################### DEV IMAGE #################### @@ -208,7 +166,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip libnccl2\ + && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ From 075ab9db4e0ae8db4ea53c18a7fa729650b5b139 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Mon, 16 Dec 2024 15:27:47 +0800 Subject: [PATCH 08/25] update Memory usage && rename image && pinned version for torch torchvision --- docs/source/serving/deploying_with_docker.rst | 4 ++-- requirements-cuda-arm64.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 3118e19daf118..11a9f12fd17cd 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -51,11 +51,11 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- .. code-block:: console - # Example of building on Nvidia GH200 server. (Memory usage: ~180GB, Build time: ~2387s / ~40 min) + # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB) $ DOCKER_BUILDKIT=1 sudo docker build . \ --target vllm-openai \ -platform "linux/arm64" \ - -t drikster80/vllm-gh200-openai:v0.6.4.post1 \ + -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ --build-arg torch_cuda_arch_list="9.0+PTX" \ diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt index a8baf1dedb5a8..bbcb5cb7012ce 100644 --- a/requirements-cuda-arm64.txt +++ b/requirements-cuda-arm64.txt @@ -1,3 +1,3 @@ --index-url https://download.pytorch.org/whl/nightly/cu124 -torchvision; platform_machine == 'aarch64' -torch; platform_machine == 'aarch64' +torchvision==0.22.0.dev20241215; platform_machine == 'aarch64' +torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64' From 34e714087d684fefb24e00ed385e78ddbe0437a6 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Wed, 18 Dec 2024 19:44:37 +0800 Subject: [PATCH 09/25] rm requirements-cuda-arm64.txt && rm redundant torch install/uninstall && rm arm64 platform condition in requirements && add python3 use_existing_torch.py before build in run-gh200-test.sh --- .buildkite/run-gh200-test.sh | 13 +++++++++++++ Dockerfile | 15 ++------------- docs/source/serving/deploying_with_docker.rst | 4 ++++ requirements-build.txt | 2 +- requirements-cuda-arm64.txt | 3 --- requirements-cuda.txt | 4 ++-- 6 files changed, 22 insertions(+), 19 deletions(-) delete mode 100644 requirements-cuda-arm64.txt diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index d06604f96f2b8..6cf071b02b23a 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -4,6 +4,19 @@ # It serves a sanity check for compilation and basic model usage. set -ex +# Download the python +PYTHON_VERSION=3.12 +apt-get update -y \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +python3 use_existing_torch.py + # Try building the docker image DOCKER_BUILDKIT=1 docker build . \ --target vllm-openai \ diff --git a/Dockerfile b/Dockerfile index 123703848749c..2a8f714f41515 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,13 +47,13 @@ WORKDIR /workspace # install build and runtime dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-cuda.txt requirements-cuda.txt -COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-cuda.txt RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - python3 -m pip install -r requirements-cuda-arm64.txt; \ + python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" \ + && python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215"; \ fi # cuda arch list used by torch @@ -77,11 +77,6 @@ COPY requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-build.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - python3 -m pip install -r requirements-cuda-arm64.txt; \ - fi - COPY . . ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ @@ -188,12 +183,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose -RUN --mount=type=cache,target=/root/.cache/pip \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - pip uninstall -y torch && \ - python3 -m pip install -r requirements-cuda-arm64.txt; \ - fi - RUN --mount=type=cache,target=/root/.cache/pip \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 11a9f12fd17cd..ccc3b9c92ca78 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -52,6 +52,10 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- .. code-block:: console # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB) + # Note: You should download the torch and torchvision wheels from the PyTorch nightly site, and run the use_existing_torch.py script to skip the download of other torch wheels during the build. + $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" + $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215" + $ python3 use_existing_torch.py $ DOCKER_BUILDKIT=1 sudo docker build . \ --target vllm-openai \ -platform "linux/arm64" \ diff --git a/requirements-build.txt b/requirements-build.txt index 388b193403e88..fec01caaf25ef 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.5.1; platform_machine != 'aarch64' +torch==2.5.1 wheel jinja2 diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt deleted file mode 100644 index bbcb5cb7012ce..0000000000000 --- a/requirements-cuda-arm64.txt +++ /dev/null @@ -1,3 +0,0 @@ ---index-url https://download.pytorch.org/whl/nightly/cu124 -torchvision==0.22.0.dev20241215; platform_machine == 'aarch64' -torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64' diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 5d4dee8c7129a..058ab7c1ee9df 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py >= 12.560.30 # for pynvml package -torch == 2.5.1; platform_machine != 'aarch64' +torch == 2.5.1 # These must be updated alongside torch -torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 From c65bf72e6e10d2b968ad9ce332256bfe4f073495 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Wed, 18 Dec 2024 19:51:24 +0800 Subject: [PATCH 10/25] fix docs --platform "linux/arm64" --- docs/source/serving/deploying_with_docker.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index ccc3b9c92ca78..ecbb0775f598e 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -58,7 +58,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- $ python3 use_existing_torch.py $ DOCKER_BUILDKIT=1 sudo docker build . \ --target vllm-openai \ - -platform "linux/arm64" \ + --platform "linux/arm64" \ -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ From 6bddcd4e58aa34afc2d61f5c2e7bb622e93fa5e6 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Wed, 18 Dec 2024 19:52:49 +0800 Subject: [PATCH 11/25] rm docs blanks --- docs/source/serving/deploying_with_docker.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index ecbb0775f598e..729cc07560049 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -65,9 +65,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- --build-arg torch_cuda_arch_list="9.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" - - - To run vLLM: .. code-block:: console From 6b80175de547f40a201a583c1bdf62b3aa7e06ea Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Wed, 18 Dec 2024 19:58:31 +0800 Subject: [PATCH 12/25] add torch nightly build in run-gh200-test.sh --- .buildkite/run-gh200-test.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 6cf071b02b23a..ff8ae37742dfb 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -15,6 +15,11 @@ apt-get update -y \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version +# Install the nightly version of torch and torchvision +python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" +python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215" + +# Skip the new torch installation during build since we are using the specified version python3 use_existing_torch.py # Try building the docker image From 93b82d739f8ebb3abb1f3252bcd89dc305185b31 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Wed, 18 Dec 2024 20:07:09 +0800 Subject: [PATCH 13/25] edit doc note && rm build-arg max_jobs nvcc_threads --- docs/source/serving/deploying_with_docker.rst | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 729cc07560049..03c79a18d7765 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -45,13 +45,11 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- .. note:: - Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` - flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits. - Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). + Multiple modules must be compiled, so this process can take a while. .. code-block:: console - # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB) + # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 7.26GB) # Note: You should download the torch and torchvision wheels from the PyTorch nightly site, and run the use_existing_torch.py script to skip the download of other torch wheels during the build. $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215" @@ -60,8 +58,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- --target vllm-openai \ --platform "linux/arm64" \ -t vllm/vllm-gh200-openai:latest \ - --build-arg max_jobs=66 \ - --build-arg nvcc_threads=2 \ --build-arg torch_cuda_arch_list="9.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" From d0264892c3f3d4997827546a1cb8a5f8c3811a8a Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Wed, 18 Dec 2024 21:30:00 +0800 Subject: [PATCH 14/25] add args max_jobs nvcc_threads to speed up && install pytorch first && outlines == 0.1.11 # Requires pytorch && rm torch install before docker build --- .buildkite/run-gh200-test.sh | 6 +----- Dockerfile | 10 +++++----- docs/source/serving/deploying_with_docker.rst | 9 +++++---- requirements-common.txt | 2 +- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index ff8ae37742dfb..2525656ec9839 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -15,11 +15,7 @@ apt-get update -y \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version -# Install the nightly version of torch and torchvision -python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" -python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215" - -# Skip the new torch installation during build since we are using the specified version +# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile python3 use_existing_torch.py # Try building the docker image diff --git a/Dockerfile b/Dockerfile index 2a8f714f41515..ccd6e5064f8cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,16 +45,16 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace # install build and runtime dependencies -COPY requirements-common.txt requirements-common.txt -COPY requirements-cuda.txt requirements-cuda.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-cuda.txt - +# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten by the requirements RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" \ && python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215"; \ fi +COPY requirements-common.txt requirements-common.txt +COPY requirements-cuda.txt requirements-cuda.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -r requirements-cuda.txt # cuda arch list used by torch # can be useful for both `dev` and `test` diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 03c79a18d7765..574081b112a59 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -45,19 +45,20 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- .. note:: - Multiple modules must be compiled, so this process can take a while. + Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` + flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits. + Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). .. code-block:: console # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 7.26GB) - # Note: You should download the torch and torchvision wheels from the PyTorch nightly site, and run the use_existing_torch.py script to skip the download of other torch wheels during the build. - $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" - $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215" $ python3 use_existing_torch.py $ DOCKER_BUILDKIT=1 sudo docker build . \ --target vllm-openai \ --platform "linux/arm64" \ -t vllm/vllm-gh200-openai:latest \ + --build-arg max_jobs=66 \ + --build-arg nvcc_threads=2 \ --build-arg torch_cuda_arch_list="9.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" diff --git a/requirements-common.txt b/requirements-common.txt index 1c935303c8d79..c2dea20005a5d 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ pillow # Required for image processing prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines == 0.1.11 +outlines == 0.1.11 # Requires pytorch lark == 1.2.2 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 From fb1759e3a3e1f77a41a0702602b70ff82ff282fd Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Wed, 18 Dec 2024 22:24:41 +0800 Subject: [PATCH 15/25] update vllm-gh200-openai Image size: 3.23GB --- docs/source/serving/deploying_with_docker.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 574081b112a59..7aceb49463449 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -51,7 +51,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- .. code-block:: console - # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 7.26GB) + # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 3.23GB) $ python3 use_existing_torch.py $ DOCKER_BUILDKIT=1 sudo docker build . \ --target vllm-openai \ From 62c8e29905ab4de046451514064beda6ad08197a Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Thu, 19 Dec 2024 13:41:19 +0800 Subject: [PATCH 16/25] add debug "python3 -m pip list | grep torch || true" && compressed-tensors requires pytorch --- Dockerfile | 39 ++++++++++++++++++++++++++++++++++----- requirements-common.txt | 2 +- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index ccd6e5064f8cb..f348c19a1807e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,17 +45,26 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace # install build and runtime dependencies -# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten by the requirements +# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip list | grep torch || true + RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" \ - && python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215"; \ + python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ fi + +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip list | grep torch || true + COPY requirements-common.txt requirements-common.txt COPY requirements-cuda.txt requirements-cuda.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-cuda.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip list | grep torch || true + # cuda arch list used by torch # can be useful for both `dev` and `test` # explicitly set the list to avoid issues with torch 2.2 @@ -120,6 +129,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip list | grep torch || true + # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # Default max size of the wheel is 250MB @@ -152,8 +164,6 @@ WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive ARG TARGETPLATFORM -COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt - RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment @@ -178,11 +188,23 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ +# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten +RUN --mount=type=cache,target=/root/.cache/pip \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip list | grep torch || true + # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip list | grep torch || true + RUN --mount=type=cache,target=/root/.cache/pip \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ @@ -226,6 +248,9 @@ RUN mv vllm test_docs/ # openai api server alternative FROM vllm-base AS vllm-openai +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip list | grep torch || true + # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ @@ -233,6 +258,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ else \ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \ fi + +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip list | grep torch || true + ENV VLLM_USAGE_SOURCE production-docker-image ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/requirements-common.txt b/requirements-common.txt index c2dea20005a5d..e5f6c7e5dbd9d 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -34,5 +34,5 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.8.1 # required for compressed-tensors +compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch depyf==0.18.0 # required for profiling and debugging torch.compile From dd483d974c750e48d3407f4bc42b96ca016a797e Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Thu, 19 Dec 2024 14:36:38 +0800 Subject: [PATCH 17/25] rm "python3 -m pip list | grep torch || true" && add cloudpickle && update image size --- Dockerfile | 28 ++----------------- docs/source/serving/deploying_with_docker.rst | 2 +- requirements-common.txt | 1 + 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/Dockerfile b/Dockerfile index f348c19a1807e..ed2d51d90e215 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,26 +45,17 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace # install build and runtime dependencies -# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip list | grep torch || true - +# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten by the requirements RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ fi -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip list | grep torch || true - COPY requirements-common.txt requirements-common.txt COPY requirements-cuda.txt requirements-cuda.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-cuda.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip list | grep torch || true - # cuda arch list used by torch # can be useful for both `dev` and `test` # explicitly set the list to avoid issues with torch 2.2 @@ -129,9 +120,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip list | grep torch || true - # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # Default max size of the wheel is 250MB @@ -188,23 +176,17 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ -# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten +# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten during the wheel installation RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ fi -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip list | grep torch || true - # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip list | grep torch || true - RUN --mount=type=cache,target=/root/.cache/pip \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ @@ -248,9 +230,6 @@ RUN mv vllm test_docs/ # openai api server alternative FROM vllm-base AS vllm-openai -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip list | grep torch || true - # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ @@ -259,9 +238,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \ fi -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip list | grep torch || true - ENV VLLM_USAGE_SOURCE production-docker-image ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 7aceb49463449..93a7df39547e7 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -51,7 +51,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- .. code-block:: console - # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 3.23GB) + # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) $ python3 use_existing_torch.py $ DOCKER_BUILDKIT=1 sudo docker build . \ --target vllm-openai \ diff --git a/requirements-common.txt b/requirements-common.txt index e5f6c7e5dbd9d..6ec7bed463caf 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -36,3 +36,4 @@ setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we n einops # Required for Qwen2-VL. compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch depyf==0.18.0 # required for profiling and debugging torch.compile +cloudpickle # allows pickling lambda functions in model_executor/models/registry.py \ No newline at end of file From 022eaf990797db0f2c76605bd03083ad9ff31ae5 Mon Sep 17 00:00:00 2001 From: cenzhiyao <2523403608@qq.com> Date: Thu, 19 Dec 2024 15:24:07 +0800 Subject: [PATCH 18/25] rm sudo in doc --- docs/source/serving/deploying_with_docker.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 93a7df39547e7..93cf2eb51ee85 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -53,7 +53,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) $ python3 use_existing_torch.py - $ DOCKER_BUILDKIT=1 sudo docker build . \ + $ DOCKER_BUILDKIT=1 docker build . \ --target vllm-openai \ --platform "linux/arm64" \ -t vllm/vllm-gh200-openai:latest \ From c61c57967bfa04c0cf34b811ba3faf56f8a15de4 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 19 Dec 2024 16:30:25 -0800 Subject: [PATCH 19/25] avoid installing python Signed-off-by: youkaichao --- .buildkite/run-gh200-test.sh | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 2525656ec9839..4fc6d089cc666 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -4,17 +4,6 @@ # It serves a sanity check for compilation and basic model usage. set -ex -# Download the python -PYTHON_VERSION=3.12 -apt-get update -y \ - && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ - && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ - && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ - && python3 --version && python3 -m pip --version - # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile python3 use_existing_torch.py From 5c056e6f8492484b250becaf822e2a816ba24ed4 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 19 Dec 2024 16:34:54 -0800 Subject: [PATCH 20/25] add comments Signed-off-by: youkaichao --- Dockerfile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ed2d51d90e215..0944050f7dfca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,7 +45,11 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace # install build and runtime dependencies -# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten by the requirements + +# arm64 (GH200) build follows the practice of "use existing pytorch" build, +# we need to install torch and torchvision from the nightly builds first, +# pytorch will not appear as a vLLM dependency in all of the following steps +# after this step RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ @@ -176,7 +180,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ -# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten during the wheel installation +# arm64 (GH200) build follows the practice of "use existing pytorch" build, +# we need to install torch and torchvision from the nightly builds first, +# pytorch will not appear as a vLLM dependency in all of the following steps +# after this step RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ From 383f799c695a201dfe17cbd73b51fc89dfc79f87 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 19 Dec 2024 16:37:19 -0800 Subject: [PATCH 21/25] fix comments Signed-off-by: youkaichao --- requirements-common.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 6ec7bed463caf..3ac301f0a4a60 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -35,5 +35,5 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch -depyf==0.18.0 # required for profiling and debugging torch.compile -cloudpickle # allows pickling lambda functions in model_executor/models/registry.py \ No newline at end of file +depyf==0.18.0 # required for profiling and debugging with compilation config +cloudpickle # allows pickling lambda functions in model_executor/models/registry.py From fccb98521ab58939e9398f56361396bf6451e782 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 19 Dec 2024 16:59:55 -0800 Subject: [PATCH 22/25] doc change Signed-off-by: youkaichao --- docs/source/serving/deploying_with_docker.rst | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 93cf2eb51ee85..b62b066bd6108 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -3,6 +3,9 @@ Deploying with Docker ============================ +Use vLLM's Official Docker Image +-------------------------------- + vLLM offers an official Docker image for deployment. The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai `_. @@ -24,12 +27,15 @@ The image can be used to run OpenAI compatible server and is available on Docker memory to share data between processes under the hood, particularly for tensor parallel inference. +Building vLLM's Docker Image from Source +---------------------------------------- + You can build and run vLLM from source via the provided `Dockerfile `_. To build vLLM: .. code-block:: console - $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 - + $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 + $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai .. note:: @@ -62,7 +68,10 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- --build-arg torch_cuda_arch_list="9.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" -To run vLLM: +Use the custom-built vLLM Docker image +-------------------------------------- + +To run vLLM with the custom-built Docker image: .. code-block:: console @@ -72,6 +81,8 @@ To run vLLM: --env "HUGGING_FACE_HUB_TOKEN=" \ vllm/vllm-openai +The argument ``vllm/vllm-openai`` specifies the image to run, and should be replaced with the name of the custom-built image (the ``-t`` tag from the build command). + .. note:: - **For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . + **For ``v0.4.1``` and ``v0.4.2``` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . From 67129a77c9de3536223660dc9569e5fa052a103f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 19 Dec 2024 17:05:13 -0800 Subject: [PATCH 23/25] fix docs Signed-off-by: youkaichao --- docs/source/serving/deploying_with_docker.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index b62b066bd6108..ad89fdb5a5139 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -47,7 +47,7 @@ Building for Arm64/aarch64 -------------------------- A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use -of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. +of PyTorch Nightly and should be considered **experimental**. Using the flag ``--platform "linux/arm64"`` will attempt to build for arm64. .. note:: @@ -85,4 +85,4 @@ The argument ``vllm/vllm-openai`` specifies the image to run, and should be repl .. note:: - **For ``v0.4.1``` and ``v0.4.2``` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . + **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . From f7f54f6347326ab3f861d347025d55d8ec83aaf3 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 19 Dec 2024 17:10:08 -0800 Subject: [PATCH 24/25] fix docs Signed-off-by: youkaichao --- docs/source/serving/deploying_with_docker.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index ad89fdb5a5139..5d5ee4e3e02ce 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -51,7 +51,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag ``- .. note:: - Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` + Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=`` flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits. Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). From 7dbe69e1e0f12df34d855c76ff6c170f8f29d5aa Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 19 Dec 2024 17:10:41 -0800 Subject: [PATCH 25/25] fix docs Signed-off-by: youkaichao --- docs/source/serving/deploying_with_docker.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 5d5ee4e3e02ce..b64eef819cd2e 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -52,7 +52,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag ``- .. note:: Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=`` - flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits. + flags to speed up build process. However, ensure your ``max_jobs`` is substantially larger than ``nvcc_threads`` to get the most benefits. Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). .. code-block:: console