From cab47289b53746b1144ffa20b2056b6cd0f8f267 Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 13 Dec 2023 06:53:08 +0000 Subject: [PATCH] upgrade xformers version dependency for ROCm; update documentations --- Dockerfile.rocm | 4 +- .../getting_started/amd-installation.rst | 16 ++-- ...7.rocm.sh => patch_xformers-0.0.23.rocm.sh | 8 +- requirements-rocm.txt | 1 - ...ch => commonpy_xformers-0.0.23.rocm.patch} | 0 ...tch => flashpy_xformers-0.0.23.rocm.patch} | 96 +++++++++++-------- 6 files changed, 71 insertions(+), 54 deletions(-) rename patch_xformers-0.0.22.post7.rocm.sh => patch_xformers-0.0.23.rocm.sh (83%) rename rocm_patch/{commonpy_xformers-0.0.22.post7.rocm.patch => commonpy_xformers-0.0.23.rocm.patch} (100%) rename rocm_patch/{flashpy_xformers-0.0.22.post7.rocm.patch => flashpy_xformers-0.0.23.rocm.patch} (61%) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 7af53e4472e33..e5c4e051fcab7 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -47,12 +47,12 @@ RUN mkdir libs \ COPY ./ /app/vllm RUN python3 -m pip install --upgrade pip -RUN pip install xformers==0.0.22.post7 --no-deps +RUN pip install xformers==0.0.23 --no-deps RUN cd /app \ && cd vllm \ && pip install -U -r requirements-rocm.txt \ - && bash patch_xformers-0.0.22.post7.rocm.sh \ + && bash patch_xformers-0.0.23.rocm.sh \ && python3 setup.py install \ && cd .. diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 512aa051f16d9..14ece960cd233 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -3,7 +3,7 @@ Installation with ROCm ====================== -vLLM 0.2.x onwards supports model inferencing and serving on AMD GPUs with ROCm. +vLLM 0.2.4 onwards supports model inferencing and serving on AMD GPUs with ROCm. At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported. Data types currently supported in ROCm are FP16 and BF16. @@ -29,7 +29,7 @@ Installation options: .. code-block:: console - $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.3 + $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.4 $ docker run -it \ --network=host \ --group-add=video \ @@ -70,12 +70,12 @@ You can build and install vLLM from source: - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention. - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) -2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention +2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention .. code-block:: console - $ pip install xformers==0.0.22.post7 --no-deps - $ bash patch_xformers-0.0.22.post7.rocm.sh + $ pip install xformers==0.0.23 --no-deps + $ bash patch_xformers-0.0.23.rocm.sh 3. Build vLLM. @@ -127,12 +127,12 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention. - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) -2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention +2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention .. code-block:: console - $ pip install xformers==0.0.22.post7 --no-deps - $ bash patch_xformers-0.0.22.post7.rocm.sh + $ pip install xformers==0.0.23 --no-deps + $ bash patch_xformers-0.0.23.rocm.sh 3. Build vLLM. diff --git a/patch_xformers-0.0.22.post7.rocm.sh b/patch_xformers-0.0.23.rocm.sh similarity index 83% rename from patch_xformers-0.0.22.post7.rocm.sh rename to patch_xformers-0.0.23.rocm.sh index 8dcb6240f0a44..4ba3a00fdd719 100644 --- a/patch_xformers-0.0.22.post7.rocm.sh +++ b/patch_xformers-0.0.23.rocm.sh @@ -5,17 +5,17 @@ export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; echo $XFORMERS_FMHA_FLASH_PATH echo $XFORMERS_FMHA_COMMON_PATH -if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"; then +if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.23.rocm.patch"; then echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}" - patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch" + patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.23.rocm.patch" echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}" else echo "${XFORMERS_FMHA_FLASH_PATH} was patched before" fi -if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"; then +if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.23.rocm.patch"; then echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}" - patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch" + patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.23.rocm.patch" echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}" else echo "${XFORMERS_FMHA_COMMON_PATH} was patched before" diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 441f018976a27..81bc19580274c 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -8,7 +8,6 @@ pyarrow # Required for Ray data. sentencepiece # Required for LLaMA tokenizer. numpy tokenizers>=0.15.0 -huggingface_hub<0.18,>=0.16.4 transformers >= 4.36.0 # Required for Mixtral. fastapi uvicorn[standard] diff --git a/rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch similarity index 100% rename from rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch rename to rocm_patch/commonpy_xformers-0.0.23.rocm.patch diff --git a/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch similarity index 61% rename from rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch rename to rocm_patch/flashpy_xformers-0.0.23.rocm.patch index 4798f1efd461c..ac846728a7a91 100644 --- a/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch +++ b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch @@ -1,6 +1,6 @@ ---- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/flash.py 2023-11-29 03:17:03.930103539 +0000 -+++ flash.py 2023-11-28 16:14:25.206128903 +0000 -@@ -31,39 +31,39 @@ +--- flash_ori.py 2023-12-13 05:43:31.530752623 +0000 ++++ flash_patch.py 2023-12-13 06:00:45.962403104 +0000 +@@ -36,44 +36,44 @@ FLASH_VERSION = "0.0.0" try: @@ -15,9 +15,12 @@ - from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention - - FLASH_VERSION = flash_attn.__version__ -- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2]) -- if flash_ver_parsed < (2, 3): -- raise ImportError("Requires 2.3 for sliding window support") +- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3]) +- if ( +- flash_ver_parsed != (2, 3, 6) +- and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1" +- ): +- raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api") + #try: + # from ... import _C_flashattention # type: ignore[attr-defined] + # from ..._cpp_lib import _build_metadata @@ -29,35 +32,41 @@ + from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention + + FLASH_VERSION = flash_attn.__version__ -+ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2]) -+ # if flash_ver_parsed < (2, 3): -+ # raise ImportError("Requires 2.3 for sliding window support") ++ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3]) ++ # if ( ++ # flash_ver_parsed != (2, 3, 6) ++ # and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1" ++ # ): ++ # raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api") # create library so that flash-attn goes through the PyTorch Dispatcher - _flash_lib = torch.library.Library("xformers_flash", "DEF") -+ #_flash_lib = torch.library.Library("xformers_flash", "DEF") - +- - _flash_lib.define( - "flash_fwd(Tensor query, Tensor key, Tensor value, " -- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, " +- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, " - "int max_seqlen_q, int max_seqlen_k, " - "float p, float softmax_scale, " -- "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)" +- "bool is_causal, int window_left, " +- "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)" - ) -- ++ #_flash_lib = torch.library.Library("xformers_flash", "DEF") + - _flash_lib.define( - "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " - "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " - "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " - "int max_seqlen_q, int max_seqlen_k, " -- "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)" +- "float p, float softmax_scale, bool is_causal, " +- "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)" - ) + #_flash_lib.define( + # "flash_fwd(Tensor query, Tensor key, Tensor value, " -+ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, " ++ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, " + # "int max_seqlen_q, int max_seqlen_k, " + # "float p, float softmax_scale, " -+ # "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)" ++ # "bool is_causal, int window_left, " ++ # "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)" + #) + + #_flash_lib.define( @@ -65,52 +74,61 @@ + # "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " + # "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " + # "int max_seqlen_q, int max_seqlen_k, " -+ # "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)" ++ # "float p, float softmax_scale, bool is_causal, " ++ # "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)" + #) def _flash_fwd( query, -@@ -98,8 +98,8 @@ +@@ -111,8 +111,8 @@ p, softmax_scale, is_causal, -- window_size - 1, # window_size_left -- -1, # window_size_right -+ # window_size - 1, # window_size_left -+ # -1, # window_size_right +- window_left, # window_size_left +- window_right, # window_size_right ++ # window_left, # window_size_left ++ # window_right, # window_size_right return_softmax, None, # rng ) -@@ -127,8 +127,8 @@ +@@ -134,15 +134,15 @@ + out, + cu_seq_lens_q, + cu_seq_lens_k, +- seqused_k, ++ # seqused_k, + max_seq_len_q, + max_seq_len_k, + p, softmax_scale, False, is_causal, -- window_size - 1, # window_size_left -- -1, # window_size_right -+ # window_size - 1, # window_size_left -+ # -1, # window_size_right +- window_left, +- window_right, ++ # window_left, ++ # window_right, return_softmax, None, ) -@@ -169,8 +169,8 @@ +@@ -184,8 +184,8 @@ p, softmax_scale, is_causal, -- window_size - 1, # window_size_left -- -1, # window_size_right -+ # window_size - 1, # window_size_left -+ # -1, # window_size_right +- window_left, +- window_right, ++ # window_left, ++ # window_right, None, rng_state, ) -@@ -193,15 +193,15 @@ +@@ -208,15 +208,15 @@ softmax_scale, False, # zero_tensors is_causal, -- window_size - 1, # window_size_left -- -1, # window_size_right -+ # window_size - 1, # window_size_left -+ # -1, # window_size_right +- window_left, +- window_right, ++ # window_left, ++ # window_right, None, rng_state, ) @@ -123,7 +141,7 @@ except ImportError: pass -@@ -348,7 +348,7 @@ +@@ -400,7 +400,7 @@ implementation. """