From cab47289b53746b1144ffa20b2056b6cd0f8f267 Mon Sep 17 00:00:00 2001
From: miloice <jeffaw99@hotmail.com>
Date: Wed, 13 Dec 2023 06:53:08 +0000
Subject: [PATCH] upgrade xformers version dependency for ROCm; update
 documentations

---
 Dockerfile.rocm                               |  4 +-
 .../getting_started/amd-installation.rst      | 16 ++--
 ...7.rocm.sh => patch_xformers-0.0.23.rocm.sh |  8 +-
 requirements-rocm.txt                         |  1 -
 ...ch => commonpy_xformers-0.0.23.rocm.patch} |  0
 ...tch => flashpy_xformers-0.0.23.rocm.patch} | 96 +++++++++++--------
 6 files changed, 71 insertions(+), 54 deletions(-)
 rename patch_xformers-0.0.22.post7.rocm.sh => patch_xformers-0.0.23.rocm.sh (83%)
 rename rocm_patch/{commonpy_xformers-0.0.22.post7.rocm.patch => commonpy_xformers-0.0.23.rocm.patch} (100%)
 rename rocm_patch/{flashpy_xformers-0.0.22.post7.rocm.patch => flashpy_xformers-0.0.23.rocm.patch} (61%)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 7af53e4472e33..e5c4e051fcab7 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -47,12 +47,12 @@ RUN mkdir libs \
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN pip install xformers==0.0.22.post7 --no-deps
+RUN pip install xformers==0.0.23 --no-deps
 
 RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
-    && bash patch_xformers-0.0.22.post7.rocm.sh \
+    && bash patch_xformers-0.0.23.rocm.sh \
     && python3 setup.py install \
     && cd ..
 
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 512aa051f16d9..14ece960cd233 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -3,7 +3,7 @@
 Installation with ROCm
 ======================
 
-vLLM 0.2.x onwards supports model inferencing and serving on AMD GPUs with ROCm.
+vLLM 0.2.4 onwards supports model inferencing and serving on AMD GPUs with ROCm.
 At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported.
 Data types currently supported in ROCm are FP16 and BF16.
 
@@ -29,7 +29,7 @@ Installation options:
 
 .. code-block:: console
 
-    $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.3
+    $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.4
     $ docker run -it \
        --network=host \
        --group-add=video \
@@ -70,12 +70,12 @@ You can build and install vLLM from source:
     - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
     - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
 
-2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention
+2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
 
     .. code-block:: console
 
-        $ pip install xformers==0.0.22.post7 --no-deps
-        $ bash patch_xformers-0.0.22.post7.rocm.sh
+        $ pip install xformers==0.0.23 --no-deps
+        $ bash patch_xformers-0.0.23.rocm.sh
 
 3. Build vLLM.
 
@@ -127,12 +127,12 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from
     - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
     - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
 
-2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention
+2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
 
     .. code-block:: console
 
-        $ pip install xformers==0.0.22.post7 --no-deps
-        $ bash patch_xformers-0.0.22.post7.rocm.sh
+        $ pip install xformers==0.0.23 --no-deps
+        $ bash patch_xformers-0.0.23.rocm.sh
 
 3. Build vLLM.
 
diff --git a/patch_xformers-0.0.22.post7.rocm.sh b/patch_xformers-0.0.23.rocm.sh
similarity index 83%
rename from patch_xformers-0.0.22.post7.rocm.sh
rename to patch_xformers-0.0.23.rocm.sh
index 8dcb6240f0a44..4ba3a00fdd719 100644
--- a/patch_xformers-0.0.22.post7.rocm.sh
+++ b/patch_xformers-0.0.23.rocm.sh
@@ -5,17 +5,17 @@ export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops;
 echo $XFORMERS_FMHA_FLASH_PATH
 echo $XFORMERS_FMHA_COMMON_PATH
 
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"; then
+if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.23.rocm.patch"; then
     echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
-    patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"
+    patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.23.rocm.patch"
     echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
 else
     echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
 fi
 
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"; then
+if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.23.rocm.patch"; then
     echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
-    patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"
+    patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.23.rocm.patch"
     echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
 else
     echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 441f018976a27..81bc19580274c 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -8,7 +8,6 @@ pyarrow  # Required for Ray data.
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0
-huggingface_hub<0.18,>=0.16.4
 transformers >= 4.36.0  # Required for Mixtral.
 fastapi
 uvicorn[standard]
diff --git a/rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
similarity index 100%
rename from rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch
rename to rocm_patch/commonpy_xformers-0.0.23.rocm.patch
diff --git a/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
similarity index 61%
rename from rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch
rename to rocm_patch/flashpy_xformers-0.0.23.rocm.patch
index 4798f1efd461c..ac846728a7a91 100644
--- a/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch
+++ b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
@@ -1,6 +1,6 @@
---- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/flash.py	2023-11-29 03:17:03.930103539 +0000
-+++ flash.py	2023-11-28 16:14:25.206128903 +0000
-@@ -31,39 +31,39 @@
+--- flash_ori.py	2023-12-13 05:43:31.530752623 +0000
++++ flash_patch.py	2023-12-13 06:00:45.962403104 +0000
+@@ -36,44 +36,44 @@
  
  FLASH_VERSION = "0.0.0"
  try:
@@ -15,9 +15,12 @@
 -        from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
 -
 -        FLASH_VERSION = flash_attn.__version__
--        flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2])
--        if flash_ver_parsed < (2, 3):
--            raise ImportError("Requires 2.3 for sliding window support")
+-        flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
+-        if (
+-            flash_ver_parsed != (2, 3, 6)
+-            and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
+-        ):
+-            raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
 +    #try:
 +    #    from ... import _C_flashattention  # type: ignore[attr-defined]
 +    #    from ..._cpp_lib import _build_metadata
@@ -29,35 +32,41 @@
 +    from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
 +
 +    FLASH_VERSION = flash_attn.__version__
-+    #    flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2])
-+    #    if flash_ver_parsed < (2, 3):
-+    #        raise ImportError("Requires 2.3 for sliding window support")
++    #    flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
++    #    if (
++    #        flash_ver_parsed != (2, 3, 6)
++    #        and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
++    #    ):
++    #        raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
  
      # create library so that flash-attn goes through the PyTorch Dispatcher
 -    _flash_lib = torch.library.Library("xformers_flash", "DEF")
-+    #_flash_lib = torch.library.Library("xformers_flash", "DEF")
- 
+-
 -    _flash_lib.define(
 -        "flash_fwd(Tensor query, Tensor key, Tensor value, "
--        "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, "
+-        "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
 -        "int max_seqlen_q, int max_seqlen_k, "
 -        "float p, float softmax_scale, "
--        "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)"
+-        "bool is_causal, int window_left, "
+-        "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
 -    )
--
++    #_flash_lib = torch.library.Library("xformers_flash", "DEF")
+ 
 -    _flash_lib.define(
 -        "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
 -        "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
 -        "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
 -        "int max_seqlen_q, int max_seqlen_k, "
--        "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
+-        "float p, float softmax_scale, bool is_causal, "
+-        "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
 -    )
 +    #_flash_lib.define(
 +    #    "flash_fwd(Tensor query, Tensor key, Tensor value, "
-+    #    "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, "
++    #    "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
 +    #    "int max_seqlen_q, int max_seqlen_k, "
 +    #    "float p, float softmax_scale, "
-+    #    "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)"
++    #    "bool is_causal, int window_left, "
++    #    "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
 +    #)
 +
 +    #_flash_lib.define(
@@ -65,52 +74,61 @@
 +    #    "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
 +    #    "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
 +    #    "int max_seqlen_q, int max_seqlen_k, "
-+    #    "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
++    #    "float p, float softmax_scale, bool is_causal, "
++    #    "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
 +    #)
  
      def _flash_fwd(
          query,
-@@ -98,8 +98,8 @@
+@@ -111,8 +111,8 @@
                  p,
                  softmax_scale,
                  is_causal,
--                window_size - 1,  # window_size_left
--                -1,  # window_size_right
-+        #        window_size - 1,  # window_size_left
-+        #        -1,  # window_size_right
+-                window_left,  # window_size_left
+-                window_right,  # window_size_right
++        #        window_left,  # window_size_left
++        #        window_right,  # window_size_right
                  return_softmax,
                  None,  # rng
              )
-@@ -127,8 +127,8 @@
+@@ -134,15 +134,15 @@
+                 out,
+                 cu_seq_lens_q,
+                 cu_seq_lens_k,
+-                seqused_k,
++         #       seqused_k,
+                 max_seq_len_q,
+                 max_seq_len_k,
+                 p,
                  softmax_scale,
                  False,
                  is_causal,
--                window_size - 1,  # window_size_left
--                -1,  # window_size_right
-+         #       window_size - 1,  # window_size_left
-+         #       -1,  # window_size_right
+-                window_left,
+-                window_right,
++         #       window_left,
++         #       window_right,
                  return_softmax,
                  None,
              )
-@@ -169,8 +169,8 @@
+@@ -184,8 +184,8 @@
                  p,
                  softmax_scale,
                  is_causal,
--                window_size - 1,  # window_size_left
--                -1,  # window_size_right
-+        #        window_size - 1,  # window_size_left
-+        #        -1,  # window_size_right
+-                window_left,
+-                window_right,
++        #        window_left,
++        #        window_right,
                  None,
                  rng_state,
              )
-@@ -193,15 +193,15 @@
+@@ -208,15 +208,15 @@
                  softmax_scale,
                  False,  # zero_tensors
                  is_causal,
--                window_size - 1,  # window_size_left
--                -1,  # window_size_right
-+        #        window_size - 1,  # window_size_left
-+        #        -1,  # window_size_right
+-                window_left,
+-                window_right,
++        #        window_left,
++        #        window_right,
                  None,
                  rng_state,
              )
@@ -123,7 +141,7 @@
  except ImportError:
      pass
  
-@@ -348,7 +348,7 @@
+@@ -400,7 +400,7 @@
          implementation.
      """