vllm-project · hongxiayang · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 23, 2024
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -14,7 +14,7 @@ RUN echo "Base image is $BASE_IMAGE"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
 
-ARG FA_BRANCH="3d2b6f5"
+ARG FA_BRANCH="ae7928c"
 RUN echo "FA_BRANCH is $FA_BRANCH"
 
 # whether to build flash-attention
@@ -98,18 +98,16 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.23 --no-deps
+RUN python3 -m pip install xformers --no-deps
 
 RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
-    && if [ "$BUILD_FA" = "1" ]; then \
-       bash patch_xformers.rocm.sh; fi \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
     && cd ..
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 
 CMD ["/bin/bash"]
diff --git a/patch_xformers.rocm.sh b/patch_xformers.rocm.sh
diff --git a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
diff --git a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py
@@ -2,7 +2,6 @@
 from typing import Optional, List, Any, Dict
 
 import torch
-from xformers.ops.fmha.attn_bias import AttentionBias
 
 
 @dataclass
@@ -82,7 +81,11 @@ def __post_init__(self):
         # when alibi slopes is used. It is because of the limitation
         # from xformer API.
         # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[List[AttentionBias]] = None
+        try:
+            from xformers.ops.fmha.attn_bias import AttentionBias
+            self.attn_bias: Optional[List[AttentionBias]] = None
+        except ImportError:
+            self.attn_bias = None
 
         # Cuda graph is only used for decoding now.
         if self.use_cuda_graph:

diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
@@ -7,7 +7,6 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.utils import is_hip
 
 logger = init_logger(__name__)
 
@@ -67,9 +66,6 @@ def _use_flash_attn() -> bool:
         logger.info("flash_attn is not found. Using xformers backend.")
         return False
 
-    if is_hip():
-        # AMD GPUs.
-        return False
     if torch.cuda.get_device_capability()[0] < 8:
         # Volta and Turing NVIDIA GPUs.
         logger.info("flash_attn is not supported on Turing or older GPUs. "

diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -7,6 +7,7 @@
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention.ops.paged_attn import (
     PagedAttentionImpl)
+from vllm.utils import is_hip
 
 
 class FlashAttentionBackend:
@@ -99,19 +100,33 @@ def forward(
                 # normal attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
-                output = flash_attn_varlen_func(
-                    q=query,
-                    k=key,
-                    v=value,
-                    cu_seqlens_q=input_metadata.seq_start_loc,
-                    cu_seqlens_k=input_metadata.seq_start_loc,
-                    max_seqlen_q=input_metadata.max_seq_len,
-                    max_seqlen_k=input_metadata.max_seq_len,
-                    softmax_scale=self.scale,
-                    causal=True,
-                    window_size=self.sliding_window,
-                    alibi_slopes=self.alibi_slopes,
-                )
+                if is_hip():
+                    # window_size and alibi_slopes not supported
+                    output = flash_attn_varlen_func(
+                        q=query,
+                        k=key,
+                        v=value,
+                        cu_seqlens_q=input_metadata.seq_start_loc,
+                        cu_seqlens_k=input_metadata.seq_start_loc,
+                        max_seqlen_q=input_metadata.max_seq_len,
+                        max_seqlen_k=input_metadata.max_seq_len,
+                        softmax_scale=self.scale,
+                        causal=True,
+                    )
+                else:
+                    output = flash_attn_varlen_func(
+                        q=query,
+                        k=key,
+                        v=value,
+                        cu_seqlens_q=input_metadata.seq_start_loc,
+                        cu_seqlens_k=input_metadata.seq_start_loc,
+                        max_seqlen_q=input_metadata.max_seq_len,
+                        max_seqlen_k=input_metadata.max_seq_len,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        window_size=self.sliding_window,
+                        alibi_slopes=self.alibi_slopes,
+                    )
             else:
                 # prefix-enabled attention
                 output = PagedAttentionImpl.forward_prefix(

diff --git a/vllm/model_executor/layers/attention/backends/xformers.py b/vllm/model_executor/layers/attention/backends/xformers.py
@@ -57,6 +57,8 @@ def __init__(
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
         self.use_ref_attention = _check_use_ref_attention()
+        if self.use_ref_attention:
+            print("ref attention used.")
 
     def forward(
         self,
@@ -119,7 +121,6 @@ def forward(
                                                   value.shape[-1])
 
                 if self.use_ref_attention:
-                    print("ref attention used.")
                     output = torch.empty_like(query)
                     start = 0
                     for _, prompt_len in enumerate(input_metadata.prompt_lens):