@@ -25,7 +25,7 @@ class _Backend(enum.Enum):
2525def get_attn_backend (dtype : torch .dtype ) -> Type [AttentionBackend ]:
2626 backend = _which_attn_to_use (dtype )
2727 if backend == _Backend .FLASH_ATTN :
28- logger .info ("Using FlashAttention backend." )
28+ logger .info ("Using FlashAttention-2 backend." )
2929 from vllm .attention .backends .flash_attn import ( # noqa: F401
3030 FlashAttentionBackend )
3131 return FlashAttentionBackend
@@ -62,21 +62,21 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
6262 # NVIDIA GPUs.
6363 if torch .cuda .get_device_capability ()[0 ] < 8 :
6464 # Volta and Turing NVIDIA GPUs.
65- logger .info ("Cannot use FlashAttention backend for Volta and Turing "
65+ logger .info ("Cannot use FlashAttention-2 backend for Volta and Turing "
6666 "GPUs." )
6767 return _Backend .XFORMERS
6868
6969 if dtype not in (torch .float16 , torch .bfloat16 ):
70- logger .info ("Cannot use FlashAttention backend for dtype other than "
70+ logger .info ("Cannot use FlashAttention-2 backend for dtype other than "
7171 "torch.float16 or torch.bfloat16." )
7272 return _Backend .XFORMERS
7373
7474 try :
7575 import flash_attn # noqa: F401
7676 except ImportError :
7777 logger .info (
78- "Cannot use FlashAttention backend because the flash_attn package "
79- "is not found. Please install it for better performance." )
78+ "Cannot use FlashAttention-2 backend because the flash_attn "
79+ "package is not found. Please install it for better performance." )
8080 return _Backend .XFORMERS
8181
8282 backend_by_env_var = os .getenv (VLLM_ATTENTION_BACKEND )
0 commit comments