Skip to content

Commit 4aeda82

Browse files
therealnaveenkamalProExpertProg
authored andcommitted
Separate MLAAttention class from Attention (vllm-project#25103)
Signed-off-by: Naveenraj Kamalakannan <therealnaveenkamal@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: 0xrushi <6279035+0xrushi@users.noreply.github.com>
1 parent e60ed60 commit 4aeda82

File tree

10 files changed

+502
-163
lines changed

10 files changed

+502
-163
lines changed

vllm/attention/backends/abstract.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import torch
88

9+
from vllm.model_executor.layers.linear import ColumnParallelLinear
910
from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
1011

1112

@@ -184,6 +185,31 @@ def fused_output_quant_supported(self, quant_key: QuantKey):
184185

185186

186187
class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
188+
@abstractmethod
189+
def __init__(
190+
self,
191+
num_heads: int,
192+
head_size: int,
193+
scale: float,
194+
num_kv_heads: int,
195+
alibi_slopes: Optional[list[float]],
196+
sliding_window: Optional[int],
197+
kv_cache_dtype: str,
198+
logits_soft_cap: Optional[float],
199+
attn_type: str,
200+
kv_sharing_target_layer_name: Optional[str],
201+
# MLA Specific Arguments
202+
q_lora_rank: Optional[int],
203+
kv_lora_rank: int,
204+
qk_nope_head_dim: int,
205+
qk_rope_head_dim: int,
206+
qk_head_dim: int,
207+
v_head_dim: int,
208+
kv_b_proj: ColumnParallelLinear,
209+
indexer: Optional[object] = None,
210+
) -> None:
211+
raise NotImplementedError
212+
187213
@abstractmethod
188214
def forward(
189215
self,

0 commit comments

Comments
 (0)