vllm-project
diff --git a/‎docs/source/contributing/model/basic.md‎
Lines changed: 0 additions & 2 deletions b/‎docs/source/contributing/model/basic.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/source/contributing/model/multimodal.md‎
Lines changed: 0 additions & 2 deletions b/‎docs/source/contributing/model/multimodal.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/kernels/test_encoder_decoder_attn.py‎
Lines changed: 3 additions & 11 deletions b/‎tests/kernels/test_encoder_decoder_attn.py‎
Lines changed: 3 additions & 11 deletions
diff --git a/‎vllm/attention/layer.py‎
Lines changed: 7 additions & 12 deletions b/‎vllm/attention/layer.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎vllm/model_executor/layers/mamba/mamba_mixer.py‎
Lines changed: 3 additions & 2 deletions b/‎vllm/model_executor/layers/mamba/mamba_mixer.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/mamba/mamba_mixer2.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/layers/mamba/mamba_mixer2.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/adapters.py‎
Lines changed: 1 addition & 5 deletions b/‎vllm/model_executor/models/adapters.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎vllm/model_executor/models/arctic.py‎
Lines changed: 5 additions & 19 deletions b/‎vllm/model_executor/models/arctic.py‎
Lines changed: 5 additions & 19 deletions
diff --git a/‎vllm/model_executor/models/aria.py‎
Lines changed: 0 additions & 5 deletions b/‎vllm/model_executor/models/aria.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎vllm/model_executor/models/baichuan.py‎
Lines changed: 5 additions & 19 deletions b/‎vllm/model_executor/models/baichuan.py‎
Lines changed: 5 additions & 19 deletions
@@ -74,8 +74,6 @@ def forward(
     self,
     input_ids: torch.Tensor,
     positions: torch.Tensor,
-    kv_caches: List[torch.Tensor],
-    attn_metadata: AttentionMetadata,
 ) -> torch.Tensor:
     ...
 ```
 
@@ -16,8 +16,6 @@ Further update the model as follows:
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
   +     pixel_values: torch.Tensor,
     ) -> SamplerOutput:
   ```
 
@@ -644,11 +644,7 @@ def _run_encoder_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(
-            reshaped_query, packed_qkv.key, packed_qkv.value,
-            torch.tensor([],
-                         dtype=torch.float32,
-                         device=packed_qkv.query.device), attn_metadata)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value)
 
 
 def _run_decoder_self_attention_test(
@@ -682,7 +678,6 @@ def _run_decoder_self_attention_test(
       & attn_metadata
     '''
     attn = test_rsrcs.attn
-    kv_cache = test_rsrcs.kv_cache
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
     with set_forward_context(attn_metadata, vllm_config):
@@ -695,8 +690,7 @@ def _run_decoder_self_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value,
-                            kv_cache, attn_metadata)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value)
 
 
 def _run_encoder_decoder_cross_attention_test(
@@ -744,7 +738,6 @@ def _run_encoder_decoder_cross_attention_test(
     assert decoder_test_params.packed_qkvo.packed_qkv is not None
 
     attn = test_rsrcs.attn
-    kv_cache = test_rsrcs.kv_cache
     if cross_test_params is None:
         key = None
         value = None
@@ -762,8 +755,7 @@ def _run_encoder_decoder_cross_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query, key, value, kv_cache,
-                            attn_metadata)
+        return attn.forward(reshaped_query, key, value)
 
 
 @pytest.fixture(autouse=True)
 
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 
 import vllm.envs as envs
-from vllm.attention import AttentionMetadata, AttentionType
+from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
@@ -153,15 +153,10 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        # NOTE: please avoid accessing `kv_cache` and `attn_metadata` arguments
-        # directly, use `self.kv_cache` and
-        # `get_forward_context().attn_metadata` instead.
         if self.calculate_kv_scales:
-            ctx_attn_metadata = get_forward_context().attn_metadata
-            if ctx_attn_metadata.enable_kv_scales_calculation:
+            attn_metadata = get_forward_context().attn_metadata
+            if attn_metadata.enable_kv_scales_calculation:
                 self.calc_kv_scales(key, value)
         if self.use_output:
             output = torch.empty_like(query)
@@ -177,14 +172,14 @@ def forward(
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             if self.use_direct_call:
                 forward_context: ForwardContext = get_forward_context()
-                ctx_attn_metadata = forward_context.attn_metadata
+                attn_metadata = forward_context.attn_metadata
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                 self.impl.forward(self,
                                   query,
                                   key,
                                   value,
                                   self_kv_cache,
-                                  ctx_attn_metadata,
+                                  attn_metadata,
                                   output=output)
             else:
                 torch.ops.vllm.unified_attention_with_output(
@@ -193,10 +188,10 @@ def forward(
         else:
             if self.use_direct_call:
                 forward_context = get_forward_context()
-                ctx_attn_metadata = forward_context.attn_metadata
+                attn_metadata = forward_context.attn_metadata
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                 return self.impl.forward(self, query, key, value,
-                                         self_kv_cache, ctx_attn_metadata)
+                                         self_kv_cache, attn_metadata)
             else:
                 return torch.ops.vllm.unified_attention(
                     query, key, value, self.layer_name)
 
@@ -7,6 +7,7 @@
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -130,14 +131,14 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
         ) if use_rms_norm else None
 
     def forward_native(self, hidden_states: torch.Tensor,
-                       attn_metadata: AttentionMetadata,
                        conv_state: torch.Tensor, ssm_state: torch.Tensor):
         pass
 
     def forward_cuda(self, hidden_states: torch.Tensor,
-                     attn_metadata: AttentionMetadata,
                      mamba_cache_params: MambaCacheParams):
 
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
         hidden_states, gate = projected_states.chunk(2, dim=-2)
 
@@ -14,6 +14,7 @@
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -376,17 +377,16 @@ def __init__(self,
                                        eps=rms_norm_eps)
 
     def forward_native(self, hidden_states: torch.Tensor,
-                       attn_metadata: AttentionMetadata,
                        conv_state: torch.Tensor, ssm_state: torch.Tensor):
         pass
 
     def forward_cuda(
         self,
         hidden_states: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
         sequence_idx: Optional[torch.Tensor] = None,
     ):
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
 
         seq_len, _ = hidden_states.shape
         groups_time_state_size = self.n_groups * self.ssm_state_size
 
@@ -160,7 +160,6 @@ def as_classification_model(cls: _T) -> _T:
         return cls
 
     # Lazy import
-    from vllm.attention import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.linear import RowParallelLinear
     from vllm.model_executor.layers.pooler import PoolingType
@@ -201,13 +200,10 @@ def forward(
             self,
             input_ids: torch.Tensor,
             positions: torch.Tensor,
-            kv_caches: list[torch.Tensor],
-            attn_metadata: AttentionMetadata,
             intermediate_tensors: Optional[IntermediateTensors] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
         ) -> torch.Tensor:
-            hidden_states = super().forward(input_ids, positions, kv_caches,
-                                            attn_metadata,
+            hidden_states = super().forward(input_ids, positions,
                                             intermediate_tensors,
                                             inputs_embeds)
             logits, _ = self.score(hidden_states)
 
@@ -5,7 +5,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -283,13 +283,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -336,16 +334,12 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual_input = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual_input + hidden_states
 
@@ -400,8 +394,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -413,11 +405,8 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(positions, hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.norm(hidden_states)
@@ -458,13 +447,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
 
@@ -9,7 +9,6 @@
 from transformers.models.aria.modeling_aria import AriaCrossAttention
 from transformers.models.aria.processing_aria import AriaProcessor
 
-from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.activation import get_act_fn
@@ -626,8 +625,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -643,8 +640,6 @@ def forward(
         hidden_states = self.language_model(
             input_ids,
             positions,
-            kv_caches,
-            attn_metadata,
             intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
 
@@ -20,13 +20,13 @@
 # limitations under the License.
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
 import math
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -182,14 +182,12 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.W_pack(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         if self.postion_embedding != "ALIBI":
             q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -232,8 +230,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -246,8 +242,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -301,8 +295,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -316,13 +308,10 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -379,13 +368,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states