working

vllm-project · rkooo567 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
commit 769b2b491939f9e461486fecd2ac97e80e15eb0c
diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py
@@ -45,6 +45,13 @@ def __init__(
         self.start_loc = start_loc
         self.max_context_len = max_context_len
         self.slot_mapping = slot_mapping
+        # Index: The batched sequence's index.
+        # Value: The length of attention context.
+        # NOTE(sang): When it is prefill/decoding,
+        # the definition is different. For prefill,
+        # it means the the length of KV that are cached
+        # excluding the new KVs. In decoding, this
+        # includes a new KV.
         self.context_lens = context_lens
         self.block_tables = block_tables
         self.use_cuda_graph = use_cuda_graph
@@ -53,6 +60,8 @@ def __init__(
         # Set during the execution of the first attention op.
         # FIXME(woosuk): This is a hack.
         self.attn_bias = None
+        # Number of valid tokens. It includes paddings.
+        # See attention.py for precise definition.
         self.num_valid_tokens = slot_mapping.shape[0]
 
     def __repr__(self) -> str:

diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
@@ -242,7 +242,6 @@ def forward(
         else:
             # Decoding run.
             output = _paged_attention(
-                output,
                 query,
                 key_cache,
                 value_cache,
@@ -289,15 +288,17 @@ def _make_alibi_bias(
 
 
 def _paged_attention(
-    output: torch.Tensor,  # [num_tokens, num_heads, head_size]
     query: torch.Tensor,  # [num_tokens, num_heads, head_size]
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    key_cache: torch.
+    Tensor,  # [num_total_blocks, block_size, num_heads, head_size]
+    value_cache: torch.
+    Tensor,  # [num_total_blocks, block_size, num_heads, head_size]
     input_metadata: InputMetadata,
     num_kv_heads: int,
     scale: float,
     alibi_slopes: Optional[torch.Tensor],
 ) -> torch.Tensor:
+    output = torch.empty_like(query)
     block_size = value_cache.shape[3]
     num_seqs, num_heads, head_size = query.shape
     max_num_partitions = (

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -31,8 +31,6 @@
 _BATCH_SIZE_ALIGNMENT = 8
 # Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
 # NOTE: _get_graph_batch_size needs to be updated if this list is changed.
-# Note that cuda graph is only used for decoding because it speeds up
-# the performance when num_tokens < 200. Batch here means a single token.
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
     _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
 ]
@@ -215,6 +213,9 @@ def _prepare_prompt(
 
         max_prompt_len = max(subquery_lens)
         num_prompt_tokens = len(input_tokens)
+
+        # Pad tokens to better utilize tensor cores although
+        # cuda graph is not enabled.
         input_tokens = _make_tensor_with_pad_for_alignment(input_tokens,
                                                            pad=0,
                                                            dtype=torch.long,
@@ -347,7 +348,8 @@ def _prepare_decode(
                 block_tables.append([])
             batch_size = graph_batch_size
 
-        # Q: should we not pad when cuda graph is disabled?
+        # Pad tokens to better utilize tensor cores although
+        # cuda graph is not enabled.
         input_tokens = _make_tensor_with_pad_for_alignment(input_tokens,
                                                            pad=0,
                                                            dtype=torch.long,
@@ -599,9 +601,6 @@ def execute_model(
 
         # Execute the model.
         if input_metadata.use_cuda_graph:
-            # NOTE: We use cuda graph only when there are only
-            # decoding requests, which means the number of batch
-            # size is equivalent to number of input tokens.
             graph_batch_size = input_tokens.shape[0]
             model_executable = self.graph_runners[graph_batch_size]
         else:
@@ -719,7 +718,7 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
         # deleted before the CUDA graphs.
         self.cupy_nccl_backend = cupy_utils.get_nccl_backend()
 
-        # assert not self.model_config.enforce_eager
+        assert not self.model_config.enforce_eager
         logger.info("Capturing the model for CUDA graphs. This may lead to "
                     "unexpected consequences if the model is not static. To "
                     "run the model in eager mode, set 'enforce_eager=True' or "
@@ -915,7 +914,6 @@ def _make_tensor_with_pad_for_alignment(
     """Create a tensor of a given list x with padding.
     It adds paddings to align with graph batch size. See
     _get_graph_batch_size for more details.
-    # NOTE: This API is only for decoding requests.
     """
     batch_size = len(x)
     batch_size = _get_graph_batch_size(batch_size)