N support (vllm-project#8)

mzusman · Mor Zusman · tomeras91 · Mor Zusman · commit 39c27b7ab060 · 2024-04-16T10:25:44.000+03:00
* Return support for other models apart from jamba

* Support n&gt;1

* A little cleanup

* Rename

* Apply whitespace suggestions from code review

* Add max batch size to the main func

* Fixed attention kv cache bug

* log where requests id are deleted from the dict to debug mode

* Fix typo

* Align with v0.3.3 vllm code

* Remove comments

* Take out model config from CUDAGraph object

* Fix

* Fix typo

* Make the kv cache selection cleaner

* Another typo

* Took the num layers calc outside

* Remove the -1

* Set as num layer / period

---------

Co-authored-by: Mor Zusman &lt;morz@ai21.com&gt;
Co-authored-by: tomeras91 &lt;57313761+tomeras91@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/mamba_metadata.py b/vllm/model_executor/mamba_metadata.py
@@ -1,8 +1,9 @@
-from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import Dict, Optional, Tuple
+from typing import List
+
 import torch
 
+
 @dataclass
 class MambaCacheParams:
     is_prompt: bool = False
@@ -13,6 +14,6 @@ class MambaCacheParams:
 @dataclass
 class RequestInfo:
     request_id: str = ''
-    n: int = 1
+    seqs_id: List[int] = field(default_factory=list)
 
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
@@ -504,10 +504,12 @@ def forward(
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
-
+            kv_cache = None
+            if isinstance(layer, JambaAttentionDecoderLayer):
+                kv_cache = kv_caches[(i - self.config.attn_layer_offset) // self.config.attn_layer_period]
             hidden_states, residual = layer(positions=positions,
                                             hidden_states=hidden_states,
-                                            kv_cache=kv_caches[i],
+                                            kv_cache=kv_cache,
                                             input_metadata=input_metadata,
                                             residual=residual,
                                             conv_state=conv_state,
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
@@ -30,7 +30,7 @@ def __init__(
         self.parallel_config = parallel_config
 
         self.head_size = model_config.get_head_size()
-        self.num_layers = model_config.get_num_layers(parallel_config)
+        self.num_layers = CacheEngine.get_num_attention_layers(model_config, parallel_config)
         self.num_heads = model_config.get_num_kv_heads(parallel_config)
 
         self.block_size = cache_config.block_size
@@ -80,6 +80,18 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
     def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
         self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
 
+    @staticmethod
+    def get_num_attention_layers(
+        model_config:ModelConfig,
+        parallel_config:ParallelConfig
+    ):
+        num_layers = model_config.get_num_layers(parallel_config)
+        is_mamba = model_config.hf_config.model_type == "jamba"
+        if is_mamba:
+            attention_period = model_config.hf_config.attn_layer_period
+            num_layers = num_layers // attention_period
+        return num_layers
+
     @staticmethod
     def get_cache_block_size(
         cache_config: CacheConfig,
@@ -88,13 +100,7 @@ def get_cache_block_size(
     ) -> int:
         head_size = model_config.get_head_size()
         num_heads = model_config.get_num_kv_heads(parallel_config)
-        num_layers = model_config.get_num_layers(parallel_config)
-        is_mamba = model_config.hf_config.model_type == "jamba"
-
-        if is_mamba:
-            attention_period = model_config.hf_config.attn_layer_period
-            num_layers = max(num_layers // attention_period, 1)
-
+        num_layers = CacheEngine.get_num_attention_layers(model_config,parallel_config)
         key_cache_block = cache_config.block_size * num_heads * head_size
         value_cache_block = key_cache_block
         total = num_layers * (key_cache_block + value_cache_block)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -154,7 +154,7 @@ def __init__(
         # cache in_wsl result
         self.mamba_cache = None
         self.mamba_cache4gc = None
-        self.request_id2index = {}
+        self.request_id2index: Dict[str, Dict[int, int]] = {}
         self.in_wsl = in_wsl()
         self.kv_cache_dtype = kv_cache_dtype
 
@@ -441,7 +441,7 @@ def _prepare_prompt(
             requests_info=[
                 RequestInfo(
                     request_id=req.request_id,
-                    n=req.sampling_params.n
+                    seqs_id=list(req.seq_data.keys())
                 )
                 for req in seq_group_metadata_list
             ]
@@ -579,10 +579,9 @@ def _prepare_decode(
             requests_info=[
                 RequestInfo(
                     request_id=req.request_id,
-                    n=req.sampling_params.n
+                    seqs_id=list(req.seq_data.keys())
                 )
                 for req in seq_group_metadata_list]
-
         )
         return PrepareDecodeMetadata(
             input_tokens=input_tokens,
@@ -790,13 +789,7 @@ def prepare_input_tensors(
                 "slot_mapping": slot_mapping,
                 "num_prefills": num_prefills,
                 "batch_type": batch_type,
-                "requests_info": [
-                    RequestInfo(
-                        request_id=req.request_id,
-                        n=req.sampling_params.n
-                    )
-                    for req in seq_group_metadata_list
-                ]
+                "requests_info": input_metadata.requests_info
             }
             if prefill_attn_metadata is not None:
                 metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
@@ -922,22 +915,29 @@ def execute_model(
         if not sampling_metadata.perform_sampling:
             return None
 
-        if self.mamba_cache is None:
-            self.prepare_contiguous_mamba_cache(self.model_config.dtype)
-
-        conv_state, ssm_state, indecies = self._prepare_request_mamba_cache(input_metadata, input_tokens.shape[0])
-
-        hidden_states = model_executable(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=kv_caches,
-            input_metadata=input_metadata,
-            conv_state=conv_state,
-            ssm_state=ssm_state
-        )
-        for i,offset in enumerate(indecies):
-            self.mamba_cache[0][:,offset] = conv_state[:,i]
-            self.mamba_cache[1][:,offset] = ssm_state[:,i]
+        is_mamba = self.model_config.hf_config.model_type == "jamba"
+        indices = []
+        conv_state = None
+        model_inputs = {
+            "input_ids":input_tokens,
+            "positions":input_positions,
+            "kv_caches":kv_caches,
+            "input_metadata":input_metadata,
+        }
+        if is_mamba:
+            if self.mamba_cache is None:
+                self.prepare_contiguous_mamba_cache(self.model_config.dtype)
+            conv_state, ssm_state, indices = self._prepare_request_mamba_cache(input_metadata, input_tokens.shape[0])
+            model_inputs = {
+                **model_inputs,
+                "conv_state":conv_state,
+                "ssm_state":ssm_state,
+            }
+        hidden_states = model_executable(**model_inputs)
+        if is_mamba:
+            for i, offset in enumerate(indices):
+                self.mamba_cache[0][:, offset] = conv_state[:, i]
+                self.mamba_cache[1][:, offset] = ssm_state[:, i]
 
         # Sample the next token.
         output = self.model.sample(
@@ -946,6 +946,13 @@ def execute_model(
         )
         return output
 
+    def _get_first_free_mamba_cache_index(self):
+        max_possible_bs = self.mamba_cache[0].shape[1]
+        occupied = [id for seq_ids in self.request_id2index.values() for id in seq_ids.values()]
+        first_free_index = [i not in occupied for i in range(max_possible_bs)].index(True)
+        return first_free_index
+
+
     def _prepare_request_mamba_cache(
         self,
         input_metadata: InputMetadata,
@@ -955,13 +962,26 @@ def _prepare_request_mamba_cache(
         max_possible_bs = self.mamba_cache[0].shape[1]
         for request_info in input_metadata.requests_info:
             if request_info.request_id not in self.request_id2index:
-                first_free_index = [i not in self.request_id2index.values() for i in range(max_possible_bs)].index(True)
-                self.request_id2index[request_info.request_id] = first_free_index
-            indices.append(self.request_id2index[request_info.request_id])
+                self.request_id2index[request_info.request_id] = {}
+                for seq_id in request_info.seqs_id:
+                    first_free_index = self._get_first_free_mamba_cache_index()
+                    self.request_id2index[request_info.request_id][seq_id] = first_free_index
+                    indices.append(first_free_index)
+            else:
+                for seq_id in request_info.seqs_id:
+                    if seq_id not in self.request_id2index[request_info.request_id]:
+                        first_free_index = self._get_first_free_mamba_cache_index()
+                        ## case of decoding n>1
+                        if len(self.request_id2index[request_info.request_id].keys()) > 0:
+                            self.mamba_cache[0][:,first_free_index].copy_(self.mamba_cache[0][:,list(self.request_id2index[request_info.request_id].values())[0]])
+                            self.mamba_cache[1][:,first_free_index].copy_(self.mamba_cache[1][:,list(self.request_id2index[request_info.request_id].values())[0]])
+                        self.request_id2index[request_info.request_id][seq_id] = first_free_index
+                    indices.append(self.request_id2index[request_info.request_id][seq_id])
         ## Pad the batch incase of running batch that was not captured via CG
         padded_indices = indices
         for _ in range(batch_size - len(indices)):
-            padded_indices += [[i not in set(self.request_id2index.values()).union(padded_indices) for i in range(max_possible_bs)].index(True)]
+            occupied = [id for seq_ids in self.request_id2index.values() for id in seq_ids.values()]
+            padded_indices += [[i not in set(occupied).union(padded_indices) for i in range(max_possible_bs)].index(True)]
 
         conv_state = self.mamba_cache[0][:,padded_indices]
         ssm_state = self.mamba_cache[1][:,padded_indices]
@@ -1140,23 +1160,26 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                     kv_cache_dtype=self.kv_cache_dtype,
                 )
 
+                is_mamba = self.model_config.hf_config.model_type == "jamba"
                 if self.lora_config:
                     lora_mapping = LoRAMapping(
                         [0] * batch_size,
                         [0] * batch_size,
                     )
                     self.set_active_loras(set(), lora_mapping)
 
-                graph_runner = CUDAGraphRunner(self.model)
-                graph_runner.capture(
-                    input_tokens[:batch_size],
-                    input_positions[:batch_size],
-                    kv_caches,
-                    attn_metadata,
-                    memory_pool=self.graph_memory_pool,
-                    conv_state=self.mamba_cache4gc[0][:, :batch_size],
-                    ssm_state=self.mamba_cache4gc[1][:, :batch_size]
-                )
+                graph_runner = CUDAGraphRunner(self.model,is_mamba)
+                capture_inputs = {
+                    "input_ids" : input_tokens[:batch_size],
+                    "positions" :input_positions[:batch_size],
+                    "kv_caches": kv_caches,
+                    "attn_metadata": attn_metadata,
+                    "memory_pool":self.graph_memory_pool,
+                }
+                if is_mamba:
+                    capture_inputs["conv_state"]=self.mamba_cache4gc[0][:, :batch_size]
+                    capture_inputs["ssm_state"]=self.mamba_cache4gc[1][:, :batch_size]
+                graph_runner.capture(**capture_inputs)
                 self.graph_memory_pool = graph_runner.graph.pool()
                 self.graph_runners[batch_size] = graph_runner
 
@@ -1182,11 +1205,12 @@ def vocab_size(self) -> int:
 
 class CUDAGraphRunner:
 
-    def __init__(self, model: nn.Module):
+    def __init__(self, model: nn.Module, is_mamba: bool):
         self.model = model
         self.graph = None
         self.input_buffers: Dict[str, torch.Tensor] = {}
         self.output_buffers: Dict[str, torch.Tensor] = {}
+        self.is_mamba = is_mamba
 
     def capture(
         self,
@@ -1197,40 +1221,38 @@ def capture(
         conv_state: torch.Tensor,
         ssm_state: torch.Tensor,
         memory_pool,
+        conv_state: Optional[torch.Tensor] = None,
+        ssm_state: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> None:
         assert self.graph is None
         # Run the model once without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        with _maybe_pynccl():
-            self.model(
-                input_ids,
-                positions,
-                kv_caches,
-                attn_metadata,
-                conv_state,
-                ssm_state
-                **kwargs,
-            )
+        model_inputs = {
+            "input_ids":input_ids,
+            "positions":positions,
+            "kv_caches":kv_caches,
+            "attn_metadata":attn_metadata,
+        }
+        if self.is_mamba:
+            model_inputs = {
+                **model_inputs,
+                "conv_state":conv_state,
+                "ssm_state":ssm_state,
+            }
+
+        with _maybe_cupy_nccl():
+            self.model(**model_inputs)
         torch.cuda.synchronize()
 
         # Capture the graph.
         # NOTE(woosuk): Python 3.8 does not support multi-line with statements.
         # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
         self.graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self.graph, pool=memory_pool):  # noqa: SIM117
-            with _maybe_pynccl():
-                hidden_states = self.model(
-                    input_ids,
-                    positions,
-                    kv_caches,
-                    attn_metadata,
-                    input_metadata,
-                    conv_state,
-                    ssm_state
-                    **kwargs,
-                )
+            with _maybe_cupy_nccl():
+                hidden_states = self.model(**model_inputs)
         torch.cuda.synchronize()
 
         # Save the input and output buffers.
@@ -1244,6 +1266,13 @@ def capture(
             "conv_state": conv_state,
             "ssm_state": ssm_state
         }
+        if self.is_mamba:
+            self.input_buffers = {
+                **self.input_buffers,
+                "conv_state": conv_state,
+                "ssm_state": ssm_state,
+            }
+
         self.output_buffers = {"hidden_states": hidden_states}
         return
 
@@ -1253,8 +1282,8 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-        conv_state:torch.Tensor,
-        ssm_state:torch.Tensor
+        conv_state:Optional[torch.Tensor] = None,
+        ssm_state:Optional[torch.Tensor] = None
         **kwargs,
     ) -> torch.Tensor:
         # KV caches are fixed tensors, so we don't need to copy them.
@@ -1269,16 +1298,19 @@ def forward(
             attn_metadata.decode_metadata.context_lens, non_blocking=True)
         self.input_buffers["block_tables"].copy_(
             attn_metadata.decode_metadata.block_tables, non_blocking=True)
-        self.input_buffers["conv_state"].copy_(conv_state,
-                                               non_blocking=True)
-        self.input_buffers["ssm_state"].copy_(ssm_state,
-                                              non_blocking=True)
+        if self.is_mamba:
+            self.input_buffers["conv_state"].copy_(conv_state,
+                                                non_blocking=True)
+            self.input_buffers["ssm_state"].copy_(ssm_state,
+                                                non_blocking=True)
+
         # Run the graph.
         self.graph.replay()
 
         # in-place edit of the mamba cache states as in the KV cache
-        ssm_state.copy_(self.input_buffers["ssm_state"])
-        conv_state.copy_(self.input_buffers["conv_state"])
+        if self.is_mamba:
+            ssm_state.copy_(self.input_buffers["ssm_state"])
+            conv_state.copy_(self.input_buffers["conv_state"])
 
         # Return the output tensor.
         return self.output_buffers["hidden_states"]
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -184,7 +184,9 @@ def _init_cache_engine(self):
                                         self.parallel_config)
         self.gpu_cache = self.cache_engine.gpu_cache
         self.model_runner.set_block_size(self.cache_engine.block_size)
-        self.model_runner.prepare_contiguous_mamba_cache(self.cache_engine.dtype)
+        is_mamba = self.model_config.hf_config.model_type == "jamba"
+        if is_mamba:
+            self.model_runner.prepare_contiguous_mamba_cache(self.cache_engine.dtype)
 
     def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
@@ -212,8 +214,8 @@ def cache_swap(
     def release_mamba_cache(self, finished_seq_groups_req_ids: List[str]):
         for req_id in finished_seq_groups_req_ids:
             if req_id in self.model_runner.request_id2index:
-                index = self.model_runner.request_id2index.pop(req_id)
-                logger.info(f"deleted { req_id } from mamba_cache with index = {index}")
+                indices = self.model_runner.request_id2index.pop(req_id)
+                logger.debug(f"Deleted { req_id } from mamba_cache with indices = {indices}")
 
 
     @torch.inference_mode()