optimize shapes of tensors used as physical cache

yuanheng-zhao · yuanheng-zhao · commit c1474e698773 · 2023-12-07T10:43:38.000+08:00
diff --git a/colossalai/inference/kv_cache/kvcache_manager.py b/colossalai/inference/kv_cache/kvcache_manager.py
@@ -44,7 +44,7 @@ class KVCacheManager:
         config(InferenceConfig): The All-in-one inference configuration.
     """
 
-    def __init__(self, config: InferenceConfig) -> None:
+    def __init__(self, config: InferenceConfig, verbose: bool = False) -> None:
         self.logger = get_dist_logger(__name__)
         self.device = get_current_device()
 
@@ -73,9 +73,9 @@ def __init__(self, config: InferenceConfig) -> None:
         self.num_blocks = self.max_blocks_per_sequence * self.max_batch_size * self.beam_width
 
         # Physical cache allocation
-        alloc_shape = (self.num_blocks, self.block_size, self.head_num, self.head_size)
-        self.logger.info(f"Allocating KV cache with shape: {alloc_shape} consisting of {self.num_blocks} blocks.")
-        # self._kv_caches = self._init_device_caches(alloc_shape)
+        if verbose:
+            alloc_shape = (self.num_blocks, self.head_num, self.head_size, self.block_size)
+            self.logger.info(f"Allocating KV cache with shape: {alloc_shape} consisting of {self.num_blocks} blocks.")
         self._kv_caches = self._init_device_caches()
         self.total_physical_cache_size_in_bytes = (
             self.elem_size_in_bytes
@@ -177,7 +177,7 @@ def free_cache_blocks(self, block_table: torch.Tensor) -> None:
         for i in range(block_table.numel()):
             global_block_id = block_table[i].item()
             block: CacheBlock = self._cache_blocks[global_block_id]
-            block.remove_ref()  # not going to clear the block thoroughly
+            block.remove_ref()
             if not block.has_ref():
                 block.allocated_size = 0
                 self._free_blocks.append(block)
@@ -236,11 +236,11 @@ def _init_device_caches(self) -> Tuple[torch.Tensor, torch.Tensor]:
         """Initialize the physical cache on the device.
 
         For each layer of the model, we allocate two tensors for key and value respectively,
-        with shape of [num_blocks, block_size, num_head, head_size]
+        with shape of [num_blocks, num_kv_heads, head_size, block_size]
         """
-        alloc_shape = (self.num_blocks, self.block_size, self.head_num, self.head_size)
+        alloc_shape = (self.num_blocks, self.head_num, self.head_size, self.block_size)
         # TODO: Explore the performance when using difference shapes with kernel-related optimizations
-        #       e.g. [num_blocks, block_size, num_head // x, head_size, x]
+        #       e.g. [num_blocks, num_kv_heads // x, head_size, block_size, x]
         k_cache: List[torch.Tensor] = []
         v_cache: List[torch.Tensor] = []
         for _ in range(self.num_layers):
diff --git a/tests/test_infer/test_kvcache_manager.py b/tests/test_infer/test_kvcache_manager.py
@@ -19,6 +19,7 @@ class SampleConfig:
     max_output_length: int
     beam_width: int
     dtype: torch.dtype
+    tp_size: int
 
 
 @parameterize(
@@ -60,6 +61,7 @@ def test_logical_blocks(test_config):
             "max_output_length": 32,
             "dtype": torch.float32,
             "beam_width": 1,
+            "tp_size": 1,
         },
         {
             "num_attention_heads": 4,
@@ -71,6 +73,7 @@ def test_logical_blocks(test_config):
             "max_output_length": 32,
             "dtype": torch.float16,
             "beam_width": 3,
+            "tp_size": 1,
         },
     ],
 )
@@ -92,7 +95,7 @@ def test_cache_manager(test_config):
     assert len(cache_manager._allocated_blocks) == 0
     key_caches = cache_manager._kv_caches[0]  # key caches for all the blocks in all the layers
     assert len(key_caches) == test_config["num_layers"]
-    expected_kv_shape = (num_blocks, block_size, num_heads, head_size)
+    expected_kv_shape = (num_blocks, num_heads, head_size, block_size)
     assert key_caches[0].shape == expected_kv_shape
     k_cache_block0, v_cache_block0 = cache_manager.get_physical_cache(0, 0)
     expected_kv_block_shape = expected_kv_shape[1:]