refactor code so that only free() is used

alexm-redhat · alexm-redhat · commit e08d643d2ec2 · 2024-06-20T21:34:17.000Z
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
@@ -56,6 +56,22 @@ def ids(self) -> List[int]:
         return self._block_ids
 
 
+def append_token_ids_and_update_allocator(
+        block: Block, token_ids: List[int],
+        allocator: DeviceAwareBlockAllocator) -> Block:
+    new_block = allocator.cow_block_if_not_appendable(block)
+    if new_block:
+        block = new_block
+
+    block.append_token_ids(token_ids)
+
+    immutable_block = allocator.promote_to_immutable_block(block)
+    if immutable_block:
+        block = immutable_block
+
+    return block
+
+
 class BlockTable:
     """A class to manage blocks for a specific sequence.
 
@@ -193,11 +209,14 @@ def append_token_ids(self,
                                     num_lookahead_slots)
 
         # Update the blocks with the new tokens
-        blocks = self.blocks[self._num_full_slots // self._block_size:]
+        first_block_idx = self._num_full_slots // self._block_size
         token_blocks = self._chunk_token_blocks_for_append(token_ids)
 
-        for block, token_block in zip(blocks, token_blocks):
-            block.append_token_ids(token_block)
+        for i, token_block in enumerate(token_blocks):
+            cur_block_idx = first_block_idx + i
+            self._blocks[
+                cur_block_idx] = append_token_ids_and_update_allocator(
+                    self._blocks[cur_block_idx], token_block, self._allocator)
 
         self._num_full_slots += len(token_ids)
 
@@ -328,7 +347,11 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
 
             block = self._allocator.allocate_mutable_block(
                 prev_block=prev_block, device=device)
+
+            # Note that no copy-on-write or immutable promotion can happen
+            # here since this block is fresh and not full
             block.append_token_ids(cur_token_ids)
+
             blocks.append(block)
 
         return blocks
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
@@ -114,15 +114,22 @@ def __init__(
         self._refcounter = refcounter
         self._allocator = allocator
 
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+    def is_appendable(self, block: Block) -> bool:
+        block_id = block.block_id
+        if block_id is None:
+            return True
+
+        refcount = self._refcounter.get(block_id)
+        return refcount <= 1
+
+    def cow_block_if_not_appendable(self, block: Block) -> Optional[Block]:
         """Performs a copy-on-write operation on the given block if it is not
         appendable.
 
         This method checks the reference count of the given block. If the
         reference count is greater than 1, indicating that the block is shared,
         a copy-on-write operation is performed. The original block is freed,
-        and a new block is allocated with the same content. The new block index
-        is returned.
+        and a new block is allocated with the same content.
 
         Args:
             block (Block): The block to check for copy-on-write.
@@ -132,30 +139,31 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
                 -write operation was performed, or the original block index if
                 no copy-on-write was necessary.
         """
-        block_id = block.block_id
-        if block_id is None:
-            return block_id
+        if self.is_appendable(block):
+            return None
 
-        refcount = self._refcounter.get(block_id)
-        assert refcount != 0
-        if refcount > 1:
-            src_block_id = block_id
+        # Get data from old block
+        prev_block = block.prev_block
+        token_ids = block.token_ids
+        old_block_id = block.block_id
+
+        # Mark the block as free and decrement its refcount
+        self._allocator.free(block)
 
-            # Decrement refcount of the old physical block. Note that
-            # we do not free the actual block object here since it is
-            # going to reused by the caller.
-            self._allocator.free_block_id(block)
+        # Allocate a new block
+        new_block = self._allocator.allocate_mutable_block(
+            prev_block=prev_block)
+        # Copy the tokens to the new block
+        new_block.append_token_ids(token_ids)
 
-            # Allocate a fresh new block.
-            block_id = self._allocator.allocate_mutable_block(
-                prev_block=block.prev_block).block_id
+        new_block_id = new_block.block_id
 
-            # Track src/dst copy.
-            assert src_block_id is not None
-            assert block_id is not None
-            self._copy_on_writes.append((src_block_id, block_id))
+        # Track src/dst copy.
+        assert old_block_id is not None
+        assert new_block_id is not None
+        self._copy_on_writes.append((old_block_id, new_block_id))
 
-        return block_id
+        return new_block
 
     def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
         """Clears the copy-on-write tracking information and returns the current
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -170,17 +170,6 @@ def allocate_immutable_block(self, prev_block: Optional[Block],
         return self._allocators[device].allocate_immutable_block(
             prev_block, token_ids)
 
-    def free_block_id(self, block: Block) -> None:
-        """Frees the underlying physical block_id of the given block object
-
-        Args:
-            block (Block): The block for which to free the physical block id
-        """
-        block_id = block.block_id
-        assert block_id is not None
-        allocator = self._block_ids_to_allocator[block_id]
-        allocator.free_block_id(block)
-
     def free(self, block: Block) -> None:
         """Frees the memory occupied by the given block.
 
@@ -333,11 +322,13 @@ def get_common_computed_block_ids(
     def all_block_ids(self) -> FrozenSet[int]:
         return frozenset(self._block_ids_to_allocator.keys())
 
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        raise NotImplementedError
+    def promote_to_immutable_block(self, block: Block) -> Optional[Block]:
+        device = Device.GPU
+        return self._allocators[device].promote_to_immutable_block(block)
 
-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
-        raise NotImplementedError
+    def cow_block_if_not_appendable(self, block: Block) -> Optional[Block]:
+        device = Device.GPU
+        return self._allocators[device].cow_block_if_not_appendable(block)
 
     def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
         """Returns and clears the mapping of source to destination block IDs.
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
@@ -9,7 +9,8 @@
 class Block(ABC):
 
     @abstractmethod
-    def append_token_ids(self, token_ids: Optional[List[int]]) -> None:
+    def append_token_ids(self,
+                         token_ids: Optional[List[int]]) -> Optional["Block"]:
         pass
 
     @property
@@ -116,10 +117,6 @@ def allocate_immutable_blocks(
             block_token_ids: List[List[int]]) -> List[Block]:
         pass
 
-    @abstractmethod
-    def free_block_id(self, block: Block) -> None:
-        pass
-
     @abstractmethod
     def free(self, block: Block) -> None:
         pass
@@ -177,13 +174,16 @@ def get_common_computed_block_ids(
         pass
 
     @abstractmethod
-    def cow_block_if_not_appendable(self, block: Block) -> Optional["BlockId"]:
+    def is_appendable(self, block: Block) -> bool:
         """NOTE: This should not be used besides Block"""
         pass
 
     @abstractmethod
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        """NOTE: This should not be used besides Block"""
+    def cow_block_if_not_appendable(self, block: Block) -> Optional[Block]:
+        pass
+
+    @abstractmethod
+    def promote_to_immutable_block(self, block: Block) -> Optional[Block]:
         pass
 
     @abstractmethod
@@ -283,3 +283,11 @@ def allocate_or_get_null_block(self) -> Block:
         There is at most one null block per allocator.
         """
         pass
+
+    @abstractmethod
+    def cow_block_if_not_appendable(self, block: Block) -> Optional[Block]:
+        pass
+
+    @abstractmethod
+    def promote_to_immutable_block(self, block: Block) -> Optional[Block]:
+        pass
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py