@@ -104,12 +104,12 @@ def allocate_xpu_cache(self) -> List[KVCache]:
104104 value_block_shape = self .get_value_block_shape ()
105105 for _ in range (self .num_layers ):
106106 key_blocks = torch .empty (
107- size = (self .num_gpu_blocks , * key_block_shape ),
107+ size = (self .num_xpu_blocks , * key_block_shape ),
108108 dtype = self .dtype ,
109109 device = "xpu" ,
110110 )
111111 value_blocks = torch .empty (
112- size = (self .num_gpu_blocks , * value_block_shape ),
112+ size = (self .num_xpu_blocks , * value_block_shape ),
113113 dtype = self .dtype ,
114114 device = "xpu" ,
115115 )
@@ -147,6 +147,7 @@ def _swap(
147147 dst : List [KVCache ],
148148 src_to_dst : Dict [int , int ],
149149 ) -> None :
150+ print ("swapping cache" )
150151 with torch .cuda .stream (self .cache_stream ):
151152 for i in range (self .num_layers ):
152153 src_key_cache , src_value_cache = src [i ]
@@ -166,6 +167,7 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
166167 self ._swap (self .gpu_cache , self .cpu_cache , src_to_dst )
167168
168169 def copy (self , src_to_dsts : Dict [int , List [int ]]) -> None :
170+ print ("copy cache" )
169171 key_caches = [key_cache for key_cache , _ in self .xpu_cache ]
170172 value_caches = [value_cache for _ , value_cache in self .xpu_cache ]
171173 # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
0 commit comments