@@ -66,8 +66,9 @@ def __init__(
66
66
pin_memory = False ,
67
67
)
68
68
self .token_ids_cpu = self .token_ids_cpu_tensor .numpy ()
69
- self .num_computed_tokens_cpu = np .empty (max_num_reqs , dtype = np .int32 )
69
+ self .num_tokens = np .zeros (max_num_reqs , dtype = np .int32 )
70
70
self .num_prompt_tokens = np .zeros (max_num_reqs , dtype = np .int32 )
71
+ self .num_computed_tokens_cpu = np .empty (max_num_reqs , dtype = np .int32 )
71
72
72
73
# Attention-related.
73
74
self .block_table = torch .zeros (
@@ -189,6 +190,7 @@ def add_request(
189
190
end_idx = start_idx + len (request .output_token_ids )
190
191
self .token_ids_cpu [req_index ,
191
192
start_idx :end_idx ] = request .output_token_ids
193
+ self .num_tokens [req_index ] = request .num_tokens
192
194
193
195
self .num_computed_tokens_cpu [req_index ] = request .num_computed_tokens
194
196
num_blocks = len (request .block_ids )
@@ -290,14 +292,15 @@ def condense(self, empty_req_indices: List[int]) -> None:
290
292
self .req_ids [last_req_index ] = None
291
293
self .req_id_to_index [req_id ] = empty_index
292
294
293
- # TODO(woosuk): Optimize the copy of token_ids_cpu and
294
- # block_table_cpu.
295
- self . token_ids_cpu [ empty_index ] = self . token_ids_cpu [
296
- last_req_index ]
295
+ num_tokens = self . num_tokens [ last_req_index ]
296
+ self . token_ids_cpu [ empty_index , : num_tokens ] = self . token_ids_cpu [
297
+ last_req_index , : num_tokens ]
298
+ self . num_tokens [ empty_index ] = num_tokens
297
299
self .num_prompt_tokens [empty_index ] = \
298
300
self .num_prompt_tokens [last_req_index ]
299
301
self .num_computed_tokens_cpu [
300
302
empty_index ] = self .num_computed_tokens_cpu [last_req_index ]
303
+ # TODO(woosuk): Optimize the copy of block_table_cpu.
301
304
self .block_table_cpu [empty_index ] = self .block_table_cpu [
302
305
last_req_index ]
303
306
self .temperature_cpu [empty_index ] = self .temperature_cpu [
0 commit comments