Skip to content

Commit b55ed6e

Browse files
authored
[V1][Minor] Optimize token_ids_cpu copy (#11692)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
1 parent 2f38518 commit b55ed6e

File tree

2 files changed

+9
-5
lines changed

2 files changed

+9
-5
lines changed

vllm/v1/worker/gpu_input_batch.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,9 @@ def __init__(
6666
pin_memory=False,
6767
)
6868
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
69-
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
69+
self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
7070
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
71+
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
7172

7273
# Attention-related.
7374
self.block_table = torch.zeros(
@@ -189,6 +190,7 @@ def add_request(
189190
end_idx = start_idx + len(request.output_token_ids)
190191
self.token_ids_cpu[req_index,
191192
start_idx:end_idx] = request.output_token_ids
193+
self.num_tokens[req_index] = request.num_tokens
192194

193195
self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
194196
num_blocks = len(request.block_ids)
@@ -290,14 +292,15 @@ def condense(self, empty_req_indices: List[int]) -> None:
290292
self.req_ids[last_req_index] = None
291293
self.req_id_to_index[req_id] = empty_index
292294

293-
# TODO(woosuk): Optimize the copy of token_ids_cpu and
294-
# block_table_cpu.
295-
self.token_ids_cpu[empty_index] = self.token_ids_cpu[
296-
last_req_index]
295+
num_tokens = self.num_tokens[last_req_index]
296+
self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
297+
last_req_index, :num_tokens]
298+
self.num_tokens[empty_index] = num_tokens
297299
self.num_prompt_tokens[empty_index] = \
298300
self.num_prompt_tokens[last_req_index]
299301
self.num_computed_tokens_cpu[
300302
empty_index] = self.num_computed_tokens_cpu[last_req_index]
303+
# TODO(woosuk): Optimize the copy of block_table_cpu.
301304
self.block_table_cpu[empty_index] = self.block_table_cpu[
302305
last_req_index]
303306
self.temperature_cpu[empty_index] = self.temperature_cpu[

vllm/v1/worker/gpu_model_runner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,7 @@ def execute_model(
644644
# Append the sampled token to the output token ids.
645645
token_id = sampled_token_ids[i]
646646
self.input_batch.token_ids_cpu[i, seq_len] = token_id
647+
self.input_batch.num_tokens[i] += 1
647648
req_state.output_token_ids.append(token_id)
648649
else:
649650
# Ignore the sampled token from the partial request.

0 commit comments

Comments
 (0)