Skip to content

Commit f3a397f

Browse files
Merge pull request vllm-project#15 from luo-cheng2021/luocheng/openvino-model-executor-opt
[CPU] Avoid copy result and force allocation
2 parents 05b9161 + 0f83539 commit f3a397f

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

vllm/executor/openvino_executor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ def allocate_cpu_cache(self) -> List[OpenVINOKVCache]:
119119
for _ in range(self.num_layers):
120120
key_blocks = ov.Tensor(self.cache_dtype, key_block_shape)
121121
value_blocks = ov.Tensor(self.cache_dtype, value_block_shape)
122+
# force allocation
123+
key_blocks.data[:] = 0
124+
value_blocks.data[:] = 0
122125
cpu_cache.append((key_blocks, value_blocks))
123126
return cpu_cache
124127

vllm/model_executor/openvino_model_loader.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,9 @@ def ov_wrapper(self, *args, **kwargs) -> torch.Tensor:
5151
else:
5252
inputs.append(np.array(0, dtype=np.int32)) # for optimum-based models this parameter can be used even on the first iteration
5353

54-
outputs = self._ov_request.infer(inputs, share_inputs=True, share_outputs=False)
55-
return torch.from_numpy(outputs[0])
54+
self._ov_request.start_async(inputs, share_inputs=True)
55+
self._ov_request.wait()
56+
return torch.from_numpy(self._ov_request.get_tensor("logits").data)
5657

5758

5859
def patch_stateful_model(

0 commit comments

Comments
 (0)