Skip to content

Commit e913d6b

Browse files
Merge pull request vllm-project#2 from slyalin/optimum_models_after_reorg
Re-enable optimum-intel path
2 parents a920809 + b98f5ba commit e913d6b

File tree

3 files changed

+30
-7
lines changed

3 files changed

+30
-7
lines changed

vllm/executor/openvino_executor.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,21 @@ def _init_distributed_environment(self) -> None:
419419
ensure_model_parallel_initialized(self.parallel_config.tensor_parallel_size,
420420
self.parallel_config.pipeline_parallel_size)
421421

422+
def __del__(self):
423+
# TODO: Better to put this code in a wrapper around optimum-based model inside OpenVINO model loader
424+
# but it requires more coding because it should be a full-functional substitution of torch.nn.Module.
425+
# The current solution to put the code here is not robust enough: self.model_runner is not our class instance
426+
# and it can be modified in a way that model is no longer kept as self.model_runner.model attribute.
427+
if not (hasattr(self.model_runner, 'model') and hasattr(self.model_runner.model, 'model')):
428+
return
429+
pt_model = self.model_runner.model
430+
if hasattr(pt_model, 'ov_node_factory'):
431+
del pt_model._ov_request
432+
del pt_model.model
433+
if gc: # when app is being destroyed the module may not be available
434+
gc.collect()
435+
del pt_model.ov_node_factory
436+
422437

423438
class OpenVINOExecutor(ExecutorBase):
424439

vllm/model_executor/layers/sampler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def forward(
6464
if self.logits_as_hidden_states:
6565
logits = hidden_states
6666
if is_openvino_optimum_intel():
67+
# TODO: Fuse this step to the model inference
6768
logits = _prune_hidden_states(logits, sampling_metadata)
6869
else:
6970
hidden_states = _prune_hidden_states(hidden_states,

vllm/model_executor/openvino_model_loader.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Utilities for selecting and loading models."""
2+
from functools import partial
23
from typing import Optional
3-
44
import math
55
import torch
66
import numpy as np
@@ -11,6 +11,8 @@
1111
from vllm.sequence import SamplerOutput
1212
from vllm.utils import is_openvino_optimum_intel
1313

14+
import openvino as ov
15+
1416

1517
def _flattenize_inputs(inputs):
1618
"""
@@ -53,7 +55,7 @@ def ov_wrapper(self, *args, **kwargs) -> torch.Tensor:
5355

5456

5557
def patch_stateful_model(
56-
model: torch.nn.Module,
58+
model: ov.Model,
5759
factory):
5860
print('TRANSFORMING OPTIMUM-INTEL MODEL TO vLLM COMPATIBLE FORM')
5961
from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher, AnyInput, Or
@@ -194,7 +196,14 @@ def __init__(self):
194196
seq = WrapType("opset13.Gather", [kv_shape, AnyInput(), AnyInput()])
195197

196198
def callback(m: Matcher) -> bool:
197-
replace_node(m.get_match_root(), max_context_len)
199+
gather = m.get_match_root()
200+
target_type = gather.get_output_element_type(0)
201+
if max_context_len.get_output_element_type(0) != target_type:
202+
print(f'Converting {max_context_len.get_output_element_type(0)} of max_context_len to {target_type}')
203+
replacement = opset13.convert(max_context_len, target_type)
204+
else:
205+
replacement = max_context_len
206+
replace_node(gather, replacement)
198207
print("DETECTED PATTERN FOR max_sequence_length, CONNECTED TO A DEDICATED PARAMETER")
199208
return True
200209

@@ -270,7 +279,6 @@ def _patch_model_with_openvino(
270279
from vllm.model_executor.layers.attention.attention import Attention
271280
from openvino.frontend.pytorch import ModuleExtension
272281
from openvino import Core, convert_model, Type, PartialShape
273-
from functools import partial
274282

275283
# Avoid usage of vllm._C.ops
276284

@@ -426,7 +434,7 @@ def get_model(model_config: ModelConfig,
426434

427435
pt_model = None
428436

429-
if is_openvino_optimum_intel() and False:
437+
if is_openvino_optimum_intel():
430438
import openvino as ov
431439
from optimum.intel import OVModelForCausalLM
432440
pt_model = OVModelForCausalLM.from_pretrained(model_config.model, export=True, compile=False, load_in_8bit=False, trust_remote_code=True) # need stateful because it also enables SDPA
@@ -438,9 +446,8 @@ def get_model(model_config: ModelConfig,
438446
patch_stateful_model(pt_model.model, pt_model.ov_node_factory)
439447
core = ov.Core()
440448
ov_compiled = core.compile_model(pt_model.model, "CPU")
441-
pt_model.ov_request = ov_compiled.create_infer_request()
449+
pt_model._ov_request = ov_compiled.create_infer_request()
442450

443-
from functools import partial
444451
pt_model._openvino_patch_orig_forward = pt_model.forward
445452
pt_model.forward = partial(ov_wrapper, pt_model)
446453

0 commit comments

Comments
 (0)