Skip to content

Commit bf20bd8

Browse files
sheparklibinta
andcommitted
Embedding fix: warmup failure in embedding model (#1510)
Fix the failures at warmup stage in pooling mode -- due to. [rank0]: File "/wm/vllm-fork/vllm/worker/hpu_model_runner.py", line 2904, in warmup_model [rank0]: self.warmup_graphs( [rank0]: File "/wm/vllm-fork/vllm/worker/hpu_model_runner.py", line 2714, in warmup_graphs [rank0]: self.warmup_scenario(batch_size, [rank0]: File "/wm/vllm-fork/vllm/worker/hpu_model_runner.py", line 2561, in warmup_scenario [rank0]: inputs = self.prepare_model_input_align_worker( [rank0]: File "/wm/vllm-fork/vllm/worker/model_runner_base.py", line 233, in prepare_model_input_align_worker [rank0]: raise NotImplementedError [rank0]: NotImplementedError Co-authored-by: Libin Tang <litang@habana.ai>
1 parent 7b69f70 commit bf20bd8

File tree

3 files changed

+64
-33
lines changed

3 files changed

+64
-33
lines changed

vllm/model_executor/layers/pooler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
239239
pooling_metadata: PoolingMetadata):
240240

241241
dimensions_list = [
242-
pooling_param.dimensions
242+
pooling_param.dimensions if pooling_param is not None else None
243243
for _, pooling_param in pooling_metadata.seq_groups
244244
]
245245
if any(d is not None for d in dimensions_list):

vllm/worker/hpu_model_runner.py

Lines changed: 48 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2227,6 +2227,54 @@ def prepare_input_tensors(
22272227
lora_ids=lora_ids), \
22282228
sampling_metadata
22292229

2230+
@torch.inference_mode()
2231+
def prepare_model_input_align_worker(
2232+
self,
2233+
seq_group_metadata_list: List[SequenceGroupMetadata],
2234+
virtual_engine: int = 0,
2235+
finished_requests_ids: Optional[List[str]] = None,
2236+
align_worker: bool = False,
2237+
) -> ModelInputForHPUWithSamplingMetadata:
2238+
"""Prepare the model input based on a given sequence group, including
2239+
metadata for the sampling step.
2240+
The API assumes seq_group_metadata_list is sorted by prefill -> decode.
2241+
The result tensors and data structure also batches input in prefill
2242+
-> decode order. For example,
2243+
- input_tokens[:num_prefill_tokens] contains prefill tokens.
2244+
- input_tokens[num_prefill_tokens:] contains decode tokens.
2245+
If cuda graph is required, this API automatically pads inputs.
2246+
"""
2247+
with self.profiler.record_event('internal', 'prepare_input_tensors'):
2248+
assert seq_group_metadata_list is not None
2249+
if self.profiler.enabled:
2250+
self.profiler_counter_helper.capture_seq_group_metadata_stats(
2251+
seq_group_metadata_list=seq_group_metadata_list)
2252+
model_input, sampling_metadata = self.prepare_input_tensors(
2253+
seq_group_metadata_list, finished_requests_ids, align_worker)
2254+
assert model_input.attn_metadata is not None
2255+
is_prompt = model_input.attn_metadata.is_prompt
2256+
2257+
return ModelInputForHPUWithSamplingMetadata(
2258+
input_tokens=model_input.input_tokens,
2259+
input_positions=model_input.input_positions,
2260+
seq_lens=model_input.seq_lens,
2261+
query_lens=model_input.query_lens,
2262+
lora_mapping=model_input.lora_mapping,
2263+
lora_requests=model_input.lora_requests,
2264+
attn_metadata=model_input.attn_metadata,
2265+
multi_modal_kwargs=model_input.multi_modal_kwargs,
2266+
real_batch_size=model_input.real_batch_size,
2267+
batch_size_padded=model_input.batch_size_padded,
2268+
virtual_engine=virtual_engine,
2269+
lora_ids=model_input.lora_ids,
2270+
async_callback=model_input.async_callback,
2271+
is_first_multi_step=model_input.is_first_multi_step,
2272+
is_last_step=model_input.is_last_step,
2273+
previous_hidden_states=model_input.previous_hidden_states,
2274+
sampling_metadata=sampling_metadata,
2275+
is_prompt=is_prompt,
2276+
)
2277+
22302278
def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int],
22312279
is_prompt: bool):
22322280
'''
@@ -3160,38 +3208,6 @@ def prepare_model_input(
31603208
finished_requests_ids,
31613209
False)
31623210

3163-
@torch.inference_mode()
3164-
def prepare_model_input_align_worker(
3165-
self,
3166-
seq_group_metadata_list: List[SequenceGroupMetadata],
3167-
virtual_engine: int = 0,
3168-
finished_requests_ids: Optional[List[str]] = None,
3169-
align_worker: bool = False,
3170-
) -> ModelInputForHPUWithSamplingMetadata:
3171-
"""Prepare the model input based on a given sequence group, including
3172-
metadata for the sampling step.
3173-
The API assumes seq_group_metadata_list is sorted by prefill -> decode.
3174-
The result tensors and data structure also batches input in prefill
3175-
-> decode order. For example,
3176-
- input_tokens[:num_prefill_tokens] contains prefill tokens.
3177-
- input_tokens[num_prefill_tokens:] contains decode tokens.
3178-
If cuda graph is required, this API automatically pads inputs.
3179-
"""
3180-
with self.profiler.record_event('internal', 'prepare_input_tensors'):
3181-
assert seq_group_metadata_list is not None
3182-
if self.profiler.enabled:
3183-
self.profiler_counter_helper.capture_seq_group_metadata_stats(
3184-
seq_group_metadata_list=seq_group_metadata_list)
3185-
model_input, sampling_metadata = self.prepare_input_tensors(
3186-
seq_group_metadata_list, finished_requests_ids, align_worker)
3187-
assert model_input.attn_metadata is not None
3188-
is_prompt = model_input.attn_metadata.is_prompt
3189-
3190-
return dataclasses.replace(model_input,
3191-
sampling_metadata=sampling_metadata,
3192-
is_prompt=is_prompt,
3193-
virtual_engine=virtual_engine)
3194-
31953211
def finish_measurements(self):
31963212
from neural_compressor.torch.quantization import finalize_calibration
31973213
finalize_calibration(self.model.model)

vllm/worker/hpu_pooling_model_runner.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def execute_model(
3939
intermediate_tensors: Optional[IntermediateTensors] = None,
4040
num_steps: int = 1,
4141
warmup_mode=False,
42+
**kwargs,
4243
) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
4344
if num_steps > 1:
4445
raise ValueError(
@@ -189,6 +190,20 @@ def prepare_model_input(
189190
virtual_engine=virtual_engine,
190191
pooling_metadata=pooling_metadata)
191192

193+
@torch.inference_mode()
194+
def prepare_model_input_align_worker(
195+
self,
196+
seq_group_metadata_list: List[SequenceGroupMetadata],
197+
virtual_engine: int = 0,
198+
finished_requests_ids: Optional[List[str]] = None,
199+
align_worker: bool = False,
200+
) -> ModelInputForHPUWithPoolingMetadata:
201+
return self.prepare_model_input(
202+
seq_group_metadata_list,
203+
virtual_engine,
204+
finished_requests_ids,
205+
)
206+
192207
def _prepare_pooling(
193208
self,
194209
seq_group_metadata_list: List[SequenceGroupMetadata],

0 commit comments

Comments
 (0)