@@ -488,6 +488,8 @@ class HabanaModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
488
488
Helper class for shared methods between GPU model runners.
489
489
"""
490
490
_model_input_cls : Type [TModelInputForHPU ]
491
+ dummy_prompt_list : List [SequenceGroupMetadata ]
492
+ dummy_decode_list : List [SequenceGroupMetadata ]
491
493
492
494
def __init__ (
493
495
self ,
@@ -1087,9 +1089,16 @@ def prepare_input_tensors(
1087
1089
batch_size_padded = find_bucket (real_batch_size , bucket_cfg )
1088
1090
batch_size_padding = batch_size_padded - real_batch_size
1089
1091
seq_group_metadata_list = seq_group_metadata_list .copy ()
1090
- seq_group_metadata_list .extend (
1091
- self .create_dummy_seq_group_metadata (0 , 0 , is_prompt )
1092
- for _ in range (batch_size_padding ))
1092
+
1093
+ if is_prompt :
1094
+ seq_group_metadata_list .extend (
1095
+ self .dummy_prompt_list
1096
+ for _ in range (batch_size_padding ))
1097
+ else :
1098
+ seq_group_metadata_list .extend (
1099
+ self .dummy_decode_list
1100
+ for _ in range (batch_size_padding ))
1101
+
1093
1102
1094
1103
prefill_reqs = []
1095
1104
decode_reqs = []
@@ -1292,6 +1301,9 @@ def profile_run(self) -> None:
1292
1301
self .max_num_batched_tokens // max_batch_size )
1293
1302
1294
1303
self .warmup_scenario (max_batch_size , max_seq_len , True , kv_caches )
1304
+ self .dummy_prompt_list = self .create_dummy_seq_group_metadata (0 , 0 , 1 )
1305
+ self .dummy_decode_list = self .create_dummy_seq_group_metadata (0 , 0 , 0 )
1306
+
1295
1307
return
1296
1308
1297
1309
def warmup_scenario (self ,
0 commit comments