Skip to content

Commit 22135c9

Browse files
authored
Offline inference metrics test fixed (vllm-project#140)
1 parent e97134d commit 22135c9

File tree

1 file changed

+16
-1
lines changed

1 file changed

+16
-1
lines changed

vllm/engine/llm_engine.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from vllm.multimodal.processing import EncDecMultiModalProcessor
4545
from vllm.outputs import (PoolingRequestOutput, RequestOutput,
4646
RequestOutputFactory)
47+
from vllm.platforms import current_platform
4748
from vllm.pooling_params import PoolingParams
4849
from vllm.prompt_adapter.request import PromptAdapterRequest
4950
from vllm.sampling_params import RequestOutputKind, SamplingParams
@@ -1814,7 +1815,21 @@ def _get_stats(self,
18141815
# TPOTs.
18151816
latency = seq_group.get_last_token_latency()
18161817
# last_token_time is set only for the last step so take avg
1817-
num_outputs = scheduler_outputs.num_lookahead_slots + 1
1818+
if current_platform.is_tt():
1819+
# for the current tt model runner, the number of steps
1820+
# executed is not always the same as the number of
1821+
# lookahead slots but rather the number of balance
1822+
# tokens left to be generated.
1823+
assert len(
1824+
seq_group.seqs
1825+
) == 1, "Only one seq per group is allowed for TT"
1826+
total_tokens = seq_group.seqs[0].get_output_len() - 1
1827+
max_steps = scheduler_outputs.num_lookahead_slots + 1
1828+
num_outputs = (total_tokens %
1829+
max_steps if total_tokens %
1830+
max_steps != 0 else max_steps)
1831+
else:
1832+
num_outputs = scheduler_outputs.num_lookahead_slots + 1
18181833
latency /= num_outputs
18191834
time_per_output_tokens_iter.append(latency)
18201835
if seq_group.state.current_step == 0:

0 commit comments

Comments
 (0)