Skip to content

Commit 017d9f1

Browse files
authored
Add metrics to RequestOutput (#2876)
1 parent 181b27d commit 017d9f1

File tree

6 files changed

+61
-9
lines changed

6 files changed

+61
-9
lines changed

tests/async_engine/test_request_tracker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def test_request_tracker():
6464
stream_5 = tracker.add_request("5")
6565
assert tracker.new_requests_event.flag
6666
tracker.process_request_output(
67-
RequestOutput("2", "output", [], [], [], finished=True))
67+
RequestOutput("2", "output", [], [], [], bool(finished)))
6868
new, finished = tracker.get_new_and_finished_requests()
6969
assert not tracker.new_requests_event.flag
7070
assert len(finished) == 1

vllm/core/policy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def get_priority(
3333
now: float,
3434
seq_group: SequenceGroup,
3535
) -> float:
36-
return now - seq_group.arrival_time
36+
return now - seq_group.metrics.arrival_time
3737

3838

3939
class PolicyFactory:

vllm/core/scheduler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,10 +365,13 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
365365
# This function call changes the internal states of the scheduler
366366
# such as self.running, self.swapped, and self.waiting.
367367
scheduler_outputs = self._schedule()
368+
now = time.time()
368369

369370
# Create input data structures.
370371
seq_group_metadata_list: List[SequenceGroupMetadata] = []
371372
for seq_group in scheduler_outputs.scheduled_seq_groups:
373+
seq_group.maybe_set_first_scheduled_time(now)
374+
372375
seq_data: Dict[int, SequenceData] = {}
373376
block_tables: Dict[int, List[int]] = {}
374377
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):

vllm/engine/llm_engine.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
728728
def _process_model_outputs(
729729
self, output: SamplerOutput,
730730
scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
731+
now = time.time()
731732
# Update the scheduled sequence groups with the model outputs.
732733
scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
733734
for seq_group, outputs in zip(scheduled_seq_groups, output):
@@ -739,6 +740,7 @@ def _process_model_outputs(
739740
# Create the outputs.
740741
request_outputs: List[RequestOutput] = []
741742
for seq_group in scheduled_seq_groups:
743+
seq_group.maybe_set_first_token_time(now)
742744
request_output = RequestOutput.from_seq_group(seq_group)
743745
request_outputs.append(request_output)
744746
for seq_group in scheduler_outputs.ignored_seq_groups:
@@ -876,11 +878,12 @@ def _get_stats(self,
876878
# Latency Timings.
877879
time_last_iters = []
878880
for seq_group in scheduler_outputs.scheduled_seq_groups:
879-
# Time since last token. (n.b. updates seq_group.last_token_time)
881+
# Time since last token. (n.b. updates seq_group.metrics.last_token_time)
880882
time_last_iters.append(seq_group.get_last_latency(now))
881883
# Time since arrival for all finished requests.
882884
if seq_group.is_finished():
883-
time_e2e_requests.append(now - seq_group.arrival_time)
885+
time_e2e_requests.append(now -
886+
seq_group.metrics.arrival_time)
884887

885888
time_to_first_tokens = time_last_iters if prompt_run else []
886889
time_per_output_tokens = [] if prompt_run else time_last_iters

vllm/outputs.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from typing import List, Optional
2+
import time
23

34
from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
4-
SequenceStatus)
5+
SequenceStatus, RequestMetrics)
56
from vllm.lora.request import LoRARequest
67

78

@@ -60,6 +61,7 @@ class RequestOutput:
6061
prompt_logprobs: The log probabilities to return per prompt token.
6162
outputs: The output sequences of the request.
6263
finished: Whether the whole request is finished.
64+
metrics: Metrics associated with the request.
6365
lora_request: The LoRA request that was used to generate the output.
6466
"""
6567

@@ -71,6 +73,7 @@ def __init__(
7173
prompt_logprobs: Optional[PromptLogprobs],
7274
outputs: List[CompletionOutput],
7375
finished: bool,
76+
metrics: Optional[RequestMetrics] = None,
7477
lora_request: Optional[LoRARequest] = None,
7578
) -> None:
7679
self.request_id = request_id
@@ -79,6 +82,7 @@ def __init__(
7982
self.prompt_logprobs = prompt_logprobs
8083
self.outputs = outputs
8184
self.finished = finished
85+
self.metrics = metrics
8286
self.lora_request = lora_request
8387

8488
@classmethod
@@ -115,12 +119,15 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
115119
prompt_token_ids = seq_group.prompt_token_ids
116120
prompt_logprobs = seq_group.prompt_logprobs
117121
finished = seq_group.is_finished()
122+
finished_time = time.time() if finished else None
123+
seq_group.set_finished_time(finished_time)
118124
return cls(seq_group.request_id,
119125
prompt,
120126
prompt_token_ids,
121127
prompt_logprobs,
122128
outputs,
123129
finished,
130+
seq_group.metrics,
124131
lora_request=seq_group.lora_request)
125132

126133
def __repr__(self) -> str:
@@ -130,4 +137,5 @@ def __repr__(self) -> str:
130137
f"prompt_logprobs={self.prompt_logprobs}, "
131138
f"outputs={self.outputs}, "
132139
f"finished={self.finished}, "
140+
f"metrics={self.metrics}, "
133141
f"lora_request={self.lora_request})")

vllm/sequence.py

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Sequence and its related classes."""
22
import copy
33
import enum
4+
from dataclasses import dataclass
45
from typing import Dict, List, Optional, Union
56

67
from vllm.block import LogicalTokenBlock
@@ -49,6 +50,25 @@ def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
4950
return finish_reason
5051

5152

53+
@dataclass
54+
class RequestMetrics:
55+
"""Metrics associated with a request.
56+
57+
Args:
58+
arrival_time: The time when the request arrived.
59+
first_scheduled_time: The time when the request was first scheduled.
60+
first_token_time: The time when the first token was generated.
61+
time_in_queue: The time the request spent in the queue.
62+
finished_time: The time when the request was finished.
63+
"""
64+
arrival_time: float
65+
last_token_time: float
66+
first_scheduled_time: Optional[float]
67+
first_token_time: Optional[float]
68+
time_in_queue: Optional[float]
69+
finished_time: Optional[float] = None
70+
71+
5272
class SequenceData:
5373
"""Data associated with a sequence.
5474
@@ -252,8 +272,11 @@ def __init__(
252272
self.request_id = request_id
253273
self.seqs_dict = {seq.seq_id: seq for seq in seqs}
254274
self.sampling_params = sampling_params
255-
self.arrival_time = arrival_time
256-
self.last_token_time = arrival_time
275+
self.metrics = RequestMetrics(arrival_time=arrival_time,
276+
last_token_time=arrival_time,
277+
first_scheduled_time=None,
278+
first_token_time=None,
279+
time_in_queue=None)
257280
self.lora_request = lora_request
258281
self.prefix: Optional[Prefix] = prefix
259282
self.prompt_logprobs: Optional[PromptLogprobs] = None
@@ -276,10 +299,25 @@ def lora_int_id(self) -> int:
276299

277300
def get_last_latency(self, now: float) -> float:
278301
"""Gets last token latency for Request level timings."""
279-
latency = now - self.last_token_time
280-
self.last_token_time = now
302+
latency = now - self.metrics.last_token_time
303+
self.metrics.last_token_time = now
281304
return latency
282305

306+
def maybe_set_first_token_time(self, time: float) -> None:
307+
"""Sets the first token time for Request level timings."""
308+
if self.metrics.first_token_time is None:
309+
self.metrics.first_token_time = time
310+
311+
def maybe_set_first_scheduled_time(self, time: float) -> None:
312+
"""Sets the first scheduled time and time in queue for Request level timings."""
313+
if self.metrics.first_scheduled_time is None:
314+
self.metrics.first_scheduled_time = time
315+
self.metrics.time_in_queue = time - self.metrics.arrival_time
316+
317+
def set_finished_time(self, time: Optional[float]) -> None:
318+
"""Sets the finished time for Request level timings."""
319+
self.metrics.finished_time = time
320+
283321
def get_max_num_running_seqs(self) -> int:
284322
"""The maximum number of sequences running in parallel in the remaining
285323
lifetime of the request."""

0 commit comments

Comments
 (0)