1
1
"""Sequence and its related classes."""
2
2
import copy
3
3
import enum
4
+ from dataclasses import dataclass
4
5
from typing import Dict , List , Optional , Union
5
6
6
7
from vllm .block import LogicalTokenBlock
@@ -49,6 +50,25 @@ def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
49
50
return finish_reason
50
51
51
52
53
+ @dataclass
54
+ class RequestMetrics :
55
+ """Metrics associated with a request.
56
+
57
+ Args:
58
+ arrival_time: The time when the request arrived.
59
+ first_scheduled_time: The time when the request was first scheduled.
60
+ first_token_time: The time when the first token was generated.
61
+ time_in_queue: The time the request spent in the queue.
62
+ finished_time: The time when the request was finished.
63
+ """
64
+ arrival_time : float
65
+ last_token_time : float
66
+ first_scheduled_time : Optional [float ]
67
+ first_token_time : Optional [float ]
68
+ time_in_queue : Optional [float ]
69
+ finished_time : Optional [float ] = None
70
+
71
+
52
72
class SequenceData :
53
73
"""Data associated with a sequence.
54
74
@@ -252,8 +272,11 @@ def __init__(
252
272
self .request_id = request_id
253
273
self .seqs_dict = {seq .seq_id : seq for seq in seqs }
254
274
self .sampling_params = sampling_params
255
- self .arrival_time = arrival_time
256
- self .last_token_time = arrival_time
275
+ self .metrics = RequestMetrics (arrival_time = arrival_time ,
276
+ last_token_time = arrival_time ,
277
+ first_scheduled_time = None ,
278
+ first_token_time = None ,
279
+ time_in_queue = None )
257
280
self .lora_request = lora_request
258
281
self .prefix : Optional [Prefix ] = prefix
259
282
self .prompt_logprobs : Optional [PromptLogprobs ] = None
@@ -276,10 +299,25 @@ def lora_int_id(self) -> int:
276
299
277
300
def get_last_latency (self , now : float ) -> float :
278
301
"""Gets last token latency for Request level timings."""
279
- latency = now - self .last_token_time
280
- self .last_token_time = now
302
+ latency = now - self .metrics . last_token_time
303
+ self .metrics . last_token_time = now
281
304
return latency
282
305
306
+ def maybe_set_first_token_time (self , time : float ) -> None :
307
+ """Sets the first token time for Request level timings."""
308
+ if self .metrics .first_token_time is None :
309
+ self .metrics .first_token_time = time
310
+
311
+ def maybe_set_first_scheduled_time (self , time : float ) -> None :
312
+ """Sets the first scheduled time and time in queue for Request level timings."""
313
+ if self .metrics .first_scheduled_time is None :
314
+ self .metrics .first_scheduled_time = time
315
+ self .metrics .time_in_queue = time - self .metrics .arrival_time
316
+
317
+ def set_finished_time (self , time : Optional [float ]) -> None :
318
+ """Sets the finished time for Request level timings."""
319
+ self .metrics .finished_time = time
320
+
283
321
def get_max_num_running_seqs (self ) -> int :
284
322
"""The maximum number of sequences running in parallel in the remaining
285
323
lifetime of the request."""
0 commit comments