@@ -87,16 +87,17 @@ def __init__(self, sliding_window_size: float):
8787 """
8888 self .sliding_window_size = sliding_window_size
8989
90- # Monitors for calculating QPS and TTFT
90+ # Finished requests for each serving engine
91+ # The elements in the deque should be sorted by 'complete' time
9192 self .qps_monitors : Dict [str , MovingAverageMonitor ] = {}
9293 self .ttft_monitors : Dict [str , MovingAverageMonitor ] = {}
9394
94- # Record initial request start time: (engine_url, request_id) -> timestamp
95+ # The time when the request is coming (engine_url, request_id) -> timestamp
9596 self .request_start_time : Dict [Tuple [str , str ], float ] = {}
9697 # Record time when first token is received: (engine_url, request_id) -> timestamp
9798 self .first_token_time : Dict [Tuple [str , str ], float ] = {}
9899
99- # Counters for requests in different stages
100+ # Number of requests in different stages (from the start of the router)
100101 self .in_prefill_requests : Dict [str , int ] = {}
101102 self .in_decoding_requests : Dict [str , int ] = {}
102103 self .finished_requests : Dict [str , int ] = {}
@@ -195,13 +196,16 @@ def on_request_swapped(self, engine_url: str, request_id: str, timestamp: float)
195196
196197 def get_request_stats (self , current_time : float ) -> Dict [str , RequestStats ]:
197198 """
198- Get the request statistics from the monitor.
199+ Get the request statistics for each serving engine
199200
200201 Args:
201- current_time: The current timestamp
202+ current_time: The current timestamp in seconds
202203
203204 Returns:
204- A dictionary mapping engine URLs to RequestStats objects
205+ A dictionary where the key is the serving engine URL and the value
206+ is the request statistics for that engine.
207+ The TTFT and inter token latency will be -1 if there is no requests
208+ finished in the sliding window.
205209 """
206210 ret = {}
207211 urls = set (self .in_prefill_requests .keys ()).union (
0 commit comments