@@ -27,8 +27,8 @@ class RequestFuncInput:
2727class RequestFuncOutput :
2828 generated_text : str = ""
2929 success : bool = False
30- latency : float = 0
31- ttft : float = 0 # Time to first token
30+ latency : float = 0.0
31+ ttft : float = 0.0 # Time to first token
3232 itl : List [float ] = field (
3333 default_factory = list ) # List of inter-token latencies
3434 prompt_len : int = 0
@@ -58,23 +58,24 @@ async def async_request_tgi(
5858 output = RequestFuncOutput ()
5959 output .prompt_len = request_func_input .prompt_len
6060
61- ttft = 0
61+ ttft = 0.0
6262 st = time .perf_counter ()
6363 most_recent_timestamp = st
6464 try :
6565 async with session .post (url = api_url , json = payload ) as response :
6666 if response .status == 200 :
67- async for chunk in response .content :
68- chunk = chunk .strip ()
69- if not chunk :
67+ async for chunk_bytes in response .content :
68+ chunk_bytes = chunk_bytes .strip ()
69+ if not chunk_bytes :
7070 continue
7171
72- chunk = remove_prefix (chunk .decode ("utf-8" ), "data:" )
72+ chunk = remove_prefix (chunk_bytes .decode ("utf-8" ),
73+ "data:" )
7374
7475 data = json .loads (chunk )
7576 timestamp = time .perf_counter ()
7677 # First token
77- if ttft == 0 :
78+ if ttft == 0.0 :
7879 ttft = time .perf_counter () - st
7980 output .ttft = ttft
8081
@@ -119,23 +120,24 @@ async def async_request_trt_llm(
119120 output = RequestFuncOutput ()
120121 output .prompt_len = request_func_input .prompt_len
121122
122- ttft = 0
123+ ttft = 0.0
123124 st = time .perf_counter ()
124125 most_recent_timestamp = st
125126 try :
126127 async with session .post (url = api_url , json = payload ) as response :
127128 if response .status == 200 :
128- async for chunk in response .content :
129- chunk = chunk .strip ()
130- if not chunk :
129+ async for chunk_bytes in response .content :
130+ chunk_bytes = chunk_bytes .strip ()
131+ if not chunk_bytes :
131132 continue
132133
133- chunk = remove_prefix (chunk .decode ("utf-8" ), "data:" )
134+ chunk = remove_prefix (chunk_bytes .decode ("utf-8" ),
135+ "data:" )
134136
135137 data = json .loads (chunk )
136138 timestamp = time .perf_counter ()
137139 # First token
138- if ttft == 0 :
140+ if ttft == 0.0 :
139141 ttft = time .perf_counter () - st
140142 output .ttft = ttft
141143
@@ -151,7 +153,7 @@ async def async_request_trt_llm(
151153 output .success = True
152154
153155 else :
154- output .error = response .reason
156+ output .error = response .reason or ""
155157 output .success = False
156158 except Exception :
157159 output .success = False
@@ -195,7 +197,7 @@ async def async_request_deepspeed_mii(
195197 output .generated_text = parsed_resp ["text" ][0 ]
196198 output .success = True
197199 else :
198- output .error = response .reason
200+ output .error = response .reason or ""
199201 output .success = False
200202 except Exception :
201203 output .success = False
@@ -234,19 +236,20 @@ async def async_request_openai_completions(
234236 output .prompt_len = request_func_input .prompt_len
235237
236238 generated_text = ""
237- ttft = 0
239+ ttft = 0.0
238240 st = time .perf_counter ()
239241 most_recent_timestamp = st
240242 try :
241243 async with session .post (url = api_url , json = payload ,
242244 headers = headers ) as response :
243245 if response .status == 200 :
244- async for chunk in response .content :
245- chunk = chunk .strip ()
246- if not chunk :
246+ async for chunk_bytes in response .content :
247+ chunk_bytes = chunk_bytes .strip ()
248+ if not chunk_bytes :
247249 continue
248250
249- chunk = remove_prefix (chunk .decode ("utf-8" ), "data: " )
251+ chunk = remove_prefix (chunk_bytes .decode ("utf-8" ),
252+ "data: " )
250253 if chunk == "[DONE]" :
251254 latency = time .perf_counter () - st
252255 else :
@@ -255,7 +258,7 @@ async def async_request_openai_completions(
255258 if data ["choices" ][0 ]["text" ]:
256259 timestamp = time .perf_counter ()
257260 # First token
258- if ttft == 0 :
261+ if ttft == 0.0 :
259262 ttft = time .perf_counter () - st
260263 output .ttft = ttft
261264
@@ -315,19 +318,20 @@ async def async_request_openai_chat_completions(
315318 output .prompt_len = request_func_input .prompt_len
316319
317320 generated_text = ""
318- ttft = 0
321+ ttft = 0.0
319322 st = time .perf_counter ()
320323 most_recent_timestamp = st
321324 try :
322325 async with session .post (url = api_url , json = payload ,
323326 headers = headers ) as response :
324327 if response .status == 200 :
325- async for chunk in response .content :
326- chunk = chunk .strip ()
327- if not chunk :
328+ async for chunk_bytes in response .content :
329+ chunk_bytes = chunk_bytes .strip ()
330+ if not chunk_bytes :
328331 continue
329332
330- chunk = remove_prefix (chunk .decode ("utf-8" ), "data: " )
333+ chunk = remove_prefix (chunk_bytes .decode ("utf-8" ),
334+ "data: " )
331335 if chunk == "[DONE]" :
332336 latency = time .perf_counter () - st
333337 else :
@@ -337,7 +341,7 @@ async def async_request_openai_chat_completions(
337341 delta = data ["choices" ][0 ]["delta" ]
338342 if delta .get ("content" , None ):
339343 # First token
340- if ttft == 0 :
344+ if ttft == 0.0 :
341345 ttft = time .perf_counter () - st
342346 output .ttft = ttft
343347
@@ -354,7 +358,7 @@ async def async_request_openai_chat_completions(
354358 output .success = True
355359 output .latency = latency
356360 else :
357- output .error = response .reason
361+ output .error = response .reason or ""
358362 output .success = False
359363 except Exception :
360364 output .success = False
0 commit comments