Skip to content

Commit b0d7008

Browse files
yecohnLeiWang1999
authored andcommitted
[Frontend] Add Usage data in each chunk for chat_serving. vllm-project#6540 (vllm-project#6652)
Signed-off-by: LeiWang1999 <leiwang1999@outlook.com>
1 parent e69c8cf commit b0d7008

File tree

2 files changed

+78
-12
lines changed

2 files changed

+78
-12
lines changed

tests/entrypoints/openai/test_chat.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
295295
async for chunk in stream:
296296
assert chunk.usage is None
297297

298-
# Test stream=True, stream_options={"include_usage": True}
299-
stream = await client.chat.completions.create(
300-
model=model_name,
301-
messages=messages,
302-
max_tokens=10,
303-
temperature=0.0,
304-
stream=True,
305-
stream_options={"include_usage": True})
298+
# Test stream=True, stream_options={"include_usage": True,
299+
# "continuous_usage_stats": False}}
300+
stream = await client.chat.completions.create(model=model_name,
301+
messages=messages,
302+
max_tokens=10,
303+
temperature=0.0,
304+
stream=True,
305+
stream_options={
306+
"include_usage":
307+
True,
308+
"continuous_usage_stats":
309+
False
310+
})
306311

307312
async for chunk in stream:
308313
if chunk.choices[0].finish_reason is None:
@@ -338,6 +343,25 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
338343
stream=False,
339344
stream_options={"include_usage": True})
340345

346+
# Test stream=True, stream_options={"include_usage": True,
347+
# "continuous_usage_stats": True}
348+
stream = await client.chat.completions.create(
349+
model=model_name,
350+
messages=messages,
351+
max_tokens=10,
352+
temperature=0.0,
353+
stream=True,
354+
stream_options={
355+
"include_usage": True,
356+
"continuous_usage_stats": True
357+
},
358+
)
359+
async for chunk in stream:
360+
assert chunk.usage.prompt_tokens >= 0
361+
assert chunk.usage.completion_tokens >= 0
362+
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
363+
chunk.usage.completion_tokens)
364+
341365

342366
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
343367
# (i.e. using the same ordering as in the Completions API tests), the test

vllm/entrypoints/openai/serving_chat.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,15 @@ async def chat_completion_stream_generator(
247247
model=model_name)
248248
if (request.stream_options
249249
and request.stream_options.include_usage):
250-
chunk.usage = None
250+
if (request.stream_options.continuous_usage_stats):
251+
prompt_tokens = len(res.prompt_token_ids)
252+
usage = UsageInfo(prompt_tokens=prompt_tokens,
253+
completion_tokens=0,
254+
total_tokens=prompt_tokens)
255+
chunk.usage = usage
256+
else:
257+
chunk.usage = None
258+
251259
data = chunk.model_dump_json(exclude_unset=True)
252260
yield f"data: {data}\n\n"
253261

@@ -277,7 +285,18 @@ async def chat_completion_stream_generator(
277285
model=model_name)
278286
if (request.stream_options and
279287
request.stream_options.include_usage):
280-
chunk.usage = None
288+
if (request.stream_options.
289+
continuous_usage_stats):
290+
prompt_tokens = len(
291+
res.prompt_token_ids)
292+
usage = UsageInfo(
293+
prompt_tokens=prompt_tokens,
294+
completion_tokens=0,
295+
total_tokens=prompt_tokens)
296+
chunk.usage = usage
297+
else:
298+
chunk.usage = None
299+
281300
data = chunk.model_dump_json(
282301
exclude_unset=True)
283302
yield f"data: {data}\n\n"
@@ -336,7 +355,19 @@ async def chat_completion_stream_generator(
336355
model=model_name)
337356
if (request.stream_options
338357
and request.stream_options.include_usage):
339-
chunk.usage = None
358+
if (request.stream_options.continuous_usage_stats):
359+
prompt_tokens = len(res.prompt_token_ids)
360+
completion_tokens = len(output.token_ids)
361+
usage = UsageInfo(
362+
prompt_tokens=prompt_tokens,
363+
completion_tokens=completion_tokens,
364+
total_tokens=prompt_tokens +
365+
completion_tokens,
366+
)
367+
chunk.usage = usage
368+
else:
369+
chunk.usage = None
370+
340371
data = chunk.model_dump_json(exclude_unset=True)
341372
yield f"data: {data}\n\n"
342373
else:
@@ -356,7 +387,18 @@ async def chat_completion_stream_generator(
356387
model=model_name)
357388
if (request.stream_options
358389
and request.stream_options.include_usage):
359-
chunk.usage = None
390+
if (request.stream_options.continuous_usage_stats):
391+
prompt_tokens = len(res.prompt_token_ids)
392+
completion_tokens = len(output.token_ids)
393+
usage = UsageInfo(
394+
prompt_tokens=prompt_tokens,
395+
completion_tokens=completion_tokens,
396+
total_tokens=prompt_tokens +
397+
completion_tokens,
398+
)
399+
chunk.usage = usage
400+
else:
401+
chunk.usage = None
360402
data = chunk.model_dump_json(exclude_unset=True)
361403
yield f"data: {data}\n\n"
362404
finish_reason_sent[i] = True

0 commit comments

Comments
 (0)