Skip to content

[Frontend] Allow return_tokens_as_token_ids to be passed as a request param #14066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 30 additions & 14 deletions tests/entrypoints/openai/test_return_tokens_as_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,28 @@


@pytest.fixture(scope="module")
def server_with_return_tokens_as_token_ids_flag(
default_server_args): # noqa: F811
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
yield remote_server
def server_fixture(request, default_server_args): # noqa: F811
use_server_flag = request.param
if use_server_flag:
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
yield (remote_server, True)
else:
with RemoteOpenAIServer(MODEL_NAME,
default_server_args) as remote_server:
yield (remote_server, False)


@pytest.mark.asyncio
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
async def test_completion_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag):
async with server_with_return_tokens_as_token_ids_flag.get_async_client(
) as client:
server_fixture):
server, use_server_flag = server_fixture
request_args = {}
if not use_server_flag:
request_args["return_tokens_as_token_ids"] = True

async with server.get_async_client() as client:

completion = await client.completions.create(
model=MODEL_NAME,
Expand All @@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
echo=True,
temperature=0,
max_tokens=10,
logprobs=1)
logprobs=1,
extra_body=request_args)

text = completion.choices[0].text
token_strs = completion.choices[0].logprobs.tokens
Expand All @@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion(


@pytest.mark.asyncio
async def test_chat_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag):
async with server_with_return_tokens_as_token_ids_flag.get_async_client(
) as client:
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
server, use_server_flag = server_fixture
request_args = {}
if not use_server_flag:
request_args["return_tokens_as_token_ids"] = True

async with server.get_async_client() as client:
response = await client.chat.completions.create(
model=MODEL_NAME,
# Include Unicode characters to test for dividing a single
Expand All @@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion(
}],
temperature=0,
max_tokens=8,
logprobs=True)
logprobs=True,
extra_body=request_args)

text = response.choices[0].message.content
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
Expand Down
12 changes: 12 additions & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
"arguments. For example: {'qualname': "
"'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
"{'param': 'value'}}."))
return_tokens_as_token_ids: Optional[bool] = Field(
default=None,
description=(
"If specified with 'logprobs', tokens are represented "
" as strings of the form 'token_id:{token_id}' so that tokens "
"that are not JSON-encodable can be identified."))

# doc: end-chat-completion-extra-params

Expand Down Expand Up @@ -739,6 +745,12 @@ class CompletionRequest(OpenAIBaseModel):
"arguments. For example: {'qualname': "
"'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
"{'param': 'value'}}."))
return_tokens_as_token_ids: Optional[bool] = Field(
default=None,
description=(
"If specified with 'logprobs', tokens are represented "
" as strings of the form 'token_id:{token_id}' so that tokens "
"that are not JSON-encodable can be identified."))

# doc: end-completion-extra-params

Expand Down
21 changes: 13 additions & 8 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,8 @@ async def chat_completion_stream_generator(
top_logprobs=output.logprobs,
tokenizer=tokenizer,
num_output_top_logprobs=request.top_logprobs,
return_as_token_id=request.
return_tokens_as_token_ids,
)
else:
logprobs = None
Expand Down Expand Up @@ -706,6 +708,7 @@ async def chat_completion_full_generator(
top_logprobs=out_logprobs,
num_output_top_logprobs=request.top_logprobs,
tokenizer=tokenizer,
return_as_token_id=request.return_tokens_as_token_ids,
)
else:
logprobs = None
Expand Down Expand Up @@ -853,13 +856,14 @@ async def chat_completion_full_generator(

def _get_top_logprobs(
self, logprobs: dict[int, Logprob], top_logprobs: Optional[int],
tokenizer: AnyTokenizer) -> list[ChatCompletionLogProb]:
tokenizer: AnyTokenizer,
should_return_as_token_id: bool) -> list[ChatCompletionLogProb]:
return [
ChatCompletionLogProb(token=(token := self._get_decoded_token(
p[1],
p[0],
tokenizer,
return_as_token_id=self.return_tokens_as_token_ids)),
return_as_token_id=should_return_as_token_id)),
logprob=max(p[1].logprob, -9999.0),
bytes=list(
token.encode("utf-8", errors="replace")))
Expand All @@ -873,15 +877,18 @@ def _create_chat_logprobs(
top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
tokenizer: AnyTokenizer,
num_output_top_logprobs: Optional[int] = None,
return_as_token_id: Optional[bool] = None,
) -> ChatCompletionLogProbs:
"""Create OpenAI-style logprobs."""
logprobs_content: list[ChatCompletionLogProbsContent] = []

should_return_as_token_id = return_as_token_id if \
return_as_token_id is not None else self.return_tokens_as_token_ids
for i, token_id in enumerate(token_ids):
step_top_logprobs = top_logprobs[i]
if step_top_logprobs is None:
token = tokenizer.decode(token_id)
if self.return_tokens_as_token_ids:
if should_return_as_token_id:
token = f"token_id:{token_id}"

logprobs_content.append(
Expand All @@ -899,16 +906,14 @@ def _create_chat_logprobs(
step_token,
token_id,
tokenizer,
self.return_tokens_as_token_ids,
should_return_as_token_id,
),
logprob=max(step_token.logprob, -9999.0),
bytes=None if step_decoded is None else list(
step_decoded.encode("utf-8", errors="replace")),
top_logprobs=self._get_top_logprobs(
step_top_logprobs,
num_output_top_logprobs,
tokenizer,
),
step_top_logprobs, num_output_top_logprobs,
tokenizer, should_return_as_token_id),
))

return ChatCompletionLogProbs(content=logprobs_content)
Expand Down
12 changes: 9 additions & 3 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,8 @@ async def completion_stream_generator(
num_output_top_logprobs=request.logprobs,
tokenizer=tokenizer,
initial_text_offset=previous_text_lens[i],
return_as_token_id=request.
return_tokens_as_token_ids,
)
else:
logprobs = None
Expand Down Expand Up @@ -443,6 +445,7 @@ def request_output_to_completion_response(
top_logprobs=out_logprobs,
tokenizer=tokenizer,
num_output_top_logprobs=request.logprobs,
return_as_token_id=request.return_tokens_as_token_ids,
)
else:
logprobs = None
Expand Down Expand Up @@ -484,6 +487,7 @@ def _create_completion_logprobs(
num_output_top_logprobs: int,
tokenizer: AnyTokenizer,
initial_text_offset: int = 0,
return_as_token_id: Optional[bool] = None,
) -> CompletionLogProbs:
"""Create logprobs for OpenAI Completion API."""
out_text_offset: list[int] = []
Expand All @@ -493,11 +497,13 @@ def _create_completion_logprobs(

last_token_len = 0

should_return_as_token_id = return_as_token_id if \
return_as_token_id is not None else self.return_tokens_as_token_ids
for i, token_id in enumerate(token_ids):
step_top_logprobs = top_logprobs[i]
if step_top_logprobs is None:
token = tokenizer.decode(token_id)
if self.return_tokens_as_token_ids:
if should_return_as_token_id:
token = f"token_id:{token_id}"

out_tokens.append(token)
Expand All @@ -510,7 +516,7 @@ def _create_completion_logprobs(
step_token,
token_id,
tokenizer,
return_as_token_id=self.return_tokens_as_token_ids,
return_as_token_id=should_return_as_token_id,
)
token_logprob = max(step_token.logprob, -9999.0)

Expand All @@ -527,7 +533,7 @@ def _create_completion_logprobs(
self._get_decoded_token(top_lp[1],
top_lp[0],
tokenizer,
return_as_token_id=self.return_tokens_as_token_ids):
return_as_token_id=should_return_as_token_id):
max(top_lp[1].logprob, -9999.0)
for i, top_lp in enumerate(step_top_logprobs.items())
if num_output_top_logprobs >= i
Expand Down