10
10
from vllm .engine .async_llm_engine import AsyncLLMEngine
11
11
from vllm .entrypoints .openai .protocol import (ChatCompletionRequest ,
12
12
CompletionRequest ,
13
+ DetokenizeRequest ,
13
14
EmbeddingRequest , ErrorResponse ,
14
15
ModelCard , ModelList ,
15
- ModelPermission )
16
+ ModelPermission , TokenizeRequest )
16
17
from vllm .logger import init_logger
17
18
from vllm .lora .request import LoRARequest
18
19
from vllm .sequence import Logprob
@@ -99,8 +100,9 @@ def create_streaming_error_response(
99
100
return json_str
100
101
101
102
async def _check_model (
102
- self , request : Union [CompletionRequest , ChatCompletionRequest ,
103
- EmbeddingRequest ]
103
+ self , request : Union [ChatCompletionRequest , CompletionRequest ,
104
+ DetokenizeRequest , EmbeddingRequest ,
105
+ TokenizeRequest ]
104
106
) -> Optional [ErrorResponse ]:
105
107
if request .model in self .served_model_names :
106
108
return None
@@ -126,7 +128,8 @@ def _maybe_get_lora(
126
128
def _validate_prompt_and_tokenize (
127
129
self ,
128
130
request : Union [ChatCompletionRequest , CompletionRequest ,
129
- EmbeddingRequest ],
131
+ DetokenizeRequest , EmbeddingRequest ,
132
+ TokenizeRequest ],
130
133
prompt : Optional [str ] = None ,
131
134
prompt_ids : Optional [List [int ]] = None ,
132
135
truncate_prompt_tokens : Optional [Annotated [int ,
@@ -174,6 +177,11 @@ def _validate_prompt_and_tokenize(
174
177
f"generation. Please reduce the length of the input." , )
175
178
return input_ids , input_text
176
179
180
+ # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
181
+ # and does not require model context length validation
182
+ if isinstance (request , (TokenizeRequest , DetokenizeRequest )):
183
+ return input_ids , input_text
184
+
177
185
if request .max_tokens is None :
178
186
if token_num >= self .max_model_len :
179
187
raise ValueError (
0 commit comments