14
14
from vllm import (AsyncLLMEngine , CompletionOutput , RequestOutput ,
15
15
SamplingParams )
16
16
from vllm .config import ModelConfig
17
- from vllm .entrypoints .grpc .pb import generation_pb2_grpc
17
+ from vllm .entrypoints .grpc .pb import generation_pb2_grpc # type: ignore
18
18
# yapf: disable
19
19
from vllm .entrypoints .grpc .pb .generation_pb2 import (BatchedGenerationRequest ,
20
20
BatchedGenerationResponse ,
@@ -54,15 +54,15 @@ async def _handle_exception(e: Exception, func, *args, **kwargs):
54
54
if not isinstance (e , AbortError ):
55
55
if type (e ).__name__ == "torch.cuda.OutOfMemoryError" : #TODO check
56
56
context = kwargs .get ("context" , None ) or args [- 1 ]
57
- logger .exception (f" { func . __name__ } caused GPU OOM error" )
57
+ logger .exception ("%s caused GPU OOM error", func . __name__ )
58
58
service_metrics .count_request_failure (FailureReasonLabel .OOM )
59
59
await context .abort (StatusCode .RESOURCE_EXHAUSTED , str (e ))
60
60
else :
61
61
if "generate" in func .__name__ .lower ():
62
62
service_metrics .count_request_failure (FailureReasonLabel .GENERATE )
63
63
else :
64
64
service_metrics .count_request_failure (FailureReasonLabel .UNKNOWN )
65
- logger .exception (f" { func . __name__ } failed" )
65
+ logger .exception ("%s failed", func . __name__ )
66
66
raise e
67
67
68
68
@@ -298,7 +298,7 @@ def _convert_output(self,
298
298
text = output .text [text_start_offset :],
299
299
generated_token_count = len (output .token_ids ),
300
300
stop_reason = stop_reason ,
301
- stop_sequence = stop_sequence ,
301
+ stop_sequence = stop_sequence if stop_sequence else '' ,
302
302
)
303
303
304
304
if resp_options .generated_tokens :
@@ -416,7 +416,8 @@ async def _validate_and_convert_params(
416
416
417
417
@staticmethod
418
418
def _convert_reason (output : CompletionOutput , max_is_token_limit : bool ,
419
- time_limit_reached : bool ) -> Tuple ['StopReason' , str ]:
419
+ time_limit_reached : bool
420
+ ) -> Tuple [StopReason .ValueType , Optional [str ]]:
420
421
finish_reason = output .finish_reason
421
422
stop_sequence = None
422
423
if finish_reason is None :
@@ -436,20 +437,20 @@ def _convert_reason(output: CompletionOutput, max_is_token_limit: bool,
436
437
stop_sequence = stop_str_or_tok
437
438
else :
438
439
logger .warning (
439
- f "Unexpected stop_reason type: { type (stop_str_or_tok )} "
440
+ "Unexpected stop_reason type: %s" , type (stop_str_or_tok )
440
441
)
441
442
elif finish_reason == "abort" :
442
443
stop_reason = StopReason .CANCELLED
443
444
else :
444
- logger .warning (f "Unrecognized finish_reason: { finish_reason } " )
445
+ logger .warning ("Unrecognized finish_reason: %s" , finish_reason )
445
446
stop_reason = StopReason .CANCELLED
446
447
447
448
return stop_reason , stop_sequence
448
449
449
450
def _convert_tokens (
450
451
self ,
451
- token_ids : list [int ],
452
- logprobs_list : Optional [list [Dict [int , Logprob ]]],
452
+ token_ids : List [int ],
453
+ logprobs_list : Optional [List [Dict [int , Logprob ]]],
453
454
include_logprobs : bool ,
454
455
include_ranks : bool ,
455
456
top_n_tokens : int ,
@@ -502,7 +503,7 @@ async def _validate_prompt_and_tokenize(
502
503
# "max_length": truncate_input_tokens} \
503
504
# if truncate_input_tokens is not None else {
504
505
# "truncation": True, "max_length": max_model_len + 1}
505
- tokenize_kwargs = {}
506
+ tokenize_kwargs : Dict [ str , Any ] = {}
506
507
507
508
input_ids = await self .tokenizer_group .encode_async (
508
509
prompt , ** tokenize_kwargs )
@@ -664,6 +665,6 @@ async def start_grpc_server(engine: AsyncLLMEngine,
664
665
server .add_insecure_port (listen_on )
665
666
666
667
await server .start ()
667
- logger .info (f "gRPC Server started at { listen_on } " )
668
+ logger .info ("gRPC Server started at %s" , listen_on )
668
669
669
670
return server
0 commit comments