@@ -507,6 +507,12 @@ async def connection_warmup():
507
507
return JSONResponse ({'SUCCESS' : True })
508
508
509
509
510
+ @app .post ('/distserve/gc' )
511
+ async def cache_block_gc_to_be_migrated ():
512
+ # TODO (JimyMa): add garbage collection of to be migrated request
513
+ raise NotImplementedError
514
+
515
+
510
516
@app .post ('/v1/chat/completions' , dependencies = [Depends (check_api_key )])
511
517
async def chat_completions_v1 (request : ChatCompletionRequest , raw_request : Request = None ):
512
518
"""Completion API similar to OpenAI's API.
@@ -625,17 +631,21 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
625
631
).model_dump (mode = 'json' )
626
632
627
633
start = node_manager .pre_call (d_url )
634
+ node_manager .pd_connection_pool .shelf_prefill_session ((p_url , d_url ), prefill_info ['id' ])
628
635
if request .stream is True :
629
636
response = node_manager .stream_generate (request_dict , d_url , '/v1/chat/completions' )
630
637
background_task = node_manager .create_background_tasks (d_url , start )
638
+ node_manager .pd_connection_pool .unshelf_prefill_session ((p_url , d_url ), prefill_info ['id' ])
631
639
return StreamingResponse (response , background = background_task )
632
640
else :
633
641
try :
634
642
response = await node_manager .generate (request_dict , d_url , '/v1/chat/completions' )
635
643
node_manager .post_call (d_url , start )
636
644
resp = JSONResponse (json .loads (response ))
637
645
finally :
646
+ node_manager .pd_connection_pool .unshelf_prefill_session ((p_url , d_url ), prefill_info ['id' ])
638
647
return resp
648
+
639
649
else :
640
650
raise ValueError (f'No serving strategy named { node_manager .serving_strategy } ' )
641
651
@@ -737,15 +747,18 @@ async def completions_v1(request: CompletionRequest, raw_request: Request = None
737
747
remote_block_ids = prefill_info ['cache_block_ids' ],
738
748
remote_token_id = prefill_info ['remote_token_ids' ][- 1 ],
739
749
).model_dump (mode = 'json' )
750
+ node_manager .pd_connection_pool .shelf_prefill_session ((p_url , d_url ), prefill_info ['id' ])
740
751
741
752
start = node_manager .pre_call (d_url )
742
753
if request .stream is True :
743
754
response = node_manager .stream_generate (request_dict , d_url , '/v1/completions' )
744
755
background_task = node_manager .create_background_tasks (d_url , start )
756
+ node_manager .pd_connection_pool .unshelf_prefill_session ((p_url , d_url ), prefill_info ['id' ])
745
757
return StreamingResponse (response , background = background_task )
746
758
else :
747
759
response = await node_manager .generate (request_dict , d_url , '/v1/completions' )
748
760
node_manager .post_call (d_url , start )
761
+ node_manager .pd_connection_pool .unshelf_prefill_session ((p_url , d_url ), prefill_info ['id' ])
749
762
return JSONResponse (json .loads (response ))
750
763
else :
751
764
raise ValueError (f'No serving strategy named { node_manager .serving_strategy } ' )
0 commit comments