@@ -1860,7 +1860,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
18601860    assert  len (scheduler .waiting ) ==  1 
18611861
18621862
1863- def  test_priority_scheduling_preemption_when_out_of_kv ():
1863+ def  test_priority_scheduling_preemption_and_resumption_when_out_of_kv ():
18641864    """Test that priority scheduling preempts lower priority requests 
18651865    when out of KV cache space.""" 
18661866    # Create scheduler with very limited memory to force preemption 
@@ -1869,6 +1869,7 @@ def test_priority_scheduling_preemption_when_out_of_kv():
18691869        max_num_batched_tokens = 200 ,
18701870        num_blocks = 5 ,  # Can hold 64 tokens (first block is null) 
18711871        block_size = 16 ,  # Standard block size 
1872+         use_kv_connector = True ,
18721873    )
18731874
18741875    # Create a request and schedule it 
@@ -1880,12 +1881,13 @@ def test_priority_scheduling_preemption_when_out_of_kv():
18801881        starting_idx = 0 ,
18811882    )[0 ]
18821883    scheduler .add_request (request_low )
1884+     # 1st schedule 
18831885    output  =  scheduler .schedule ()
18841886    assert  len (output .scheduled_new_reqs ) ==  1 
18851887    assert  len (scheduler .waiting ) ==  0 
18861888    assert  len (scheduler .running ) ==  1 
18871889
1888-     # Simulate model execution 
1890+     # Simulate model execution - 1st decode  
18891891    model_output  =  ModelRunnerOutput (
18901892        req_ids = [request_low .request_id ],
18911893        req_id_to_index = {request_low .request_id : 0 },
@@ -1906,6 +1908,7 @@ def test_priority_scheduling_preemption_when_out_of_kv():
19061908        starting_idx = 1 ,
19071909    )[0 ]
19081910    scheduler .add_request (request_high )
1911+     # 2nd schedule 
19091912    output  =  scheduler .schedule ()
19101913    # KV cache should be full at this point 
19111914    assert  scheduler .kv_cache_manager .block_pool .get_num_free_blocks () ==  0 
@@ -1914,7 +1917,7 @@ def test_priority_scheduling_preemption_when_out_of_kv():
19141917    assert  len (scheduler .waiting ) ==  0 
19151918    assert  len (scheduler .running ) ==  2 
19161919
1917-     # Simulate model execution 
1920+     # Simulate model execution - 2nd decode  
19181921    requests  =  [request_low , request_high ]
19191922    model_output  =  ModelRunnerOutput (
19201923        req_ids = [req .request_id  for  req  in  requests ],
@@ -1927,7 +1930,7 @@ def test_priority_scheduling_preemption_when_out_of_kv():
19271930    )
19281931    scheduler .update_from_output (output , model_output )
19291932
1930-     # Schedule again  - this should trigger preemption 
1933+     # 3rd schedule  - this should trigger preemption 
19311934    # req_low needs 32 tokens = 2 blocks 
19321935    # req_high needs 33 tokens = 3 blocks 
19331936    # so doesn't fit in 4 blocks. 
@@ -1937,9 +1940,44 @@ def test_priority_scheduling_preemption_when_out_of_kv():
19371940    assert  len (output .scheduled_new_reqs ) ==  0 
19381941    assert  output .scheduled_cached_reqs .num_reqs  ==  1 
19391942    assert  output .scheduled_cached_reqs .req_ids [0 ] ==  request_high .request_id 
1943+     assert  scheduler .requests [request_low .request_id ].status  ==  RequestStatus .PREEMPTED 
19401944    assert  len (scheduler .waiting ) ==  1 
19411945    assert  len (scheduler .running ) ==  1 
19421946
1947+     # Simulate model execution - 3rd decode 
1948+     model_output  =  ModelRunnerOutput (
1949+         req_ids = [req .request_id  for  req  in  requests ],
1950+         req_id_to_index = {req .request_id : i  for  i , req  in  enumerate (requests )},
1951+         sampled_token_ids = [[], [100 ]],
1952+         # spec_token_ids=None, 
1953+         logprobs = None ,
1954+         prompt_logprobs_dict = {},
1955+         pooler_output = [],
1956+     )
1957+     # Finish the requests to make room for the preempted requests to resume 
1958+     scheduler .update_from_output (output , model_output )
1959+     scheduler .finish_requests (request_high .request_id , RequestStatus .FINISHED_STOPPED )
1960+ 
1961+     # 4th Schedule - this should trigger the resumption 
1962+     output  =  scheduler .schedule ()
1963+     scheduled_cached_reqs  =  output .scheduled_cached_reqs 
1964+     resumed_from_preemption  =  scheduled_cached_reqs .resumed_from_preemption 
1965+ 
1966+     assert  len (output .scheduled_new_reqs ) ==  0 
1967+     assert  scheduled_cached_reqs .num_reqs  ==  1 
1968+     assert  len (scheduler .waiting ) ==  0 
1969+     assert  len (scheduler .running ) ==  1 
1970+ 
1971+     # Preempted request resumed in scheduled_cached_reqs 
1972+     assert  len (resumed_from_preemption ) ==  1 
1973+     assert  len (scheduled_cached_reqs .resumed_req_token_ids ) ==  1 
1974+     assert  resumed_from_preemption [0 ]
1975+     assert  scheduled_cached_reqs .req_ids [0 ] ==  request_low .request_id 
1976+     assert  scheduled_cached_reqs .resumed_req_token_ids [0 ] is  not None 
1977+     # Resumed tokens include 30 prompt tokens and 2 decoded tokens 
1978+     assert  len (scheduled_cached_reqs .resumed_req_token_ids [0 ]) ==  32 
1979+     assert  scheduled_cached_reqs .resumed_req_token_ids [0 ][31 ] ==  100 
1980+ 
19431981
19441982@pytest .mark .parametrize ( 
19451983    ("enable_chunked_prefill" , "is_encoder_decoder" , "expect_enabled" ), 
0 commit comments