@@ -482,3 +482,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
482482 assert expected_token_ids == actual_token_ids
483483
484484 assert baseline_token_ids == test_token_ids
485+
486+
487+ @pytest .mark .parametrize (
488+ "common_llm_kwargs" ,
489+ [{
490+ # Use a small model for a fast test.
491+ "model" : "facebook/opt-125m" ,
492+
493+ # skip cuda graph creation for fast test.
494+ "enforce_eager" : True ,
495+
496+ # we keep the blocks small, so that hit eviction quickly
497+ "max_model_len" : 48 ,
498+ "block_size" : 16 ,
499+ "num_gpu_blocks_override" : 3 ,
500+
501+ # Test APC in v2 block
502+ "use_v2_block_manager" : True ,
503+ }])
504+ @pytest .mark .parametrize ("per_test_common_llm_kwargs" , [{}])
505+ @pytest .mark .parametrize ("baseline_llm_kwargs" , [{
506+ "enable_prefix_caching" : False
507+ }])
508+ @pytest .mark .parametrize ("test_llm_kwargs" , [{
509+ "enable_prefix_caching" : True ,
510+ }])
511+ @pytest .mark .parametrize ("seed" , [1 ])
512+ def test_auto_prefix_caching_after_evition_start (baseline_llm_generator ,
513+ test_llm_generator ):
514+ """Verify block manager v2 with auto prefix caching could works normal
515+ even when eviction started.
516+ With APC enabled, all blocks are held by native block at the beginning.
517+ Then blocks are managed by evictor instead. If cache hit at the evitor's
518+ block, then it could be reused, or we need to recompute its kv cache.
519+ """
520+ output_len = 10
521+ temperature = 0.0
522+
523+ prompts = [
524+ "You are a helpful assistant. Please answer truthfully and write "
525+ "out your thinking step by step to be sure you get the right answer. "
526+ "If you make a mistake, attempt to correct it. who are you?" ,
527+ "You are a helpful assistant. Please answer truthfully and write out "
528+ "your thinking step by step to be sure you get the right answer. You "
529+ "are helpful and harmless and you follow ethical guidelines. "
530+ "who are you?"
531+ ]
532+
533+ sampling_params = SamplingParams (
534+ max_tokens = output_len ,
535+ ignore_eos = True ,
536+ temperature = temperature ,
537+ )
538+
539+ print ('Getting token ids with APC disabled' )
540+ baseline_token_ids = get_token_ids_from_llm_generator (
541+ baseline_llm_generator , prompts , sampling_params )
542+
543+ print ('Getting token ids with APC enabled' )
544+ test_token_ids = get_token_ids_from_llm_generator (test_llm_generator ,
545+ prompts , sampling_params )
546+
547+ for expected_token_ids , actual_token_ids in zip (baseline_token_ids ,
548+ test_token_ids ):
549+ assert expected_token_ids == actual_token_ids
550+
551+ assert baseline_token_ids == test_token_ids
0 commit comments