@@ -84,7 +84,8 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
8484 tensor_parallel_size = tp_size ,
8585 gpu_memory_utilization = 0.2 , #avoid OOM
8686 quantization = model .quantization ,
87- trust_remote_code = True )
87+ trust_remote_code = True ,
88+ enable_chunked_prefill = True )
8889
8990 if model .quantization is None :
9091 expected_no_lora_output = [
@@ -176,7 +177,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
176177 tensor_parallel_size = 1 ,
177178 gpu_memory_utilization = 0.2 , #avoid OOM
178179 quantization = model .quantization ,
179- trust_remote_code = True )
180+ trust_remote_code = True ,
181+ enable_chunked_prefill = True )
180182 output_tp1 = do_sample (llm_tp1 , tinyllama_lora_files , lora_id = 1 )
181183
182184 del llm_tp1
@@ -189,7 +191,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
189191 max_loras = 4 ,
190192 tensor_parallel_size = 2 ,
191193 gpu_memory_utilization = 0.2 , #avoid OOM
192- quantization = model .quantization )
194+ quantization = model .quantization ,
195+ enable_chunked_prefill = True )
193196 output_tp2 = do_sample (llm_tp2 , tinyllama_lora_files , lora_id = 1 )
194197
195198 del llm_tp2
0 commit comments