diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index a392c905c6cae..67b755a155966 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -52,8 +52,7 @@ del regular_llm cleanup_dist_env_and_memory() -# The second LLM needs to request a higher gpu_memory_utilization because -# the first LLM has already allocated a full 30% of the gpu memory. +# Create an LLM with prefix caching enabled. prefix_cached_llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True, gpu_memory_utilization=0.4) diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 010969ad4750d..cbfb0cc32c1ce 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,6 +1,7 @@ import sys from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory def test_lazy_outlines(sample_regex): @@ -14,6 +15,7 @@ def test_lazy_outlines(sample_regex): ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + # Create an LLM without guided decoding as a baseline. llm = LLM(model="facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.3) @@ -26,8 +28,11 @@ def test_lazy_outlines(sample_regex): # make sure outlines is not imported assert 'outlines' not in sys.modules - # The second LLM needs to request a higher gpu_memory_utilization because - # the first LLM has already allocated a full 30% of the gpu memory. + # Destroy the LLM object and free up the GPU memory. + del llm + cleanup_dist_env_and_memory() + + # Create an LLM with guided decoding enabled. llm = LLM(model="facebook/opt-125m", enforce_eager=True, guided_decoding_backend="lm-format-enforcer",