Skip to content

Commit

Permalink
comments
Browse files Browse the repository at this point in the history
  • Loading branch information
comaniac committed Oct 18, 2024
1 parent 1b9d363 commit 1d4a009
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
3 changes: 1 addition & 2 deletions examples/offline_inference_with_prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@
del regular_llm
cleanup_dist_env_and_memory()

# The second LLM needs to request a higher gpu_memory_utilization because
# the first LLM has already allocated a full 30% of the gpu memory.
# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
Expand Down
9 changes: 7 additions & 2 deletions tests/entrypoints/llm/test_lazy_outlines.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys

from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory


def test_lazy_outlines(sample_regex):
Expand All @@ -14,6 +15,7 @@ def test_lazy_outlines(sample_regex):
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM without guided decoding as a baseline.
llm = LLM(model="facebook/opt-125m",
enforce_eager=True,
gpu_memory_utilization=0.3)
Expand All @@ -26,8 +28,11 @@ def test_lazy_outlines(sample_regex):
# make sure outlines is not imported
assert 'outlines' not in sys.modules

# The second LLM needs to request a higher gpu_memory_utilization because
# the first LLM has already allocated a full 30% of the gpu memory.
# Destroy the LLM object and free up the GPU memory.
del llm
cleanup_dist_env_and_memory()

# Create an LLM with guided decoding enabled.
llm = LLM(model="facebook/opt-125m",
enforce_eager=True,
guided_decoding_backend="lm-format-enforcer",
Expand Down

0 comments on commit 1d4a009

Please sign in to comment.