Skip to content

Deprecate best_of Sampling Parameter in anticipation for vLLM V1 #13997

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Mar 5, 2025
5 changes: 0 additions & 5 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class RequestFuncInput:
output_len: int
model: str
model_name: Optional[str] = None
best_of: int = 1
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
Expand Down Expand Up @@ -58,7 +57,6 @@ async def async_request_tgi(
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
params = {
"best_of": request_func_input.best_of,
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
Expand Down Expand Up @@ -130,7 +128,6 @@ async def async_request_trt_llm(

async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
Expand Down Expand Up @@ -195,7 +192,6 @@ async def async_request_deepspeed_mii(
) -> RequestFuncOutput:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1

payload = {
"prompt": request_func_input.prompt,
Expand Down Expand Up @@ -249,7 +245,6 @@ async def async_request_openai_completions(
if request_func_input.model_name else request_func_input.model,
"prompt": request_func_input.prompt,
"temperature": 0.0,
"best_of": request_func_input.best_of,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
Expand Down
14 changes: 0 additions & 14 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,6 @@ async def benchmark(
tokenizer: PreTrainedTokenizerBase,
input_requests: list[tuple[str, int, int]],
logprobs: Optional[int],
best_of: int,
request_rate: float,
burstiness: float,
disable_tqdm: bool,
Expand Down Expand Up @@ -592,7 +591,6 @@ async def benchmark(
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
)
Expand All @@ -619,7 +617,6 @@ async def benchmark(
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos)
profile_output = await request_func(request_func_input=profile_input)
Expand Down Expand Up @@ -668,7 +665,6 @@ async def limited_request_func(request_func_input, pbar):
prompt_len=prompt_len,
output_len=output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=mm_content,
ignore_eos=ignore_eos)
tasks.append(
Expand All @@ -686,7 +682,6 @@ async def limited_request_func(request_func_input, pbar):
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
Expand Down Expand Up @@ -958,7 +953,6 @@ def main(args: argparse.Namespace):
tokenizer=tokenizer,
input_requests=input_requests,
logprobs=args.logprobs,
best_of=args.best_of,
request_rate=args.request_rate,
burstiness=args.burstiness,
disable_tqdm=args.disable_tqdm,
Expand All @@ -983,7 +977,6 @@ def main(args: argparse.Namespace):
result_json["backend"] = backend
result_json["model_id"] = model_id
result_json["tokenizer_id"] = tokenizer_id
result_json["best_of"] = args.best_of
result_json["num_prompts"] = args.num_prompts

# Metadata
Expand Down Expand Up @@ -1081,13 +1074,6 @@ def main(args: argparse.Namespace):
help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument(
"--best-of",
type=int,
default=1,
help="Generates `best_of` sequences per prompt and "
"returns the best one.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
"--num-prompts",
Expand Down
1 change: 0 additions & 1 deletion examples/offline_inference/llm_engine_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def create_test_prompts() -> list[tuple[str, SamplingParams]]:
SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
("What is the meaning of life?",
SamplingParams(n=2,
best_of=5,
temperature=0.8,
top_p=0.95,
frequency_penalty=0.1)),
Expand Down
1 change: 0 additions & 1 deletion examples/online_serving/opentelemetry/dummy_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
"model": "facebook/opt-125m",
"prompt": prompt,
"max_tokens": 10,
"best_of": 20,
"n": 3,
"use_beam_search": "true",
"temperature": 0.0,
Expand Down
4 changes: 0 additions & 4 deletions tests/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,6 @@ def test_schedule_decode_blocks_to_copy_update():
num_gpu_blocks=16)
_, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
curr_loras = None
scheduler._allocate_and_set_running(seq_group)
Expand Down Expand Up @@ -686,7 +685,6 @@ def test_schedule_swapped_cannot_swap_in():
for i in range(2):
_, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
Expand Down Expand Up @@ -717,7 +715,6 @@ def test_infeasible_swap():
for i in range(2):
_, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
Expand Down Expand Up @@ -747,7 +744,6 @@ def test_schedule_swapped_blocks_to_copy():
curr_loras = None
_, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
Expand Down
27 changes: 13 additions & 14 deletions tests/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def create_dummy_prompt(
prompt_length: int = -1,
block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
best_of: int = 1,
prompt_tokens: Optional[list[int]] = None,
min_tokens: int = 0,
max_tokens: int = 16,
Expand All @@ -32,17 +31,19 @@ def create_dummy_prompt(
prompt_tokens = list(range(prompt_length))

prompt_str = " ".join([str(t) for t in prompt_tokens])
prompt = Sequence(int(request_id),
inputs=token_inputs(prompt_tokens, prompt=prompt_str),
block_size=block_size)
seq_group = SequenceGroup(request_id=request_id,
seqs=[prompt],
arrival_time=time.time(),
sampling_params=SamplingParams(
best_of=best_of,
max_tokens=max_tokens,
min_tokens=min_tokens),
lora_request=lora_request)
prompt = Sequence(
int(request_id),
inputs=token_inputs(prompt_tokens, prompt=prompt_str),
block_size=block_size,
)
seq_group = SequenceGroup(
request_id=request_id,
seqs=[prompt],
arrival_time=time.time(),
sampling_params=SamplingParams(max_tokens=max_tokens,
min_tokens=min_tokens),
lora_request=lora_request,
)

return prompt, seq_group

Expand Down Expand Up @@ -72,7 +73,6 @@ def create_dummy_prompt_encoder_decoder(
encoder_prompt_length: int,
block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
best_of: int = 1,
) -> tuple[Sequence, Sequence, SequenceGroup]:
if not block_size:
block_size = decoder_prompt_length
Expand Down Expand Up @@ -102,7 +102,6 @@ def create_dummy_prompt_encoder_decoder(

seq_group = SequenceGroup(request_id=request_id,
seqs=[decoder_prompt],
sampling_params=SamplingParams(best_of=best_of),
arrival_time=time.time(),
lora_request=lora_request,
encoder_seq=encoder_prompt)
Expand Down
8 changes: 0 additions & 8 deletions tests/v1/sample/test_sampling_params_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,6 @@ def test_n_gt_1(model):
assert len(outputs[0].outputs) == 3


def test_best_of(model):
"""Raise a ValueError since best_of is deprecated."""

params = SamplingParams(n=2, best_of=3)
with pytest.raises(ValueError):
_ = model.generate(PROMPT, params)


def test_penalties(model):
"""Check that we do not get errors if applied."""

Expand Down
5 changes: 1 addition & 4 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,7 @@ class LLM:
throughput. However, if the value is too high, it may cause out-of-
memory (OOM) errors.
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is swap space used for anything now that best_of has been removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not AFAIK

This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Otherwise, too small values may cause out-of-memory (OOM) errors.
Too small values may cause out-of-memory (OOM) errors.
cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
the model weights. This virtually increases the GPU memory space
you can use to hold the model weights, at the cost of CPU-GPU data
Expand Down
4 changes: 0 additions & 4 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
user: Optional[str] = None

# doc: begin-chat-completion-sampling-params
best_of: Optional[int] = None
use_beam_search: bool = False
top_k: Optional[int] = None
min_p: Optional[float] = None
Expand Down Expand Up @@ -479,7 +478,6 @@ def to_sampling_params(

return SamplingParams.from_optional(
n=self.n,
best_of=self.best_of,
presence_penalty=self.presence_penalty,
frequency_penalty=self.frequency_penalty,
repetition_penalty=repetition_penalty,
Expand Down Expand Up @@ -650,7 +648,6 @@ class CompletionRequest(OpenAIBaseModel):
# https://platform.openai.com/docs/api-reference/completions/create
model: Optional[str] = None
prompt: Union[list[int], list[list[int]], str, list[str]]
best_of: Optional[int] = None
echo: Optional[bool] = False
frequency_penalty: Optional[float] = 0.0
logit_bias: Optional[dict[str, float]] = None
Expand Down Expand Up @@ -848,7 +845,6 @@ def to_sampling_params(

return SamplingParams.from_optional(
n=self.n,
best_of=self.best_of,
presence_penalty=self.presence_penalty,
frequency_penalty=self.frequency_penalty,
repetition_penalty=repetition_penalty,
Expand Down
8 changes: 2 additions & 6 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,8 @@ async def create_completion(
model_name = self._get_model_name(request.model, lora_request)
num_prompts = len(engine_prompts)

# Similar to the OpenAI API, when n != best_of, we do not stream the
# results. In addition, we do not stream the results when use
# beam search.
stream = (request.stream
and (request.best_of is None or request.n == request.best_of)
and not request.use_beam_search)
# We do not stream the results when use beam search.
stream = (request.stream and not request.use_beam_search)

# Streaming response
if stream:
Expand Down
24 changes: 0 additions & 24 deletions vllm/sampling_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,6 @@ class SamplingParams(

Args:
n: Number of output sequences to return for the given prompt.
best_of: Number of output sequences that are generated from the prompt.
From these `best_of` sequences, the top `n` sequences are returned.
`best_of` must be greater than or equal to `n`. By default,
`best_of` is set to `n`.
presence_penalty: Float that penalizes new tokens based on whether they
appear in the generated text so far. Values > 0 encourage the model
to use new tokens, while values < 0 encourage the model to repeat
Expand Down Expand Up @@ -187,7 +183,6 @@ class SamplingParams(
"""

n: int = 1
best_of: Optional[int] = None
_real_n: Optional[int] = None
presence_penalty: float = 0.0
frequency_penalty: float = 0.0
Expand Down Expand Up @@ -231,7 +226,6 @@ class SamplingParams(
@staticmethod
def from_optional(
n: Optional[int] = 1,
best_of: Optional[int] = None,
presence_penalty: Optional[float] = 0.0,
frequency_penalty: Optional[float] = 0.0,
repetition_penalty: Optional[float] = 1.0,
Expand Down Expand Up @@ -270,7 +264,6 @@ def from_optional(

return SamplingParams(
n=1 if n is None else n,
best_of=best_of,
presence_penalty=0.0
if presence_penalty is None else presence_penalty,
frequency_penalty=0.0
Expand Down Expand Up @@ -303,20 +296,6 @@ def from_optional(
)

def __post_init__(self) -> None:
# how we deal with `best_of``:
# if `best_of`` is not set, we default to `n`;
# if `best_of`` is set, we set `n`` to `best_of`,
# and set `_real_n`` to the original `n`.
# when we return the result, we will check
# if we need to return `n` or `_real_n` results
if self.best_of:
if self.best_of < self.n:
raise ValueError(
f"best_of must be greater than or equal to n, "
f"got n={self.n} and best_of={self.best_of}.")
if not self._real_n:
self._real_n = self.n
self.n = self.best_of

if 0 < self.temperature < _MAX_TEMP:
logger.warning(
Expand Down Expand Up @@ -423,9 +402,6 @@ def _verify_args(self) -> None:
raise ValueError(
"stop strings are only supported when detokenize is True. "
"Set detokenize=True to use stop.")
if self.best_of != self._real_n and self.output_kind == (
RequestOutputKind.DELTA):
raise ValueError("best_of must equal n to use output_kind=DELTA")

def _verify_greedy_sampling(self) -> None:
if self.n > 1:
Expand Down
3 changes: 0 additions & 3 deletions vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,6 @@ def _validate_supported_sampling_params(
self,
params: SamplingParams,
) -> None:
# Best of not yet supported.
if params.best_of:
raise ValueError("VLLM V1 does not yet support best_of.")
# Bad words not yet supported.
if params.bad_words:
raise ValueError("VLLM V1 does not yet support bad_words.")
Expand Down