Skip to content

Commit 7fd46eb

Browse files
committed
Merge remote-tracking branch 'upstream/main' into v5.5_upstream_merge_rc
2 parents 05e67ab + 2188a60 commit 7fd46eb

File tree

185 files changed

+6571
-1622
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

185 files changed

+6571
-1622
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ steps:
9090
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
9191
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
9292
- pytest -v -s entrypoints/openai
93+
- pytest -v -s entrypoints/test_chat_utils.py
9394

9495
- label: Distributed Tests (4 GPUs) # 10min
9596
working_dir: "/vllm-workspace/tests"

.github/workflows/add_label_ready_comment.yml

Lines changed: 0 additions & 23 deletions
This file was deleted.

.github/workflows/reminder_comment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
owner: context.repo.owner,
1616
repo: context.repo.repo,
1717
issue_number: context.issue.number,
18-
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
18+
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
1919
})
2020
env:
2121
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/remove_label_not_ready_comment.yml

Lines changed: 0 additions & 23 deletions
This file was deleted.

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
162162

163163
RUN --mount=type=cache,target=/root/.cache/pip \
164164
. /etc/environment && \
165-
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
165+
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
166166
#################### vLLM installation IMAGE ####################
167167

168168

benchmarks/benchmark_serving.py

Lines changed: 94 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,22 @@ class BenchmarkMetrics:
6161
mean_ttft_ms: float
6262
median_ttft_ms: float
6363
std_ttft_ms: float
64-
p99_ttft_ms: float
64+
percentiles_ttft_ms: List[Tuple[float, float]]
6565
mean_tpot_ms: float
6666
median_tpot_ms: float
6767
std_tpot_ms: float
68-
p99_tpot_ms: float
68+
percentiles_tpot_ms: List[Tuple[float, float]]
6969
mean_itl_ms: float
7070
median_itl_ms: float
7171
std_itl_ms: float
72-
p99_itl_ms: float
72+
percentiles_itl_ms: List[Tuple[float, float]]
73+
# E2EL stands for end-to-end latency per request.
74+
# It is the time taken on the client side from sending
75+
# a request to receiving a complete response.
76+
mean_e2el_ms: float
77+
median_e2el_ms: float
78+
std_e2el_ms: float
79+
percentiles_e2el_ms: List[Tuple[float, float]]
7380

7481

7582
def sample_sharegpt_requests(
@@ -235,13 +242,16 @@ def calculate_metrics(
235242
outputs: List[RequestFuncOutput],
236243
dur_s: float,
237244
tokenizer: PreTrainedTokenizerBase,
245+
selected_percentile_metrics: List[str],
246+
selected_percentiles: List[float],
238247
) -> Tuple[BenchmarkMetrics, List[int]]:
239248
actual_output_lens: List[int] = []
240249
total_input = 0
241250
completed = 0
242251
itls: List[float] = []
243252
tpots: List[float] = []
244253
ttfts: List[float] = []
254+
e2els: List[float] = []
245255
for i in range(len(outputs)):
246256
if outputs[i].success:
247257
# We use the tokenizer to count the number of output tokens for all
@@ -258,6 +268,7 @@ def calculate_metrics(
258268
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
259269
itls += outputs[i].itl
260270
ttfts.append(outputs[i].ttft)
271+
e2els.append(outputs[i].latency)
261272
completed += 1
262273
else:
263274
actual_output_lens.append(0)
@@ -276,17 +287,25 @@ def calculate_metrics(
276287
output_throughput=sum(actual_output_lens) / dur_s,
277288
mean_ttft_ms=np.mean(ttfts or 0) *
278289
1000, # ttfts is empty if streaming is not supported by backend
279-
median_ttft_ms=np.median(ttfts or 0) * 1000,
280290
std_ttft_ms=np.std(ttfts or 0) * 1000,
281-
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
291+
median_ttft_ms=np.median(ttfts or 0) * 1000,
292+
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
293+
for p in selected_percentiles],
282294
mean_tpot_ms=np.mean(tpots or 0) * 1000,
283-
median_tpot_ms=np.median(tpots or 0) * 1000,
284295
std_tpot_ms=np.std(tpots or 0) * 1000,
285-
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
296+
median_tpot_ms=np.median(tpots or 0) * 1000,
297+
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
298+
for p in selected_percentiles],
286299
mean_itl_ms=np.mean(itls or 0) * 1000,
287-
median_itl_ms=np.median(itls or 0) * 1000,
288300
std_itl_ms=np.std(itls or 0) * 1000,
289-
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
301+
median_itl_ms=np.median(itls or 0) * 1000,
302+
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
303+
for p in selected_percentiles],
304+
mean_e2el_ms=np.median(e2els or 0) * 1000,
305+
std_e2el_ms=np.std(e2els or 0) * 1000,
306+
median_e2el_ms=np.mean(e2els or 0) * 1000,
307+
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
308+
for p in selected_percentiles],
290309
)
291310

292311
return metrics, actual_output_lens
@@ -304,6 +323,8 @@ async def benchmark(
304323
request_rate: float,
305324
disable_tqdm: bool,
306325
profile: bool,
326+
selected_percentile_metrics: List[str],
327+
selected_percentiles: List[str],
307328
):
308329
if backend in ASYNC_REQUEST_FUNCS:
309330
request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -392,6 +413,8 @@ async def benchmark(
392413
outputs=outputs,
393414
dur_s=benchmark_duration,
394415
tokenizer=tokenizer,
416+
selected_percentile_metrics=selected_percentile_metrics,
417+
selected_percentiles=selected_percentiles,
395418
)
396419

397420
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -407,23 +430,6 @@ async def benchmark(
407430
metrics.input_throughput))
408431
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
409432
metrics.output_throughput))
410-
print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
411-
print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
412-
print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
413-
metrics.median_ttft_ms))
414-
print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
415-
print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
416-
n=50,
417-
c='-'))
418-
print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
419-
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
420-
metrics.median_tpot_ms))
421-
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
422-
print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
423-
print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
424-
print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
425-
print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
426-
print("=" * 50)
427433

428434
result = {
429435
"duration": benchmark_duration,
@@ -433,25 +439,54 @@ async def benchmark(
433439
"request_throughput": metrics.request_throughput,
434440
"input_throughput": metrics.input_throughput,
435441
"output_throughput": metrics.output_throughput,
436-
"mean_ttft_ms": metrics.mean_ttft_ms,
437-
"median_ttft_ms": metrics.median_ttft_ms,
438-
"std_ttft_ms": metrics.std_ttft_ms,
439-
"p99_ttft_ms": metrics.p99_ttft_ms,
440-
"mean_tpot_ms": metrics.mean_tpot_ms,
441-
"median_tpot_ms": metrics.median_tpot_ms,
442-
"std_tpot_ms": metrics.std_tpot_ms,
443-
"p99_tpot_ms": metrics.p99_tpot_ms,
444-
"mean_itl_ms": metrics.mean_itl_ms,
445-
"median_itl_ms": metrics.median_itl_ms,
446-
"std_itl_ms": metrics.std_itl_ms,
447-
"p99_itl_ms": metrics.p99_itl_ms,
448442
"input_lens": [output.prompt_len for output in outputs],
449443
"output_lens": actual_output_lens,
450444
"ttfts": [output.ttft for output in outputs],
451445
"itls": [output.itl for output in outputs],
452446
"generated_texts": [output.generated_text for output in outputs],
453447
"errors": [output.error for output in outputs],
454448
}
449+
450+
def process_one_metric(
451+
# E.g., "ttft"
452+
metric_attribute_name: str,
453+
# E.g., "TTFT"
454+
metric_name: str,
455+
# E.g., "Time to First Token"
456+
metric_header: str,
457+
):
458+
# This function print and add statistics of the specified
459+
# metric.
460+
if metric_attribute_name not in selected_percentile_metrics:
461+
return
462+
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
463+
print("{:<40} {:<10.2f}".format(
464+
f"Mean {metric_name} (ms):",
465+
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
466+
print("{:<40} {:<10.2f}".format(
467+
f"Median {metric_name} (ms):",
468+
getattr(metrics, f"median_{metric_attribute_name}_ms")))
469+
result[f"mean_{metric_attribute_name}_ms"] = getattr(
470+
metrics, f"mean_{metric_attribute_name}_ms")
471+
result[f"median_{metric_attribute_name}_ms"] = getattr(
472+
metrics, f"median_{metric_attribute_name}_ms")
473+
result[f"std_{metric_attribute_name}_ms"] = getattr(
474+
metrics, f"std_{metric_attribute_name}_ms")
475+
for p, value in getattr(metrics,
476+
f"percentiles_{metric_attribute_name}_ms"):
477+
p_word = str(int(p)) if int(p) == p else str(p)
478+
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
479+
value))
480+
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
481+
482+
process_one_metric("ttft", "TTFT", "Time to First Token")
483+
process_one_metric("tpot", "TPOT",
484+
"Time per Output Token (excl. 1st token)")
485+
process_one_metric("itl", "ITL", "Inter-token Latency")
486+
process_one_metric("e2el", "E2EL", "End-to-end Latency")
487+
488+
print("=" * 50)
489+
455490
return result
456491

457492

@@ -550,6 +585,10 @@ def main(args: argparse.Namespace):
550585
request_rate=args.request_rate,
551586
disable_tqdm=args.disable_tqdm,
552587
profile=args.profile,
588+
selected_percentile_metrics=args.percentile_metrics.split(","),
589+
selected_percentiles=[
590+
float(p) for p in args.metric_percentiles.split(",")
591+
],
553592
))
554593

555594
# Save config and results to json
@@ -765,6 +804,23 @@ def main(args: argparse.Namespace):
765804
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
766805
" format.",
767806
)
807+
parser.add_argument(
808+
"--percentile-metrics",
809+
type=str,
810+
default="ttft,tpot,itl",
811+
help="Comma-seperated list of selected metrics to report percentils. "
812+
"This argument specifies the metrics to report percentiles. "
813+
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
814+
"Default value is \"ttft,tpot,itl\".")
815+
parser.add_argument(
816+
"--metric-percentiles",
817+
type=str,
818+
default="99",
819+
help="Comma-seperated list of percentiles for selected metrics. "
820+
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
821+
"Default value is \"99\". "
822+
"Use \"--percentile-metrics\" to select metrics.",
823+
)
768824

769825
args = parser.parse_args()
770826
main(args)

docs/requirements-docs.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@ pydantic >= 2.8
1111
torch
1212
py-cpuinfo
1313
transformers
14-
mistral_common >= 1.3.4
1514
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
15+
mistral_common >= 1.3.4
16+
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args

docs/source/models/supported_models.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ Decoder-only Language Models
5151
- DeciLM
5252
- :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
5353
-
54+
* - :code:`ExaoneForCausalLM`
55+
- EXAONE-3
56+
- :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
57+
- ✅︎
5458
* - :code:`FalconForCausalLM`
5559
- Falcon
5660
- :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
@@ -143,6 +147,10 @@ Decoder-only Language Models
143147
- Phi-3-Small
144148
- :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
145149
-
150+
* - :code:`PhiMoEForCausalLM`
151+
- Phi-3.5-MoE
152+
- :code:`microsoft/Phi-3.5-MoE-instruct`, etc.
153+
-
146154
* - :code:`PersimmonForCausalLM`
147155
- Persimmon
148156
- :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.

docs/source/serving/openai_compatible_server.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,32 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
111111
:prog: vllm serve
112112
```
113113

114+
### Config file
115+
116+
The `serve` module can also accept arguments from a config file in
117+
`yaml` format. The arguments in the yaml must be specified using the
118+
long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server):
119+
120+
For example:
121+
122+
```yaml
123+
# config.yaml
124+
125+
host: "127.0.0.1"
126+
port: 6379
127+
uvicorn-log-level: "info"
128+
```
129+
130+
```bash
131+
$ vllm serve SOME_MODEL --config config.yaml
132+
```
133+
---
134+
**NOTE**
135+
In case an argument is supplied using command line and the config file, the value from the commandline will take precedence.
136+
The order of priorities is `command line > config file values > defaults`.
137+
138+
---
139+
114140
## Tool calling in the chat completion API
115141
vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap.
116142

examples/offline_inference_neuron.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
1+
import os
2+
13
from vllm import LLM, SamplingParams
24

5+
# creates XLA hlo graphs for all the context length buckets.
6+
os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
7+
# creates XLA hlo graphs for all the token gen buckets.
8+
os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
9+
310
# Sample prompts.
411
prompts = [
512
"Hello, my name is",
@@ -19,8 +26,8 @@
1926
# Currently, this is a known limitation in continuous batching support
2027
# in transformers-neuronx.
2128
# TODO(liangfu): Support paged-attention in transformers-neuronx.
22-
max_model_len=128,
23-
block_size=128,
29+
max_model_len=2048,
30+
block_size=2048,
2431
# The device can be automatically detected when AWS Neuron SDK is installed.
2532
# The device argument can be either unspecified for automated detection,
2633
# or explicitly assigned.

0 commit comments

Comments
 (0)