Skip to content

Commit a1cb99a

Browse files
DarkLight1337yangw-dev
authored andcommitted
[Misc] Update Mistral-3.1 example (vllm-project#16147)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Yang Wang <elainewy@meta.com>
1 parent 20b71df commit a1cb99a

File tree

1 file changed

+22
-8
lines changed

1 file changed

+22
-8
lines changed

examples/offline_inference/mistral-small.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,14 @@
1313
# - Server:
1414
#
1515
# ```bash
16+
# # Mistral format
1617
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
1718
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
1819
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
20+
#
21+
# # HF format
22+
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
23+
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
1924
# ```
2025
#
2126
# - Client:
@@ -44,19 +49,22 @@
4449
# python demo.py simple
4550
# python demo.py advanced
4651

52+
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
53+
# These scripts have been tested on 2x L40 GPUs
54+
4755

4856
def run_simple_demo(args: argparse.Namespace):
4957
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
5058
sampling_params = SamplingParams(max_tokens=8192)
5159

52-
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
5360
llm = LLM(
5461
model=model_name,
55-
tokenizer_mode="mistral",
56-
config_format="mistral",
57-
load_format="mistral",
62+
tokenizer_mode="mistral" if args.format == "mistral" else "auto",
63+
config_format="mistral" if args.format == "mistral" else "auto",
64+
load_format="mistral" if args.format == "mistral" else "auto",
5865
max_model_len=4096,
5966
max_num_seqs=2,
67+
tensor_parallel_size=2,
6068
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
6169
)
6270

@@ -88,17 +96,18 @@ def run_simple_demo(args: argparse.Namespace):
8896

8997
def run_advanced_demo(args: argparse.Namespace):
9098
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
91-
max_img_per_msg = 5
99+
max_img_per_msg = 3
92100
max_tokens_per_img = 4096
93101

94102
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
95103
llm = LLM(
96104
model=model_name,
97-
tokenizer_mode="mistral",
98-
config_format="mistral",
99-
load_format="mistral",
105+
tokenizer_mode="mistral" if args.format == "mistral" else "auto",
106+
config_format="mistral" if args.format == "mistral" else "auto",
107+
load_format="mistral" if args.format == "mistral" else "auto",
100108
limit_mm_per_prompt={"image": max_img_per_msg},
101109
max_model_len=max_img_per_msg * max_tokens_per_img,
110+
tensor_parallel_size=2,
102111
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
103112
)
104113

@@ -166,6 +175,11 @@ def main():
166175
help="Specify the demo mode: 'simple' or 'advanced'",
167176
)
168177

178+
parser.add_argument('--format',
179+
choices=["mistral", "hf"],
180+
default="mistral",
181+
help='Specify the format of the model to load.')
182+
169183
parser.add_argument(
170184
'--disable-mm-preprocessor-cache',
171185
action='store_true',

0 commit comments

Comments
 (0)