|
13 | 13 | # - Server:
|
14 | 14 | #
|
15 | 15 | # ```bash
|
| 16 | +# # Mistral format |
16 | 17 | # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
|
17 | 18 | # --tokenizer-mode mistral --config-format mistral --load-format mistral \
|
18 | 19 | # --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
| 20 | +# |
| 21 | +# # HF format |
| 22 | +# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ |
| 23 | +# --limit-mm-per-prompt 'image=4' --max-model-len 16384 |
19 | 24 | # ```
|
20 | 25 | #
|
21 | 26 | # - Client:
|
|
44 | 49 | # python demo.py simple
|
45 | 50 | # python demo.py advanced
|
46 | 51 |
|
| 52 | +# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs. |
| 53 | +# These scripts have been tested on 2x L40 GPUs |
| 54 | + |
47 | 55 |
|
48 | 56 | def run_simple_demo(args: argparse.Namespace):
|
49 | 57 | model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
50 | 58 | sampling_params = SamplingParams(max_tokens=8192)
|
51 | 59 |
|
52 |
| - # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs. |
53 | 60 | llm = LLM(
|
54 | 61 | model=model_name,
|
55 |
| - tokenizer_mode="mistral", |
56 |
| - config_format="mistral", |
57 |
| - load_format="mistral", |
| 62 | + tokenizer_mode="mistral" if args.format == "mistral" else "auto", |
| 63 | + config_format="mistral" if args.format == "mistral" else "auto", |
| 64 | + load_format="mistral" if args.format == "mistral" else "auto", |
58 | 65 | max_model_len=4096,
|
59 | 66 | max_num_seqs=2,
|
| 67 | + tensor_parallel_size=2, |
60 | 68 | disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
61 | 69 | )
|
62 | 70 |
|
@@ -88,17 +96,18 @@ def run_simple_demo(args: argparse.Namespace):
|
88 | 96 |
|
89 | 97 | def run_advanced_demo(args: argparse.Namespace):
|
90 | 98 | model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
91 |
| - max_img_per_msg = 5 |
| 99 | + max_img_per_msg = 3 |
92 | 100 | max_tokens_per_img = 4096
|
93 | 101 |
|
94 | 102 | sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
|
95 | 103 | llm = LLM(
|
96 | 104 | model=model_name,
|
97 |
| - tokenizer_mode="mistral", |
98 |
| - config_format="mistral", |
99 |
| - load_format="mistral", |
| 105 | + tokenizer_mode="mistral" if args.format == "mistral" else "auto", |
| 106 | + config_format="mistral" if args.format == "mistral" else "auto", |
| 107 | + load_format="mistral" if args.format == "mistral" else "auto", |
100 | 108 | limit_mm_per_prompt={"image": max_img_per_msg},
|
101 | 109 | max_model_len=max_img_per_msg * max_tokens_per_img,
|
| 110 | + tensor_parallel_size=2, |
102 | 111 | disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
103 | 112 | )
|
104 | 113 |
|
@@ -166,6 +175,11 @@ def main():
|
166 | 175 | help="Specify the demo mode: 'simple' or 'advanced'",
|
167 | 176 | )
|
168 | 177 |
|
| 178 | + parser.add_argument('--format', |
| 179 | + choices=["mistral", "hf"], |
| 180 | + default="mistral", |
| 181 | + help='Specify the format of the model to load.') |
| 182 | + |
169 | 183 | parser.add_argument(
|
170 | 184 | '--disable-mm-preprocessor-cache',
|
171 | 185 | action='store_true',
|
|
0 commit comments