Skip to content

Commit eaa1732

Browse files
ywang96JJJYmmmwulipcIsotr0py
authored andcommitted
[Model] Support Qwen3-VL Model Series (vllm-project#24727)
Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: charlifu <charlifu@amd.com>
1 parent ec77bf3 commit eaa1732

File tree

13 files changed

+2084
-17
lines changed

13 files changed

+2084
-17
lines changed

docs/models/supported_models.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
661661
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
662662
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
663663
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
664+
| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
665+
| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
664666
| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
665667
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
666668
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |

examples/offline_inference/vision_language.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,6 +1437,80 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
14371437
)
14381438

14391439

1440+
# Qwen3-VL-Dense
1441+
def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
1442+
model_name = "Qwen/Qwen3-VL-4B-Instruct"
1443+
1444+
engine_args = EngineArgs(
1445+
model=model_name,
1446+
max_model_len=4096,
1447+
max_num_seqs=5,
1448+
mm_processor_kwargs={
1449+
"min_pixels": 28 * 28,
1450+
"max_pixels": 1280 * 28 * 28,
1451+
"fps": 1,
1452+
},
1453+
limit_mm_per_prompt={modality: 1},
1454+
)
1455+
1456+
if modality == "image":
1457+
placeholder = "<|image_pad|>"
1458+
elif modality == "video":
1459+
placeholder = "<|video_pad|>"
1460+
1461+
prompts = [
1462+
(
1463+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
1464+
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
1465+
f"{question}<|im_end|>\n"
1466+
"<|im_start|>assistant\n"
1467+
)
1468+
for question in questions
1469+
]
1470+
1471+
return ModelRequestData(
1472+
engine_args=engine_args,
1473+
prompts=prompts,
1474+
)
1475+
1476+
1477+
# Qwen3-VL-MOE
1478+
def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
1479+
model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
1480+
1481+
engine_args = EngineArgs(
1482+
model=model_name,
1483+
max_model_len=4096,
1484+
max_num_seqs=5,
1485+
mm_processor_kwargs={
1486+
"min_pixels": 28 * 28,
1487+
"max_pixels": 1280 * 28 * 28,
1488+
"fps": 1,
1489+
},
1490+
limit_mm_per_prompt={modality: 1},
1491+
)
1492+
1493+
if modality == "image":
1494+
placeholder = "<|image_pad|>"
1495+
elif modality == "video":
1496+
placeholder = "<|video_pad|>"
1497+
1498+
prompts = [
1499+
(
1500+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
1501+
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
1502+
f"{question}<|im_end|>\n"
1503+
"<|im_start|>assistant\n"
1504+
)
1505+
for question in questions
1506+
]
1507+
1508+
return ModelRequestData(
1509+
engine_args=engine_args,
1510+
prompts=prompts,
1511+
)
1512+
1513+
14401514
# R-4B
14411515
def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
14421516
assert modality == "image"
@@ -1645,6 +1719,8 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
16451719
"qwen2_vl": run_qwen2_vl,
16461720
"qwen2_5_vl": run_qwen2_5_vl,
16471721
"qwen2_5_omni": run_qwen2_5_omni,
1722+
"qwen3_vl": run_qwen3_vl,
1723+
"qwen3_vl_moe": run_qwen3_vl_moe,
16481724
"rvl": run_r_vl,
16491725
"skywork_chat": run_skyworkr1v,
16501726
"smolvlm": run_smolvlm,
@@ -1658,6 +1734,8 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
16581734
"glm4_1v",
16591735
"glm4_5v",
16601736
"glm4_5v_fp8",
1737+
"qwen3_vl",
1738+
"qwen3_vl_moe",
16611739
]
16621740

16631741

tests/models/multimodal/processing/test_common.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
3131
"""
3232
# Ensure video metadata is included
3333
if "video" in mm_data:
34+
# GLM4.1V doesn't support multiple videos
3435
video = mm_data["video"]
3536
num_frames = len(video)
3637
mm_data["video"] = (video, {
@@ -44,6 +45,34 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
4445
return mm_data
4546

4647

48+
def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
49+
"""
50+
Patch the multimodal data for Qwen3-VL model.
51+
"""
52+
53+
def create_metadata(frames: np.ndarray):
54+
num_frames = len(frames)
55+
return {
56+
"total_num_frames": num_frames,
57+
"fps": 2.0,
58+
"duration": num_frames / 2.0,
59+
"video_backend": "opencv",
60+
"frames_indices": list(range(num_frames)),
61+
"do_sample_frames": True,
62+
}
63+
64+
# Ensure video metadata is included
65+
if "video" in mm_data:
66+
video = mm_data["video"]
67+
if isinstance(video, list):
68+
# multiple videos
69+
mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
70+
else:
71+
# single video
72+
mm_data["video"] = (video, create_metadata(video))
73+
return mm_data
74+
75+
4776
def _test_processing_correctness(
4877
model_id_or_arch: str,
4978
hit_rate: float,
@@ -182,8 +211,10 @@ def _test_processing_correctness(
182211
}
183212

184213
MM_DATA_PATCHES = {
185-
# GLM4.1V requires video metadata to be included in the input
214+
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
186215
"glm4v": glm4_1v_patch_mm_data,
216+
"qwen3_vl": qwen3_vl_patch_mm_data,
217+
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
187218
}
188219

189220

@@ -326,6 +357,8 @@ def _test_processing_correctness_one(
326357
"Qwen/Qwen2.5-VL-3B-Instruct",
327358
"Qwen/Qwen2-Audio-7B-Instruct",
328359
"Qwen/Qwen2.5-Omni-3B",
360+
"Qwen/Qwen3-VL-4B-Instruct",
361+
"Qwen/Qwen3-VL-30B-A3B-Instruct",
329362
"YannQi/R-4B",
330363
"Skywork/Skywork-R1V-38B",
331364
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",

tests/models/registry.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,12 @@ def check_available_online(
557557
max_model_len=4096),
558558
"Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
559559
"Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501
560+
"Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501
561+
max_model_len=4096,
562+
min_transformers_version="4.57"), # noqa: E501
563+
"Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501
564+
max_model_len=4096,
565+
min_transformers_version="4.57"),
560566
"RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B",
561567
trust_remote_code=True),
562568
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",

vllm/model_executor/layers/rotary_embedding/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ def get_rope(
103103
is_neox_style,
104104
dtype,
105105
mrope_section=rope_scaling["mrope_section"],
106+
mrope_interleaved=rope_scaling.get("mrope_interleaved",
107+
False),
106108
)
107109
else:
108110
rotary_emb = RotaryEmbedding(

vllm/model_executor/layers/rotary_embedding/mrope.py

Lines changed: 133 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,18 @@ def triton_mrope(
177177
return q, k
178178

179179

180+
def apply_interleaved_rope(x: torch.Tensor,
181+
mrope_section: list[int]) -> torch.Tensor:
182+
"""Apply interleaved MRoPE to 3D rotary embeddings.
183+
Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
184+
interleaved [THTHWHTHW...TT], preserving frequency continuity.
185+
"""
186+
x_t = x[0].clone()
187+
x_t[..., 1:mrope_section[1] * 3:3] = x[1, ..., 1:mrope_section[1] * 3:3]
188+
x_t[..., 2:mrope_section[2] * 3:3] = x[2, ..., 2:mrope_section[2] * 3:3]
189+
return x_t
190+
191+
180192
class MRotaryEmbedding(RotaryEmbedding):
181193
"""Rotary Embedding with Multimodal Sections."""
182194

@@ -189,6 +201,7 @@ def __init__(
189201
is_neox_style: bool,
190202
dtype: torch.dtype,
191203
mrope_section: Optional[list[int]] = None,
204+
mrope_interleaved: Optional[bool] = False,
192205
) -> None:
193206
# In Qwen2.5-VL, the maximum index value is related to the duration of
194207
# the input video. We enlarge max_position_embeddings to 4 times to get
@@ -198,6 +211,7 @@ def __init__(
198211
base, is_neox_style, dtype)
199212

200213
self.mrope_section = mrope_section
214+
self.mrope_interleaved = mrope_interleaved
201215
if self.mrope_section:
202216
assert sum(self.mrope_section) == rotary_dim // 2
203217

@@ -225,17 +239,20 @@ def forward_native(
225239
cos, sin = cos_sin.chunk(2, dim=-1)
226240
if positions.ndim == 2:
227241
assert self.mrope_section
228-
229-
cos = torch.cat([
230-
m[i]
231-
for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
232-
],
233-
dim=-1)
234-
sin = torch.cat([
235-
m[i]
236-
for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
237-
],
238-
dim=-1)
242+
if self.mrope_interleaved:
243+
cos = apply_interleaved_rope(cos, self.mrope_section)
244+
sin = apply_interleaved_rope(sin, self.mrope_section)
245+
else:
246+
cos = torch.cat([
247+
m[i] for i, m in enumerate(
248+
cos.split(self.mrope_section, dim=-1))
249+
],
250+
dim=-1)
251+
sin = torch.cat([
252+
m[i] for i, m in enumerate(
253+
sin.split(self.mrope_section, dim=-1))
254+
],
255+
dim=-1)
239256

240257
query_shape = query.shape
241258
query = query.view(num_tokens, -1, self.head_size)
@@ -265,6 +282,10 @@ def forward_cuda(
265282
assert positions.ndim == 1 or positions.ndim == 2
266283
assert key is not None
267284

285+
if self.mrope_interleaved:
286+
# TODO: add triton implementation to support mrope-interleaved
287+
return self.forward_native(positions, query, key)
288+
268289
num_tokens = positions.shape[-1]
269290
cos_sin = self.cos_sin_cache[positions]
270291
cos, sin = cos_sin.chunk(2, dim=-1)
@@ -388,6 +409,15 @@ def get_input_positions_tensor(
388409
context_len=context_len,
389410
seq_len=seq_len,
390411
)
412+
elif hf_config.model_type in ["qwen3_vl", "qwen3_vl_moe"]:
413+
return cls._qwen3vl_get_input_positions_tensor(
414+
input_tokens=input_tokens,
415+
hf_config=hf_config,
416+
image_grid_thw=image_grid_thw,
417+
video_grid_thw=video_grid_thw,
418+
context_len=context_len,
419+
seq_len=seq_len,
420+
)
391421
elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
392422
return cls._ernie_get_input_positions_tensor(
393423
input_tokens=input_tokens,
@@ -526,6 +556,98 @@ def _glm4v_get_input_positions_tensor(
526556
len(input_tokens)).item()
527557
return llm_positions, mrope_position_delta
528558

559+
@classmethod
560+
def _qwen3vl_get_input_positions_tensor(
561+
cls,
562+
input_tokens: list[int],
563+
hf_config: PretrainedConfig,
564+
image_grid_thw: Union[list[list[int]], torch.Tensor],
565+
video_grid_thw: Union[list[list[int]], torch.Tensor],
566+
context_len: int = 0,
567+
seq_len: Optional[int] = None,
568+
) -> tuple[torch.Tensor, int]:
569+
"""Get mrope input positions and delta value."""
570+
571+
video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw
572+
for _ in range(t)]
573+
574+
image_token_id = hf_config.image_token_id
575+
video_token_id = hf_config.video_token_id
576+
vision_start_token_id = hf_config.vision_start_token_id
577+
spatial_merge_size = hf_config.vision_config.spatial_merge_size
578+
579+
input_tokens_tensor = torch.tensor(input_tokens)
580+
vision_start_indices = torch.argwhere(
581+
input_tokens_tensor == vision_start_token_id).squeeze(1)
582+
vision_tokens = input_tokens_tensor[vision_start_indices + 1]
583+
image_nums = (vision_tokens == image_token_id).sum()
584+
video_nums = (vision_tokens == video_token_id).sum()
585+
llm_pos_ids_list: list = []
586+
587+
st = 0
588+
remain_images, remain_videos = image_nums, video_nums
589+
590+
image_index, video_index = 0, 0
591+
for _ in range(image_nums + video_nums):
592+
if image_token_id in input_tokens and remain_images > 0:
593+
ed_image = input_tokens.index(image_token_id, st)
594+
else:
595+
ed_image = len(input_tokens) + 1
596+
if video_token_id in input_tokens and remain_videos > 0:
597+
ed_video = input_tokens.index(video_token_id, st)
598+
else:
599+
ed_video = len(input_tokens) + 1
600+
if ed_image < ed_video:
601+
t, h, w = (
602+
image_grid_thw[image_index][0],
603+
image_grid_thw[image_index][1],
604+
image_grid_thw[image_index][2],
605+
)
606+
image_index += 1
607+
remain_images -= 1
608+
ed = ed_image
609+
else:
610+
t, h, w = (
611+
video_grid_thw[video_index][0],
612+
video_grid_thw[video_index][1],
613+
video_grid_thw[video_index][2],
614+
)
615+
video_index += 1
616+
remain_videos -= 1
617+
ed = ed_video
618+
619+
llm_grid_t, llm_grid_h, llm_grid_w = \
620+
t, h // spatial_merge_size, w // spatial_merge_size
621+
text_len = ed - st
622+
623+
st_idx = llm_pos_ids_list[-1].max() + 1 if len(
624+
llm_pos_ids_list) > 0 else 0
625+
llm_pos_ids_list.append(
626+
torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
627+
628+
t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
629+
-1, llm_grid_h * llm_grid_w).flatten()
630+
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
631+
llm_grid_t, -1, llm_grid_w).flatten()
632+
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
633+
llm_grid_t, llm_grid_h, -1).flatten()
634+
llm_pos_ids_list.append(
635+
torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
636+
st = ed + llm_grid_t * llm_grid_h * llm_grid_w
637+
638+
if st < len(input_tokens):
639+
st_idx = llm_pos_ids_list[-1].max() + 1 if len(
640+
llm_pos_ids_list) > 0 else 0
641+
text_len = len(input_tokens) - st
642+
llm_pos_ids_list.append(
643+
torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
644+
645+
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
646+
mrope_position_delta = (llm_positions.max() + 1 -
647+
len(input_tokens)).item()
648+
llm_positions = llm_positions[:, context_len:seq_len]
649+
return llm_positions, mrope_position_delta
650+
529651
@classmethod
530652
def _ernie_get_input_positions_tensor(
531653
cls,

vllm/model_executor/models/qwen2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def __init__(self,
285285
decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
286286
super().__init__()
287287

288-
config = vllm_config.model_config.hf_config
288+
config = vllm_config.model_config.hf_config.get_text_config()
289289
cache_config = vllm_config.cache_config
290290
quant_config = vllm_config.quant_config
291291

0 commit comments

Comments
 (0)