Skip to content

Commit 341077d

Browse files
DarkLight1337Isotr0py
authored andcommitted
[Bugfix] Comprehensively test and fix LLaVA-NeXT feature size calculation (vllm-project#11800)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com>
1 parent 3fc5707 commit 341077d

File tree

6 files changed

+253
-89
lines changed

6 files changed

+253
-89
lines changed

requirements-test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
1313
httpx
1414
librosa # required for audio tests
1515
peft
16+
pqdm
1617
ray[adag]==2.40.0
1718
sentence-transformers # required for embedding tests
1819
soundfile # required for audio tests

requirements-test.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ botocore==1.35.57
4848
# awscli
4949
# boto3
5050
# s3transfer
51+
bounded-pool-executor==0.0.3
52+
# via pqdm
5153
buildkite-test-collector==0.1.9
5254
# via -r requirements-test.in
5355
certifi==2024.8.30
@@ -342,6 +344,8 @@ pooch==1.8.2
342344
# via librosa
343345
portalocker==2.10.1
344346
# via sacrebleu
347+
pqdm==0.2.0
348+
# via -r requirements-test.in
345349
propcache==0.2.0
346350
# via yarl
347351
protobuf==5.28.3
Lines changed: 108 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
import itertools
2+
from functools import partial
3+
14
import pytest
25
from PIL import Image
6+
from pqdm.threads import pqdm
37
from transformers import AutoTokenizer
48

59
from vllm.inputs import InputProcessingContext
10+
from vllm.multimodal.parse import ImageSize
611

712
from ....utils import build_model_context
813

@@ -15,20 +20,69 @@ def processor_for_llava_next():
1520
return LlavaNextMultiModalProcessor
1621

1722

23+
def _validate_image_prompt_replacements_one(
24+
processor,
25+
num_imgs: int,
26+
failed_size_excs: list[tuple[ImageSize, Exception]],
27+
image_size: ImageSize,
28+
) -> None:
29+
prompt = "<image>" * num_imgs
30+
image = Image.new("RGB", size=image_size)
31+
mm_data = {"image": [image] * num_imgs}
32+
33+
try:
34+
# The processor will throw an error if there is a mismatch
35+
# in the prompt replacements
36+
processed_inputs = processor.apply(prompt, mm_data, {})
37+
38+
image_placeholders = processed_inputs["mm_placeholders"]["image"]
39+
assert len(image_placeholders) == num_imgs
40+
41+
first_placeholder = image_placeholders[0]
42+
43+
# NOTE: There is a BOS token
44+
assert first_placeholder["offset"] == 1
45+
assert first_placeholder["length"] == (
46+
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
47+
48+
except Exception as exc:
49+
failed_size_excs.append((image_size, exc))
50+
51+
52+
def _test_image_prompt_replacements(
53+
processor,
54+
*,
55+
num_imgs: int,
56+
image_sizes: list[ImageSize],
57+
) -> None:
58+
"""
59+
Ensure LlavaNextMultiModalProcessor
60+
handles prompt replacement properly for input images.
61+
"""
62+
failed_size_excs = list[tuple[ImageSize, Exception]]()
63+
64+
validate_one = partial(
65+
_validate_image_prompt_replacements_one,
66+
processor,
67+
num_imgs,
68+
failed_size_excs,
69+
)
70+
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
71+
72+
if failed_size_excs:
73+
msg = "Found failing image sizes:" \
74+
+ "\n========\n".join(f"[{size}]\n{exc}"
75+
for size, exc in failed_size_excs)
76+
raise AssertionError(msg)
77+
78+
1879
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
19-
@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
20-
(488, 183), (198, 176), (176, 198),
21-
(161, 184), (184, 161)])
2280
@pytest.mark.parametrize("num_imgs", [1, 2])
23-
def test_processor_prompt_replacements(
81+
def test_processor_prompt_replacements_regression(
2482
processor_for_llava_next,
2583
model_id: str,
26-
image_size: tuple[int, int],
2784
num_imgs: int,
2885
):
29-
"""
30-
Ensure LlavaNextMultiModalProcessor handles prompt replacement properly.
31-
"""
3286
ctx = build_model_context(
3387
model_name=model_id,
3488
tokenizer_name=model_id,
@@ -37,22 +91,55 @@ def test_processor_prompt_replacements(
3791
)
3892
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
3993
ctx = InputProcessingContext(ctx.model_config, tokenizer)
94+
processor = processor_for_llava_next(ctx)
95+
96+
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
97+
(488, 183), (2560, 1669)]
98+
image_sizes = [
99+
size for w, h in image_ratios
100+
for size in [ImageSize(w, h), ImageSize(h, w)]
101+
]
102+
103+
_test_image_prompt_replacements(
104+
processor,
105+
num_imgs=num_imgs,
106+
image_sizes=image_sizes,
107+
)
40108

41-
# Build the image str / prompt based on the number of images we pass
42-
prompt = "<image>" * num_imgs
43-
mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
44109

45-
# The processor will throw an error if there is a mismatch
46-
# in the prompt replacements
110+
@pytest.mark.skip("This test takes around 2 hours to run. "
111+
"Comment this out to run it manually.")
112+
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
113+
@pytest.mark.parametrize("num_imgs", [1])
114+
def test_processor_prompt_replacements_all(
115+
processor_for_llava_next,
116+
model_id: str,
117+
num_imgs: int,
118+
):
119+
ctx = build_model_context(
120+
model_name=model_id,
121+
tokenizer_name=model_id,
122+
mm_processor_kwargs=None,
123+
limit_mm_per_prompt={"image": num_imgs},
124+
)
125+
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
126+
ctx = InputProcessingContext(ctx.model_config, tokenizer)
47127
processor = processor_for_llava_next(ctx)
48-
processed_inputs = processor.apply(prompt, mm_data, {})
49128

50-
image_placeholders = processed_inputs["mm_placeholders"]["image"]
51-
assert len(image_placeholders) == num_imgs
129+
seen_aspect_ratios = set[float]()
130+
image_sizes = list[ImageSize]()
52131

53-
first_placeholder = image_placeholders[0]
132+
# The aspect ratio of the grid layout is between 1 and 2
133+
# NOTE: Assumes that feature size calculation is the same if we
134+
# swap the width and height of the image
135+
for w, h in itertools.product(range(64, 1024), repeat=2):
136+
aspect_ratio = w / h
137+
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
138+
image_sizes.append(ImageSize(w, h))
139+
seen_aspect_ratios.add(aspect_ratio)
54140

55-
# NOTE: There is a BOS token
56-
assert first_placeholder["offset"] == 1
57-
assert first_placeholder["length"] == (
58-
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
141+
_test_image_prompt_replacements(
142+
processor,
143+
num_imgs=num_imgs,
144+
image_sizes=image_sizes,
145+
)
Lines changed: 107 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
import itertools
2+
from functools import partial
3+
14
import pytest
25
from PIL import Image
6+
from pqdm.threads import pqdm
37
from transformers import AutoTokenizer
48

59
from vllm.inputs import InputProcessingContext
10+
from vllm.multimodal.parse import ImageSize
611

712
from ....utils import build_model_context
813

@@ -15,22 +20,68 @@ def processor_for_llava_onevision():
1520
return LlavaOnevisionMultiModalProcessor
1621

1722

23+
def _validate_image_prompt_replacements_one(
24+
processor,
25+
num_imgs: int,
26+
failed_size_excs: list[tuple[ImageSize, Exception]],
27+
image_size: ImageSize,
28+
) -> None:
29+
prompt = "<image>" * num_imgs
30+
image = Image.new("RGB", size=image_size)
31+
mm_data = {"image": [image] * num_imgs}
32+
33+
try:
34+
# The processor will throw an error if there is a mismatch
35+
# in the prompt replacements
36+
processed_inputs = processor.apply(prompt, mm_data, {})
37+
38+
image_placeholders = processed_inputs["mm_placeholders"]["image"]
39+
assert len(image_placeholders) == num_imgs
40+
41+
first_placeholder = image_placeholders[0]
42+
43+
assert first_placeholder["offset"] == 0
44+
assert first_placeholder["length"] == len(
45+
processed_inputs["prompt_token_ids"]) // num_imgs
46+
except Exception as exc:
47+
failed_size_excs.append((image_size, exc))
48+
49+
50+
def _test_image_prompt_replacements(
51+
processor,
52+
*,
53+
num_imgs: int,
54+
image_sizes: list[ImageSize],
55+
) -> None:
56+
"""
57+
Ensure LlavaOnevisionMultiModalProcessor
58+
handles prompt replacement properly for input images.
59+
"""
60+
failed_size_excs = list[tuple[ImageSize, Exception]]()
61+
62+
validate_one = partial(
63+
_validate_image_prompt_replacements_one,
64+
processor,
65+
num_imgs,
66+
failed_size_excs,
67+
)
68+
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
69+
70+
if failed_size_excs:
71+
msg = "Found failing image sizes:" \
72+
+ "\n========\n".join(f"[{size}]\n{exc}"
73+
for size, exc in failed_size_excs)
74+
raise AssertionError(msg)
75+
76+
1877
@pytest.mark.parametrize("model_id",
1978
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
20-
@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
21-
(488, 183), (198, 176), (176, 198),
22-
(161, 184), (184, 161)])
2379
@pytest.mark.parametrize("num_imgs", [1, 2])
24-
def test_processor_prompt_replacements(
80+
def test_processor_prompt_replacements_regression(
2581
processor_for_llava_onevision,
2682
model_id: str,
27-
image_size: tuple[int, int],
2883
num_imgs: int,
2984
):
30-
"""
31-
Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement
32-
properly.
33-
"""
3485
ctx = build_model_context(
3586
model_name=model_id,
3687
tokenizer_name=model_id,
@@ -39,22 +90,56 @@ def test_processor_prompt_replacements(
3990
)
4091
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
4192
ctx = InputProcessingContext(ctx.model_config, tokenizer)
93+
processor = processor_for_llava_onevision(ctx)
4294

43-
# Build the image str / prompt based on the number of images we pass
44-
prompt = "<image>" * num_imgs
45-
mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
95+
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
96+
(488, 183), (2560, 1669)]
97+
image_sizes = [
98+
size for w, h in image_ratios
99+
for size in [ImageSize(w, h), ImageSize(h, w)]
100+
]
101+
102+
_test_image_prompt_replacements(
103+
processor,
104+
num_imgs=num_imgs,
105+
image_sizes=image_sizes,
106+
)
46107

47-
# The processor will throw an error if there is a mismatch
48-
# in the prompt replacements
108+
109+
@pytest.mark.skip("This test takes around 2 hours to run. "
110+
"Comment this out to run it manually.")
111+
@pytest.mark.parametrize("model_id",
112+
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
113+
@pytest.mark.parametrize("num_imgs", [1])
114+
def test_processor_prompt_replacements_all(
115+
processor_for_llava_onevision,
116+
model_id: str,
117+
num_imgs: int,
118+
):
119+
ctx = build_model_context(
120+
model_name=model_id,
121+
tokenizer_name=model_id,
122+
mm_processor_kwargs=None,
123+
limit_mm_per_prompt={"image": num_imgs},
124+
)
125+
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
126+
ctx = InputProcessingContext(ctx.model_config, tokenizer)
49127
processor = processor_for_llava_onevision(ctx)
50-
processed_inputs = processor.apply(prompt, mm_data, {})
51128

52-
image_placeholders = processed_inputs["mm_placeholders"]["image"]
53-
assert len(image_placeholders) == num_imgs
129+
seen_aspect_ratios = set[float]()
130+
image_sizes = list[ImageSize]()
54131

55-
first_placeholder = image_placeholders[0]
132+
# The aspect ratio of the grid layout is between 1 and 6
133+
# NOTE: Assumes that feature size calculation is the same if we
134+
# swap the width and height of the image
135+
for w, h in itertools.product(range(64, 1024), repeat=2):
136+
aspect_ratio = w / h
137+
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
138+
image_sizes.append(ImageSize(w, h))
139+
seen_aspect_ratios.add(aspect_ratio)
56140

57-
# NOTE: There is a BOS token
58-
assert first_placeholder["offset"] == 0
59-
assert first_placeholder["length"] == len(
60-
processed_inputs["prompt_token_ids"]) // num_imgs
141+
_test_image_prompt_replacements(
142+
processor,
143+
num_imgs=num_imgs,
144+
image_sizes=image_sizes,
145+
)

0 commit comments

Comments
 (0)