1
+ import itertools
2
+ from functools import partial
3
+
1
4
import pytest
2
5
from PIL import Image
6
+ from pqdm .threads import pqdm
3
7
from transformers import AutoTokenizer
4
8
5
9
from vllm .inputs import InputProcessingContext
10
+ from vllm .multimodal .parse import ImageSize
6
11
7
12
from ....utils import build_model_context
8
13
@@ -15,22 +20,68 @@ def processor_for_llava_onevision():
15
20
return LlavaOnevisionMultiModalProcessor
16
21
17
22
23
+ def _validate_image_prompt_replacements_one (
24
+ processor ,
25
+ num_imgs : int ,
26
+ failed_size_excs : list [tuple [ImageSize , Exception ]],
27
+ image_size : ImageSize ,
28
+ ) -> None :
29
+ prompt = "<image>" * num_imgs
30
+ image = Image .new ("RGB" , size = image_size )
31
+ mm_data = {"image" : [image ] * num_imgs }
32
+
33
+ try :
34
+ # The processor will throw an error if there is a mismatch
35
+ # in the prompt replacements
36
+ processed_inputs = processor .apply (prompt , mm_data , {})
37
+
38
+ image_placeholders = processed_inputs ["mm_placeholders" ]["image" ]
39
+ assert len (image_placeholders ) == num_imgs
40
+
41
+ first_placeholder = image_placeholders [0 ]
42
+
43
+ assert first_placeholder ["offset" ] == 0
44
+ assert first_placeholder ["length" ] == len (
45
+ processed_inputs ["prompt_token_ids" ]) // num_imgs
46
+ except Exception as exc :
47
+ failed_size_excs .append ((image_size , exc ))
48
+
49
+
50
+ def _test_image_prompt_replacements (
51
+ processor ,
52
+ * ,
53
+ num_imgs : int ,
54
+ image_sizes : list [ImageSize ],
55
+ ) -> None :
56
+ """
57
+ Ensure LlavaOnevisionMultiModalProcessor
58
+ handles prompt replacement properly for input images.
59
+ """
60
+ failed_size_excs = list [tuple [ImageSize , Exception ]]()
61
+
62
+ validate_one = partial (
63
+ _validate_image_prompt_replacements_one ,
64
+ processor ,
65
+ num_imgs ,
66
+ failed_size_excs ,
67
+ )
68
+ pqdm (image_sizes , validate_one , n_jobs = 8 , desc = "Validating image sizes" )
69
+
70
+ if failed_size_excs :
71
+ msg = "Found failing image sizes:" \
72
+ + "\n ========\n " .join (f"[{ size } ]\n { exc } "
73
+ for size , exc in failed_size_excs )
74
+ raise AssertionError (msg )
75
+
76
+
18
77
@pytest .mark .parametrize ("model_id" ,
19
78
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf" ])
20
- @pytest .mark .parametrize ("image_size" , [(1669 , 2560 ), (2560 , 1669 ), (183 , 488 ),
21
- (488 , 183 ), (198 , 176 ), (176 , 198 ),
22
- (161 , 184 ), (184 , 161 )])
23
79
@pytest .mark .parametrize ("num_imgs" , [1 , 2 ])
24
- def test_processor_prompt_replacements (
80
+ def test_processor_prompt_replacements_regression (
25
81
processor_for_llava_onevision ,
26
82
model_id : str ,
27
- image_size : tuple [int , int ],
28
83
num_imgs : int ,
29
84
):
30
- """
31
- Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement
32
- properly.
33
- """
34
85
ctx = build_model_context (
35
86
model_name = model_id ,
36
87
tokenizer_name = model_id ,
@@ -39,22 +90,56 @@ def test_processor_prompt_replacements(
39
90
)
40
91
tokenizer = AutoTokenizer .from_pretrained (model_id , trust_remote_code = True )
41
92
ctx = InputProcessingContext (ctx .model_config , tokenizer )
93
+ processor = processor_for_llava_onevision (ctx )
42
94
43
- # Build the image str / prompt based on the number of images we pass
44
- prompt = "<image>" * num_imgs
45
- mm_data = {"image" : [Image .new ("RGB" , size = image_size )] * num_imgs }
95
+ image_ratios = [(171 , 152 ), (184 , 161 ), (198 , 176 ), (333 , 296 ), (369 , 328 ),
96
+ (488 , 183 ), (2560 , 1669 )]
97
+ image_sizes = [
98
+ size for w , h in image_ratios
99
+ for size in [ImageSize (w , h ), ImageSize (h , w )]
100
+ ]
101
+
102
+ _test_image_prompt_replacements (
103
+ processor ,
104
+ num_imgs = num_imgs ,
105
+ image_sizes = image_sizes ,
106
+ )
46
107
47
- # The processor will throw an error if there is a mismatch
48
- # in the prompt replacements
108
+
109
+ @pytest .mark .skip ("This test takes around 2 hours to run. "
110
+ "Comment this out to run it manually." )
111
+ @pytest .mark .parametrize ("model_id" ,
112
+ ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf" ])
113
+ @pytest .mark .parametrize ("num_imgs" , [1 ])
114
+ def test_processor_prompt_replacements_all (
115
+ processor_for_llava_onevision ,
116
+ model_id : str ,
117
+ num_imgs : int ,
118
+ ):
119
+ ctx = build_model_context (
120
+ model_name = model_id ,
121
+ tokenizer_name = model_id ,
122
+ mm_processor_kwargs = None ,
123
+ limit_mm_per_prompt = {"image" : num_imgs },
124
+ )
125
+ tokenizer = AutoTokenizer .from_pretrained (model_id , trust_remote_code = True )
126
+ ctx = InputProcessingContext (ctx .model_config , tokenizer )
49
127
processor = processor_for_llava_onevision (ctx )
50
- processed_inputs = processor .apply (prompt , mm_data , {})
51
128
52
- image_placeholders = processed_inputs [ "mm_placeholders" ][ "image" ]
53
- assert len ( image_placeholders ) == num_imgs
129
+ seen_aspect_ratios = set [ float ]()
130
+ image_sizes = list [ ImageSize ]()
54
131
55
- first_placeholder = image_placeholders [0 ]
132
+ # The aspect ratio of the grid layout is between 1 and 6
133
+ # NOTE: Assumes that feature size calculation is the same if we
134
+ # swap the width and height of the image
135
+ for w , h in itertools .product (range (64 , 1024 ), repeat = 2 ):
136
+ aspect_ratio = w / h
137
+ if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios :
138
+ image_sizes .append (ImageSize (w , h ))
139
+ seen_aspect_ratios .add (aspect_ratio )
56
140
57
- # NOTE: There is a BOS token
58
- assert first_placeholder ["offset" ] == 0
59
- assert first_placeholder ["length" ] == len (
60
- processed_inputs ["prompt_token_ids" ]) // num_imgs
141
+ _test_image_prompt_replacements (
142
+ processor ,
143
+ num_imgs = num_imgs ,
144
+ image_sizes = image_sizes ,
145
+ )
0 commit comments