|  | 
| 14 | 14 | 
 | 
| 15 | 15 | pytestmark = pytest.mark.vlm | 
| 16 | 16 | 
 | 
|  | 17 | + | 
|  | 18 | +class NestedInputs(UserDict): | 
|  | 19 | + | 
|  | 20 | +    def __init__(self, model_inputs: BatchFeature): | 
|  | 21 | +        super().__init__({"model_inputs": model_inputs}) | 
|  | 22 | + | 
|  | 23 | +        self.model_inputs = model_inputs | 
|  | 24 | + | 
|  | 25 | +    def to(self, device: torch.types.Device): | 
|  | 26 | +        return NestedInputs(self.model_inputs.to(device)) | 
|  | 27 | + | 
|  | 28 | + | 
| 17 | 29 | # The image token is placed before "user" on purpose so that the test can pass | 
| 18 | 30 | HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ | 
| 19 | 31 |     "stop_sign": | 
|  | 
| 23 | 35 |     "cherry_blossom": | 
| 24 | 36 |         "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \ | 
| 25 | 37 |         "(<image>./</image>)\nWhat is the season?<|eot_id|>" \ | 
| 26 |  | -        "<|start_header_id|>assistant<|end_header_id|>\n\n" | 
|  | 38 | +        "<|start_header_id|>assistant<|end_header_id|>\n\n", | 
| 27 | 39 | }) | 
| 28 | 40 | 
 | 
| 29 | 41 | models = ["openbmb/MiniCPM-Llama3-V-2_5"] | 
| @@ -94,22 +106,10 @@ def run_test( | 
| 94 | 106 |         ] | 
| 95 | 107 | 
 | 
| 96 | 108 |     with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): | 
| 97 |  | - | 
| 98 |  | -        class NestedInputs(UserDict): | 
| 99 |  | - | 
| 100 |  | -            def __init__(self, model_inputs: BatchFeature): | 
| 101 |  | -                super().__init__({"model_inputs": model_inputs}) | 
| 102 |  | - | 
| 103 |  | -                self.model_inputs = model_inputs | 
| 104 |  | - | 
| 105 |  | -            def to(self, device: torch.types.Device): | 
| 106 |  | -                return NestedInputs(self.model_inputs.to(device)) | 
| 107 |  | - | 
| 108 | 109 |         hf_processor = hf_model.processor | 
| 109 | 110 |         hf_model.processor = lambda **kw: NestedInputs( | 
| 110 | 111 |             hf_processor(**kw)  # type: ignore | 
| 111 | 112 |         ) | 
| 112 |  | - | 
| 113 | 113 |         hf_outputs_per_image = [ | 
| 114 | 114 |             hf_model.generate_greedy_logprobs_limit(prompts, | 
| 115 | 115 |                                                     max_tokens, | 
| @@ -161,3 +161,123 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, | 
| 161 | 161 |         num_logprobs=num_logprobs, | 
| 162 | 162 |         tensor_parallel_size=1, | 
| 163 | 163 |     ) | 
|  | 164 | + | 
|  | 165 | + | 
|  | 166 | +HF_MULTIIMAGE_IMAGE_PROMPT = \ | 
|  | 167 | +    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \ | 
|  | 168 | +    "(<image>./</image>)\n(<image>./</image>)\n" \ | 
|  | 169 | +    "Describe these images.<|eot_id|>" \ | 
|  | 170 | +    "<|start_header_id|>assistant<|end_header_id|>\n\n" | 
|  | 171 | + | 
|  | 172 | + | 
|  | 173 | +def run_multi_image_test( | 
|  | 174 | +    hf_runner: Type[HfRunner], | 
|  | 175 | +    vllm_runner: Type[VllmRunner], | 
|  | 176 | +    image_assets: _ImageAssets, | 
|  | 177 | +    model: str, | 
|  | 178 | +    *, | 
|  | 179 | +    size_factors: List[float], | 
|  | 180 | +    dtype: str, | 
|  | 181 | +    max_tokens: int, | 
|  | 182 | +    num_logprobs: int, | 
|  | 183 | +    tensor_parallel_size: int, | 
|  | 184 | +    distributed_executor_backend: Optional[str] = None, | 
|  | 185 | +): | 
|  | 186 | +    """Inference result should be the same between hf and vllm. | 
|  | 187 | +
 | 
|  | 188 | +    All the image fixtures for the test is under tests/images. | 
|  | 189 | +    For huggingface runner, we provide the PIL images as input. | 
|  | 190 | +    For vllm runner, we provide MultiModalDataDict objects  | 
|  | 191 | +    and corresponding vision language config as input. | 
|  | 192 | +    Note, the text input is also adjusted to abide by vllm contract. | 
|  | 193 | +    The text output is sanitized to be able to compare with hf. | 
|  | 194 | +    """ | 
|  | 195 | +    images = [asset.pil_image for asset in image_assets] | 
|  | 196 | + | 
|  | 197 | +    inputs_per_case = [ | 
|  | 198 | +        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], | 
|  | 199 | +         [[rescale_image_size(image, factor) for image in images] | 
|  | 200 | +          for factor in size_factors]) | 
|  | 201 | +    ] | 
|  | 202 | + | 
|  | 203 | +    # NOTE: take care of the order. run vLLM first, and then run HF. | 
|  | 204 | +    # vLLM needs a fresh new process without cuda initialization. | 
|  | 205 | +    # if we run HF first, the cuda initialization will be done and it | 
|  | 206 | +    # will hurt multiprocessing backend with fork method (the default method). | 
|  | 207 | + | 
|  | 208 | +    # max_model_len should be greater than image_feature_size | 
|  | 209 | +    with vllm_runner(model, | 
|  | 210 | +                     max_model_len=4096, | 
|  | 211 | +                     max_num_seqs=1, | 
|  | 212 | +                     dtype=dtype, | 
|  | 213 | +                     tensor_parallel_size=tensor_parallel_size, | 
|  | 214 | +                     distributed_executor_backend=distributed_executor_backend, | 
|  | 215 | +                     enforce_eager=True) as vllm_model: | 
|  | 216 | +        tokenizer = vllm_model.model.get_tokenizer() | 
|  | 217 | +        stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] | 
|  | 218 | +        vllm_outputs_per_case = [ | 
|  | 219 | +            vllm_model.generate_greedy_logprobs(prompts, | 
|  | 220 | +                                                max_tokens, | 
|  | 221 | +                                                num_logprobs=num_logprobs, | 
|  | 222 | +                                                images=images, | 
|  | 223 | +                                                stop_token_ids=stop_token_ids) | 
|  | 224 | +            for prompts, images in inputs_per_case | 
|  | 225 | +        ] | 
|  | 226 | + | 
|  | 227 | +    with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): | 
|  | 228 | +        hf_processor = hf_model.processor | 
|  | 229 | +        hf_model.processor = lambda **kw: NestedInputs( | 
|  | 230 | +            hf_processor(**kw)  # type: ignore | 
|  | 231 | +        ) | 
|  | 232 | +        hf_outputs_per_case = [ | 
|  | 233 | +            hf_model.generate_greedy_logprobs_limit(prompts, | 
|  | 234 | +                                                    max_tokens, | 
|  | 235 | +                                                    num_logprobs=num_logprobs, | 
|  | 236 | +                                                    images=images, | 
|  | 237 | +                                                    tokenizer=tokenizer) | 
|  | 238 | +            for prompts, images in inputs_per_case | 
|  | 239 | +        ] | 
|  | 240 | + | 
|  | 241 | +    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, | 
|  | 242 | +                                        vllm_outputs_per_case): | 
|  | 243 | +        check_logprobs_close( | 
|  | 244 | +            outputs_0_lst=[ | 
|  | 245 | +                trunc_hf_output(hf_output) for hf_output in hf_outputs | 
|  | 246 | +            ], | 
|  | 247 | +            outputs_1_lst=vllm_outputs, | 
|  | 248 | +            name_0="hf", | 
|  | 249 | +            name_1="vllm", | 
|  | 250 | +        ) | 
|  | 251 | + | 
|  | 252 | + | 
|  | 253 | +@pytest.mark.parametrize("model", models) | 
|  | 254 | +@pytest.mark.parametrize( | 
|  | 255 | +    "size_factors", | 
|  | 256 | +    [ | 
|  | 257 | +        # No image | 
|  | 258 | +        [], | 
|  | 259 | +        # Single-scale | 
|  | 260 | +        [1.0], | 
|  | 261 | +        # Single-scale, batched | 
|  | 262 | +        [1.0, 1.0, 1.0], | 
|  | 263 | +        # Multi-scale | 
|  | 264 | +        [0.25, 0.5, 1.0], | 
|  | 265 | +    ], | 
|  | 266 | +) | 
|  | 267 | +@pytest.mark.parametrize("dtype", [target_dtype]) | 
|  | 268 | +@pytest.mark.parametrize("max_tokens", [128]) | 
|  | 269 | +@pytest.mark.parametrize("num_logprobs", [5]) | 
|  | 270 | +def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, | 
|  | 271 | +                             size_factors, dtype: str, max_tokens: int, | 
|  | 272 | +                             num_logprobs: int) -> None: | 
|  | 273 | +    run_multi_image_test( | 
|  | 274 | +        hf_runner, | 
|  | 275 | +        vllm_runner, | 
|  | 276 | +        image_assets, | 
|  | 277 | +        model, | 
|  | 278 | +        size_factors=size_factors, | 
|  | 279 | +        dtype=dtype, | 
|  | 280 | +        max_tokens=max_tokens, | 
|  | 281 | +        num_logprobs=num_logprobs, | 
|  | 282 | +        tensor_parallel_size=1, | 
|  | 283 | +    ) | 
0 commit comments