|
| 1 | +from typing import List, Tuple |
| 2 | + |
| 3 | +import pytest |
| 4 | +from transformers import AutoTokenizer |
| 5 | + |
| 6 | +from vllm.config import VisionLanguageConfig |
| 7 | + |
| 8 | +from ..conftest import IMAGE_FILES |
| 9 | + |
| 10 | +pytestmark = pytest.mark.llava |
| 11 | + |
| 12 | +_PREFACE = ( |
| 13 | + "A chat between a curious human and an artificial intelligence assistant. " |
| 14 | + "The assistant gives helpful, detailed, and polite answers to the human's " |
| 15 | + "questions.") |
| 16 | + |
| 17 | +# The image token is placed before "user" on purpose so that the test can pass |
| 18 | +HF_IMAGE_PROMPTS = [ |
| 19 | + f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:", |
| 20 | + f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:", |
| 21 | +] |
| 22 | + |
| 23 | +assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) |
| 24 | + |
| 25 | + |
| 26 | +def iter_llava_next_configs(model_name: str): |
| 27 | + image_hw_to_feature_size = { |
| 28 | + (336, 336): 1176, |
| 29 | + (672, 672): 2928, |
| 30 | + (1344, 336): 1944, |
| 31 | + (336, 1344): 1890, |
| 32 | + } |
| 33 | + |
| 34 | + for (h, w), f in image_hw_to_feature_size.items(): |
| 35 | + for input_type, input_shape in [ |
| 36 | + (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), |
| 37 | + ]: |
| 38 | + yield (model_name, |
| 39 | + VisionLanguageConfig(image_input_type=input_type, |
| 40 | + image_feature_size=f, |
| 41 | + image_token_id=32000, |
| 42 | + image_input_shape=input_shape, |
| 43 | + image_processor=model_name, |
| 44 | + image_processor_revision=None)) |
| 45 | + |
| 46 | + |
| 47 | +model_and_vl_config = [ |
| 48 | + *iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"), |
| 49 | +] |
| 50 | + |
| 51 | + |
| 52 | +def vllm_to_hf_output(vllm_output: Tuple[List[int], str], |
| 53 | + vlm_config: VisionLanguageConfig, model_id: str): |
| 54 | + """Sanitize vllm output to be comparable with hf output. |
| 55 | + The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, |
| 56 | + x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... |
| 57 | + It also reduces `output_str` from "<image><image>bla" to "bla". |
| 58 | + """ |
| 59 | + input_ids, output_str = vllm_output |
| 60 | + image_token_id = vlm_config.image_token_id |
| 61 | + |
| 62 | + tokenizer = AutoTokenizer.from_pretrained(model_id) |
| 63 | + image_token_str = tokenizer.decode(image_token_id) |
| 64 | + |
| 65 | + hf_input_ids = [ |
| 66 | + input_id for idx, input_id in enumerate(input_ids) |
| 67 | + if input_id != image_token_id or input_ids[idx - 1] != image_token_id |
| 68 | + ] |
| 69 | + hf_output_str = output_str \ |
| 70 | + .replace(image_token_str * vlm_config.image_feature_size, " ") |
| 71 | + |
| 72 | + return hf_input_ids, hf_output_str |
| 73 | + |
| 74 | + |
| 75 | +@pytest.mark.xfail( |
| 76 | + reason="Inconsistent image processor being used due to lack " |
| 77 | + "of support for dynamic image token replacement") |
| 78 | +@pytest.mark.parametrize("model_and_config", model_and_vl_config) |
| 79 | +@pytest.mark.parametrize("dtype", ["half"]) |
| 80 | +@pytest.mark.parametrize("max_tokens", [128]) |
| 81 | +def test_models(hf_runner, vllm_runner, hf_images, vllm_images, |
| 82 | + model_and_config, dtype: str, max_tokens: int) -> None: |
| 83 | + """Inference result should be the same between hf and vllm. |
| 84 | +
|
| 85 | + All the image fixtures for the test is under tests/images. |
| 86 | + For huggingface runner, we provide the PIL images as input. |
| 87 | + For vllm runner, we provide MultiModalData objects and corresponding |
| 88 | + vision language config as input. |
| 89 | + Note, the text input is also adjusted to abide by vllm contract. |
| 90 | + The text output is sanitized to be able to compare with hf. |
| 91 | + """ |
| 92 | + model_id, vlm_config = model_and_config |
| 93 | + |
| 94 | + with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: |
| 95 | + hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, |
| 96 | + max_tokens, |
| 97 | + images=hf_images) |
| 98 | + |
| 99 | + vllm_image_prompts = [ |
| 100 | + p.replace("<image>", "<image>" * vlm_config.image_feature_size) |
| 101 | + for p in HF_IMAGE_PROMPTS |
| 102 | + ] |
| 103 | + |
| 104 | + with vllm_runner( |
| 105 | + model_id, |
| 106 | + dtype=dtype, |
| 107 | + # should be greater than image_feature_size |
| 108 | + max_model_len=4096, |
| 109 | + enforce_eager=True, |
| 110 | + **vlm_config.as_cli_args_dict(), |
| 111 | + ) as vllm_model: |
| 112 | + vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, |
| 113 | + max_tokens, |
| 114 | + images=vllm_images) |
| 115 | + |
| 116 | + for i in range(len(HF_IMAGE_PROMPTS)): |
| 117 | + hf_output_ids, hf_output_str = hf_outputs[i] |
| 118 | + vllm_output_ids, vllm_output_str = vllm_to_hf_output( |
| 119 | + vllm_outputs[i], vlm_config, model_id) |
| 120 | + assert hf_output_str == vllm_output_str, ( |
| 121 | + f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") |
| 122 | + assert hf_output_ids == vllm_output_ids, ( |
| 123 | + f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") |
0 commit comments