Fix phi-3.5-vision-instruct

vllm-project · DarkLight1337 · Aug 22, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 21, 2024
commit d35272fe168adb02bc814f5fdff711003ffb957f
@@ -21,7 +21,7 @@
     "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
 })
 
-models = ["microsoft/Phi-3-vision-128k-instruct"]
+models = ["microsoft/Phi-3.5-vision-instruct"]
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,

diff --git a/vllm/config.py b/vllm/config.py
@@ -13,7 +13,9 @@
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
 from vllm.tracing import is_otel_available, otel_import_error_traceback
-from vllm.transformers_utils.config import get_config, get_hf_text_config
+from vllm.transformers_utils.config import (get_config,
+                                            get_hf_image_processor_config,
+                                            get_hf_text_config)
 from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
                         cuda_device_count_stateless, get_cpu_memory, is_cpu,
                         is_hip, is_neuron, is_openvino, is_xpu,
@@ -166,6 +168,8 @@ def __init__(
         self.hf_config = get_config(self.model, trust_remote_code, revision,
                                     code_revision, rope_scaling, rope_theta)
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.hf_image_processor_config = get_hf_image_processor_config(
+            self.model, trust_remote_code, revision)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
 
         # Choose a default enforce_eager value if the user did not specify

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
@@ -55,6 +55,18 @@ def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
 
         return hf_config
 
+    def get_hf_image_processor_config(self) -> Dict:
+        """
+        Get the HuggingFace configuration
+        (:class:`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the model is not of the specified type.
+        """
+
+        return self.model_config.hf_image_processor_config
+
 
 N = TypeVar("N", bound=Type[nn.Module])
 

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
@@ -329,7 +329,7 @@ def get_phi3v_image_feature_size(
     input_height: int,
     input_width: int,
 ) -> int:
-    num_crops = getattr(hf_config, "num_crops", 16)
+    num_crops = hf_config.get("num_crops", 4)
     new_width, new_height = _calc_hd_transform_size(width=input_width,
                                                     height=input_height,
                                                     hd_num=num_crops)
@@ -341,7 +341,7 @@ def get_phi3v_image_feature_size(
 def get_max_phi3v_image_tokens(ctx: InputContext):
 
     return get_phi3v_image_feature_size(
-        ctx.get_hf_config(),
+        ctx.get_hf_image_processor_config(),
         input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
         input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
     )
@@ -395,7 +395,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
         return llm_inputs
 
     model_config = ctx.model_config
-    hf_config = ctx.get_hf_config()
+    hf_config = ctx.get_hf_image_processor_config()
 
     image_data = multi_modal_data["image"]
     if isinstance(image_data, Image.Image):

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -3,6 +3,8 @@
 from typing import Dict, Optional, Type, Union
 
 from transformers import GenerationConfig, PretrainedConfig
+from transformers.models.auto.image_processing_auto import (
+    get_image_processor_config)
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
 
@@ -97,6 +99,33 @@ def get_config(
     return config
 
 
+def get_hf_image_processor_config(
+    model: Union[str, Path],
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+    **kwargs,
+) -> Dict:
+    try:
+        config = get_image_processor_config(
+            model,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs)
+    except ValueError as e:
+        if (not trust_remote_code and
+                "requires you to execute the configuration file" in str(e)):
+            err_msg = (
+                "Failed to load the model config. If the model is a custom "
+                "model not yet available in the HuggingFace transformers "
+                "library, consider setting `trust_remote_code=True` in LLM "
+                "or using the `--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    return config
+
+
 def get_hf_text_config(config: PretrainedConfig):
     """Get the "sub" config relevant to llm for multi modal models.
         No op for pure text models.