Skip to content

[Frontend] Support configurable mm placeholder strings & flexible video sampling policies via CLI flags. #20105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
6 changes: 4 additions & 2 deletions tests/async_engine/test_async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import uuid
from asyncio import CancelledError
from copy import copy
from dataclasses import dataclass
from typing import Optional
from dataclasses import dataclass, field
from typing import Any, Optional

import pytest
import pytest_asyncio
Expand All @@ -32,6 +32,8 @@ class RequestOutput:
@dataclass
class MockModelConfig:
use_async_output_proc = True
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)


class MockEngine:
Expand Down
52 changes: 52 additions & 0 deletions tests/engine/test_arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,58 @@ def test_limit_mm_per_prompt_parser(arg, expected):
assert args.limit_mm_per_prompt == expected


@pytest.mark.parametrize(
("arg", "expected"),
[
(None, dict()),
('{"video": {"num_frames": 123} }', {
"video": {
"num_frames": 123
}
}),
(
'{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }', # noqa
{
"video": {
"num_frames": 123,
"fps": 1.0,
"foo": "bar"
},
"image": {
"foo": "bar"
}
}),
])
def test_media_io_kwargs_parser(arg, expected):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--media-io-kwargs", arg])

assert args.media_io_kwargs == expected


@pytest.mark.parametrize(("arg", "expected"), [
(None, dict()),
('{"video":"<|video_placeholder|>"}', {
"video": "<|video_placeholder|>"
}),
('{"video":"<|video_placeholder|>", "image": "<|image_placeholder|>"}', {
"video": "<|video_placeholder|>",
"image": "<|image_placeholder|>"
}),
])
def test_mm_placeholder_str_override_parser(arg, expected):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--mm-placeholder-str-override", arg])

assert args.mm_placeholder_str_override == expected


def test_compilation_config():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())

Expand Down
6 changes: 4 additions & 2 deletions tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

import asyncio
from contextlib import suppress
from dataclasses import dataclass
from typing import Optional
from dataclasses import dataclass, field
from typing import Any, Optional
from unittest.mock import MagicMock

from vllm.config import MultiModalConfig
Expand Down Expand Up @@ -40,6 +40,8 @@ class MockModelConfig:
allowed_local_media_path: str = ""
encoder_config = None
generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
Expand Down
5 changes: 2 additions & 3 deletions tests/multimodal/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,8 @@ async def test_fetch_image_error_conversion():
async def test_fetch_video_http(video_url: str, num_frames: int):
connector = MediaConnector()

video_sync = connector.fetch_video(video_url, num_frames=num_frames)
video_async = await connector.fetch_video_async(video_url,
num_frames=num_frames)
video_sync = connector.fetch_video(video_url)
video_async = await connector.fetch_video_async(video_url)
assert np.array_equal(video_sync, video_async)


Expand Down
48 changes: 47 additions & 1 deletion tests/multimodal/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import numpy.typing as npt
import pytest

from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
from vllm import envs
from vllm.multimodal.image import ImageMediaIO
from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
VideoMediaIO)

NUM_FRAMES = 10
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
Expand Down Expand Up @@ -40,3 +43,46 @@ def test_video_loader_registry():
def test_video_loader_type_doesnt_exist():
with pytest.raises(AssertionError):
VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")


@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
class Assert10Frames1FPSVideoLoader(VideoLoader):

@classmethod
def load_bytes(cls,
data: bytes,
num_frames: int = -1,
fps: float = -1.0,
**kwargs) -> npt.NDArray:
assert num_frames == 10, "bad num_frames"
assert fps == 1.0, "bad fps"
return FAKE_OUTPUT_2


def test_video_media_io_kwargs():
envs.VLLM_VIDEO_LOADER_BACKEND = "assert_10_frames_1_fps"
imageio = ImageMediaIO()

# Verify that different args pass/fail assertions as expected.
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
_ = videoio.load_bytes(b"test")

videoio = VideoMediaIO(
imageio, **{
"num_frames": 10,
"fps": 1.0,
"not_used": "not_used"
})
_ = videoio.load_bytes(b"test")

with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{})
_ = videoio.load_bytes(b"test")

with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
_ = videoio.load_bytes(b"test")

with pytest.raises(AssertionError, match="bad fps"):
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
_ = videoio.load_bytes(b"test")
16 changes: 16 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,12 @@ class ModelConfig:
limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
"""Maximum number of data items per modality per prompt. Only applicable
for multimodal models."""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
"""Optionally override placeholder string for given modalities."""
use_async_output_proc: bool = True
"""Whether to use async output processor."""
config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value
Expand Down Expand Up @@ -690,6 +696,8 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
if self.registry.is_multimodal_model(self.architectures):
return MultiModalConfig(
limit_per_prompt=self.limit_mm_per_prompt,
media_io_kwargs=self.media_io_kwargs,
mm_placeholder_str_override=self.mm_placeholder_str_override,
mm_processor_kwargs=self.mm_processor_kwargs,
disable_mm_preprocessor_cache=self.
disable_mm_preprocessor_cache)
Expand Down Expand Up @@ -3000,6 +3008,14 @@ class MultiModalConfig:
`{"images": 16, "videos": 2}`
"""

media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """

mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
"""Optionally override placeholder string for given modalities."""

mm_processor_kwargs: Optional[dict[str, object]] = None
"""
Overrides for the multi-modal processor obtained from
Expand Down
12 changes: 12 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,11 @@ class EngineArgs:
get_field(TokenizerPoolConfig, "extra_config")
limit_mm_per_prompt: dict[str, int] = \
get_field(MultiModalConfig, "limit_per_prompt")
media_io_kwargs: dict[str, dict[str,
Any]] = get_field(MultiModalConfig,
"media_io_kwargs")
mm_placeholder_str_override: dict[str, str] = \
get_field(MultiModalConfig, "mm_placeholder_str_override")
mm_processor_kwargs: Optional[Dict[str, Any]] = \
MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = \
Expand Down Expand Up @@ -714,6 +719,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
)
multimodal_group.add_argument("--limit-mm-per-prompt",
**multimodal_kwargs["limit_per_prompt"])
multimodal_group.add_argument("--media-io-kwargs",
**multimodal_kwargs["media_io_kwargs"])
multimodal_group.add_argument(
"--mm-placeholder-str-override",
**multimodal_kwargs["mm_placeholder_str_override"])
multimodal_group.add_argument(
"--mm-processor-kwargs",
**multimodal_kwargs["mm_processor_kwargs"])
Expand Down Expand Up @@ -938,6 +948,8 @@ def create_model_config(self) -> ModelConfig:
enable_prompt_embeds=self.enable_prompt_embeds,
served_model_name=self.served_model_name,
limit_mm_per_prompt=self.limit_mm_per_prompt,
media_io_kwargs=self.media_io_kwargs,
mm_placeholder_str_override=self.mm_placeholder_str_override,
use_async_output_proc=not self.disable_async_output_proc,
config_format=self.config_format,
mm_processor_kwargs=self.mm_processor_kwargs,
Expand Down
11 changes: 8 additions & 3 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,9 @@ def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:

def _placeholder_str(self, modality: ModalityStr,
current_count: int) -> Optional[str]:
if modality in self._model_config.mm_placeholder_str_override:
return self._model_config.mm_placeholder_str_override[modality]

# TODO: Let user specify how to insert image tokens into prompt
# (similar to chat template)
hf_config = self._model_config.hf_config
Expand Down Expand Up @@ -720,6 +723,7 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
self._tracker = tracker

self._connector = MediaConnector(
media_io_kwargs=self._tracker._model_config.media_io_kwargs,
allowed_local_media_path=tracker.allowed_local_media_path,
)

Expand Down Expand Up @@ -758,7 +762,7 @@ def parse_input_audio(self, input_audio: InputAudio) -> None:
return self.parse_audio(audio_url)

def parse_video(self, video_url: str) -> None:
video = self._connector.fetch_video(video_url)
video = self._connector.fetch_video(video_url=video_url)

placeholder = self._tracker.add("video", video)
self._add_placeholder(placeholder)
Expand All @@ -771,7 +775,8 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:

self._tracker = tracker
self._connector = MediaConnector(
allowed_local_media_path=tracker.allowed_local_media_path,
media_io_kwargs=self._tracker._model_config.media_io_kwargs,
allowed_local_media_path=tracker.allowed_local_media_path
)

def parse_image(self, image_url: str) -> None:
Expand Down Expand Up @@ -813,7 +818,7 @@ def parse_input_audio(self, input_audio: InputAudio) -> None:
return self.parse_audio(audio_url)

def parse_video(self, video_url: str) -> None:
video = self._connector.fetch_video_async(video_url)
video = self._connector.fetch_video_async(video_url=video_url)

placeholder = self._tracker.add("video", video)
self._add_placeholder(placeholder)
Expand Down
10 changes: 10 additions & 0 deletions vllm/multimodal/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,16 @@ def resample(

class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):

def __init__(self, **kwargs) -> None:
super().__init__()

# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self.kwargs = kwargs

def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
return librosa.load(BytesIO(data), sr=None)

Expand Down
8 changes: 7 additions & 1 deletion vllm/multimodal/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,16 @@ def convert_image_mode(image: Image.Image, to_mode: str):

class ImageMediaIO(MediaIO[Image.Image]):

def __init__(self, *, image_mode: str = "RGB") -> None:
def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
super().__init__()

self.image_mode = image_mode
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self.kwargs = kwargs

def load_bytes(self, data: bytes) -> Image.Image:
image = Image.open(BytesIO(data))
Expand Down
Loading