Skip to content

[Misc] Clean up Kimi-VL #16833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:

engine_args = EngineArgs(
model="moonshotai/Kimi-VL-A3B-Instruct",
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
Expand Down
3 changes: 1 addition & 2 deletions examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:

engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=4,
tensor_parallel_size=1,
limit_mm_per_prompt={"image": len(image_urls)},
trust_remote_code=True,
)

placeholders = [{"type": "image", "image": url} for url in image_urls]
Expand Down
57 changes: 17 additions & 40 deletions vllm/model_executor/models/kimi_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
from vllm.config import VllmConfig
from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
Expand All @@ -70,22 +69,20 @@
from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement,
PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config

from .utils import is_pp_missing_parameter, maybe_prefix

logger = init_logger(__name__)


# For dummy input only
@dataclass
Expand Down Expand Up @@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config(KimiVLConfig)

def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}

def get_num_image_tokens(
self,
*,
Expand Down Expand Up @@ -180,58 +180,35 @@ def get_num_image_tokens(
token_width = (width + pad_width) // (kernel_size[1] * patch_size)
return int(token_height * token_width)

def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
# None means unlimited
return {"image": None}

def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {
"image":
self.get_num_image_tokens(
image_width=MaxImageTokenMeta.width,
image_height=MaxImageTokenMeta.height,
),
}

@property
def image_token_id(self) -> int:
return self.get_hf_config().media_placeholder_token_id


class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):

def __init__(self, info: KimiVLProcessingInfo) -> None:
super().__init__(info)
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0)

processor = self.info.get_hf_processor()
image_token = processor.image_token

self.image_token_id = self.info.image_token_id
self.image_token = self.info.get_tokenizer().decode(
self.image_token_id)
return image_token * num_images

def get_dummy_processor_inputs(
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

width = MaxImageTokenMeta.width
height = MaxImageTokenMeta.height
mm_data = {
return {
"image":
self._get_dummy_images(width=width,
height=height,
self._get_dummy_images(width=MaxImageTokenMeta.width,
height=MaxImageTokenMeta.height,
num_images=num_images)
}

return ProcessorInputs(
prompt_text=self.image_token * num_images,
mm_data=mm_data,
)


class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):

Expand Down