|
1 | 1 | # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
|
2 | 2 | """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
|
3 | 3 | import math
|
4 |
| -from functools import cached_property, partial |
| 4 | +from functools import cached_property |
5 | 5 | from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
|
6 | 6 | TypedDict, Union)
|
7 | 7 |
|
8 | 8 | import torch
|
9 | 9 | import torch.nn as nn
|
10 | 10 | import torch.nn.functional as F
|
11 | 11 | from einops import rearrange, repeat
|
12 |
| -from transformers import AutoProcessor, BatchFeature, ProcessorMixin |
| 12 | +from transformers import BatchFeature |
13 | 13 |
|
14 | 14 | from vllm.attention import AttentionMetadata
|
15 | 15 | from vllm.config import VllmConfig
|
|
31 | 31 | from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
|
32 | 32 | MlpProjectorConfig,
|
33 | 33 | VisionEncoderConfig)
|
| 34 | +from vllm.transformers_utils.processors.deepseek_vl2 import ( |
| 35 | + DeepseekVLV2Processor) |
34 | 36 | from vllm.utils import is_list_of
|
35 | 37 |
|
36 | 38 | from .interfaces import SupportsMultiModal, SupportsPP
|
@@ -129,25 +131,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
|
129 | 131 | def get_hf_config(self):
|
130 | 132 | return self.ctx.get_hf_config(DeepseekVLV2Config)
|
131 | 133 |
|
132 |
| - def get_hf_processor(self) -> ProcessorMixin: |
133 |
| - # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2 |
134 |
| - # in the future, because it's flasky and lack of maintenance. |
135 |
| - try: |
136 |
| - from deepseek_vl2.models.processing_deepseek_vl_v2 import ( |
137 |
| - DeepseekVLV2Processor, select_best_resolution) |
138 |
| - AutoProcessor.register("DeepseekVLV2Processor", |
139 |
| - DeepseekVLV2Processor) |
140 |
| - except ModuleNotFoundError as exc: |
141 |
| - raise ModuleNotFoundError( |
142 |
| - "You need to `pip install " |
143 |
| - "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` " |
144 |
| - "to use this model") from exc |
145 |
| - |
146 |
| - processor = self.ctx.get_hf_processor(DeepseekVLV2Processor) |
147 |
| - processor.select_best_resolution = partial( |
148 |
| - select_best_resolution, |
149 |
| - candidate_resolutions=processor.candidate_resolutions) |
150 |
| - return processor |
| 134 | + def get_hf_processor(self) -> DeepseekVLV2Processor: |
| 135 | + return self.ctx.get_hf_processor(DeepseekVLV2Processor) |
151 | 136 |
|
152 | 137 | def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
153 | 138 | return {"image": None}
|
@@ -224,31 +209,21 @@ def _call_hf_processor(
|
224 | 209 | mm_kwargs: Mapping[str, object],
|
225 | 210 | ) -> BatchFeature:
|
226 | 211 | if mm_data:
|
227 |
| - outputs = self.info.ctx.call_hf_processor( |
| 212 | + processed_outputs = self.info.ctx.call_hf_processor( |
228 | 213 | self.info.get_hf_processor(**mm_kwargs),
|
229 | 214 | dict(prompt=prompt, **mm_data),
|
230 | 215 | mm_kwargs,
|
231 | 216 | )
|
232 |
| - |
233 |
| - # Deepseek-vl2 processor don't return BatchFeature, |
234 |
| - # we need to manually create it |
235 |
| - processed_outputs = dict(input_ids=outputs["input_ids"]) |
236 |
| - processed_outputs = BatchFeature(data=dict(processed_outputs), |
237 |
| - tensor_type="pt") |
238 |
| - |
239 |
| - # Remove batch dimension from processor outputs, |
240 |
| - # because we will try batch to create NestedTensors |
241 | 217 | target_dtype = self.info.ctx.model_config.dtype
|
242 |
| - pixel_values = outputs["images"].to(target_dtype).squeeze(0) |
243 |
| - images_spatial_crop = outputs["images_spatial_crop"].squeeze(0) |
| 218 | + pixel_values = processed_outputs.pop("pixel_values").to( |
| 219 | + target_dtype) |
| 220 | + # split pixel values into patches corresponding to each image |
| 221 | + images_spatial_crop = processed_outputs["images_spatial_crop"] |
244 | 222 | patches_per_image = [
|
245 | 223 | x.prod().item() + 1 for x in images_spatial_crop
|
246 | 224 | ]
|
247 |
| - |
248 |
| - # Rename `images` -> `pixel_values` to avoid confusion |
249 |
| - processed_outputs["pixel_values"] = list( |
250 |
| - pixel_values.split(patches_per_image)) |
251 |
| - processed_outputs["images_spatial_crop"] = images_spatial_crop |
| 225 | + pixel_values = pixel_values.split(patches_per_image) |
| 226 | + processed_outputs["pixel_values"] = pixel_values |
252 | 227 | else:
|
253 | 228 | tokenizer = self.info.get_tokenizer()
|
254 | 229 | processed_outputs = tokenizer(prompt,
|
|
0 commit comments