|
28 | 28 |
|
29 | 29 | import torch
|
30 | 30 | from torch import nn
|
31 |
| -from transformers import BatchFeature |
| 31 | +from transformers import BatchFeature, PretrainedConfig |
32 | 32 | from transformers.modeling_outputs import BaseModelOutputWithPast
|
33 | 33 | from transformers.models.whisper.modeling_whisper import (
|
34 | 34 | ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
|
35 | 35 |
|
36 | 36 | from vllm.config import VllmConfig
|
| 37 | +from vllm.model_executor.layers.quantization import QuantizationConfig |
| 38 | +from vllm.model_executor.layers.quantization.gptq import GPTQConfig |
| 39 | +from vllm.model_executor.layers.quantization.gptq_marlin import ( |
| 40 | + GPTQMarlinConfig) |
37 | 41 | from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
38 | 42 | from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
39 | 43 | NestedTensors)
|
@@ -512,6 +516,36 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
512 | 516 |
|
513 | 517 | self.audio_token_id = None
|
514 | 518 |
|
| 519 | + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): |
| 520 | + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ |
| 521 | + # seems to avoid vision encoder sections for some models. |
| 522 | + # See: https://huggingface.co/openbmb/MiniCPM-o-2_6-int4 |
| 523 | + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): |
| 524 | + return None |
| 525 | + return quant_config |
| 526 | + |
| 527 | + def init_vision_module( |
| 528 | + self, |
| 529 | + config: PretrainedConfig, |
| 530 | + quant_config: Optional[QuantizationConfig] = None, |
| 531 | + prefix: str = "", |
| 532 | + ) -> nn.Module: |
| 533 | + # MiniCPMO GPTQ model leave vpm unquantized. |
| 534 | + quant_config = self._maybe_ignore_quant_config(quant_config) |
| 535 | + return super().init_vision_module(config, quant_config, prefix) |
| 536 | + |
| 537 | + def init_resampler( |
| 538 | + self, |
| 539 | + embed_dim: int, |
| 540 | + vision_dim: int, |
| 541 | + quant_config: Optional[QuantizationConfig] = None, |
| 542 | + prefix: str = "", |
| 543 | + ) -> nn.Module: |
| 544 | + # MiniCPMO GPTQ model leave resampler unquantized. |
| 545 | + quant_config = self._maybe_ignore_quant_config(quant_config) |
| 546 | + return super().init_resampler(embed_dim, vision_dim, quant_config, |
| 547 | + prefix) |
| 548 | + |
515 | 549 | def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
516 | 550 | # Do not use parameters temporarily
|
517 | 551 | audio_config = self.config.audio_config
|
|
0 commit comments