Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Support Pixtral models in the HF Transformers format #9036

Merged
merged 15 commits into from
Oct 18, 2024
Prev Previous commit
Next Next commit
Merge branch 'main' into support-pixtral-hf-format
  • Loading branch information
mgoin committed Oct 16, 2024
commit 9cc49d4fccc9c9a313b52ba5a6a08dce8d2e75f0
6 changes: 3 additions & 3 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def run_qwen2_vl(question: str, modality: str):


# Pixtral
def run_pixtral(question, modality):
def run_pixtral(question: str, modality: str):
assert modality == "image"

model_name = "mistral-community/pixtral-12b"
Expand All @@ -295,8 +295,8 @@ def run_pixtral(question, modality):
return llm, prompt, stop_token_ids


# LLama
def run_mllama(question, modality):
# LLama 3.2
def run_mllama(question: str, modality: str):
assert modality == "image"

model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
Expand Down
4 changes: 1 addition & 3 deletions vllm/model_executor/models/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@

from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
from vllm.model_executor.layers.activation import get_act_fn
from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext, LLMInputs
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
Expand Down Expand Up @@ -823,7 +822,6 @@ def __init__(self, config: PixtralVisionConfig):
self.down_proj = nn.Linear(config.intermediate_size,
config.hidden_size,
bias=False)
mgoin marked this conversation as resolved.
Show resolved Hide resolved
self.act_fn = get_act_fn(config.hidden_act)

def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
Expand Down
4 changes: 1 addition & 3 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,8 @@
from vllm.multimodal.base import MultiModalData
from vllm.multimodal.image import cached_get_image_processor
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.configs.qwen2vl import (Qwen2VLConfig,
Qwen2VLVisionConfig)
from vllm.transformers_utils.processor import cached_get_processor
from vllm.utils import is_cpu
from vllm.transformers_utils.config import uses_mrope

from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (PPMissingLayer, get_vit_attn_backend,
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.