add dummy config

huggingface · Jul 4, 2024 · 55e8811 · 55e8811
1 parent bfa2eb8
commit 55e8811
Showing 1 changed file with 133 additions and 0 deletions.
diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mllama model configuration"""
+
+import warnings
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class MllamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MllamaForConditionalGeneration`]. It is used to instantiate an
+    Mllama model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mllama-9B.
+
+    e.g. [mllama-hf/mllama-9b](https://huggingface.co/mllama-hf/mllama-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+
+    Example:
+
+    ```python
+    >>> from transformers import MllamaForConditionalGeneration, MllamaConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a Mllama mllama-1.5-7b style configuration
+    >>> configuration = MllamaConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the mllama-1.5-7b style configuration
+    >>> model = MllamaForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mllama"
+    is_composition = False
+
+    def __init__(
+        self,
+        vision_config=None,
+        global_vision_config=None,
+        text_config=None,
+        **kwargs,
+    ):
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=32,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"](
+
+            )
+
+        self.text_config = text_config
+        self._vocab_size = self.text_config.vocab_size
+
+        super().__init__(**kwargs)
+
+    @property
+    def vocab_size(self):
+        warnings.warn(
+            "The `vocab_size` attribute is deprecated and will be removed in v4.42, Please use `text_config.vocab_size` instead.",
+            FutureWarning,
+        )
+        return self._vocab_size
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self._vocab_size = value
+
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_vocab_size", None)
+        return output