mindspore-lab · lvyufeng · Mar 12, 2024 · Mar 12, 2024
diff --git a/mindnlp/transformers/modeling_attn_mask_utils.py b/mindnlp/transformers/modeling_attn_mask_utils.py
@@ -110,10 +110,16 @@ def to_4d(
 
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1])
-        expanded_4d_mask = expanded_attn_mask if causal_4d_mask is None else expanded_attn_mask + causal_4d_mask
+
+        if causal_4d_mask is not None:
+            expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), np.finfo(mindspore.dtype_to_nptype(dtype)).min)
+
+        # expanded_attn_mask + causal_4d_mask can cause some overflow
+        expanded_4d_mask = expanded_attn_mask
 
         return expanded_4d_mask
 
+
     @staticmethod
     def _make_causal_mask(
         input_ids_shape,
@@ -170,11 +176,11 @@ def _prepare_4d_causal_attention_mask(
     `(batch_size, key_value_length)`
 
     Args:
-        attention_mask (`mindspore.Tensor` or `None`):
+        attention_mask (`torch.Tensor` or `None`):
             A 2D attention mask of shape `(batch_size, key_value_length)`
         input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
             The input shape should be a tuple that defines `(batch_size, query_length)`.
-        inputs_embeds (`mindspore.Tensor`):
+        inputs_embeds (`torch.Tensor`):
             The embedded inputs as a torch Tensor.
         past_key_values_length (`int`):
             The length of the key value cache.
@@ -186,14 +192,24 @@ def _prepare_4d_causal_attention_mask(
     key_value_length = input_shape[-1] + past_key_values_length
 
     # 4d mask is passed through the layers
-    if attention_mask is not None:
+    if attention_mask is not None and len(attention_mask.shape) == 2:
         attention_mask = attn_mask_converter.to_4d(
-            attention_mask, input_shape[-1], key_value_length, dtype=inputs_embeds.dtype
+            attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
+        )
+    elif attention_mask is not None and len(attention_mask.shape) == 4:
+        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+        if tuple(attention_mask.shape) != expected_shape:
+            raise ValueError(
+                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+            )
+        # if the 4D mask has correct shape - invert it and fill with negative infinity
+        inverted_mask = 1.0 - attention_mask
+        attention_mask = inverted_mask.masked_fill(
+            inverted_mask.to(mindspore.bool_), np.finfo(mindspore.dtype_to_nptype(inputs_embeds.dtype)).min
         )
     else:
         attention_mask = attn_mask_converter.to_causal_4d(
-            input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype
-        )
+            input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype)
 
     return attention_mask
 

diff --git a/mindnlp/transformers/modeling_outputs.py b/mindnlp/transformers/modeling_outputs.py
@@ -1651,3 +1651,94 @@ class MaskedImageModelingOutput(ModelOutput):
     reconstruction: mindspore.Tensor = None
     hidden_states: Optional[Tuple[mindspore.Tensor]] = None
     attentions: Optional[Tuple[mindspore.Tensor]] = None
+
+
+@dataclass
+class MoeModelOutputWithPast(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        router_logits (`tuple(mindspore.Tensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
+            Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+
+            Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
+            loss for Mixture of Experts models.
+    """
+
+    last_hidden_state: mindspore.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None
+    hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
+    attentions: Optional[Tuple[mindspore.Tensor, ...]] = None
+    router_logits: Optional[Tuple[mindspore.Tensor]] = None
+
+
+@dataclass
+class MoeCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) with mixture of experts outputs.
+
+    Args:
+        loss (`mindspore.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+
+        logits (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+
+        aux_loss (`mindspore.Tensor`, *optional*, returned when `labels` is provided):
+            aux_loss for the sparse modules.
+
+        router_logits (`tuple(mindspore.Tensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
+            Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+
+            Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
+            loss for Mixture of Experts models.
+
+        past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[mindspore.Tensor] = None
+    aux_loss: Optional[mindspore.Tensor] = None
+    logits: mindspore.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None
+    hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
+    attentions: Optional[Tuple[mindspore.Tensor, ...]] = None
+    router_logits: Optional[Tuple[mindspore.Tensor]] = None
diff --git a/mindnlp/transformers/models/__init__.py b/mindnlp/transformers/models/__init__.py
@@ -53,6 +53,7 @@
     megatron_bert,
     minicpm,
     mistral,
+    mixtral,
     mobilebert,
     nezha,
     opt,
@@ -109,6 +110,7 @@
 from .megatron_bert import *
 from .minicpm import *
 from .mistral import *
+from .mixtral import *
 from .mobilebert import *
 from .nezha import *
 from .opt import *
@@ -165,6 +167,7 @@
 __all__.extend(megatron_bert.__all__)
 __all__.extend(minicpm.__all__)
 __all__.extend(mistral.__all__)
+__all__.extend(mixtral.__all__)
 __all__.extend(mobilebert.__all__)
 __all__.extend(nezha.__all__)
 __all__.extend(opt.__all__)

diff --git a/mindnlp/transformers/models/auto/configuration_auto.py b/mindnlp/transformers/models/auto/configuration_auto.py
@@ -55,6 +55,8 @@
         ('hubert', 'HubertConfig'),
         ("mbart","MBartConfig"),
         ('minicpm', 'MiniCPMConfig'),
+        ("mistral", "MistralConfig"),
+        ("mixtral", "MixtralConfig"),
         ('mt5', 'MT5Config'),
         ("phi", "PhiConfig"),
         ("qwen2", "Qwen2Config"),
@@ -177,6 +179,7 @@
         ("megatron-bert", "MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mgp-str", "MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mistral", "MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mixtral", "MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilenet_v1", "MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilenet_v2", "MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -409,6 +412,7 @@
         ("mgp-str", "MGP-STR"),
         ("minicpm", "MiniCPM"),
         ("mistral", "Mistral"),
+        ("mixtral", "Mixtral"),
         ("mluke", "mLUKE"),
         ("mms", "MMS"),
         ("mobilebert", "MobileBERT"),

diff --git a/mindnlp/transformers/models/auto/modeling_auto.py b/mindnlp/transformers/models/auto/modeling_auto.py
@@ -55,8 +55,9 @@
         ("gpt_pangu", "GPTPanguModel"),
         ("longformer", "LongformerModel"),
         ('mbart','MBartModel'),
-        ("mistral", "MistralModel"),
         ('minicpm', 'MiniCPMModel'),
+        ("mistral", "MistralModel"),
+        ("mixtral", "MixtralModel"),
         ("phi", "PhiModel"),
         ("qwen2", "Qwen2Model"),
         ("roberta", "RobertaModel"),
@@ -115,6 +116,8 @@
         ("falcon", "FalconForCausalLM"),
         ("gpt_bigcode", "GPTBigCodeForCausalLM"),
         ('minicpm', 'MiniCPMForCausalLM'),
+        ("mistral", "MistralForCausalLM"),
+        ("mixtral", "MixtralForCausalLM"),
         ("phi", "PhiForCausalLM"),
         ("qwen2", "Qwen2ForCausalLM"),
         ("roberta", "RobertaLMHeadModel"),
@@ -313,6 +316,8 @@
         ("esm", "EsmForSequenceClassification"),
         ("falcon", "FalconForSequenceClassification"),
         ('minicpm', 'MiniCPMForSequenceClassification'),
+        ("mistral", "MistralForSequenceClassification"),
+        ("mixtral", "MixtralForSequenceClassification"),
         ("phi", "PhiForSequenceClassification"),
         ("qwen2", "Qwen2ForSequenceClassification"),
         ("xlm-roberta", "XLMRobertaForSequenceClassification"),

diff --git a/mindnlp/transformers/models/auto/tokenization_auto.py b/mindnlp/transformers/models/auto/tokenization_auto.py
@@ -244,6 +244,13 @@
                 "LlamaTokenizerFast" if is_tokenizers_available() else None,
             ),
         ),
+        (
+            "mixtral",
+            (
+                "LlamaTokenizer" if is_sentencepiece_available() else None,
+                "LlamaTokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
         ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
         ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
         ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),

diff --git a/mindnlp/transformers/models/mixtral/__init__.py b/mindnlp/transformers/models/mixtral/__init__.py
@@ -0,0 +1,26 @@
+# coding=utf-8
+# Copyright 2021 The Eleuther AI and HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+LLAMA Model init
+"""
+from .import configuration_mixtral, modeling_mixtral
+from .modeling_mixtral import *
+from .configuration_mixtral import *
+
+__all__ = []
+__all__.extend(modeling_mixtral.__all__)
+__all__.extend(configuration_mixtral.__all__)