huggingface · patrickvonplaten · Nov 12, 2020 · Nov 12, 2020 · Nov 12, 2020 · shenfe
diff --git a/check_t5_against_hf.py b/check_t5_against_hf.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+import os
+
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # or any {'0', '1', '2'}
+
+import t5  # noqa: E402
+from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary  # noqa: E402
+from transformers import T5Tokenizer  # noqa: E402
+from transformers.convert_t5_v1_1_original_tf_checkpoint_to_pytorch import (  # noqa: E402
+    convert_tf_checkpoint_to_pytorch,
+)
+from transformers.modeling_t5v2 import T5Config, T5v2ForConditionalGeneration  # noqa: E402
+
+
+path_to_tf_checkpoint = "/home/patrick/hugging_face/mt5/mt5_mesh_tf"
+
+
+tok = T5Tokenizer.from_pretrained(path_to_tf_checkpoint + "/sentencepiece.model")
+tok.save_pretrained(path_to_tf_checkpoint)
+config = T5Config.from_pretrained("t5-small")
+config.d_ff = 1024
+config.num_decoder_layers = 8
+config.num_layers = 8
+config.num_heads = 6
+# comment this line out if only checkpoints for T5v1.1 should be checked
+config.vocab_size = 250112
+
+config.save_pretrained(path_to_tf_checkpoint)
+
+convert_tf_checkpoint_to_pytorch(path_to_tf_checkpoint, path_to_tf_checkpoint + "/config.json", path_to_tf_checkpoint)
+
+t5_model = t5.models.MtfModel(
+    model_dir=path_to_tf_checkpoint,
+    batch_size=1,
+    tpu=None,
+    sequence_length={"inputs": 64, "targets": 64},
+)
+
+vocab_model_path = path_to_tf_checkpoint + "/sentencepiece.model"
+
+# for T5v1.1 one should set `extra_ids=100`.
+vocab = SentencePieceVocabulary(vocab_model_path, extra_ids=0)
+
+score = t5_model.score(
+    inputs=["Hello there. Let's put more words in more languages than I originally thought."],
+    targets=["Hi I am"],
+    vocabulary=vocab,
+)
+
+model = T5v2ForConditionalGeneration.from_pretrained(path_to_tf_checkpoint, return_dict=True)
+
+input_ids = tok("Hello there", return_tensors="pt").input_ids
+labels = tok("Hi I am", return_tensors="pt").input_ids
+
+# input_ids and labels are ok!
+loss = model(input_ids, labels=labels).loss
+mesh_tf_loss = -(labels.shape[-1] * loss.item())
+
+if mesh_tf_loss - score[0][0] < 1e-4:
+    print("Success!")
+else:
+    print(f"Fail. Mesh TF {mesh_tf_loss} vs. {score[0][0]}")
diff --git a/src/transformers/configuration_t5v2.py b/src/transformers/configuration_t5v2.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2010, The T5v2 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5v2 model configuration """
+
+from .configuration_utils import PretrainedConfig
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+T5v2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "t5-small": "https://huggingface.co/t5-small/resolve/main/config.json",
+    "t5-base": "https://huggingface.co/t5-base/resolve/main/config.json",
+    "t5-large": "https://huggingface.co/t5-large/resolve/main/config.json",
+    "t5-3b": "https://huggingface.co/t5-3b/resolve/main/config.json",
+    "t5-11b": "https://huggingface.co/t5-11b/resolve/main/config.json",
+}
+
+
+class T5v2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.T5v2Model` or a
+    :class:`~transformers.TFT5v2Model`. It is used to instantiate a T5v2 model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the T5v2 `t5-small <https://huggingface.co/t5-small>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    Arguments:
+        vocab_size (:obj:`int`, `optional`, defaults to 32128):
+            Vocabulary size of the T5v2 model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.T5v2Model` or
+            :class:`~transformers.TFT5v2Model`.
+        n_positions (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        d_model (:obj:`int`, `optional`, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (:obj:`int`, `optional`, defaults to 64):
+            Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model
+            // num_heads`.
+        d_ff (:obj:`int`, `optional`, defaults to 2048):
+            Size of the intermediate feed forward layer in each :obj:`T5v2Block`.
+        num_layers (:obj:`int`, `optional`, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (:obj:`int`, `optional`):
+            Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not
+            set.
+        num_heads (:obj:`int`, `optional`, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
+            The number of buckets to use for each attention layer.
+        dropout_rate (:obj:`float`, `optional`, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    """
+    model_type = "t5"
+
+    def __init__(
+        self,
+        vocab_size=32128,
+        n_positions=512,
+        d_model=512,
+        d_kv=64,
+        d_ff=2048,
+        num_layers=6,
+        num_decoder_layers=None,
+        num_heads=8,
+        relative_attention_num_buckets=32,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        is_encoder_decoder=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        tie_word_embeddings=False,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (
+            num_decoder_layers if num_decoder_layers is not None else self.num_layers
+        )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.num_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.num_layers
diff --git a/src/transformers/convert_t5_v1_1_original_tf_checkpoint_to_pytorch.py b/src/transformers/convert_t5_v1_1_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,59 @@
+# coding=utf-8
+# Copyright 2018 The T5 authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert T5 checkpoint."""
+
+
+import argparse
+
+from transformers.modeling_t5v2 import T5Config, T5v2ForConditionalGeneration, load_tf_weights_in_t5
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = T5Config.from_json_file(config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = T5v2ForConditionalGeneration(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    model.save_pretrained(pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained T5 model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)