SEW-D and tests

huggingface · anton-l · Oct 15, 2021 · Oct 6, 2021 · Oct 11, 2021 · Oct 11, 2021
commit a6ec41caef29e60567fc18d3f9218a0aea306d39
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -443,6 +443,10 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            SEWD             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Speech2Text         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -251,6 +251,7 @@
     "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
     "models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
     "models.sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
+    "models.sew_d": ["SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWDConfig"],
     "models.speech_encoder_decoder": ["SpeechEncoderDecoderConfig"],
     "models.speech_to_text": [
         "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -1114,6 +1115,14 @@
             "SEWPreTrainedModel",
         ]
     )
+    _import_structure["models.sew_d"].extend(
+        [
+            "SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "SEWDForCTC",
+            "SEWDModel",
+            "SEWDPreTrainedModel",
+        ]
+    )
     _import_structure["models.speech_encoder_decoder"].extend(["SpeechEncoderDecoderModel"])
     _import_structure["models.speech_to_text"].extend(
         [
@@ -2079,6 +2088,7 @@
     from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
     from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
     from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
+    from .models.sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
     from .models.speech_encoder_decoder import SpeechEncoderDecoderConfig
     from .models.speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
     from .models.speech_to_text_2 import (
@@ -2800,12 +2810,8 @@
             RoFormerPreTrainedModel,
             load_tf_weights_in_roformer,
         )
-        from .models.sew import (
-            SEW_PRETRAINED_MODEL_ARCHIVE_LIST,
-            SEWForCTC,
-            SEWModel,
-            SEWPreTrainedModel,
-        )
+        from .models.sew import SEW_PRETRAINED_MODEL_ARCHIVE_LIST, SEWForCTC, SEWModel, SEWPreTrainedModel
+        from .models.sew_d import SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST, SEWDForCTC, SEWDModel, SEWDPreTrainedModel
         from .models.speech_encoder_decoder import SpeechEncoderDecoderModel
         from .models.speech_to_text import (
             SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -94,6 +94,8 @@
         ("rag", "RagConfig"),
         ("tapas", "TapasConfig"),
         ("splinter", "SplinterConfig"),
+        ("sew", "SEWConfig"),
+        ("sew-d", "SEWDConfig"),
     ]
 )
 
@@ -160,6 +162,7 @@
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"),
     ]
 )
 
@@ -241,6 +244,8 @@
         ("byt5", "ByT5"),
         ("mbart50", "mBART-50"),
         ("splinter", "Splinter"),
+        ("sew", "SEW"),
+        ("sew-d", "SEWD"),
     ]
 )
 

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
@@ -37,6 +37,7 @@
         ("detr", "DetrFeatureExtractor"),
         ("deit", "DeiTFeatureExtractor"),
         ("hubert", "Wav2Vec2FeatureExtractor"),
+        ("sew", "Wav2Vec2FeatureExtractor"),
         ("speech_to_text", "Speech2TextFeatureExtractor"),
         ("vit", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -92,6 +92,8 @@
         ("tapas", "TapasModel"),
         ("ibert", "IBertModel"),
         ("splinter", "SplinterModel"),
+        ("sew", "SEWModel"),
+        ("sew-d", "SEWDModel"),
     ]
 )
 
@@ -474,6 +476,8 @@
         # Model for Connectionist temporal classification (CTC) mapping
         ("wav2vec2", "Wav2Vec2ForCTC"),
         ("hubert", "HubertForCTC"),
+        ("sew", "SEWForCTC"),
+        ("sew-d", "SEWDForCTC"),
     ]
 )
 

diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
@@ -120,7 +120,7 @@ def _compute_mask_indices(
 class HubertNoLayerNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
         self.out_conv_dim = config.conv_dim[layer_id]
 
         self.conv = nn.Conv1d(

diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py
@@ -28,7 +28,7 @@
 class SEWConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a :class:`~transformers.SEWModel`. It is used to
-    instantiate an SEW model according to the specified arguments, defining the model architecture. Instantiating a
+    instantiate a SEW model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the SEW `asapp/sew-tiny-100k
     <https://huggingface.co/asapp/sew-tiny-100k>`__ architecture.
 

diff --git a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
@@ -22,8 +22,8 @@
 import fairseq
 import torch
 from fairseq.data import Dictionary
-from sew_asapp import tasks
 
+from sew_asapp import tasks
 from transformers import (
     SEWConfig,
     SEWForCTC,
@@ -39,7 +39,7 @@
 logger = logging.get_logger(__name__)
 
 MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
+    "post_extract_proj": "feature_projection",
     "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
     "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
     "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
@@ -49,9 +49,9 @@
     "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
     "fc2": "encoder.layers.*.feed_forward.output_dense",
     "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
     "encoder.upsample.0": "encoder.upsample.projection",
-    "sew_model.layer_norm": "layer_norm",
+    "encoder.layer_norm": "encoder.layer_norm",
+    "w2v_encoder.layer_norm": "layer_norm",
     "w2v_encoder.proj": "lm_head",
     "mask_emb": "masked_spec_embed",
 }
@@ -105,7 +105,7 @@ def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
             for key, mapped_key in MAPPING.items():
                 mapped_key = "sew." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
 
-                if key in name or key.split("sew_model.")[-1] == name.split(".")[0]:
+                if key in name or key.split("w2v_encoder.")[-1] == name.split(".")[0]:
                     is_used = True
                     if "*" in mapped_key:
                         layer_index = name.split(key)[0].split(".")[-2]
@@ -228,10 +228,6 @@ def convert_sew_checkpoint(
 
     recursively_load_weights(model, hf_model, is_finetuned)
 
-    hf_model = hf_model.eval().cuda()
-    outputs = hf_model(torch.ones((1, 16000)).cuda())
-    print(outputs)
-
     hf_model.save_pretrained(pytorch_dump_folder_path)
 
 
@@ -242,9 +238,9 @@ def convert_sew_checkpoint(
     parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
     parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
     parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
     )
     args = parser.parse_args()
     convert_sew_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
     )