Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the SEW and SEW-D speech models #13962

Merged
merged 19 commits into from
Oct 15, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
SEW-D and tests
  • Loading branch information
anton-l committed Oct 11, 2021
commit a6ec41caef29e60567fc18d3f9218a0aea306d39
4 changes: 4 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,10 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| RoFormer | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| SEW | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| SEWD | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Speech2Text | ✅ | ❌ | ✅ | ❌ | ❌ |
Expand Down
18 changes: 12 additions & 6 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@
"models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
"models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
"models.sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
"models.sew_d": ["SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWDConfig"],
"models.speech_encoder_decoder": ["SpeechEncoderDecoderConfig"],
"models.speech_to_text": [
"SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
Expand Down Expand Up @@ -1114,6 +1115,14 @@
"SEWPreTrainedModel",
]
)
_import_structure["models.sew_d"].extend(
[
"SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST",
"SEWDForCTC",
"SEWDModel",
"SEWDPreTrainedModel",
]
)
_import_structure["models.speech_encoder_decoder"].extend(["SpeechEncoderDecoderModel"])
_import_structure["models.speech_to_text"].extend(
[
Expand Down Expand Up @@ -2079,6 +2088,7 @@
from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
from .models.sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
from .models.speech_encoder_decoder import SpeechEncoderDecoderConfig
from .models.speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
from .models.speech_to_text_2 import (
Expand Down Expand Up @@ -2800,12 +2810,8 @@
RoFormerPreTrainedModel,
load_tf_weights_in_roformer,
)
from .models.sew import (
SEW_PRETRAINED_MODEL_ARCHIVE_LIST,
SEWForCTC,
SEWModel,
SEWPreTrainedModel,
)
from .models.sew import SEW_PRETRAINED_MODEL_ARCHIVE_LIST, SEWForCTC, SEWModel, SEWPreTrainedModel
from .models.sew_d import SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST, SEWDForCTC, SEWDModel, SEWDPreTrainedModel
from .models.speech_encoder_decoder import SpeechEncoderDecoderModel
from .models.speech_to_text import (
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@
("rag", "RagConfig"),
("tapas", "TapasConfig"),
("splinter", "SplinterConfig"),
("sew", "SEWConfig"),
("sew-d", "SEWDConfig"),
]
)

Expand Down Expand Up @@ -160,6 +162,7 @@
("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"),
]
)

Expand Down Expand Up @@ -241,6 +244,8 @@
("byt5", "ByT5"),
("mbart50", "mBART-50"),
("splinter", "Splinter"),
("sew", "SEW"),
anton-l marked this conversation as resolved.
Show resolved Hide resolved
("sew-d", "SEWD"),
anton-l marked this conversation as resolved.
Show resolved Hide resolved
]
)

Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/feature_extraction_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
("detr", "DetrFeatureExtractor"),
("deit", "DeiTFeatureExtractor"),
("hubert", "Wav2Vec2FeatureExtractor"),
("sew", "Wav2Vec2FeatureExtractor"),
anton-l marked this conversation as resolved.
Show resolved Hide resolved
("speech_to_text", "Speech2TextFeatureExtractor"),
("vit", "ViTFeatureExtractor"),
("wav2vec2", "Wav2Vec2FeatureExtractor"),
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@
("tapas", "TapasModel"),
("ibert", "IBertModel"),
("splinter", "SplinterModel"),
("sew", "SEWModel"),
("sew-d", "SEWDModel"),
]
)

Expand Down Expand Up @@ -474,6 +476,8 @@
# Model for Connectionist temporal classification (CTC) mapping
("wav2vec2", "Wav2Vec2ForCTC"),
("hubert", "HubertForCTC"),
("sew", "SEWForCTC"),
("sew-d", "SEWDForCTC"),
]
)

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/hubert/modeling_hubert.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def _compute_mask_indices(
class HubertNoLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! cc @mfuntowicz

self.out_conv_dim = config.conv_dim[layer_id]

self.conv = nn.Conv1d(
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/sew/configuration_sew.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
class SEWConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.SEWModel`. It is used to
instantiate an SEW model according to the specified arguments, defining the model architecture. Instantiating a
instantiate a SEW model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SEW `asapp/sew-tiny-100k
<https://huggingface.co/asapp/sew-tiny-100k>`__ architecture.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import fairseq
import torch
from fairseq.data import Dictionary
from sew_asapp import tasks

from sew_asapp import tasks
from transformers import (
SEWConfig,
SEWForCTC,
Expand All @@ -39,7 +39,7 @@
logger = logging.get_logger(__name__)

MAPPING = {
"post_extract_proj": "feature_projection.projection",
"post_extract_proj": "feature_projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
Expand All @@ -49,9 +49,9 @@
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"encoder.upsample.0": "encoder.upsample.projection",
"sew_model.layer_norm": "layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_encoder.layer_norm": "layer_norm",
"w2v_encoder.proj": "lm_head",
"mask_emb": "masked_spec_embed",
}
Expand Down Expand Up @@ -105,7 +105,7 @@ def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
for key, mapped_key in MAPPING.items():
mapped_key = "sew." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key

if key in name or key.split("sew_model.")[-1] == name.split(".")[0]:
if key in name or key.split("w2v_encoder.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
Expand Down Expand Up @@ -228,10 +228,6 @@ def convert_sew_checkpoint(

recursively_load_weights(model, hf_model, is_finetuned)

hf_model = hf_model.eval().cuda()
outputs = hf_model(torch.ones((1, 16000)).cuda())
print(outputs)

hf_model.save_pretrained(pytorch_dump_folder_path)


Expand All @@ -242,9 +238,9 @@ def convert_sew_checkpoint(
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
"--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
)
args = parser.parse_args()
convert_sew_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
)
Loading