Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the SEW and SEW-D speech models #13962

Merged
merged 19 commits into from
Oct 15, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,10 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| RoFormer | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| SEW | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| SEWD | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Speech2Text | ✅ | ❌ | ✅ | ❌ | ❌ |
Expand Down
22 changes: 22 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@
"models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
"models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
"models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
"models.sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
"models.sew_d": ["SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWDConfig"],
"models.speech_encoder_decoder": ["SpeechEncoderDecoderConfig"],
"models.speech_to_text": [
"SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
Expand Down Expand Up @@ -1105,6 +1107,22 @@
"load_tf_weights_in_roformer",
]
)
_import_structure["models.sew"].extend(
[
"SEW_PRETRAINED_MODEL_ARCHIVE_LIST",
"SEWForCTC",
"SEWModel",
"SEWPreTrainedModel",
]
)
_import_structure["models.sew_d"].extend(
[
"SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST",
"SEWDForCTC",
"SEWDModel",
"SEWDPreTrainedModel",
]
)
_import_structure["models.speech_encoder_decoder"].extend(["SpeechEncoderDecoderModel"])
_import_structure["models.speech_to_text"].extend(
[
Expand Down Expand Up @@ -2069,6 +2087,8 @@
from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
from .models.sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
from .models.speech_encoder_decoder import SpeechEncoderDecoderConfig
from .models.speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
from .models.speech_to_text_2 import (
Expand Down Expand Up @@ -2790,6 +2810,8 @@
RoFormerPreTrainedModel,
load_tf_weights_in_roformer,
)
from .models.sew import SEW_PRETRAINED_MODEL_ARCHIVE_LIST, SEWForCTC, SEWModel, SEWPreTrainedModel
from .models.sew_d import SEW_D_PRETRAINED_MODEL_ARCHIVE_LIST, SEWDForCTC, SEWDModel, SEWDPreTrainedModel
from .models.speech_encoder_decoder import SpeechEncoderDecoderModel
from .models.speech_to_text import (
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
retribert,
roberta,
roformer,
sew,
speech_to_text,
splinter,
squeezebert,
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@
("rag", "RagConfig"),
("tapas", "TapasConfig"),
("splinter", "SplinterConfig"),
("sew", "SEWConfig"),
("sew-d", "SEWDConfig"),
]
)

Expand Down Expand Up @@ -160,6 +162,7 @@
("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"),
]
)

Expand Down Expand Up @@ -241,6 +244,8 @@
("byt5", "ByT5"),
("mbart50", "mBART-50"),
("splinter", "Splinter"),
("sew", "SEW"),
anton-l marked this conversation as resolved.
Show resolved Hide resolved
("sew-d", "SEWD"),
anton-l marked this conversation as resolved.
Show resolved Hide resolved
]
)

Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/feature_extraction_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
("detr", "DetrFeatureExtractor"),
("deit", "DeiTFeatureExtractor"),
("hubert", "Wav2Vec2FeatureExtractor"),
("sew", "Wav2Vec2FeatureExtractor"),
anton-l marked this conversation as resolved.
Show resolved Hide resolved
("speech_to_text", "Speech2TextFeatureExtractor"),
("vit", "ViTFeatureExtractor"),
("wav2vec2", "Wav2Vec2FeatureExtractor"),
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@
("tapas", "TapasModel"),
("ibert", "IBertModel"),
("splinter", "SplinterModel"),
("sew", "SEWModel"),
("sew-d", "SEWDModel"),
]
)

Expand Down Expand Up @@ -474,6 +476,8 @@
# Model for Connectionist temporal classification (CTC) mapping
("wav2vec2", "Wav2Vec2ForCTC"),
("hubert", "HubertForCTC"),
("sew", "SEWForCTC"),
("sew-d", "SEWDForCTC"),
]
)

Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/hubert/modeling_hubert.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def _compute_mask_indices(
class HubertNoLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! cc @mfuntowicz

self.out_conv_dim = config.conv_dim[layer_id]

self.conv = nn.Conv1d(
Expand All @@ -142,7 +142,7 @@ def forward(self, hidden_states):
class HubertLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]

self.conv = nn.Conv1d(
Expand Down Expand Up @@ -170,7 +170,7 @@ def forward(self, hidden_states):
class HubertGroupNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]

self.conv = nn.Conv1d(
Expand Down
47 changes: 47 additions & 0 deletions src/transformers/models/sew/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING

from ...file_utils import _LazyModule, is_torch_available


_import_structure = {
".wav2vec2.feature_extraction_wav2vec2": ["Wav2Vec2FeatureExtractor"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need that import here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not right now, this was used to enable AutoFeatureExtractor for audio classification pipelines with Hubert #13366

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I haven't done a good review of #13366 - we shouldn't have done this - sorry! Have we already done a release since merging this PR?

The AutoFeatureExtractor works out of the box by adding this line to the config:

"feature_extractor_type": "Wav2Vec2FeatureExtractor"

We just need to add this to the configs & I would be in favor of also deprecating the HuBERT key in the AutoFeatureExtractor and instead update all hubert configs.

Also cc @sgugger what do you think?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The vision and speech APIs are both still experimental, so I'm fine with this small breaking change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@anton-l - we can change HuBERT in a follow-up PR with Deprecation, let's try to not continue the design

"configuration_sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
}

if is_torch_available():
_import_structure["modeling_sew"] = [
"SEW_PRETRAINED_MODEL_ARCHIVE_LIST",
"SEWForCTC",
"SEWModel",
"SEWPreTrainedModel",
]

if TYPE_CHECKING:
from ..wav2vec2.feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
anton-l marked this conversation as resolved.
Show resolved Hide resolved
from .configuration_sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig

if is_torch_available():
from .modeling_sew import SEW_PRETRAINED_MODEL_ARCHIVE_LIST, SEWForCTC, SEWModel, SEWPreTrainedModel


else:
import sys

sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
Loading