Skip to content

Commit

Permalink
[doc] update wav2vec2 demos README.md, test=doc (#2674)
Browse files Browse the repository at this point in the history
* fix wav2vec2 demos, test=doc

* fix wav2vec2 demos, test=doc

* fix enc_dropout and nor.py, test=asr
  • Loading branch information
Zth9730 authored Nov 22, 2022
1 parent b71f142 commit fc02cd0
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 8 deletions.
2 changes: 1 addition & 1 deletion demos/speech_ssl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
Output:
```bash
ASR Result:
我认为跑步最重要的就是给我带来了身体健康
i knocked at the door on the ancient side of the building

Representation:
Tensor(shape=[1, 164, 1024], dtype=float32, place=Place(gpu:0), stop_gradient=True,
Expand Down
8 changes: 4 additions & 4 deletions demos/speech_ssl/README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
```
参数:
- `input`(必须输入):用于识别的音频文件。
- `model`:ASR 任务的模型,默认值:`conformer_wenetspeech`
- `model`:ASR 任务的模型,默认值:`wav2vec2ASR_librispeech`
- `task`:输出类别,默认值:`asr`
- `lang`:模型语言,默认值:`zh`
- `lang`:模型语言,默认值:`en`
- `sample_rate`:音频采样率,默认值:`16000`
- `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`
- `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`
Expand Down Expand Up @@ -83,8 +83,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
输出:
```bash
ASR Result:
我认为跑步最重要的就是给我带来了身体健康

i knocked at the door on the ancient side of the building
Representation:
Tensor(shape=[1, 164, 1024], dtype=float32, place=Place(gpu:0), stop_gradient=True,
[[[ 0.02351918, -0.12980647, 0.17868176, ..., 0.10118122,
Expand Down
5 changes: 2 additions & 3 deletions paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self,
dnn_neurons=512,
activation=True,
normalization=False,
dropout_rate=0.0):
dropout_rate=0.5):
super().__init__(input_shape=[None, None, input_shape])

if not isinstance(dropout_rate, list):
Expand All @@ -68,6 +68,5 @@ def __init__(self,
if activation:
self.append(paddle.nn.LeakyReLU(), layer_name="act")
self.append(
paddle.nn.Dropout(),
p=dropout_rate[block_index],
paddle.nn.Dropout(p=dropout_rate[block_index]),
layer_name='dropout')
97 changes: 97 additions & 0 deletions paddlespeech/s2t/models/wav2vec2/modules/normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Authors
# * Mirco Ravanelli 2020
# * Guillermo Cámbara 2021
# * Sarthak Yadav 2022
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/nnet/normalization.py)
import paddle.nn as nn

from paddlespeech.s2t.modules.align import BatchNorm1D


class BatchNorm1d(nn.Layer):
"""Applies 1d batch normalization to the input tensor.
Arguments
---------
input_shape : tuple
The expected shape of the input. Alternatively, use ``input_size``.
input_size : int
The expected size of the input. Alternatively, use ``input_shape``.
eps : float
This value is added to std deviation estimation to improve the numerical
stability.
momentum : float
It is a value used for the running_mean and running_var computation.
affine : bool
When set to True, the affine parameters are learned.
track_running_stats : bool
When set to True, this module tracks the running mean and variance,
and when set to False, this module does not track such statistics.
combine_batch_time : bool
When true, it combines batch an time axis.
Example
-------
>>> input = paddle.randn([100, 10])
>>> norm = BatchNorm1d(input_shape=input.shape)
>>> output = norm(input)
>>> output.shape
Paddle.Shape([100, 10])
"""

def __init__(
self,
input_shape=None,
input_size=None,
eps=1e-05,
momentum=0.9,
combine_batch_time=False,
skip_transpose=False, ):
super().__init__()
self.combine_batch_time = combine_batch_time
self.skip_transpose = skip_transpose

if input_size is None and skip_transpose:
input_size = input_shape[1]
elif input_size is None:
input_size = input_shape[-1]

self.norm = BatchNorm1D(input_size, momentum=momentum, epsilon=eps)

def forward(self, x):
"""Returns the normalized input tensor.
Arguments
---------
x : paddle.Tensor (batch, time, [channels])
input to normalize. 2d or 3d tensors are expected in input
4d tensors can be used when combine_dims=True.
"""
shape_or = x.shape
if self.combine_batch_time:
if x.ndim == 3:
x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
else:
x = x.reshape(shape_or[0] * shape_or[1], shape_or[3],
shape_or[2])

elif not self.skip_transpose:
x = x.transpose([0, 2, 1])

x_n = self.norm(x)
if self.combine_batch_time:
x_n = x_n.reshape(shape_or)
elif not self.skip_transpose:
x_n = x_n.transpose([0, 2, 1])

return x_n

0 comments on commit fc02cd0

Please sign in to comment.