-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2502 from zh794390558/u2pp_export
[s2t] streaming conformer u2 and u2pp jit export
- Loading branch information
Showing
32 changed files
with
1,142 additions
and
217 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
############################################ | ||
# Network Architecture # | ||
############################################ | ||
cmvn_file: | ||
cmvn_file_type: "json" | ||
# encoder related | ||
encoder: conformer | ||
encoder_conf: | ||
output_size: 512 # dimension of attention | ||
attention_heads: 8 | ||
linear_units: 2048 # the number of units of position-wise feed forward | ||
num_blocks: 12 # the number of encoder blocks | ||
dropout_rate: 0.1 | ||
positional_dropout_rate: 0.1 | ||
attention_dropout_rate: 0.0 | ||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 | ||
normalize_before: True | ||
use_cnn_module: True | ||
cnn_module_kernel: 15 | ||
activation_type: swish | ||
pos_enc_layer_type: rel_pos | ||
selfattention_layer_type: rel_selfattn | ||
causal: true | ||
use_dynamic_chunk: true | ||
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster | ||
use_dynamic_left_chunk: false | ||
# decoder related | ||
decoder: transformer | ||
decoder_conf: | ||
attention_heads: 8 | ||
linear_units: 2048 | ||
num_blocks: 6 | ||
dropout_rate: 0.1 | ||
positional_dropout_rate: 0.1 | ||
self_attention_dropout_rate: 0.0 | ||
src_attention_dropout_rate: 0.0 | ||
|
||
# hybrid CTC/attention | ||
model_conf: | ||
ctc_weight: 0.3 | ||
lsm_weight: 0.1 # label smoothing option | ||
reverse_weight: 0.0 # unidecoder | ||
length_normalized_loss: false | ||
init_type: 'kaiming_uniform' | ||
|
||
# https://yaml.org/type/float.html | ||
########################################### | ||
# Data # | ||
########################################### | ||
train_manifest: data/train_l/data.list | ||
dev_manifest: data/dev/data.list | ||
test_manifest: data/test_meeting/data.list | ||
|
||
########################################### | ||
# Dataloader # | ||
########################################### | ||
use_streaming_data: True | ||
unit_type: 'char' | ||
vocab_filepath: data/lang_char/vocab.txt | ||
preprocess_config: conf/preprocess.yaml | ||
spm_model_prefix: '' | ||
feat_dim: 80 | ||
stride_ms: 10.0 | ||
window_ms: 25.0 | ||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs | ||
batch_size: 32 | ||
do_filter: True | ||
maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced | ||
maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced | ||
minlen_in: 10 | ||
minlen_out: 0 | ||
minibatches: 0 # for debug | ||
batch_count: auto | ||
batch_bins: 0 | ||
batch_frames_in: 0 | ||
batch_frames_out: 0 | ||
batch_frames_inout: 0 | ||
num_workers: 0 | ||
subsampling_factor: 1 | ||
num_encs: 1 | ||
|
||
|
||
########################################### | ||
# Training # | ||
########################################### | ||
n_epoch: 26 | ||
accum_grad: 32 | ||
global_grad_clip: 5.0 | ||
dist_sampler: True | ||
log_interval: 1 | ||
checkpoint: | ||
kbest_n: 50 | ||
latest_n: 5 | ||
optim: adam | ||
optim_conf: | ||
lr: 0.001 | ||
weight_decay: 1.0e-6 | ||
scheduler: warmuplr | ||
scheduler_conf: | ||
warmup_steps: 5000 | ||
lr_decay: 1.0 |
100 changes: 100 additions & 0 deletions
100
examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
############################################ | ||
# Network Architecture # | ||
############################################ | ||
cmvn_file: | ||
cmvn_file_type: "json" | ||
# encoder related | ||
encoder: conformer | ||
encoder_conf: | ||
output_size: 512 # dimension of attention | ||
attention_heads: 8 | ||
linear_units: 2048 # the number of units of position-wise feed forward | ||
num_blocks: 12 # the number of encoder blocks | ||
dropout_rate: 0.1 | ||
positional_dropout_rate: 0.1 | ||
attention_dropout_rate: 0.1 | ||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 | ||
normalize_before: True | ||
use_cnn_module: True | ||
cnn_module_kernel: 15 | ||
activation_type: swish | ||
pos_enc_layer_type: rel_pos | ||
selfattention_layer_type: rel_selfattn | ||
causal: true | ||
use_dynamic_chunk: true | ||
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster | ||
use_dynamic_left_chunk: false | ||
# decoder related | ||
decoder: bitransformer | ||
decoder_conf: | ||
attention_heads: 8 | ||
linear_units: 2048 | ||
num_blocks: 3 # the number of encoder blocks | ||
r_num_blocks: 3 #only for bitransformer | ||
dropout_rate: 0.1 | ||
positional_dropout_rate: 0.1 | ||
self_attention_dropout_rate: 0.1 | ||
src_attention_dropout_rate: 0.1 | ||
|
||
# hybrid CTC/attention | ||
model_conf: | ||
ctc_weight: 0.3 | ||
lsm_weight: 0.1 # label smoothing option | ||
length_normalized_loss: false | ||
reverse_weight: 0.3 # only for bitransformer decoder | ||
init_type: 'kaiming_uniform' # !Warning: need to convergence | ||
|
||
########################################### | ||
# Data # | ||
########################################### | ||
train_manifest: data/train_l/data.list | ||
dev_manifest: data/dev/data.list | ||
test_manifest: data/test_meeting/data.list | ||
|
||
########################################### | ||
# Dataloader # | ||
########################################### | ||
use_stream_data: True | ||
vocab_filepath: data/lang_char/vocab.txt | ||
unit_type: 'char' | ||
preprocess_config: conf/preprocess.yaml | ||
spm_model_prefix: '' | ||
feat_dim: 80 | ||
stride_ms: 10.0 | ||
window_ms: 25.0 | ||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs | ||
batch_size: 32 | ||
do_filter: True | ||
maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced | ||
maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced | ||
minlen_in: 10 | ||
minlen_out: 0 | ||
minibatches: 0 # for debug | ||
batch_count: auto | ||
batch_bins: 0 | ||
batch_frames_in: 0 | ||
batch_frames_out: 0 | ||
batch_frames_inout: 0 | ||
num_workers: 0 | ||
subsampling_factor: 1 | ||
num_encs: 1 | ||
|
||
########################################### | ||
# Training # | ||
########################################### | ||
n_epoch: 150 | ||
accum_grad: 8 | ||
global_grad_clip: 5.0 | ||
dist_sampler: False | ||
optim: adam | ||
optim_conf: | ||
lr: 0.002 | ||
weight_decay: 1.0e-6 | ||
scheduler: warmuplr | ||
scheduler_conf: | ||
warmup_steps: 25000 | ||
lr_decay: 1.0 | ||
log_interval: 100 | ||
checkpoint: | ||
kbest_n: 50 | ||
latest_n: 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
beam_size: 10 | ||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' | ||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. | ||
reverse_weight: 0.3 # reverse weight for attention rescoring decode mode. | ||
decoding_chunk_size: 16 # decoding chunk size. Defaults to -1. | ||
# <0: for decoding, use full chunk. | ||
# >0: for decoding, use fixed chunk size as set. | ||
# 0: used for training, it's prohibited here. | ||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. | ||
simulate_streaming: True # simulate streaming inference. Defaults to False. | ||
decode_batch_size: 128 | ||
error_rate_type: cer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,12 @@ | ||
decode_batch_size: 128 | ||
error_rate_type: cer | ||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' | ||
beam_size: 10 | ||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' | ||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. | ||
reverse_weight: 0.3 # reverse weight for attention rescoring decode mode. | ||
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. | ||
# <0: for decoding, use full chunk. | ||
# >0: for decoding, use fixed chunk size as set. | ||
# 0: used for training, it's prohibited here. | ||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. | ||
simulate_streaming: False # simulate streaming inference. Defaults to False. | ||
simulate_streaming: False # simulate streaming inference. Defaults to False. | ||
decode_batch_size: 128 | ||
error_rate_type: cer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#!/bin/bash | ||
|
||
if [ $# != 4 ];then | ||
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" | ||
exit -1 | ||
fi | ||
|
||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') | ||
echo "using $ngpu gpus..." | ||
|
||
config_path=$1 | ||
decode_config_path=$2 | ||
ckpt_prefix=$3 | ||
audio_file=$4 | ||
|
||
mkdir -p data | ||
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ | ||
if [ $? -ne 0 ]; then | ||
exit 1 | ||
fi | ||
|
||
if [ ! -f ${audio_file} ]; then | ||
echo "Plase input the right audio_file path" | ||
exit 1 | ||
fi | ||
|
||
|
||
chunk_mode=false | ||
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then | ||
chunk_mode=true | ||
fi | ||
|
||
# download language model | ||
#bash local/download_lm_ch.sh | ||
#if [ $? -ne 0 ]; then | ||
# exit 1 | ||
#fi | ||
|
||
for type in attention_rescoring; do | ||
echo "decoding ${type}" | ||
batch_size=1 | ||
output_dir=${ckpt_prefix} | ||
mkdir -p ${output_dir} | ||
python3 -u ${BIN_DIR}/quant.py \ | ||
--ngpu ${ngpu} \ | ||
--config ${config_path} \ | ||
--decode_cfg ${decode_config_path} \ | ||
--result_file ${output_dir}/${type}.rsl \ | ||
--checkpoint_path ${ckpt_prefix} \ | ||
--opts decode.decoding_method ${type} \ | ||
--opts decode.decode_batch_size ${batch_size} \ | ||
--audio_file ${audio_file} | ||
|
||
if [ $? -ne 0 ]; then | ||
echo "Failed in evaluation!" | ||
exit 1 | ||
fi | ||
done | ||
exit 0 |
Oops, something went wrong.