From aea75207ddae724ee74ab60e3ee2d54165da1706 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Fri, 24 Jan 2025 11:07:26 +0800 Subject: [PATCH] fix cache bug --- cosyvoice/transformer/upsample_encoder.py | 3 +- .../libritts/cosyvoice2/conf/cosyvoice2.yaml | 39 +++++++++---------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/cosyvoice/transformer/upsample_encoder.py b/cosyvoice/transformer/upsample_encoder.py index 6032cac4..92267a88 100644 --- a/cosyvoice/transformer/upsample_encoder.py +++ b/cosyvoice/transformer/upsample_encoder.py @@ -396,6 +396,7 @@ def forward_chunk( encoders_kv_cache_list = [] for index, layer in enumerate(self.encoders): xs, chunk_masks, encoders_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, encoders_kv_cache[index]) + encoders_kv_cache_list.append(encoders_kv_cache_new) encoders_kv_cache = torch.stack(encoders_kv_cache_list, dim=0) # upsample @@ -426,4 +427,4 @@ def forward_chunk( # Here we assume the mask is not changed in encoder layers, so just # return the masks before encoder layers, and the masks will be used # for cross attention with decoder later - return xs, masks, (offset, pre_lookahead_layer_conv2_cache, encoders_kv_cache_new, upsample_offset, upsample_conv_cache, upsample_kv_cache_new) + return xs, masks, (offset, pre_lookahead_layer_conv2_cache, encoders_kv_cache, upsample_offset, upsample_conv_cache, upsample_kv_cache) diff --git a/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml b/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml index f989e3e4..3e9defea 100644 --- a/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml +++ b/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml @@ -56,7 +56,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec input_size: 512 use_cnn_module: False macaron_style: False - use_dynamic_chunk: True + static_chunk_size: !ref # 试试UpsampleConformerEncoder也是static decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM in_channels: 240 n_spks: 1 @@ -154,12 +154,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram center: False compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank feat_extractor: !ref -# pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch # TODO need to replace it -# sample_rate: !ref -# frame_length: 46.4 # match feat_extractor win_size/sampling_rate -# frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate -# compute_f0: !name:cosyvoice.dataset.processor.compute_f0 -# pitch_extractor: !ref +compute_f0: !name:cosyvoice.dataset.processor.compute_f0 + sample_rate: !ref + hop_size: 480 parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding normalize: True shuffle: !name:cosyvoice.dataset.processor.shuffle @@ -186,20 +183,20 @@ data_pipeline: [ !ref , !ref , ] -# data_pipeline_gan: [ -# !ref , -# !ref , -# !ref , -# !ref , -# !ref , -# !ref , -# !ref , -# !ref , -# !ref , -# !ref , -# !ref , -# !ref , -# ] +data_pipeline_gan: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] # llm flow train conf train_conf: