Merge pull request #33 from prophesier/no_fs2_1128

No fs2 12.4
prophesier · Dec 4, 2022 · 760ed03 · 760ed03
2 parents 18bc38d + d251677
commit 760ed03
Show file tree

Hide file tree

Showing 9 changed files with 105 additions and 116 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,6 @@ WPy64-38100
 Winpython64-3.8.10.0dot.exe
 *.pkf
 *.wav
-*.json
+*.json
+*.flac
+*.xmp
diff --git a/README.md b/README.md
@@ -2,7 +2,9 @@
 Singing Voice Conversion via diffusion model
 
 ## updates:
->2022.11.23 修复了一个重大bug，曾导致可能将用于推理的原始gt音频转变采样率为22.05kHz,对于由此造成的影响我们表示十分抱歉，请务必检查自己的测试音频，并使用更新后的代码\
+>2022.12.4 44.1kHz声码器开放申请，正式提供对44.1kHz的支持\
+2022.11.28 增加了默认打开的no_fs2选项，可优化部分网络，提升训练速度、缩减模型体积，对于未来新训练的模型有效\
+2022.11.23 修复了一个重大bug，曾导致可能将用于推理的原始gt音频转变采样率为22.05kHz,对于由此造成的影响我们表示十分抱歉，请务必检查自己的测试音频，并使用更新后的代码\
 2022.11.22 修复了很多bug，其中有几个影响推理效果重大的bug\
 2022.11.20 增加对推理时多数格式的输入和保存，无需手动借助其他软件转换\
 2022.11.13 修正中断后读取模型的epoch/steps显示问题，添加f0处理的磁盘缓存，添加实时变声推理的支持文件\
@@ -46,6 +48,8 @@ CUDA_VISIBLE_DEVICES=0 python run.py --config training/config.yaml --exp_name [y
 >目前本项目已在众多数据集进行过训练和测试。部分ckpt文件、demo音频和推理训练所需的其他文件请在下方QQ频道内下载\
 使用QQ扫描此二维码(如不能加入，请尝试一个合适的网络环境):
 <img src="./ckpt.jpg" width=256/>
+For English support, you can join this discord:
+https://discord.gg/jvA5c2xzSE
 
 ## Acknowledgements
 >项目基于[diffsinger](https://github.com/MoonInTheRiver/DiffSinger)、[diffsinger(openvpi维护版)](https://github.com/openvpi/DiffSinger)、[soft-vc](https://github.com/bshall/soft-vc)开发.\

diff --git a/doc/train_and_inference.markdown b/doc/train_and_inference.markdown
@@ -12,7 +12,7 @@ pip install -r requirements_short.txt
 >3. 根目录下有一份@三千整理的依赖列表requirements.png，是在某品牌云服务器上跑通的，不过此torch版本已不兼容目前版本代码,但是其他部分版本可以参考，十分感谢
 
 ## 1.推理
->使用根目录下的inference.ipynb进行推理或使用经过po主适配的@小狼的infer.py\
+>使用根目录下的inference.ipynb进行推理或使用经过作者适配的@小狼的infer.py\
 在第一个block中修改如下参数：
 ```
 config_path='checkpoints压缩包中config.yaml的位置'
@@ -48,9 +48,10 @@ key=0
 #变调参数，默认为0(不是1!!)，将源音频的音高升高key个半音后合成，如男声转女生，可填入8或者12等(12就是升高一整个8度)
 
 use_pe=True
-#梅尔谱合成音频时使用的F0提取算法，如果改成False将使用源音频的F0
-这里填True和False合成会略有差异，通常是True会好些，但也不尽然，对合成速度几乎无影响
-(无论key填什么 这里都是可以自由选择的，不影响)
+#梅尔谱合成音频时使用的F0提取算法，如果改成False将使用源音频的F0\
+这里填True和False合成会略有差异，通常是True会好些，但也不尽然，对合成速度几乎无影响\
+(无论key填什么 这里都是可以自由选择的，不影响)\
+44.1kHz下不支持此功能，会自动关闭，开着也不报错就是了
 
 use_gt_mel=False
 #这个选项类似于AI画图的图生图功能，如果打开，产生的音频将是输入声音与目标说话人声音的混合，混合比例由下一个参数确定
@@ -59,7 +60,7 @@ use_gt_mel=False
 add_noise_step=500
 #与上个参数有关，控制两种声音的比例，填入1是完全的源声线，填入1000是完全的目标声线，能听出来是两者均等混合的数值大约在300附近(并不是线性的，另外这个参数如果调的很小，可以把pndm加速倍率调低，增加合成质量)
 
-wav_gen='yyy.wav'#输出音频的路径，默认在项目根目录中
+wav_gen='yyy.wav'#输出音频的路径，默认在项目根目录中，可通过改变扩展名更改保存文件类型
 ```
 如果使用infer.py，修改方式类似，需要修改__name__=='__main__'中的部分，然后在根目录中执行\
 python infer.py\
@@ -68,10 +69,10 @@ python infer.py\
 ### 2.1 准备数据
 >目前支持wav格式和ogg格式的音频数据，采样率最好高于24kHz，程序会自动处理采样率和声道问题。采样率不可低于16kHz（一般不会的）\
 音频需要切片为5-15s为宜的短音频，长度没有具体要求，但不宜过长过短。音频需要为纯目标人干声，不可以有背景音乐和其他人声音，最好也不要有过重的混响等。若经过去伴奏等处理，请尽量保证处理后的音频质量。\
-目前仅支持单人训练，总时长尽量保证在5h或以上，不需要额外任何标注，将音频文件放在下述raw_data_dir下即可，这个目录下的结构可以自由定义，程序会自主找到所需文件。
+目前仅支持单人训练，总时长尽量保证在3h或以上，不需要额外任何标注，将音频文件放在下述raw_data_dir下即可，这个目录下的结构可以自由定义，程序会自主找到所需文件。
 
 ### 2.2 修改超参数配置
->首先请备份一份config.yaml，然后修改它：\
+>首先请备份一份config.yaml(此文件对应24kHz声码器, 44.1kHz声码器请使用config_nsf.yaml)，然后修改它\
 可能会用到的参数如下(以工程名为nyaru为例):
 ```
 K_step: 1000
@@ -122,19 +123,29 @@ pe_ckpt: checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
 raw_data_dir: data/raw/nyaru
 #存放预处理前原始数据的位置，请将原始wav数据放在这个目录下，内部文件结构无所谓，会自动解构
 
+residual_channels: 384
+residual_layers: 20
+#控制核心网络规模的一组参数，越大参数越多炼的越慢，但效果不一定会变好，大一点的数据集可以把第一个改成512。这个可以自行实验效果，不过不了解的话尽量不动。
+
 speaker_id: nyaru
-训练的说话人名字，目前只支持单说话人，请在这里填写
+#训练的说话人名字，目前只支持单说话人，请在这里填写（只是观赏作用，没有实际意义的参数）
 
 use_crepe: true
 #在数据预处理中使用crepe提取F0,追求效果请打开，追求速度可以关闭
 
 val_check_interval: 2000
-每2000steps推理测试集并保存ckpt
+#每2000steps推理测试集并保存ckpt
+
+vocoder_ckpt:checkpoints/0109_hifigan_bigpopcs_hop128
+#24kHz下为对应声码器的目录, 44.1kHz下为对应声码器的文件名, 注意不要填错
 
 work_dir: checkpoints/nyaru
-#修改后缀为工程名
+#修改后缀为工程名(也可以删掉或完全留空自动生成，但别乱填)
+no_fs2: true
+#对网络encoder的精简，能缩减模型体积，加快训练，且并未发现有对网络表现损害的直接证据。默认打开
+
 ```
->其他的参数如果你不知道它是做什么的，请不要修改
+>其他的参数如果你不知道它是做什么的，请不要修改，即使你看着名称可能以为你知道它是做什么的。
 
 ### 2.3 数据预处理
 在diff-svc的目录下执行以下命令：\
@@ -196,4 +207,4 @@ torch版本过低，请更换高版本torch
 检查是否在配置中开启了use_crepe，将其关闭可显著提升速度。\
 检查配置中hubert_gpu是否开启。
 
-如有其他问题，请扫描github仓库界面下方的二维码询问。
+如有其他问题，请加入QQ频道或discord频道询问。
diff --git a/infer.py b/infer.py
@@ -11,12 +11,12 @@
 from infer_tools.infer_tool import Svc
 from utils.hparams import hparams
 
-chunks_dict = infer_tool.read_temp("./infer_tools/chunks_temp.json")
+chunks_dict = infer_tool.read_temp("./infer_tools/new_chunks_temp.json")
 
 
 def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise_step, project_name='', f_name=None,
-             file_path=None, out_path=None,**kwargs):
-    print(f'code version:2022-11-23 v2')
+             file_path=None, out_path=None, slice_db=-40,**kwargs):
+    print(f'code version:2022-11-30 v2')
     use_pe = use_pe if hparams['audio_sample_rate'] == 24000 else False
     if file_path is None:
         raw_audio_path = f"./raw/{f_name}"
@@ -33,25 +33,24 @@ def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise
         print("load chunks from temp")
         chunks = chunks_dict[wav_hash]["chunks"]
     else:
-        chunks = slicer.cut(wav_path)
+        chunks = slicer.cut(wav_path, db_thresh=slice_db)
     chunks_dict[wav_hash] = {"chunks": chunks, "time": int(time.time())}
-    infer_tool.write_temp("./infer_tools/chunks_temp.json", chunks_dict)
+    infer_tool.write_temp("./infer_tools/new_chunks_temp.json", chunks_dict)
     audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
 
     count = 0
     f0_tst = []
     f0_pred = []
     audio = []
-    epsilon = 0.0002
-    for data in audio_data:
+    for (slice_tag, data) in audio_data:
         print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
         length = int(np.ceil(len(data) / audio_sr * hparams['audio_sample_rate']))
         raw_path = io.BytesIO()
         soundfile.write(raw_path, data, audio_sr, format="wav")
         if hparams['debug']:
             print(np.mean(data), np.var(data))
         raw_path.seek(0)
-        if np.var(data) < epsilon:
+        if slice_tag:
             print('jump empty segment')
             _f0_tst, _f0_pred, _audio = (
                 np.zeros(int(np.ceil(length / hparams['hop_size']))), np.zeros(int(np.ceil(length / hparams['hop_size']))),

diff --git a/infer_tools/infer_tool.py b/infer_tools/infer_tool.py
@@ -3,14 +3,14 @@
 import os
 import time
 from io import BytesIO
+from pathlib import Path
 
 import librosa
 import numpy as np
 import soundfile
 import torch
 
 import utils
-from pathlib import Path
 from modules.fastspeech.pe import PitchExtractor
 from network.diff.candidate_decoder import FFT
 from network.diff.diffusion import GaussianDiffusion
@@ -21,22 +21,30 @@
 from utils.hparams import hparams, set_hparams
 from utils.pitch_utils import denorm_f0, norm_interp_f0
 
+if os.path.exists("chunks_temp.json"):
+    os.remove("chunks_temp.json")
+
 
 def read_temp(file_name):
     if not os.path.exists(file_name):
         with open(file_name, "w") as f:
             f.write(json.dumps({"info": "temp_dict"}))
         return {}
     else:
-        with open(file_name, "r") as f:
-            data = f.read()
-        data_dict = json.loads(data)
-        if os.path.getsize(file_name) > 50 * 1024 * 1024:
-            f_name = file_name.split("/")[-1]
-            print(f"clean {f_name}")
-            for wav_hash in list(data_dict.keys()):
-                if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
-                    del data_dict[wav_hash]
+        try:
+            with open(file_name, "r") as f:
+                data = f.read()
+            data_dict = json.loads(data)
+            if os.path.getsize(file_name) > 50 * 1024 * 1024:
+                f_name = file_name.split("/")[-1]
+                print(f"clean {f_name}")
+                for wav_hash in list(data_dict.keys()):
+                    if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
+                        del data_dict[wav_hash]
+        except Exception as e:
+            print(e)
+            print(f"{file_name} error,auto rebuild file")
+            data_dict = {"info": "temp_dict"}
         return data_dict
 
 
@@ -135,6 +143,7 @@ def infer(self, in_path, key, acc, use_pe=True, use_crepe=True, thre=0.05, singe
         spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
         hubert = batch['hubert']
         ref_mels = batch["mels"]
+        energy=batch['energy']
         mel2ph = batch['mel2ph']
         batch['f0'] = batch['f0'] + (key / 12)
         batch['f0'][batch['f0']>np.log2(hparams['f0_max'])]=0
@@ -143,7 +152,7 @@ def infer(self, in_path, key, acc, use_pe=True, use_crepe=True, thre=0.05, singe
         @timeit
         def diff_infer():
             outputs = self.model(
-                hubert.cuda(), spk_embed=spk_embed, mel2ph=mel2ph.cuda(), f0=f0.cuda(), uv=uv.cuda(),
+                hubert.cuda(), spk_embed=spk_embed, mel2ph=mel2ph.cuda(), f0=f0.cuda(), uv=uv.cuda(),energy=energy.cuda(),
                 ref_mels=ref_mels.cuda(),
                 infer=True, **kwargs)
             return outputs

diff --git a/infer_tools/slicer.py b/infer_tools/slicer.py
@@ -1,9 +1,6 @@
-import os.path
 import time
-from argparse import ArgumentParser
 
 import numpy as np
-import soundfile
 import torch
 import torchaudio
 from scipy.ndimage import maximum_filter1d, uniform_filter1d
@@ -107,59 +104,25 @@ def slice(self, audio):
             split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
             sil_tags.append((split_loc_l, samples.shape[0]))
         if len(sil_tags) == 0:
-            return [len(audio)]
+            return {"0": {"slice": False, "split_time": f"0,{len(audio)}"}}
         else:
             chunks = []
+            # 第一段静音并非从头开始，补上有声片段
+            if sil_tags[0][0]:
+                chunks.append({"slice": False, "split_time": f"0,{sil_tags[0][0]}"})
             for i in range(0, len(sil_tags)):
-                chunks.append(int((sil_tags[i][0] + sil_tags[i][1]) / 2))
-            return chunks
-
-
-def main():
-    parser = ArgumentParser()
-    parser.add_argument('audio', type=str, help='The audio to be sliced')
-    parser.add_argument('--out_name', type=str, help='Output directory of the sliced audio clips')
-    parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
-    parser.add_argument('--db_thresh', type=float, required=False, default=-40,
-                        help='The dB threshold for silence detection')
-    parser.add_argument('--min_len', type=int, required=False, default=5000,
-                        help='The minimum milliseconds required for each sliced audio clip')
-    parser.add_argument('--win_l', type=int, required=False, default=300,
-                        help='Size of the large sliding window, presented in milliseconds')
-    parser.add_argument('--win_s', type=int, required=False, default=20,
-                        help='Size of the small sliding window, presented in milliseconds')
-    parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
-                        help='The maximum silence length kept around the sliced audio, presented in milliseconds')
-    args = parser.parse_args()
-    out = args.out
-    if out is None:
-        out = os.path.dirname(os.path.abspath(args.audio))
-    audio, sr = torchaudio.load(args.audio)
-    if len(audio.shape) == 2 and audio.shape[1] >= 2:
-        audio = torch.mean(audio, dim=0).unsqueeze(0)
-    audio = audio.cpu().numpy()[0]
-
-    slicer = Slicer(
-        sr=sr,
-        db_threshold=args.db_thresh,
-        min_length=args.min_len,
-        win_l=args.win_l,
-        win_s=args.win_s,
-        max_silence_kept=args.max_sil_kept
-    )
-    chunks = slicer.slice(audio)
-    if not os.path.exists(args.out):
-        os.makedirs(args.out)
-    start = 0
-    end_id = 0
-    for i, chunk in enumerate(chunks):
-        end = chunk
-        soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(i).zfill(2))), audio[start:end], sr)
-        start = end
-        end_id = i + 1
-    if start != len(audio):
-        soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(end_id).zfill(2))),
-                        audio[start:len(audio)], sr)
+                # 标识有声片段（跳过第一段）
+                if i:
+                    chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1]},{sil_tags[i][0]}"})
+                # 标识所有静音片段
+                chunks.append({"slice": True, "split_time": f"{sil_tags[i][0]},{sil_tags[i][1]}"})
+            # 最后一段静音并非结尾，补上结尾片段
+            if sil_tags[-1][1] != len(audio):
+                chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1]},{len(audio)}"})
+            chunk_dict = {}
+            for i in range(len(chunks)):
+                chunk_dict[str(i)] = chunks[i]
+            return chunk_dict
 
 
 def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_kept=500):
@@ -181,20 +144,15 @@ def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_ke
 
 
 def chunks2audio(audio_path, chunks):
+    chunks = dict(chunks)
     audio, sr = torchaudio.load(audio_path)
     if len(audio.shape) == 2 and audio.shape[1] >= 2:
         audio = torch.mean(audio, dim=0).unsqueeze(0)
     audio = audio.cpu().numpy()[0]
-    start = 0
     result = []
-    for i, chunk in enumerate(chunks):
-        end = chunk
-        result.append(audio[start:end])
-        start = end
-    if start != len(audio):
-        result.append(audio[start:len(audio)])
+    for k, v in chunks.items():
+        tag = v["split_time"].split(",")
+        result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
     return result, sr
 
 
-if __name__ == '__main__':
-    main()