Merge pull request #30 from IceKyrin/main

fix cut_wav/load_json bug
prophesier · Nov 30, 2022 · 3edb72e · 3edb72e
2 parents 88c73d8 + e4149f8
commit 3edb72e
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 77 deletions.
diff --git a/infer.py b/infer.py
@@ -11,11 +11,11 @@
 from infer_tools.infer_tool import Svc
 from utils.hparams import hparams
 
-chunks_dict = infer_tool.read_temp("./infer_tools/chunks_temp.json")
+chunks_dict = infer_tool.read_temp("./infer_tools/new_chunks_temp.json")
 
 
 def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise_step, project_name='', f_name=None,
-             file_path=None, out_path=None,**kwargs):
+             file_path=None, out_path=None, slice_db=-40,**kwargs):
     print(f'code version:2022-11-23 v2')
     use_pe = use_pe if hparams['audio_sample_rate'] == 24000 else False
     if file_path is None:
@@ -33,25 +33,24 @@ def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise
         print("load chunks from temp")
         chunks = chunks_dict[wav_hash]["chunks"]
     else:
-        chunks = slicer.cut(wav_path)
+        chunks = slicer.cut(wav_path, db_thresh=slice_db)
     chunks_dict[wav_hash] = {"chunks": chunks, "time": int(time.time())}
-    infer_tool.write_temp("./infer_tools/chunks_temp.json", chunks_dict)
+    infer_tool.write_temp("./infer_tools/new_chunks_temp.json", chunks_dict)
     audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
 
     count = 0
     f0_tst = []
     f0_pred = []
     audio = []
-    epsilon = 0.00002
-    for data in audio_data:
+    for (slice_tag, data) in audio_data:
         print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
         length = int(np.ceil(len(data) / audio_sr * hparams['audio_sample_rate']))
         raw_path = io.BytesIO()
         soundfile.write(raw_path, data, audio_sr, format="wav")
         if hparams['debug']:
             print(np.mean(data), np.var(data))
         raw_path.seek(0)
-        if np.var(data) < epsilon:
+        if slice_tag:
             print('jump empty segment')
             _f0_tst, _f0_pred, _audio = (
                 np.zeros(int(np.ceil(length / hparams['hop_size']))), np.zeros(int(np.ceil(length / hparams['hop_size']))),

diff --git a/infer_tools/infer_tool.py b/infer_tools/infer_tool.py
@@ -3,14 +3,14 @@
 import os
 import time
 from io import BytesIO
+from pathlib import Path
 
 import librosa
 import numpy as np
 import soundfile
 import torch
 
 import utils
-from pathlib import Path
 from modules.fastspeech.pe import PitchExtractor
 from network.diff.candidate_decoder import FFT
 from network.diff.diffusion import GaussianDiffusion
@@ -21,22 +21,30 @@
 from utils.hparams import hparams, set_hparams
 from utils.pitch_utils import denorm_f0, norm_interp_f0
 
+if os.path.exists("chunks_temp.json"):
+    os.remove("chunks_temp.json")
+
 
 def read_temp(file_name):
     if not os.path.exists(file_name):
         with open(file_name, "w") as f:
             f.write(json.dumps({"info": "temp_dict"}))
         return {}
     else:
-        with open(file_name, "r") as f:
-            data = f.read()
-        data_dict = json.loads(data)
-        if os.path.getsize(file_name) > 50 * 1024 * 1024:
-            f_name = file_name.split("/")[-1]
-            print(f"clean {f_name}")
-            for wav_hash in list(data_dict.keys()):
-                if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
-                    del data_dict[wav_hash]
+        try:
+            with open(file_name, "r") as f:
+                data = f.read()
+            data_dict = json.loads(data)
+            if os.path.getsize(file_name) > 50 * 1024 * 1024:
+                f_name = file_name.split("/")[-1]
+                print(f"clean {f_name}")
+                for wav_hash in list(data_dict.keys()):
+                    if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
+                        del data_dict[wav_hash]
+        except Exception as e:
+            print(e)
+            print(f"{file_name} error,auto rebuild file")
+            data_dict = {"info": "temp_dict"}
         return data_dict
 
 

diff --git a/infer_tools/slicer.py b/infer_tools/slicer.py
@@ -1,9 +1,6 @@
-import os.path
 import time
-from argparse import ArgumentParser
 
 import numpy as np
-import soundfile
 import torch
 import torchaudio
 from scipy.ndimage import maximum_filter1d, uniform_filter1d
@@ -107,59 +104,25 @@ def slice(self, audio):
             split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
             sil_tags.append((split_loc_l, samples.shape[0]))
         if len(sil_tags) == 0:
-            return [len(audio)]
+            return {0: {"slice": False, "split_time": (0, len(audio))}}
         else:
             chunks = []
+            # 第一段静音并非从头开始，补上有声片段
+            if sil_tags[0][0]:
+                chunks.append({"slice": False, "split_time": f"0,{sil_tags[0][0]}"})
             for i in range(0, len(sil_tags)):
-                chunks.append(int((sil_tags[i][0] + sil_tags[i][1]) / 2))
-            return chunks
-
-
-def main():
-    parser = ArgumentParser()
-    parser.add_argument('audio', type=str, help='The audio to be sliced')
-    parser.add_argument('--out_name', type=str, help='Output directory of the sliced audio clips')
-    parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
-    parser.add_argument('--db_thresh', type=float, required=False, default=-40,
-                        help='The dB threshold for silence detection')
-    parser.add_argument('--min_len', type=int, required=False, default=5000,
-                        help='The minimum milliseconds required for each sliced audio clip')
-    parser.add_argument('--win_l', type=int, required=False, default=300,
-                        help='Size of the large sliding window, presented in milliseconds')
-    parser.add_argument('--win_s', type=int, required=False, default=20,
-                        help='Size of the small sliding window, presented in milliseconds')
-    parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
-                        help='The maximum silence length kept around the sliced audio, presented in milliseconds')
-    args = parser.parse_args()
-    out = args.out
-    if out is None:
-        out = os.path.dirname(os.path.abspath(args.audio))
-    audio, sr = torchaudio.load(args.audio)
-    if len(audio.shape) == 2 and audio.shape[1] >= 2:
-        audio = torch.mean(audio, dim=0).unsqueeze(0)
-    audio = audio.cpu().numpy()[0]
-
-    slicer = Slicer(
-        sr=sr,
-        db_threshold=args.db_thresh,
-        min_length=args.min_len,
-        win_l=args.win_l,
-        win_s=args.win_s,
-        max_silence_kept=args.max_sil_kept
-    )
-    chunks = slicer.slice(audio)
-    if not os.path.exists(args.out):
-        os.makedirs(args.out)
-    start = 0
-    end_id = 0
-    for i, chunk in enumerate(chunks):
-        end = chunk
-        soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(i).zfill(2))), audio[start:end], sr)
-        start = end
-        end_id = i + 1
-    if start != len(audio):
-        soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(end_id).zfill(2))),
-                        audio[start:len(audio)], sr)
+                # 标识有声片段（跳过第一段）
+                if i:
+                    chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1]},{sil_tags[i][0]}"})
+                # 标识所有静音片段
+                chunks.append({"slice": True, "split_time": f"{sil_tags[i][0]},{sil_tags[i][1]}"})
+            # 最后一段静音并非结尾，补上结尾片段
+            if sil_tags[-1][1] != len(audio):
+                chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1]},{len(audio)}"})
+            chunk_dict = {}
+            for i in range(len(chunks)):
+                chunk_dict[str(i)] = chunks[i]
+            return chunk_dict
 
 
 def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_kept=500):
@@ -181,18 +144,15 @@ def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_ke
 
 
 def chunks2audio(audio_path, chunks):
+    chunks = dict(chunks)
     audio, sr = torchaudio.load(audio_path)
     if len(audio.shape) == 2 and audio.shape[1] >= 2:
         audio = torch.mean(audio, dim=0).unsqueeze(0)
     audio = audio.cpu().numpy()[0]
-    start = 0
     result = []
-    for i, chunk in enumerate(chunks):
-        end = chunk
-        result.append(audio[start:end])
-        start = end
-    if start != len(audio):
-        result.append(audio[start:len(audio)])
+    for k, v in chunks.items():
+        tag = v["split_time"].split(",")
+        result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
     return result, sr