diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/.ruff.toml @@ -0,0 +1 @@ + diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 00000000..37c0a820 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,6 @@ +{ + "recommendations": [ + "charliermarsh.ruff", + "ms-python.python" + ] +} \ No newline at end of file diff --git a/cluster/__init__.py b/cluster/__init__.py index f1b9bde0..68758d0a 100644 --- a/cluster/__init__.py +++ b/cluster/__init__.py @@ -1,4 +1,3 @@ -import numpy as np import torch from sklearn.cluster import KMeans diff --git a/cluster/kmeans.py b/cluster/kmeans.py index 8e133ab1..0b78ed6c 100644 --- a/cluster/kmeans.py +++ b/cluster/kmeans.py @@ -1,4 +1,3 @@ -import math,pdb import torch,pynvml from torch.nn.functional import normalize from time import time diff --git a/cluster/train_cluster.py b/cluster/train_cluster.py index 86445663..4858192c 100644 --- a/cluster/train_cluster.py +++ b/cluster/train_cluster.py @@ -1,6 +1,5 @@ -import time,pdb +import time import tqdm -from time import time as ttime import os from pathlib import Path import logging @@ -12,8 +11,7 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -from time import time as ttime -import pynvml,torch +import torch def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑 logger.info(f"Loading features from {in_dir}") @@ -29,7 +27,7 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu= features = features.astype(np.float32) logger.info(f"Clustering features of shape: {features.shape}") t = time.time() - if(use_gpu==False): + if(use_gpu is False): if use_minibatch: kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features) else: @@ -37,14 +35,14 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu= else: kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)# features=torch.from_numpy(features)#.to(device) - labels = kmeans.fit_predict(features)# + kmeans.fit_predict(features)# print(time.time()-t, "s") x = { - "n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[1], - "_n_threads": kmeans._n_threads if use_gpu==False else 4, - "cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(), + "n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1], + "_n_threads": kmeans._n_threads if use_gpu is False else 4, + "cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(), } print("end") diff --git a/data_utils.py b/data_utils.py index 2539519f..0b3c3857 100644 --- a/data_utils.py +++ b/data_utils.py @@ -1,13 +1,11 @@ -import time import os import random import numpy as np import torch import torch.utils.data -import modules.commons as commons import utils -from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch +from modules.mel_processing import spectrogram_torch, spectrogram_torch from utils import load_wav_to_torch, load_filepaths_and_text # import h5py @@ -87,7 +85,7 @@ def get_audio(self, filename): assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin] audio_norm = audio_norm[:, :lmin * self.hop_length] - if volume!= None: + if volume is not None: volume = volume[:lmin] return c, f0, spec, audio_norm, spk, uv, volume @@ -96,7 +94,7 @@ def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume): # print("skip too short audio:", filename) # return None - if random.choice([True, False]) and self.vol_aug and volume!=None: + if random.choice([True, False]) and self.vol_aug and volume is not None: max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5 max_shift = min(1, np.log10(1/max_amp)) log10_vol_shift = random.uniform(-1, max_shift) @@ -114,7 +112,7 @@ def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume): end = start + 790 spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end] audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length] - if volume !=None: + if volume is not None: volume = volume[start:end] return c, f0, spec, audio_norm, spk, uv,volume @@ -178,7 +176,7 @@ def __call__(self, batch): uv = row[5] uv_padded[i, :uv.size(0)] = uv volume = row[6] - if volume != None: + if volume is not None: volume_padded[i, :volume.size(0)] = volume else : volume_padded = None diff --git a/diffusion/data_loaders.py b/diffusion/data_loaders.py index 87d7865a..ea802efb 100644 --- a/diffusion/data_loaders.py +++ b/diffusion/data_loaders.py @@ -1,6 +1,5 @@ import os import random -import re import numpy as np import librosa import torch @@ -130,7 +129,7 @@ def __init__( with open(filelists,"r") as f: self.paths = f.read().splitlines() for name_ext in tqdm(self.paths, total=len(self.paths)): - name = os.path.splitext(name_ext)[0] + os.path.splitext(name_ext)[0] path_audio = name_ext duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate) diff --git a/diffusion/diffusion.py b/diffusion/diffusion.py index e7d7b726..b95d305b 100644 --- a/diffusion/diffusion.py +++ b/diffusion/diffusion.py @@ -2,7 +2,6 @@ from functools import partial from inspect import isfunction import torch.nn.functional as F -import librosa.sequence import numpy as np import torch from torch import nn @@ -26,8 +25,10 @@ def extract(a, t, x_shape): def noise_like(shape, device, repeat=False): - repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) - noise = lambda: torch.randn(shape, device=device) + def repeat_noise(): + return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) + def noise(): + return torch.randn(shape, device=device) return repeat_noise() if repeat else noise() diff --git a/diffusion/diffusion_onnx.py b/diffusion/diffusion_onnx.py index 1c1e8032..1d60edf5 100644 --- a/diffusion/diffusion_onnx.py +++ b/diffusion/diffusion_onnx.py @@ -2,7 +2,6 @@ from functools import partial from inspect import isfunction import torch.nn.functional as F -import librosa.sequence import numpy as np from torch.nn import Conv1d from torch.nn import Mish @@ -27,8 +26,10 @@ def extract(a, t): def noise_like(shape, device, repeat=False): - repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) - noise = lambda: torch.randn(shape, device=device) + def repeat_noise(): + return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) + def noise(): + return torch.randn(shape, device=device) return repeat_noise() if repeat else noise() @@ -577,7 +578,7 @@ def forward(self, condition=None, init_noise=None, pndms=None, k_step=None): noise_list = torch.zeros((0, 1, 1, self.mel_bins, n_frames), device=device) ot = step_range[0] - ot_1 = torch.full((1,), ot, device=device, dtype=torch.long) + torch.full((1,), ot, device=device, dtype=torch.long) for t in step_range: t_1 = torch.full((1,), t, device=device, dtype=torch.long) diff --git a/diffusion/dpm_solver_pytorch.py b/diffusion/dpm_solver_pytorch.py index 23e4d3c0..037da373 100644 --- a/diffusion/dpm_solver_pytorch.py +++ b/diffusion/dpm_solver_pytorch.py @@ -1,6 +1,4 @@ import torch -import torch.nn.functional as F -import math class NoiseScheduleVP: @@ -559,7 +557,7 @@ def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=Fal x_t: A pytorch tensor. The approximated solution at time `t`. """ ns = self.noise_schedule - dims = x.dim() + x.dim() lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) h = lambda_t - lambda_s log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t) @@ -984,12 +982,16 @@ def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol nfe = 0 if order == 2: r1 = 0.5 - lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True) - higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs) + def lower_update(x, s, t): + return self.dpm_solver_first_update(x, s, t, return_intermediate=True) + def higher_update(x, s, t, **kwargs): + return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs) elif order == 3: r1, r2 = 1. / 3., 2. / 3. - lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type) - higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs) + def lower_update(x, s, t): + return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type) + def higher_update(x, s, t, **kwargs): + return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs) else: raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order)) while torch.abs((s - t_0)).mean() > t_err: @@ -997,7 +999,8 @@ def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol x_lower, lower_noise_kwargs = lower_update(x, s, t) x_higher = higher_update(x, s, t, **lower_noise_kwargs) delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev))) - norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True)) + def norm_fn(v): + return torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True)) E = norm_fn((x_higher - x_lower) / delta).max() if torch.all(E <= 1.): x = x_higher diff --git a/diffusion/infer_gt_mel.py b/diffusion/infer_gt_mel.py index 033b821a..da5f36c4 100644 --- a/diffusion/infer_gt_mel.py +++ b/diffusion/infer_gt_mel.py @@ -1,4 +1,3 @@ -import numpy as np import torch import torch.nn.functional as F from diffusion.unit2mel import load_model_vocoder diff --git a/diffusion/logger/saver.py b/diffusion/logger/saver.py index 5fdeaac6..4233613b 100644 --- a/diffusion/logger/saver.py +++ b/diffusion/logger/saver.py @@ -3,13 +3,11 @@ ''' import os -import json import time import yaml import datetime import torch import matplotlib.pyplot as plt -from . import utils from torch.utils.tensorboard import SummaryWriter class Saver(object): diff --git a/diffusion/logger/utils.py b/diffusion/logger/utils.py index 485681ce..1420076e 100644 --- a/diffusion/logger/utils.py +++ b/diffusion/logger/utils.py @@ -1,7 +1,6 @@ import os import yaml import json -import pickle import torch def traverse_dir( @@ -121,6 +120,6 @@ def load_model( ckpt = torch.load(path_pt, map_location=torch.device(device)) global_step = ckpt['global_step'] model.load_state_dict(ckpt['model'], strict=False) - if ckpt.get('optimizer') != None: + if ckpt.get("optimizer") is not None: optimizer.load_state_dict(ckpt['optimizer']) return global_step, model, optimizer diff --git a/diffusion/onnx_export.py b/diffusion/onnx_export.py index a63ab065..3663cfd4 100644 --- a/diffusion/onnx_export.py +++ b/diffusion/onnx_export.py @@ -4,9 +4,7 @@ import torch import torch.nn as nn import numpy as np -from wavenet import WaveNet import torch.nn.functional as F -import diffusion class DotDict(dict): def __getattr__(*args): @@ -147,8 +145,8 @@ def OnnxExport(self, project_name=None, init_noise=None, export_encoder=True, ex spks.update({i:1.0/float(self.n_spk)}) spk_mix = torch.tensor(spk_mix) spk_mix = spk_mix.repeat(n_frames, 1) - orgouttt = self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks) - outtt = self.forward(hubert, mel2ph, f0, volume, spk_mix) + self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks) + self.forward(hubert, mel2ph, f0, volume, spk_mix) if export_encoder: torch.onnx.export( self, @@ -182,8 +180,8 @@ def ExportOnnx(self, project_name=None): spk_mix.append(1.0/float(self.n_spk)) spks.update({i:1.0/float(self.n_spk)}) spk_mix = torch.tensor(spk_mix) - orgouttt = self.orgforward(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks) - outtt = self.forward(hubert, mel2ph, f0, volume, spk_mix) + self.orgforward(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks) + self.forward(hubert, mel2ph, f0, volume, spk_mix) torch.onnx.export( self, diff --git a/diffusion/solver.py b/diffusion/solver.py index 2a8bf380..8b389003 100644 --- a/diffusion/solver.py +++ b/diffusion/solver.py @@ -1,4 +1,3 @@ -import os import time import numpy as np import torch diff --git a/diffusion/uni_pc.py b/diffusion/uni_pc.py index 4226570c..c920f92e 100644 --- a/diffusion/uni_pc.py +++ b/diffusion/uni_pc.py @@ -1,5 +1,4 @@ import torch -import torch.nn.functional as F import math @@ -109,7 +108,8 @@ def marginal_log_mean_coeff(self, t): elif self.schedule == 'linear': return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 elif self.schedule == 'cosine': - log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.)) + def log_alpha_fn(s): + return torch.log(torch.cos((s + self.cosine_s) / (1.0 + self.cosine_s) * math.pi / 2.0)) log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0 return log_alpha_t @@ -147,7 +147,8 @@ def inverse_lambda(self, lamb): return t.reshape((-1,)) else: log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb)) - t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s + def t_fn(log_alpha_t): + return torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2.0 * (1.0 + self.cosine_s) / math.pi - self.cosine_s t = t_fn(log_alpha) return t diff --git a/diffusion/unit2mel.py b/diffusion/unit2mel.py index 53ed7521..0f40c0e8 100644 --- a/diffusion/unit2mel.py +++ b/diffusion/unit2mel.py @@ -116,13 +116,13 @@ def init_spkmix(self, n_spk): hubert_hidden_size = self.input_channel n_frames = 10 hubert = torch.randn((1, n_frames, hubert_hidden_size)) - mel2ph = torch.arange(end=n_frames).unsqueeze(0).long() + torch.arange(end=n_frames).unsqueeze(0).long() f0 = torch.randn((1, n_frames)) volume = torch.randn((1, n_frames)) spks = {} for i in range(n_spk): spks.update({i:1.0/float(self.n_spk)}) - orgouttt = self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks) + self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks) def forward(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None, gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True): diff --git a/inference/infer_tool.py b/inference/infer_tool.py index 442342c9..5dddb417 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -21,7 +21,6 @@ import pickle from diffusion.unit2mel import load_model_vocoder -import yaml logging.getLogger('matplotlib').setLevel(logging.WARNING) @@ -153,7 +152,7 @@ def __init__(self, net_g_path, config_path, self.hop_size = self.diffusion_args.data.block_size self.spk2id = self.diffusion_args.spk self.speech_encoder = self.diffusion_args.data.encoder - self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode!=None else 'left' + self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left' if spk_mix_enable: self.diffusion_model.init_spkmix(len(self.spk2id)) else: @@ -290,7 +289,7 @@ def infer(self, speaker, tran, raw_path, audio = torch.FloatTensor(wav).to(self.dev) audio_mel = None if self.only_diffusion or self.shallow_diffusion: - vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None] + vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None] if self.shallow_diffusion and second_encoding: audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000) audio16k = torch.from_numpy(audio16k).to(self.dev) diff --git a/inference/infer_tool_grad.py b/inference/infer_tool_grad.py index 561c22c5..0b3e72cc 100644 --- a/inference/infer_tool_grad.py +++ b/inference/infer_tool_grad.py @@ -1,12 +1,7 @@ -import hashlib -import json import logging import os -import time -from pathlib import Path import io import librosa -import maad import numpy as np from inference import slicer import parselmouth @@ -14,7 +9,6 @@ import torch import torchaudio -from hubert import hubert_model import utils from models import SynthesizerTrn logging.getLogger('numba').setLevel(logging.WARNING) @@ -93,7 +87,7 @@ def __init__(self): def set_device(self, device): self.device = torch.device(device) self.hubert_soft.to(self.device) - if self.SVCVITS != None: + if self.SVCVITS is not None: self.SVCVITS.to(self.device) def loadCheckpoint(self, path): diff --git a/inference_main.py b/inference_main.py index 17803427..37051723 100644 --- a/inference_main.py +++ b/inference_main.py @@ -1,14 +1,7 @@ -import io import logging -import time -from pathlib import Path from spkmix import spk_mix_map -import librosa -import matplotlib.pyplot as plt -import numpy as np import soundfile from inference import infer_tool -from inference import slicer from inference.infer_tool import Svc logging.getLogger('numba').setLevel(logging.WARNING) diff --git a/models.py b/models.py index 1f67b29c..a125f7a4 100644 --- a/models.py +++ b/models.py @@ -1,5 +1,3 @@ -import copy -import math import torch from torch import nn from torch.nn import functional as F @@ -8,11 +6,11 @@ import modules.commons as commons import modules.modules as modules -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from torch.nn import Conv1d, Conv2d +from torch.nn.utils import weight_norm, spectral_norm import utils -from modules.commons import init_weights, get_padding +from modules.commons import get_padding from utils import f0_to_coarse class ResidualCouplingBlock(nn.Module): @@ -125,7 +123,7 @@ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), @@ -160,7 +158,7 @@ def forward(self, x): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 16, 15, 1, padding=7)), norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), @@ -407,7 +405,7 @@ def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None, vo g = self.emb_g(g).transpose(1,2) # vol proj - vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0 + vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0 # ssl prenet x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) @@ -452,7 +450,7 @@ def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=Fals x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) # vol proj - vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0 + vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0 x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol diff --git a/modules/F0Predictor/crepe.py b/modules/F0Predictor/crepe.py index fa56e719..4b004190 100644 --- a/modules/F0Predictor/crepe.py +++ b/modules/F0Predictor/crepe.py @@ -1,14 +1,13 @@ from typing import Optional,Union try: from typing import Literal -except Exception as e: +except Exception: from typing_extensions import Literal import numpy as np import torch import torchcrepe from torch import nn from torch.nn import functional as F -import scipy #from:https://github.com/fishaudio/fish-diffusion @@ -334,7 +333,7 @@ def __call__(self, x, sampling_rate=44100, pad_to=None): f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)[0] if torch.all(f0 == 0): - rtn = f0.cpu().numpy() if pad_to==None else np.zeros(pad_to) + rtn = f0.cpu().numpy() if pad_to is None else np.zeros(pad_to) return rtn,rtn return self.post_process(x, sampling_rate, f0, pad_to) diff --git a/modules/attentions.py b/modules/attentions.py index f9c11ca4..bb591e95 100644 --- a/modules/attentions.py +++ b/modules/attentions.py @@ -1,12 +1,9 @@ -import copy import math -import numpy as np import torch from torch import nn from torch.nn import functional as F import modules.commons as commons -import modules.modules as modules from modules.modules import LayerNorm @@ -243,7 +240,7 @@ def _matmul_with_relative_keys(self, x, y): return ret def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 + 2 * self.window_size + 1 # Pad first before slice to avoid using cond ops. pad_length = max(length - (self.window_size + 1), 0) slice_start_position = max((self.window_size + 1) - length, 0) diff --git a/modules/commons.py b/modules/commons.py index 07488800..abb20ac9 100644 --- a/modules/commons.py +++ b/modules/commons.py @@ -1,7 +1,5 @@ import math -import numpy as np import torch -from torch import nn from torch.nn import functional as F def slice_pitch_segments(x, ids_str, segment_size=4): @@ -157,7 +155,6 @@ def generate_path(duration, mask): duration: [b, 1, t_x] mask: [b, 1, t_y, t_x] """ - device = duration.device b, _, t_y, t_x = mask.shape cum_duration = torch.cumsum(duration, -1) diff --git a/modules/losses.py b/modules/losses.py index cd21799e..4a489cdb 100644 --- a/modules/losses.py +++ b/modules/losses.py @@ -1,7 +1,5 @@ import torch -from torch.nn import functional as F -import modules.commons as commons def feature_loss(fmap_r, fmap_g): diff --git a/modules/mel_processing.py b/modules/mel_processing.py index a9936a20..0795b053 100644 --- a/modules/mel_processing.py +++ b/modules/mel_processing.py @@ -1,16 +1,5 @@ -import math -import os -import random import torch -from torch import nn -import torch.nn.functional as F import torch.utils.data -import numpy as np -import librosa -import librosa.util as librosa_util -from librosa.util import normalize, pad_center, tiny -from scipy.signal import get_window -from scipy.io.wavfile import read from librosa.filters import mel as librosa_mel_fn MAX_WAV_VALUE = 32768.0 diff --git a/modules/modules.py b/modules/modules.py index 54290fd2..3f11cc02 100644 --- a/modules/modules.py +++ b/modules/modules.py @@ -1,12 +1,8 @@ -import copy -import math -import numpy as np -import scipy import torch from torch import nn from torch.nn import functional as F -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn import Conv1d from torch.nn.utils import weight_norm, remove_weight_norm import modules.commons as commons diff --git a/onnx_export_speaker_mix.py b/onnx_export_speaker_mix.py index cb807359..742ca39b 100644 --- a/onnx_export_speaker_mix.py +++ b/onnx_export_speaker_mix.py @@ -127,7 +127,7 @@ def main(): "Characters": spklist } - MoeVSConfJson = json.dumps(MoeVSConf) + json.dumps(MoeVSConf) with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile: json.dump(MoeVSConf, MoeVsConfFile, indent = 4) diff --git a/onnxexport/model_onnx.py b/onnxexport/model_onnx.py index e28bae95..09e69aee 100644 --- a/onnxexport/model_onnx.py +++ b/onnxexport/model_onnx.py @@ -6,11 +6,11 @@ import modules.commons as commons import modules.modules as modules -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from torch.nn import Conv1d, Conv2d +from torch.nn.utils import weight_norm, spectral_norm import utils -from modules.commons import init_weights, get_padding +from modules.commons import get_padding from vdecoder.hifigan.models import Generator from utils import f0_to_coarse @@ -124,7 +124,7 @@ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), @@ -159,7 +159,7 @@ def forward(self, x): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 16, 15, 1, padding=7)), norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), diff --git a/onnxexport/model_onnx_speaker_mix.py b/onnxexport/model_onnx_speaker_mix.py index f8193637..b1889986 100644 --- a/onnxexport/model_onnx_speaker_mix.py +++ b/onnxexport/model_onnx_speaker_mix.py @@ -1,18 +1,11 @@ -import copy -import math import torch from torch import nn from torch.nn import functional as F import modules.attentions as attentions -import modules.commons as commons import modules.modules as modules -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -import utils -from modules.commons import init_weights, get_padding from utils import f0_to_coarse class ResidualCouplingBlock(nn.Module): @@ -259,7 +252,7 @@ def forward(self, c, f0, mel2ph, uv, noise=None, g=None, vol = None): x_mask = torch.unsqueeze(torch.ones_like(f0), 1).to(c.dtype) # vol proj - vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0 + vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0 x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol diff --git a/resample.py b/resample.py index 275265e5..a99c509c 100644 --- a/resample.py +++ b/resample.py @@ -3,8 +3,8 @@ import librosa import numpy as np import concurrent.futures -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from multiprocessing import Pool, cpu_count +from concurrent.futures import ProcessPoolExecutor +from multiprocessing import cpu_count from scipy.io import wavfile from tqdm import tqdm diff --git a/train.py b/train.py index dba77bbb..0139ffae 100644 --- a/train.py +++ b/train.py @@ -6,12 +6,7 @@ logging.getLogger('numba').setLevel(logging.WARNING) import os -import json -import argparse -import itertools -import math import torch -from torch import nn, optim from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter @@ -287,7 +282,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): c = c[:1].cuda(0) f0 = f0[:1].cuda(0) uv= uv[:1].cuda(0) - if volume!=None: + if volume is not None: volume = volume[:1].cuda(0) mel = spec_to_mel_torch( spec, @@ -314,7 +309,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): f"gt/audio_{batch_idx}": y[0] }) image_dict.update({ - f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()), + "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()), "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()) }) utils.summarize( diff --git a/train_diff.py b/train_diff.py index d5f99e66..8bfd9070 100644 --- a/train_diff.py +++ b/train_diff.py @@ -1,4 +1,3 @@ -import os import argparse import torch from torch.optim import lr_scheduler diff --git a/utils.py b/utils.py index 45f86f86..691cd0bb 100644 --- a/utils.py +++ b/utils.py @@ -6,17 +6,12 @@ import logging import json import subprocess -import warnings -import random -import functools import librosa import numpy as np from scipy.io.wavfile import read import torch from torch.nn import functional as F -from modules.commons import sequence_mask import faiss -import tqdm MATPLOTLIB_FLAG = False @@ -201,15 +196,20 @@ def clean_checkpoints(path_to_models='logs/44k/', n_ckpts_to_keep=2, sort_by_tim False -> lexicographically delete ckpts """ ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))] - name_key = (lambda _f: int(re.compile('._(\d+)\.pth').match(_f).group(1))) - time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f))) + def name_key(_f): + return int(re.compile("._(\\d+)\\.pth").match(_f).group(1)) + def time_key(_f): + return os.path.getmtime(os.path.join(path_to_models, _f)) sort_key = time_key if sort_by_time else name_key - x_sorted = lambda _x: sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith('_0.pth')], key=sort_key) + def x_sorted(_x): + return sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")], key=sort_key) to_del = [os.path.join(path_to_models, fn) for fn in (x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])] - del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}") - del_routine = lambda x: [os.remove(x), del_info(x)] - rs = [del_routine(fn) for fn in to_del] + def del_info(fn): + return logger.info(f".. Free up space by deleting ckpt {fn}") + def del_routine(x): + return [os.remove(x), del_info(x)] + [del_routine(fn) for fn in to_del] def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): for k, v in scalars.items(): diff --git a/vdecoder/hifigan/models.py b/vdecoder/hifigan/models.py index 2c868f3d..c94a367a 100644 --- a/vdecoder/hifigan/models.py +++ b/vdecoder/hifigan/models.py @@ -199,7 +199,7 @@ def forward(self, f0): output uv: tensor(batchsize=1, length, 1) """ with torch.no_grad(): - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, + torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) @@ -353,7 +353,7 @@ class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), @@ -412,7 +412,7 @@ def forward(self, y, y_hat): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 128, 15, 1, padding=7)), norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), diff --git a/vdecoder/hifigan/nvSTFT.py b/vdecoder/hifigan/nvSTFT.py index 88597d62..87d15119 100644 --- a/vdecoder/hifigan/nvSTFT.py +++ b/vdecoder/hifigan/nvSTFT.py @@ -1,14 +1,10 @@ -import math import os os.environ["LRU_CACHE_CAPACITY"] = "3" -import random import torch import torch.utils.data import numpy as np import librosa -from librosa.util import normalize from librosa.filters import mel as librosa_mel_fn -from scipy.io.wavfile import read import soundfile as sf def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): diff --git a/vdecoder/hifigan/utils.py b/vdecoder/hifigan/utils.py index 9c93c996..89ccf5f5 100644 --- a/vdecoder/hifigan/utils.py +++ b/vdecoder/hifigan/utils.py @@ -1,6 +1,5 @@ import glob import os -import matplotlib import torch from torch.nn.utils import weight_norm # matplotlib.use("Agg") diff --git a/vdecoder/hifiganwithsnake/models.py b/vdecoder/hifiganwithsnake/models.py index 4d9ae7a0..1d3a0c00 100644 --- a/vdecoder/hifiganwithsnake/models.py +++ b/vdecoder/hifiganwithsnake/models.py @@ -211,7 +211,7 @@ def forward(self, f0): output uv: tensor(batchsize=1, length, 1) """ with torch.no_grad(): - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, + torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) @@ -370,7 +370,7 @@ class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), @@ -429,7 +429,7 @@ def forward(self, y, y_hat): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 128, 15, 1, padding=7)), norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), diff --git a/vdecoder/hifiganwithsnake/nvSTFT.py b/vdecoder/hifiganwithsnake/nvSTFT.py index 88597d62..87d15119 100644 --- a/vdecoder/hifiganwithsnake/nvSTFT.py +++ b/vdecoder/hifiganwithsnake/nvSTFT.py @@ -1,14 +1,10 @@ -import math import os os.environ["LRU_CACHE_CAPACITY"] = "3" -import random import torch import torch.utils.data import numpy as np import librosa -from librosa.util import normalize from librosa.filters import mel as librosa_mel_fn -from scipy.io.wavfile import read import soundfile as sf def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): diff --git a/vdecoder/hifiganwithsnake/utils.py b/vdecoder/hifiganwithsnake/utils.py index 9c93c996..89ccf5f5 100644 --- a/vdecoder/hifiganwithsnake/utils.py +++ b/vdecoder/hifiganwithsnake/utils.py @@ -1,6 +1,5 @@ import glob import os -import matplotlib import torch from torch.nn.utils import weight_norm # matplotlib.use("Agg") diff --git a/vdecoder/nsf_hifigan/models.py b/vdecoder/nsf_hifigan/models.py index c2c889ec..4fa33a1e 100644 --- a/vdecoder/nsf_hifigan/models.py +++ b/vdecoder/nsf_hifigan/models.py @@ -289,7 +289,7 @@ class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), @@ -348,7 +348,7 @@ def forward(self, y, y_hat): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 128, 15, 1, padding=7)), norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), diff --git a/vdecoder/nsf_hifigan/nvSTFT.py b/vdecoder/nsf_hifigan/nvSTFT.py index 62bd5a00..369c7be8 100644 --- a/vdecoder/nsf_hifigan/nvSTFT.py +++ b/vdecoder/nsf_hifigan/nvSTFT.py @@ -1,14 +1,10 @@ -import math import os os.environ["LRU_CACHE_CAPACITY"] = "3" -import random import torch import torch.utils.data import numpy as np import librosa -from librosa.util import normalize from librosa.filters import mel as librosa_mel_fn -from scipy.io.wavfile import read import soundfile as sf import torch.nn.functional as F diff --git a/vencoder/dphubert/components.py b/vencoder/dphubert/components.py index 0cc82a35..1f8ae273 100644 --- a/vencoder/dphubert/components.py +++ b/vencoder/dphubert/components.py @@ -11,7 +11,7 @@ import torch from torch import nn, Tensor -from torch.nn import Module, Parameter +from torch.nn import Module from .hardconcrete import HardConcrete from .pruning_utils import ( diff --git a/vencoder/wavlm/WavLM.py b/vencoder/wavlm/WavLM.py index 777befb7..656e5041 100644 --- a/vencoder/wavlm/WavLM.py +++ b/vencoder/wavlm/WavLM.py @@ -402,9 +402,7 @@ def make_conv(): nn.init.kaiming_normal_(conv.weight) return conv - assert ( - is_layer_norm and is_group_norm - ) == False, "layer norm and group norm are exclusive" + assert (is_layer_norm and is_group_norm) is False, "layer norm and group norm are exclusive" if is_layer_norm: return nn.Sequential( diff --git a/vencoder/whisper/audio.py b/vencoder/whisper/audio.py index 3bdb70ba..7b3b796c 100644 --- a/vencoder/whisper/audio.py +++ b/vencoder/whisper/audio.py @@ -1,4 +1,3 @@ -import os from functools import lru_cache from typing import Union diff --git a/vencoder/whisper/decoding.py b/vencoder/whisper/decoding.py index 603546d4..133c2e77 100644 --- a/vencoder/whisper/decoding.py +++ b/vencoder/whisper/decoding.py @@ -32,7 +32,7 @@ def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) if tokenizer is None: tokenizer = get_tokenizer(model.is_multilingual) if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence: - raise ValueError(f"This model doesn't have language tokens so it can't perform lang id") + raise ValueError("This model doesn't have language tokens so it can't perform lang id") single = mel.ndim == 2 if single: diff --git a/vencoder/whisper/tokenizer.py b/vencoder/whisper/tokenizer.py index a27cb359..b15645dc 100644 --- a/vencoder/whisper/tokenizer.py +++ b/vencoder/whisper/tokenizer.py @@ -196,7 +196,7 @@ def timestamp_begin(self) -> int: def language_token(self) -> int: """Returns the token id corresponding to the value of the `language` field""" if self.language is None: - raise ValueError(f"This tokenizer does not have language token configured") + raise ValueError("This tokenizer does not have language token configured") additional_tokens = dict( zip( diff --git a/webUI.py b/webUI.py index d848e179..b6a4f016 100644 --- a/webUI.py +++ b/webUI.py @@ -1,4 +1,3 @@ -import io import os # os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt") @@ -13,8 +12,6 @@ import json import subprocess -import edge_tts -import asyncio from scipy.io import wavfile import librosa import torch @@ -42,7 +39,7 @@ def upload_mix_append_file(files,sfiles): try: - if(sfiles == None): + if(sfiles is None): file_paths = [file.name for file in files] else: file_paths = [file.name for file in chain(files,sfiles)] @@ -68,7 +65,7 @@ def mix_submit_click(js,mode): def updata_mix_info(files): try: - if files == None : return mix_model_output1.update(value="") + if files is None : return mix_model_output1.update(value="") p = {file.name:100 for file in files} return mix_model_output1.update(value=json.dumps(p,indent=2)) except Exception as e: