Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New variance parameter: tension #169

Merged
merged 11 commits into from
Feb 19, 2024
4 changes: 3 additions & 1 deletion augmentation/spec_stretch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None)
).cpu().numpy()

f0, _ = self.pe.get_pitch(
wav, aug_item['length'], hparams, speed=speed, interp_uv=hparams['interp_uv']
wav, samplerate=hparams['audio_sample_rate'], length=aug_item['length'],
hop_size=hparams['hop_size'], f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
speed=speed, interp_uv=hparams['interp_uv']
)
aug_item['f0'] = f0.astype(np.float32)

Expand Down
6 changes: 5 additions & 1 deletion basics/base_pe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
class BasePE:
def get_pitch(self, waveform, length, hparams, interp_uv=False, speed=1):
def get_pitch(
self, waveform, samplerate, length,
*, hop_size, f0_min=65, f0_max=1100,
speed=1, interp_uv=False
):
raise NotImplementedError()
2 changes: 2 additions & 0 deletions configs/acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,13 @@ mel_vmax: 1.5
interp_uv: true
energy_smooth_width: 0.12
breathiness_smooth_width: 0.12
tension_smooth_width: 0.12

use_spk_id: false
f0_embed_type: continuous
use_energy_embed: false
use_breathiness_embed: false
use_tension_embed: false
use_key_shift_embed: false
use_speed_embed: false

Expand Down
4 changes: 4 additions & 0 deletions configs/variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ predict_dur: true
predict_pitch: true
predict_energy: false
predict_breathiness: false
predict_tension: false

dur_prediction_args:
arch: fs2
Expand Down Expand Up @@ -77,6 +78,9 @@ energy_smooth_width: 0.12
breathiness_db_min: -96.0
breathiness_db_max: -20.0
breathiness_smooth_width: 0.12
tension_logit_min: -10.0
tension_logit_max: 10.0
tension_smooth_width: 0.12

variances_prediction_args:
total_repeat_bins: 48
Expand Down
2 changes: 2 additions & 0 deletions inference/ds_acoustic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N
self.variances_to_embed.add('energy')
if hparams.get('use_breathiness_embed', False):
self.variances_to_embed.add('breathiness')
if hparams.get('use_tension_embed', False):
self.variances_to_embed.add('tension')

self.ph_encoder = TokenTextEncoder(vocab_list=build_phoneme_list())
if hparams['use_spk_id']:
Expand Down
5 changes: 4 additions & 1 deletion inference/val_nsf_hifigan.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ def get_pitch(wav_data, mel, hparams, threshold=0.3):
if not filename.endswith('.wav'):
continue
wav, mel = vocoder.wav2spec(os.path.join(in_path, filename))
f0, _ = get_pitch_parselmouth(wav, len(mel), hparams)
f0, _ = get_pitch_parselmouth(
wav, samplerate=hparams['audio_sample_rate'], length=len(mel),
hop_size=hparams['hop_size']
)

wav_out = vocoder.spec2wav(mel, f0=f0)
save_wav(wav_out, os.path.join(out_path, filename), hparams['audio_sample_rate'])
3 changes: 3 additions & 0 deletions modules/fastspeech/acoustic_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,13 @@ def __init__(self, vocab_size):
self.variance_embed_list = []
self.use_energy_embed = hparams.get('use_energy_embed', False)
self.use_breathiness_embed = hparams.get('use_breathiness_embed', False)
self.use_tension_embed = hparams.get('use_tension_embed', False)
if self.use_energy_embed:
self.variance_embed_list.append('energy')
if self.use_breathiness_embed:
self.variance_embed_list.append('breathiness')
if self.use_tension_embed:
self.variance_embed_list.append('tension')

self.use_variance_embeds = len(self.variance_embed_list) > 0
if self.use_variance_embeds:
Expand Down
15 changes: 14 additions & 1 deletion modules/fastspeech/param_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from modules.diffusion.ddpm import MultiVarianceDiffusion
from utils.hparams import hparams

VARIANCE_CHECKLIST = ['energy', 'breathiness']
VARIANCE_CHECKLIST = ['energy', 'breathiness', 'tension']


class ParameterAdaptorModule(torch.nn.Module):
Expand All @@ -14,10 +14,13 @@ def __init__(self):
self.variance_prediction_list = []
self.predict_energy = hparams.get('predict_energy', False)
self.predict_breathiness = hparams.get('predict_breathiness', False)
self.predict_tension = hparams.get('predict_tension', False)
if self.predict_energy:
self.variance_prediction_list.append('energy')
if self.predict_breathiness:
self.variance_prediction_list.append('breathiness')
if self.predict_tension:
self.variance_prediction_list.append('tension')
self.predict_variances = len(self.variance_prediction_list) > 0

def build_adaptor(self, cls=MultiVarianceDiffusion):
Expand All @@ -38,6 +41,16 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
))
clamps.append((hparams['breathiness_db_min'], 0.))

if self.predict_tension:
ranges.append((
hparams['tension_logit_min'],
hparams['tension_logit_max']
))
clamps.append((
hparams['tension_logit_min'],
hparams['tension_logit_max']
))

variances_hparams = hparams['variances_prediction_args']
total_repeat_bins = variances_hparams['total_repeat_bins']
assert total_repeat_bins % len(self.variance_prediction_list) == 0, \
Expand Down
11 changes: 9 additions & 2 deletions modules/pe/pm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,12 @@


class ParselmouthPE(BasePE):
def get_pitch(self, waveform, length, hparams, interp_uv=False, speed=1):
return get_pitch_parselmouth(waveform, length, hparams, speed=speed, interp_uv=interp_uv)
def get_pitch(
self,waveform, samplerate, length,
*, hop_size, f0_min=65, f0_max=1100,
speed=1, interp_uv=False
):
return get_pitch_parselmouth(
waveform, samplerate=samplerate, length=length,
hop_size=hop_size, speed=speed, interp_uv=interp_uv
)
20 changes: 12 additions & 8 deletions modules/pe/pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,20 @@
import pyworld as pw
from utils.pitch_utils import interp_f0

class HarvestPE(BasePE):
def get_pitch(self, waveform, length, hparams, interp_uv=False, speed=1):
hop_size = int(np.round(hparams['hop_size'] * speed))

time_step = 1000 * hop_size / hparams['audio_sample_rate']
f0_floor = hparams['f0_min']
f0_ceil = hparams['f0_max']
class HarvestPE(BasePE):
def get_pitch(
self, waveform, samplerate, length,
*, hop_size, f0_min=65, f0_max=1100,
speed=1, interp_uv=False
):
hop_size = int(np.round(hop_size * speed))
time_step = 1000 * hop_size / samplerate

f0, _ = pw.harvest(waveform.astype(np.float64), hparams['audio_sample_rate'], f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=time_step)
f0, _ = pw.harvest(
waveform.astype(np.float64), samplerate,
f0_floor=f0_min, f0_ceil=f0_max, frame_period=time_step
)
f0 = f0.astype(np.float32)

if f0.size < length:
Expand All @@ -22,4 +27,3 @@ def get_pitch(self, waveform, length, hparams, interp_uv=False, speed=1):
if interp_uv:
f0, uv = interp_f0(f0, uv)
return f0, uv

12 changes: 8 additions & 4 deletions modules/pe/rmvpe/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,17 @@ def infer_from_audio(self, audio, sample_rate=16000, thred=0.03, use_viterbi=Fal
f0 = self.decode(hidden, thred=thred, use_viterbi=use_viterbi)
return f0

def get_pitch(self, waveform, length, hparams, interp_uv=False, speed=1):
f0 = self.infer_from_audio(waveform, sample_rate=hparams['audio_sample_rate'])
def get_pitch(
self, waveform, samplerate, length,
*, hop_size, f0_min=65, f0_max=1100,
speed=1, interp_uv=False
):
f0 = self.infer_from_audio(waveform, sample_rate=samplerate)
uv = f0 == 0
f0, uv = interp_f0(f0, uv)

hop_size = int(np.round(hparams['hop_size'] * speed))
time_step = hop_size / hparams['audio_sample_rate']
hop_size = int(np.round(hop_size * speed))
time_step = hop_size / samplerate
f0_res = resample_align_curve(f0, 0.01, time_step, length)
uv_res = resample_align_curve(uv.astype(np.float32), 0.01, time_step, length) > 0.5
if not interp_uv:
Expand Down
50 changes: 43 additions & 7 deletions preprocessing/acoustic_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
from modules.pe import initialize_pe
from modules.vocoders.registry import VOCODERS
from utils.binarizer_utils import (
DecomposedWaveform,
SinusoidalSmoothingConv1d,
get_mel2ph_torch,
get_energy_librosa,
get_breathiness_pyworld
get_breathiness_pyworld,
get_tension_base_harmonic,
)
from utils.hparams import hparams

Expand All @@ -37,21 +39,24 @@
'f0',
'energy',
'breathiness',
'tension',
'key_shift',
'speed'
'speed',
]

pitch_extractor: BasePE = None
energy_smooth: SinusoidalSmoothingConv1d = None
breathiness_smooth: SinusoidalSmoothingConv1d = None
tension_smooth: SinusoidalSmoothingConv1d = None


class AcousticBinarizer(BaseBinarizer):
def __init__(self):
super().__init__(data_attrs=ACOUSTIC_ITEM_ATTRIBUTES)
self.lr = LengthRegulator()
self.need_energy = hparams.get('use_energy_embed', False)
self.need_breathiness = hparams.get('use_breathiness_embed', False)
self.need_energy = hparams['use_energy_embed']
self.need_breathiness = hparams['use_breathiness_embed']
self.need_tension = hparams['use_tension_embed']

def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk_id):
meta_data_dict = {}
Expand Down Expand Up @@ -108,7 +113,9 @@ def process_item(self, item_name, meta_data, binarization_args):
if pitch_extractor is None:
pitch_extractor = initialize_pe()
gt_f0, uv = pitch_extractor.get_pitch(
wav, length, hparams, interp_uv=hparams['interp_uv']
wav, samplerate=hparams['audio_sample_rate'], length=length,
hop_size=hparams['hop_size'], f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
interp_uv=hparams['interp_uv']
)
if uv.all(): # All unvoiced
print(f'Skipped \'{item_name}\': empty gt f0')
Expand All @@ -117,7 +124,9 @@ def process_item(self, item_name, meta_data, binarization_args):

if self.need_energy:
# get ground truth energy
energy = get_energy_librosa(wav, length, hparams).astype(np.float32)
energy = get_energy_librosa(
wav, length, hop_size=hparams['hop_size'], win_size=hparams['win_size']
).astype(np.float32)

global energy_smooth
if energy_smooth is None:
Expand All @@ -128,9 +137,17 @@ def process_item(self, item_name, meta_data, binarization_args):

processed_input['energy'] = energy.cpu().numpy()

# create a DeconstructedWaveform object for further feature extraction
dec_waveform = DecomposedWaveform(
wav, samplerate=hparams['audio_sample_rate'], f0=gt_f0 * ~uv,
hop_size=hparams['hop_size'], fft_size=hparams['fft_size'], win_size=hparams['win_size']
)

if self.need_breathiness:
# get ground truth breathiness
breathiness = get_breathiness_pyworld(wav, gt_f0 * ~uv, length, hparams).astype(np.float32)
breathiness = get_breathiness_pyworld(
dec_waveform, None, None, length=length
)

global breathiness_smooth
if breathiness_smooth is None:
Expand All @@ -141,6 +158,25 @@ def process_item(self, item_name, meta_data, binarization_args):

processed_input['breathiness'] = breathiness.cpu().numpy()

if self.need_tension:
# get ground truth tension
tension = get_tension_base_harmonic(
dec_waveform, None, None, length=length, domain='logit'
)

global tension_smooth
if tension_smooth is None:
tension_smooth = SinusoidalSmoothingConv1d(
round(hparams['tension_smooth_width'] / self.timestep)
).eval().to(self.device)
tension = tension_smooth(torch.from_numpy(tension).to(self.device)[None])[0]
if tension.isnan().any():
print('Error:', item_name)
print(tension)
return None

processed_input['tension'] = tension.cpu().numpy()

if hparams.get('use_key_shift_embed', False):
processed_input['key_shift'] = 0.

Expand Down
Loading