diff --git a/dnn/torch/osce/models/lavoce.py b/dnn/torch/osce/models/lavoce.py index e34db9efa..101a06b80 100644 --- a/dnn/torch/osce/models/lavoce.py +++ b/dnn/torch/osce/models/lavoce.py @@ -74,10 +74,13 @@ def __init__(self, norm_p=2, avg_pool_k=4, pulses=False, + power_pulses=False, + pulse_power=9, innovate1=True, innovate2=False, innovate3=False, - ftrans_k=2): + ftrans_k=2, + shape_bias=True): super().__init__() @@ -90,6 +93,8 @@ def __init__(self, self.preemph = preemph self.pulses = pulses self.ftrans_k = ftrans_k + self.power_pulses = power_pulses + self.pulse_power = pulse_power assert self.FEATURE_FRAME_SIZE % self.FRAME_SIZE == 0 self.upsamp_factor = self.FEATURE_FRAME_SIZE // self.FRAME_SIZE @@ -101,7 +106,7 @@ def __init__(self, self.feature_net = LPCNetFeatureNet(num_features + pitch_embedding_dim, cond_dim, self.upsamp_factor) # noise shaper - self.noise_shaper = NoiseShaper(cond_dim, self.FRAME_SIZE) + self.noise_shaper = NoiseShaper(cond_dim, self.FRAME_SIZE, bias=shape_bias) # comb filters left_pad = self.kernel_size // 2 @@ -117,9 +122,9 @@ def __init__(self, self.af1 = LimitedAdaptiveConv1d(1, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) # non-linear transforms - self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate1) - self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate2) - self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate3) + self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate1, bias=shape_bias) + self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate2, bias=shape_bias) + self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate3, bias=shape_bias) # combinators self.af2 = LimitedAdaptiveConv1d(2, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) @@ -151,6 +156,11 @@ def create_phase_signals(self, periods): pulse_a = torch.relu(chunk_sin - alpha) / (1 - alpha) pulse_b = torch.relu(-chunk_sin - alpha) / (1 - alpha) + chunk = torch.cat((pulse_a, pulse_b), dim = 1) + elif self.power_pulses: + chunk_sin = torch.sin(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE) + pulse_a = torch.relu(chunk_sin) ** self.pulse_power + pulse_b = torch.relu(-chunk_sin) ** self.pulse_power chunk = torch.cat((pulse_a, pulse_b), dim = 1) else: chunk_sin = torch.sin(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE) @@ -176,7 +186,7 @@ def flop_count(self, rate=16000, verbose=False): af_flops = self.af1.flop_count(rate) + self.af2.flop_count(rate) + self.af3.flop_count(rate) + self.af4.flop_count(rate) + self.af_prescale.flop_count(rate) + self.af_mix.flop_count(rate) feature_flops = (_conv1d_flop_count(self.post_cf1, frame_rate) + _conv1d_flop_count(self.post_cf2, frame_rate) + _conv1d_flop_count(self.post_af1, frame_rate) + _conv1d_flop_count(self.post_af2, frame_rate) + _conv1d_flop_count(self.post_af3, frame_rate)) - shape_flops = self.tdshape1.flop_count(rate) + self.tdshape2.flop_count(rate) + self.tdshape3.flop_count(rate) + shape_flops = self.tdshape1.flop_count(rate) + self.tdshape2.flop_count(rate) + self.tdshape3.flop_count(rate) + self.noise_shaper.flop_count(rate) if verbose: print(f"feature net: {feature_net_flops / 1e6} MFLOPS") diff --git a/dnn/torch/osce/utils/layers/noise_shaper.py b/dnn/torch/osce/utils/layers/noise_shaper.py index ba8a3af37..d7c03a997 100644 --- a/dnn/torch/osce/utils/layers/noise_shaper.py +++ b/dnn/torch/osce/utils/layers/noise_shaper.py @@ -37,7 +37,8 @@ class NoiseShaper(nn.Module): def __init__(self, feature_dim, - frame_size=160 + frame_size=160, + bias=True ): """ @@ -58,8 +59,8 @@ def __init__(self, self.frame_size = frame_size # feature transform - self.feature_alpha1 = nn.Conv1d(self.feature_dim, frame_size, 2) - self.feature_alpha2 = nn.Conv1d(frame_size, frame_size, 2) + self.feature_alpha1 = nn.Conv1d(self.feature_dim, frame_size, 2, bias=bias) + self.feature_alpha2 = nn.Conv1d(frame_size, frame_size, 2, bias=bias) def flop_count(self, rate): diff --git a/dnn/torch/osce/utils/layers/td_shaper.py b/dnn/torch/osce/utils/layers/td_shaper.py index 788dd9f27..d8d258193 100644 --- a/dnn/torch/osce/utils/layers/td_shaper.py +++ b/dnn/torch/osce/utils/layers/td_shaper.py @@ -15,7 +15,8 @@ def __init__(self, innovate=False, pool_after=False, softquant=False, - apply_weight_norm=False + apply_weight_norm=False, + bias=True ): """ @@ -51,21 +52,21 @@ def __init__(self, norm = torch.nn.utils.weight_norm if apply_weight_norm else lambda x, name=None: x # feature transform - self.feature_alpha1_f = norm(nn.Conv1d(self.feature_dim, frame_size, 2)) - self.feature_alpha1_t = norm(nn.Conv1d(self.env_dim, frame_size, 2)) - self.feature_alpha2 = norm(nn.Conv1d(frame_size, frame_size, 2)) + self.feature_alpha1_f = norm(nn.Conv1d(self.feature_dim, frame_size, 2, bias=bias)) + self.feature_alpha1_t = norm(nn.Conv1d(self.env_dim, frame_size, 2, bias=bias)) + self.feature_alpha2 = norm(nn.Conv1d(frame_size, frame_size, 2, bias=bias)) if softquant: self.feature_alpha1_f = soft_quant(self.feature_alpha1_f) if self.innovate: - self.feature_alpha1b_f = norm(nn.Conv1d(self.feature_dim, frame_size, 2)) - self.feature_alpha1b_t = norm(nn.Conv1d(self.env_dim, frame_size, 2)) - self.feature_alpha1c_f = norm(nn.Conv1d(self.feature_dim, frame_size, 2)) - self.feature_alpha1c_t = norm(nn.Conv1d(self.env_dim, frame_size, 2)) + self.feature_alpha1b_f = norm(nn.Conv1d(self.feature_dim, frame_size, 2, bias=bias)) + self.feature_alpha1b_t = norm(nn.Conv1d(self.env_dim, frame_size, 2, bias=bias)) + self.feature_alpha1c_f = norm(nn.Conv1d(self.feature_dim, frame_size, 2, bias=bias)) + self.feature_alpha1c_t = norm(nn.Conv1d(self.env_dim, frame_size, 2, bias=bias)) - self.feature_alpha2b = norm(nn.Conv1d(frame_size, frame_size, 2)) - self.feature_alpha2c = norm(nn.Conv1d(frame_size, frame_size, 2)) + self.feature_alpha2b = norm(nn.Conv1d(frame_size, frame_size, 2, bias=bias)) + self.feature_alpha2c = norm(nn.Conv1d(frame_size, frame_size, 2, bias=bias)) if softquant: self.feature_alpha1b_f = soft_quant(self.feature_alpha1b_f) diff --git a/dnn/torch/osce/utils/templates.py b/dnn/torch/osce/utils/templates.py index 5fc84ef18..37f1ea419 100644 --- a/dnn/torch/osce/utils/templates.py +++ b/dnn/torch/osce/utils/templates.py @@ -316,8 +316,8 @@ 'name': 'lavoce' }, 'training': { - 'batch_size': 64, - 'epochs': 50, + 'batch_size': 128, + 'epochs': 100, 'gen_lr_reduction': 1, 'lambda_feat': 1.0, 'lambda_reg': 0.6,