forked from dhgrs/chainer-WaveGlow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 4439a45
Showing
8 changed files
with
624 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# chainer-WaveGlow | ||
|
||
A Chainer implementation of WaveGlow( https://nv-adlr.github.io/WaveGlow ). | ||
|
||
# Results | ||
I'll upload after finish the training. I'm getting audible results now. Please wait! | ||
|
||
# Requirements | ||
I trained and generated with | ||
|
||
- python(3.5.2) | ||
- chainer (5.0.0) | ||
- librosa (0.6.2) | ||
- matplotlib (3.0.1) | ||
|
||
# Usage | ||
## download dataset | ||
You can download VCTK Corpus(en multi speaker)/LJ-Speech(en single speaker) very easily via [my repository](https://github.com/dhgrs/download_dataset). | ||
|
||
## set parameters | ||
I'll write details later. | ||
|
||
## training | ||
You can use same command in each directory. | ||
``` | ||
(without GPU) | ||
python train.py | ||
(with GPU #n) | ||
python train.py -g n | ||
``` | ||
|
||
You can resume snapshot and restart training like below. | ||
``` | ||
python train.py -r snapshot_iter_100000 | ||
``` | ||
Other arguments `-f` and `-p` are parameters for multiprocess in preprocessing. `-f` means the number of prefetch and `-p` means the number of processes. | ||
|
||
## generating | ||
``` | ||
python generate.py -i <input file> -o <output file> -m <trained model> | ||
``` | ||
|
||
If you don't set `-o`, default file name `result.wav` is used. If you don't set `-s`, the speaker is same as input file that got from filepath. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .model import Glow | ||
from .modules import Invertible1x1Convolution | ||
from .modules import AffineCouplingLayer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import chainer | ||
import numpy | ||
|
||
from .modules import Flow | ||
|
||
|
||
def _squeeze(x, squeeze_factor): | ||
batchsize, channel, length = x.shape | ||
x = x.reshape( | ||
(batchsize, channel, length // squeeze_factor, squeeze_factor)) | ||
x = x.transpose((0, 1, 3, 2)) | ||
x = x.reshape( | ||
(batchsize, channel * squeeze_factor, length // squeeze_factor)) | ||
return x | ||
|
||
|
||
def _unsqueeze(x, squeeze_factor): | ||
batchsize, channel, length = x.shape | ||
x = x.reshape( | ||
(batchsize, channel // squeeze_factor, squeeze_factor, length)) | ||
x = x.transpose((0, 1, 3, 2)) | ||
x = x.reshape( | ||
(batchsize, channel // squeeze_factor, length * squeeze_factor)) | ||
return x | ||
|
||
|
||
class Glow(chainer.Chain): | ||
def __init__( | ||
self, hop_length=256, n_mels=80, input_channel=1, | ||
squeeze_factor=8, n_flows=12, n_layers=8, | ||
wn_channel=512, early_every=4, early_size=2, var=0.5): | ||
super(Glow, self).__init__() | ||
self.input_channel = input_channel | ||
self.squeeze_factor = squeeze_factor | ||
self.n_flows = n_flows | ||
self.early_every = early_every | ||
self.early_size = early_size | ||
self.ln_var = float(numpy.log(var)) | ||
flows = chainer.ChainList() | ||
for i in range(n_flows): | ||
flows.add_link(Flow( | ||
input_channel * squeeze_factor - | ||
early_size * (i // early_every), | ||
n_mels * squeeze_factor, n_layers, wn_channel)) | ||
with self.init_scope(): | ||
self.encoder = chainer.links.Deconvolution1D( | ||
n_mels, n_mels, hop_length * 4, hop_length, | ||
pad=hop_length * 3 // 2) | ||
self.flows = flows | ||
|
||
def __call__(self, x, condition): | ||
_, gaussian_nll, sum_log_s, sum_log_det_W = self._forward(x, condition) | ||
loss = gaussian_nll - sum_log_s - sum_log_det_W | ||
loss += float(numpy.log(2 ** 16)) | ||
chainer.reporter.report( | ||
{ | ||
'gaussian_nll': gaussian_nll, 'log_s': sum_log_s, | ||
'log_det_W': sum_log_det_W, 'loss': loss}, self) | ||
return loss | ||
|
||
def _forward(self, x, condition): | ||
condition = self.encoder(condition) | ||
x = _squeeze(x, self.squeeze_factor) | ||
condition = _squeeze(condition, self.squeeze_factor) | ||
sum_log_s = 0 | ||
sum_log_det_W = 0 | ||
outputs = [] | ||
for i, flow in enumerate(self.flows.children()): | ||
x, log_s, log_det_W = flow(x, condition) | ||
if (i + 1) % self.early_every == 0: | ||
output, x = x[:, :self.early_size], x[:, self.early_size:] | ||
outputs.append(output) | ||
sum_log_s += log_s | ||
sum_log_det_W += log_det_W | ||
outputs.append(x) | ||
z = chainer.functions.concat(outputs, axis=1) | ||
gaussian_nll = chainer.functions.gaussian_nll( | ||
z, | ||
mean=self.xp.zeros_like(z, dtype=self.xp.float32), | ||
ln_var=self.ln_var * self.xp.ones_like(z, dtype=self.xp.float32) | ||
) | ||
gaussian_nll /= numpy.prod(z.shape) | ||
sum_log_s /= numpy.prod(z.shape) | ||
sum_log_det_W /= numpy.prod(z.shape) | ||
return z, gaussian_nll, sum_log_s, sum_log_det_W | ||
|
||
def _reverse(self, z, condition, var=0): | ||
condition = self.encoder(condition) | ||
condition = _squeeze(condition, self.squeeze_factor) | ||
batchsize, _, length = condition.shape | ||
if z is None: | ||
z = self.xp.random.normal( | ||
0, var, | ||
(batchsize, self.input_channel * self.squeeze_factor, length)) | ||
z = z.astype(self.xp.float32) | ||
_, channel, _ = z.shape | ||
start_channel = channel - \ | ||
self.early_size * (self.n_flows // self.early_every) | ||
x, z = z[:, -start_channel:], z[:, :-start_channel] | ||
for i, flow in enumerate(reversed(list(self.flows.children()))): | ||
if (self.n_flows - i) % self.early_every == 0: | ||
x, z = chainer.functions.concat(( | ||
z[:, -self.early_size:], x)), z[:, :-self.early_size] | ||
x = flow.reverse(x, condition) | ||
x = _unsqueeze(x, self.squeeze_factor) | ||
return x | ||
|
||
def generate(self, condition, var=0.6 ** 2): | ||
return self._reverse(None, condition, var) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
import chainer | ||
import chainer.functions as F | ||
import chainer.links as L | ||
|
||
|
||
def _normalize(W): | ||
xp = chainer.cuda.get_array_module(W) | ||
g = xp.sqrt(xp.sum(W ** 2)).reshape((1,)) | ||
v = W / g | ||
return g, v | ||
|
||
|
||
def weight_norm(link): | ||
assert hasattr(link, 'W') | ||
|
||
def _W(self): | ||
return self.v * self.g | ||
|
||
def _remove(self): | ||
W = _W(self) | ||
del self.g | ||
del self.v | ||
del self.W | ||
with self.init_scope(): | ||
self.W = chainer.Parameter(W) | ||
|
||
def _replace(args): | ||
W = _W(args.link) | ||
g, v = _normalize(_W(args.link).array) | ||
args.link.g.array[...] = g | ||
args.link.v.array[...] = v | ||
args.link.W = W | ||
|
||
g, v = _normalize(link.W.array) | ||
del link.W | ||
with link.init_scope(): | ||
link.g = chainer.Parameter(g) | ||
link.v = chainer.Parameter(v) | ||
|
||
link.remove = _remove | ||
|
||
hook = chainer.LinkHook() | ||
hook.forward_preprocess = _replace | ||
link.add_hook(hook) | ||
return link | ||
|
||
|
||
class Invertible1x1Convolution(chainer.link.Link): | ||
def __init__(self, channel): | ||
super(Invertible1x1Convolution, self).__init__() | ||
xp = self.xp | ||
|
||
W = xp.linalg.qr(xp.random.normal( | ||
0, 1, (channel, channel)))[0].astype(xp.float32) | ||
W = W.reshape(W.shape + (1,)) | ||
|
||
with self.init_scope(): | ||
self.W = chainer.Parameter(W) | ||
|
||
@property | ||
def invW(self): | ||
return F.expand_dims(F.inv(self.W[..., 0]), axis=2) | ||
|
||
def __call__(self, x): | ||
return F.convolution_1d(x, self.W), \ | ||
x.shape[0] * x.shape[-1] * F.log(F.absolute(F.det(self.W[..., 0]))) | ||
|
||
def reverse(self, x): | ||
return F.convolution_1d(x, self.invW) | ||
|
||
|
||
class WaveNet(chainer.Chain): | ||
def __init__(self, out_channel, n_condition, n_layers, n_channel): | ||
super(WaveNet, self).__init__() | ||
dilated_convs = chainer.ChainList() | ||
residual_convs = chainer.ChainList() | ||
skip_convs = chainer.ChainList() | ||
condition_convs = chainer.ChainList() | ||
for i in range(n_layers): | ||
dilated_convs.add_link(weight_norm( | ||
L.Convolution1D( | ||
n_channel, 2 * n_channel, 3, pad=2 ** i, dilate=2 ** i))) | ||
residual_convs.add_link(weight_norm( | ||
L.Convolution1D(n_channel, n_channel, 1))) | ||
skip_convs.add_link(weight_norm( | ||
L.Convolution1D(n_channel, n_channel, 1))) | ||
condition_convs.add_link(weight_norm( | ||
L.Convolution1D(n_condition, 2 * n_channel, 1))) | ||
with self.init_scope(): | ||
self.input_conv = weight_norm( | ||
L.Convolution1D(out_channel // 2, n_channel, 1)) | ||
self.dilated_convs = dilated_convs | ||
self.residual_convs = residual_convs | ||
self.skip_convs = skip_convs | ||
self.condition_convs = condition_convs | ||
self.output_conv = L.Convolution1D( | ||
n_channel, out_channel, 1, | ||
initialW=chainer.initializers.Zero()) | ||
|
||
def __call__(self, x, condition): | ||
x = self.input_conv(x) | ||
skip_connection = 0 | ||
for dilated, residual, skip, condition_conv in zip( | ||
self.dilated_convs, self.residual_convs, self.skip_convs, | ||
self.condition_convs): | ||
z = dilated(x) + condition_conv(condition) | ||
z_tanh, z_sigmoid = F.split_axis(z, 2, axis=1) | ||
z = F.tanh(z_tanh) * F.sigmoid(z_sigmoid) | ||
x = residual(z) | ||
skip_connection += skip(z) | ||
y = self.output_conv(skip_connection) | ||
log_s, t = F.split_axis(y, 2, axis=1) | ||
return log_s, t | ||
|
||
|
||
class AffineCouplingLayer(chainer.Chain): | ||
def __init__(self, *args, **kwargs): | ||
super(AffineCouplingLayer, self).__init__() | ||
with self.init_scope(): | ||
self.encoder = WaveNet(*args, **kwargs) | ||
|
||
def __call__(self, x, condition): | ||
x_a, x_b = F.split_axis(x, 2, axis=1) | ||
log_s, t = self.encoder(x_a, condition) | ||
x_b = F.exp(log_s) * (x_b + t) | ||
return F.concat((x_a, x_b), axis=1), F.sum(log_s) | ||
|
||
def reverse(self, z, condition): | ||
x_a, x_b = F.split_axis(z, 2, axis=1) | ||
log_s, t = self.encoder(x_a, condition) | ||
x_b = x_b * F.exp(-log_s) - t | ||
return F.concat((x_a, x_b), axis=1) | ||
|
||
|
||
class Flow(chainer.Chain): | ||
def __init__(self, channel, n_condition, n_layers, wn_channel): | ||
super(Flow, self).__init__() | ||
with self.init_scope(): | ||
self.invertible1x1convolution = Invertible1x1Convolution( | ||
channel) | ||
self.affinecouplinglayer = AffineCouplingLayer( | ||
channel, n_condition, n_layers, wn_channel) | ||
|
||
def __call__(self, x, condition): | ||
x, log_det_W = self.invertible1x1convolution(x) | ||
z, log_s = self.affinecouplinglayer(x, condition) | ||
return z, log_s, log_det_W | ||
|
||
def reverse(self, z, condition): | ||
z = self.affinecouplinglayer.reverse(z, condition) | ||
x = self.invertible1x1convolution.reverse(z) | ||
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import argparse | ||
|
||
import numpy | ||
import librosa | ||
import chainer | ||
|
||
from WaveGlow import Glow | ||
from utils import Preprocess | ||
import params | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--input', '-i', help='Input file') | ||
parser.add_argument('--output', '-o', default='Result.wav', help='output file') | ||
parser.add_argument('--model', '-m', help='Snapshot of trained model') | ||
parser.add_argument('--var', '-v', type=float, default=0.6 ** 2, | ||
help='Variance of Gaussian distribution') | ||
parser.add_argument('--gpu', '-g', type=int, default=-1, | ||
help='GPU ID (negative value indicates CPU)') | ||
args = parser.parse_args() | ||
if args.gpu != [-1]: | ||
chainer.cuda.set_max_workspace_size(2 * 512 * 1024 * 1024) | ||
chainer.global_config.autotune = True | ||
|
||
# set data | ||
path = args.input | ||
|
||
# preprocess | ||
n = 1 # batchsize; now suporrts only 1 | ||
inputs = Preprocess( | ||
params.sr, params.n_fft, params.hop_length, params.n_mels, params.fmin, | ||
params.fmax, params.top_db, None)(path) | ||
|
||
_, condition = inputs | ||
condition = numpy.expand_dims(condition, axis=0) | ||
|
||
# make model | ||
glow = Glow( | ||
params.hop_length, params.n_mels, 1, | ||
params.squeeze_factor, params.n_flows, params.n_layers, | ||
params.wn_channel, params.early_every, params.early_size, | ||
params.var) | ||
|
||
# load trained parameter | ||
chainer.serializers.load_npz(args.model, glow, 'updater/model:main/') | ||
|
||
if args.gpu >= 0: | ||
use_gpu = True | ||
chainer.cuda.get_device_from_id(args.gpu).use() | ||
else: | ||
use_gpu = False | ||
|
||
# forward | ||
if use_gpu: | ||
condition = chainer.cuda.to_gpu(condition, device=args.gpu) | ||
glow.to_gpu(device=args.gpu) | ||
condition = chainer.Variable(condition) | ||
|
||
with chainer.using_config('enable_backprop', False): | ||
output = glow.generate(condition) | ||
|
||
output = chainer.cuda.to_cpu(output.array) | ||
output = numpy.squeeze(output) | ||
librosa.output.write_wav(args.output, output, params.sr) |
Oops, something went wrong.