Skip to content

Fix non deterministic incremental inference #40

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions deepvoice3_pytorch/deepvoice3.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,8 +487,17 @@ def incremental_forward(self, encoder_out, text_positions, speaker_embed=None,
return outputs, alignments, dones, decoder_states

def start_fresh_sequence(self):
for conv in self.convolutions:
conv.clear_buffer()
_clear_modules(self.preattention)
_clear_modules(self.convolutions)
self.last_conv.clear_buffer()


def _clear_modules(modules):
for m in modules:
try:
m.clear_buffer()
except AttributeError as e:
pass


class Converter(nn.Module):
Expand Down
1 change: 1 addition & 0 deletions deepvoice3_pytorch/nyanko.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ def incremental_forward(self, encoder_out, text_positions,
def start_fresh_sequence(self):
_clear_modules(self.audio_encoder_modules)
_clear_modules(self.audio_decoder_modules)
self.last_conv.clear_buffer()


def _clear_modules(modules):
Expand Down
Binary file added tests/data/ljspeech-mel-00001.npy
Binary file not shown.
37 changes: 35 additions & 2 deletions tests/test_deepvoice3.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@


use_cuda = torch.cuda.is_available() and False
torch.backends.cudnn.deterministic = True
num_mels = 80
num_freq = 513
outputs_per_step = 4
Expand Down Expand Up @@ -145,13 +146,45 @@ def test_multi_speaker_deepvoice3():
print("Done:", done.size())


@attr("local_only")
@attr("issue38")
def test_incremental_path_multiple_times():
texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
seqs = np.array([text_to_sequence(t) for t in texts])
text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

r = 4
mel_dim = 80
sequence = Variable(torch.LongTensor(seqs))
text_positions = Variable(torch.LongTensor(text_positions))

for model, speaker_ids in [
(_get_model(force_monotonic_attention=False), None),
(_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1])))]:
model.eval()

# first call
mel_outputs, linear_outputs, alignments, done = model(
sequence, text_positions=text_positions, speaker_ids=speaker_ids)

# second call
mel_outputs2, linear_outputs2, alignments2, done2 = model(
sequence, text_positions=text_positions, speaker_ids=speaker_ids)

# Should get same result
c = (mel_outputs - mel_outputs2).abs()
print(c.mean(), c.max())

assert np.allclose(mel_outputs.cpu().data.numpy(),
mel_outputs2.cpu().data.numpy(), atol=1e-5)


def test_incremental_correctness():
texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
seqs = np.array([text_to_sequence(t) for t in texts])
text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
mel = np.load(mel_path)
max_target_len = mel.shape[0]
r = 4
mel_dim = 80
Expand Down
38 changes: 35 additions & 3 deletions tests/test_nyanko.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from deepvoice3_pytorch.builder import nyanko
from deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq

use_cuda = torch.cuda.is_available()
use_cuda = torch.cuda.is_available() and False
num_mels = 80
num_freq = 513
outputs_per_step = 4
Expand Down Expand Up @@ -57,13 +57,45 @@ def test_nyanko_basics():
mel_outputs, linear_outputs, alignments, done = model(x, y)


@attr("local_only")
@attr("issue38")
def test_incremental_path_multiple_times():
texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
seqs = np.array([text_to_sequence(t) for t in texts])
text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

r = 1
mel_dim = 80

sequence = Variable(torch.LongTensor(seqs))
text_positions = Variable(torch.LongTensor(text_positions))

model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
r=r, force_monotonic_attention=False)
model.eval()

# first call
mel_outputs, linear_outputs, alignments, done = model(
sequence, text_positions=text_positions, speaker_ids=None)

# second call
mel_outputs2, linear_outputs2, alignments2, done2 = model(
sequence, text_positions=text_positions, speaker_ids=None)

# Should get same result
c = (mel_outputs - mel_outputs2).abs()
print(c.mean(), c.max())

assert np.allclose(mel_outputs.cpu().data.numpy(),
mel_outputs2.cpu().data.numpy(), atol=1e-5)


def test_incremental_correctness():
texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
seqs = np.array([text_to_sequence(t) for t in texts])
text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
mel = np.load(mel_path)[::4]
max_target_len = mel.shape[0]
r = 1
mel_dim = 80
Expand Down