From e7278437ee24b0962f59bb5c8512c29e56eb71f2 Mon Sep 17 00:00:00 2001 From: Eren Date: Thu, 6 Sep 2018 15:27:15 +0200 Subject: [PATCH 1/5] Please enter the commit message for your changes. Lines starting --- models/tacotron.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/models/tacotron.py b/models/tacotron.py index cd023a9..8a215b9 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -23,7 +23,9 @@ class Tacotron(nn.Module): self.encoder = Encoder(embedding_dim) self.decoder = Decoder(256, mel_dim, r) self.postnet = PostCBHG(mel_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim) + self.last_linear = nn.Sequential( + nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim), + nn.Sigmoid()) def forward(self, characters, mel_specs=None, mask=None): B = characters.size(0) From f60e4497a6d586fa4bd49cd4cd6c46ea3346c60e Mon Sep 17 00:00:00 2001 From: Eren Date: Sat, 15 Sep 2018 18:05:34 +0200 Subject: [PATCH 2/5] apply sigmoid to outputs --- layers/tacotron.py | 1 + 1 file changed, 1 insertion(+) diff --git a/layers/tacotron.py b/layers/tacotron.py index 6ab06d7..835abde 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -374,6 +374,7 @@ class Decoder(nn.Module): decoder_output = decoder_input # predict mel vectors from decoder vectors output = self.proj_to_mel(decoder_output) + output = torch.sigmoid(output) stop_input = output # predict stop token stop_token, stopnet_rnn_hidden = self.stopnet( From 6cfdf2d4688af3941529808af27632195f4ac593 Mon Sep 17 00:00:00 2001 From: Eren Date: Wed, 19 Sep 2018 15:22:43 +0200 Subject: [PATCH 3/5] config change --- config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.json b/config.json index 23320fe..4bd7da8 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { - "model_name": "TTS-master", - "model_description": "'Same' padding as in TF", + "model_name": "TTS-sigmoid", + "model_description": "Net outputting Sigmoid unit", "audio_processor": "audio", "num_mels": 80, "num_freq": 1025, From 8fe733c166c6171f1fb1626d7bada6b5ab9c54d9 Mon Sep 17 00:00:00 2001 From: Eren Date: Fri, 21 Sep 2018 17:27:02 +0200 Subject: [PATCH 4/5] Make audio folder and save audio with scipy --- train.py | 2 +- utils/audio.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index e1df221..34c5726 100644 --- a/train.py +++ b/train.py @@ -472,7 +472,7 @@ if __name__ == '__main__': OUT_PATH = create_experiment_folder(OUT_PATH, c.model_name, args.debug) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - os.mkdir(AUDIO_PATH) + os.makedirs(AUDIO_PATH, exist_ok=True) shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # setup tensorboard diff --git a/utils/audio.py b/utils/audio.py index 4ea5bfe..9849cc9 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -3,6 +3,7 @@ import librosa import pickle import copy import numpy as np +import scipy from scipy import signal _mel_basis = None @@ -38,7 +39,8 @@ class AudioProcessor(object): def save_wav(self, wav, path): wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - librosa.output.write_wav(path, wav_norm.astype(np.int16), self.sample_rate) + # librosa.output.write_wav(path, wav_norm.astype(np.int16), self.sample_rate) + scipy.io.wavfile.write(path, self.sample_rate, wav.astype(np.int16)) def _linear_to_mel(self, spectrogram): global _mel_basis From 95eb3367bded5d455bb54806022bedb6ca62c337 Mon Sep 17 00:00:00 2001 From: Eren Date: Fri, 21 Sep 2018 21:51:38 +0200 Subject: [PATCH 5/5] bug fix --- utils/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/audio.py b/utils/audio.py index 9849cc9..061fefc 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -40,7 +40,7 @@ class AudioProcessor(object): def save_wav(self, wav, path): wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) # librosa.output.write_wav(path, wav_norm.astype(np.int16), self.sample_rate) - scipy.io.wavfile.write(path, self.sample_rate, wav.astype(np.int16)) + scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) def _linear_to_mel(self, spectrogram): global _mel_basis