Bug solve on attention module and a new Notebook to experiment spectrogram reconstruction

This commit is contained in:
Eren Golge 2018-01-31 07:21:22 -08:00
Родитель 9a22f5d085
Коммит 1320d5344a
17 изменённых файлов: 928 добавлений и 51 удалений

285
PlayGround.ipynb Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,8 +1,8 @@
{
"num_mels": 80,
"num_freq": 1024,
"num_freq": 1025,
"sample_rate": 20000,
"frame_length_ms": 50.0,
"frame_length_ms": 50,
"frame_shift_ms": 12.5,
"preemphasis": 0.97,
"min_level_db": -100,
@ -12,11 +12,11 @@
"text_cleaner": "english_cleaners",
"epochs": 2000,
"lr": 0.001,
"lr_patience": 2,
"lr": 0.003,
"lr_patience": 5,
"lr_decay": 0.5,
"batch_size": 256,
"griffinf_lim_iters": 60,
"griffin_lim_iters": 60,
"power": 1.5,
"r": 5,

Двоичные данные
core Normal file

Двоичный файл не отображается.

Двоичные данные
datasets/.LJSpeech.py.swp

Двоичный файл не отображается.

Просмотреть файл

@ -20,6 +20,8 @@
"power": 1.5,
"r": 5,
"num_loader_workers": 16,
"save_step": 1,
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
"output_path": "result",

Двоичные данные
layers/.attention.py.swp

Двоичный файл не отображается.

Двоичные данные
layers/.tacotron.py.swp

Двоичный файл не отображается.

Просмотреть файл

@ -73,7 +73,8 @@ class AttentionWrapper(nn.Module):
alignment.data.masked_fill_(mask, self.score_mask_value)
# Normalize attention weight
alignment = F.softmax(alignment, dim=0)
alignment = F.softmax(alignment, dim=-1) ## TODO: might be buggy
print(alignment.size())
# Attention context vector
# (batch, 1, dim)

Двоичные данные
models/.tacotron.py.swp

Двоичный файл не отображается.

Просмотреть файл

@ -2,7 +2,7 @@
import torch
from torch.autograd import Variable
from torch import nn
from utils.text.symbols import symbols
from TTS.utils.text.symbols import symbols
from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
class Tacotron(nn.Module):

354
notebooks/PlayGround.ipynb Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

51
notebooks/utils.py Normal file
Просмотреть файл

@ -0,0 +1,51 @@
import io
import librosa
import torch
import numpy as np
from TTS.utils.text import text_to_sequence
from matplotlib import pylab as plt
hop_length = 250
def create_speech(m, s, CONFIG, use_cuda, ap):
text_cleaner = [CONFIG.text_cleaner]
seq = np.array(text_to_sequence(s, text_cleaner))
# mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)
if use_cuda:
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda()
else:
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0)
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)
mel_out, linear_out, alignments =m.forward(chars_var)
linear_out = linear_out[0].data.cpu().numpy()
alignment = alignments[0].cpu().data.numpy()
spec = ap._denormalize(linear_out)
wav = ap.inv_spectrogram(linear_out.T)
wav = wav[:ap.find_endpoint(wav)]
out = io.BytesIO()
ap.save_wav(wav, out)
return wav, alignment, spec
def visualize(alignment, spectrogram, CONFIG):
label_fontsize = 16
plt.figure(figsize=(16,16))
plt.subplot(2,1,1)
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
plt.colorbar()
plt.subplot(2,1,2)
librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate,
hop_length=hop_length, x_axis="time", y_axis="linear")
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()

Просмотреть файл

@ -38,17 +38,11 @@ def main(args):
# Sentences for generation
sentences = [
"And it is worth mention in passing that, as an example of fine typography,",
# From July 8, 2017 New York Times:
'Scientists at the CERN laboratory say they have discovered a new particle.',
'Theres a way to measure the acute emotional intelligence that has never gone out of style.',
'President Trump met with other leaders at the Group of 20 conference.',
'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
# From Google's Tacotron example page:
'Generative adversarial network or variational auto-encoder.',
'The buses aren\'t the problem, they actually provide a solution.',
'Does the quick brown fox jump over the lazy dog?',
'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
"I try my best to translate text to speech. But I know I need more work",
"The new Firefox, Fast for good.",
"Technology is continually providing us with new ways to create and publish stories.",
"For these stories to achieve their full impact, it requires tool.",
"I am allien and I am here to destron your world."
]
# Synthesis and save to wav files

Просмотреть файл

@ -111,6 +111,8 @@ def main(args):
progbar = Progbar(len(dataset) / c.batch_size)
for i, data in enumerate(dataloader):
start_time = time.time()
text_input = data[0]
magnitude_input = data[1]
mel_input = data[2]
@ -128,42 +130,40 @@ def main(args):
if use_cuda:
text_input_var = Variable(torch.from_numpy(text_input).type(
torch.cuda.LongTensor), requires_grad=False).cuda()
torch.cuda.LongTensor)).cuda()
mel_input_var = Variable(torch.from_numpy(mel_input).type(
torch.cuda.FloatTensor), requires_grad=False).cuda()
torch.cuda.FloatTensor)).cuda()
mel_spec_var = Variable(torch.from_numpy(mel_input).type(
torch.cuda.FloatTensor), requires_grad=False).cuda()
torch.cuda.FloatTensor)).cuda()
linear_spec_var = Variable(torch.from_numpy(magnitude_input)
.type(torch.cuda.FloatTensor), requires_grad=False).cuda()
.type(torch.cuda.FloatTensor)).cuda()
else:
text_input_var = Variable(torch.from_numpy(text_input).type(
torch.LongTensor), requires_grad=False)
torch.LongTensor),)
mel_input_var = Variable(torch.from_numpy(mel_input).type(
torch.FloatTensor), requires_grad=False)
torch.FloatTensor))
mel_spec_var = Variable(torch.from_numpy(
mel_input).type(torch.FloatTensor), requires_grad=False)
mel_input).type(torch.FloatTensor))
linear_spec_var = Variable(torch.from_numpy(
magnitude_input).type(torch.FloatTensor),
requires_grad=False)
magnitude_input).type(torch.FloatTensor))
mel_output, linear_output, alignments =\
model.forward(text_input_var, mel_input_var)
mel_loss = criterion(mel_output, mel_spec_var)
linear_loss = torch.abs(linear_output - linear_spec_var)
linear_loss = 0.5 * \
torch.mean(linear_loss) + 0.5 * \
torch.mean(linear_loss[:, :n_priority_freq, :])
#linear_loss = torch.abs(linear_output - linear_spec_var)
#linear_loss = 0.5 * \
#torch.mean(linear_loss) + 0.5 * \
#torch.mean(linear_loss[:, :n_priority_freq, :])
linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
linear_spec_var[: ,: ,:n_priority_freq])
loss = mel_loss + linear_loss
loss = loss.cuda()
start_time = time.time()
# loss = loss.cuda()
loss.backward()
nn.utils.clip_grad_norm(model.parameters(), 1.)
grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)
optimizer.step()
step_time = time.time() - start_time
@ -171,7 +171,8 @@ def main(args):
progbar.update(i+1, values=[('total_loss', loss.data[0]),
('linear_loss', linear_loss.data[0]),
('mel_loss', mel_loss.data[0])])
('mel_loss', mel_loss.data[0]),
('grad_norm', grad_norm)])
tb.add_scalar('Train/TotalLoss', loss.data[0], current_step)
tb.add_scalar('Train/LinearLoss', linear_loss.data[0],

Просмотреть файл

@ -81,10 +81,10 @@ class AudioProcessor(object):
def inv_spectrogram(self, spectrogram):
'''Converts spectrogram to waveform using librosa'''
S = _denormalize(spectrogram)
S = _db_to_amp(S + self.ref_level_db) # Convert back to linear
S = self._denormalize(spectrogram)
S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear
# Reconstruct phase
return inv_preemphasis(_griffin_lim(S ** self.power))
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
def _griffin_lim(self, S):
@ -93,18 +93,13 @@ class AudioProcessor(object):
'''
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles)
y = self._istft(S_complex * angles)
for i in range(self.griffin_lim_iters):
angles = np.exp(1j * np.angle(_stft(y)))
y = _istft(S_complex * angles)
angles = np.exp(1j * np.angle(self._stft(y)))
y = self._istft(S_complex * angles)
return y
def _istft(self, y):
_, hop_length, win_length = _stft_parameters()
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def melspectrogram(self, y):
D = self._stft(self.apply_preemphasis(y))
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
@ -115,11 +110,15 @@ class AudioProcessor(object):
n_fft, hop_length, win_length = self._stft_parameters()
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
def _istft(self, y):
_, hop_length, win_length = self._stft_parameters()
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(self.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
threshold = _db_to_amp(threshold_db)
threshold = self._db_to_amp(threshold_db)
for x in range(hop_length, len(wav) - window_length, hop_length):
if np.max(wav[x:x + window_length]) < threshold:
return x + hop_length

Просмотреть файл

@ -3,7 +3,9 @@ import numpy as np
def pad_data(x, length):
_pad = 0
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
return np.pad(x, (0, length - x.shape[0]),
mode='constant',
constant_values=_pad)
def prepare_data(inputs):