зеркало из https://github.com/mozilla/TTS.git
Bug solve on attention module and a new Notebook to experiment spectrogram reconstruction
This commit is contained in:
Родитель
9a22f5d085
Коммит
1320d5344a
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
10
config.json
10
config.json
|
@ -1,8 +1,8 @@
|
|||
{
|
||||
"num_mels": 80,
|
||||
"num_freq": 1024,
|
||||
"num_freq": 1025,
|
||||
"sample_rate": 20000,
|
||||
"frame_length_ms": 50.0,
|
||||
"frame_length_ms": 50,
|
||||
"frame_shift_ms": 12.5,
|
||||
"preemphasis": 0.97,
|
||||
"min_level_db": -100,
|
||||
|
@ -12,11 +12,11 @@
|
|||
"text_cleaner": "english_cleaners",
|
||||
|
||||
"epochs": 2000,
|
||||
"lr": 0.001,
|
||||
"lr_patience": 2,
|
||||
"lr": 0.003,
|
||||
"lr_patience": 5,
|
||||
"lr_decay": 0.5,
|
||||
"batch_size": 256,
|
||||
"griffinf_lim_iters": 60,
|
||||
"griffin_lim_iters": 60,
|
||||
"power": 1.5,
|
||||
"r": 5,
|
||||
|
||||
|
|
Двоичный файл не отображается.
Двоичные данные
datasets/.LJSpeech.py.swp
Двоичные данные
datasets/.LJSpeech.py.swp
Двоичный файл не отображается.
|
@ -20,6 +20,8 @@
|
|||
"power": 1.5,
|
||||
"r": 5,
|
||||
|
||||
"num_loader_workers": 16,
|
||||
|
||||
"save_step": 1,
|
||||
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
|
||||
"output_path": "result",
|
||||
|
|
Двоичные данные
layers/.attention.py.swp
Двоичные данные
layers/.attention.py.swp
Двоичный файл не отображается.
Двоичные данные
layers/.tacotron.py.swp
Двоичные данные
layers/.tacotron.py.swp
Двоичный файл не отображается.
|
@ -73,7 +73,8 @@ class AttentionWrapper(nn.Module):
|
|||
alignment.data.masked_fill_(mask, self.score_mask_value)
|
||||
|
||||
# Normalize attention weight
|
||||
alignment = F.softmax(alignment, dim=0)
|
||||
alignment = F.softmax(alignment, dim=-1) ## TODO: might be buggy
|
||||
print(alignment.size())
|
||||
|
||||
# Attention context vector
|
||||
# (batch, 1, dim)
|
||||
|
|
Двоичные данные
models/.tacotron.py.swp
Двоичные данные
models/.tacotron.py.swp
Двоичный файл не отображается.
|
@ -2,7 +2,7 @@
|
|||
import torch
|
||||
from torch.autograd import Variable
|
||||
from torch import nn
|
||||
from utils.text.symbols import symbols
|
||||
from TTS.utils.text.symbols import symbols
|
||||
from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
|
||||
|
||||
class Tacotron(nn.Module):
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,51 @@
|
|||
import io
|
||||
import librosa
|
||||
import torch
|
||||
import numpy as np
|
||||
from TTS.utils.text import text_to_sequence
|
||||
from matplotlib import pylab as plt
|
||||
|
||||
hop_length = 250
|
||||
|
||||
def create_speech(m, s, CONFIG, use_cuda, ap):
|
||||
text_cleaner = [CONFIG.text_cleaner]
|
||||
seq = np.array(text_to_sequence(s, text_cleaner))
|
||||
|
||||
# mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)
|
||||
|
||||
if use_cuda:
|
||||
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
|
||||
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda()
|
||||
else:
|
||||
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0)
|
||||
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)
|
||||
|
||||
mel_out, linear_out, alignments =m.forward(chars_var)
|
||||
linear_out = linear_out[0].data.cpu().numpy()
|
||||
alignment = alignments[0].cpu().data.numpy()
|
||||
spec = ap._denormalize(linear_out)
|
||||
wav = ap.inv_spectrogram(linear_out.T)
|
||||
wav = wav[:ap.find_endpoint(wav)]
|
||||
out = io.BytesIO()
|
||||
ap.save_wav(wav, out)
|
||||
return wav, alignment, spec
|
||||
|
||||
|
||||
def visualize(alignment, spectrogram, CONFIG):
|
||||
label_fontsize = 16
|
||||
plt.figure(figsize=(16,16))
|
||||
|
||||
plt.subplot(2,1,1)
|
||||
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
||||
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
||||
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
||||
plt.colorbar()
|
||||
|
||||
plt.subplot(2,1,2)
|
||||
librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate,
|
||||
hop_length=hop_length, x_axis="time", y_axis="linear")
|
||||
plt.xlabel("Time", fontsize=label_fontsize)
|
||||
plt.ylabel("Hz", fontsize=label_fontsize)
|
||||
plt.tight_layout()
|
||||
plt.colorbar()
|
||||
|
16
synthesis.py
16
synthesis.py
|
@ -38,17 +38,11 @@ def main(args):
|
|||
|
||||
# Sentences for generation
|
||||
sentences = [
|
||||
"And it is worth mention in passing that, as an example of fine typography,",
|
||||
# From July 8, 2017 New York Times:
|
||||
'Scientists at the CERN laboratory say they have discovered a new particle.',
|
||||
'There’s a way to measure the acute emotional intelligence that has never gone out of style.',
|
||||
'President Trump met with other leaders at the Group of 20 conference.',
|
||||
'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
|
||||
# From Google's Tacotron example page:
|
||||
'Generative adversarial network or variational auto-encoder.',
|
||||
'The buses aren\'t the problem, they actually provide a solution.',
|
||||
'Does the quick brown fox jump over the lazy dog?',
|
||||
'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
|
||||
"I try my best to translate text to speech. But I know I need more work",
|
||||
"The new Firefox, Fast for good.",
|
||||
"Technology is continually providing us with new ways to create and publish stories.",
|
||||
"For these stories to achieve their full impact, it requires tool.",
|
||||
"I am allien and I am here to destron your world."
|
||||
]
|
||||
|
||||
# Synthesis and save to wav files
|
||||
|
|
41
train.py
41
train.py
|
@ -111,6 +111,8 @@ def main(args):
|
|||
progbar = Progbar(len(dataset) / c.batch_size)
|
||||
|
||||
for i, data in enumerate(dataloader):
|
||||
start_time = time.time()
|
||||
|
||||
text_input = data[0]
|
||||
magnitude_input = data[1]
|
||||
mel_input = data[2]
|
||||
|
@ -128,42 +130,40 @@ def main(args):
|
|||
|
||||
if use_cuda:
|
||||
text_input_var = Variable(torch.from_numpy(text_input).type(
|
||||
torch.cuda.LongTensor), requires_grad=False).cuda()
|
||||
torch.cuda.LongTensor)).cuda()
|
||||
mel_input_var = Variable(torch.from_numpy(mel_input).type(
|
||||
torch.cuda.FloatTensor), requires_grad=False).cuda()
|
||||
torch.cuda.FloatTensor)).cuda()
|
||||
mel_spec_var = Variable(torch.from_numpy(mel_input).type(
|
||||
torch.cuda.FloatTensor), requires_grad=False).cuda()
|
||||
torch.cuda.FloatTensor)).cuda()
|
||||
linear_spec_var = Variable(torch.from_numpy(magnitude_input)
|
||||
.type(torch.cuda.FloatTensor), requires_grad=False).cuda()
|
||||
.type(torch.cuda.FloatTensor)).cuda()
|
||||
|
||||
else:
|
||||
text_input_var = Variable(torch.from_numpy(text_input).type(
|
||||
torch.LongTensor), requires_grad=False)
|
||||
torch.LongTensor),)
|
||||
mel_input_var = Variable(torch.from_numpy(mel_input).type(
|
||||
torch.FloatTensor), requires_grad=False)
|
||||
torch.FloatTensor))
|
||||
mel_spec_var = Variable(torch.from_numpy(
|
||||
mel_input).type(torch.FloatTensor), requires_grad=False)
|
||||
mel_input).type(torch.FloatTensor))
|
||||
linear_spec_var = Variable(torch.from_numpy(
|
||||
magnitude_input).type(torch.FloatTensor),
|
||||
requires_grad=False)
|
||||
magnitude_input).type(torch.FloatTensor))
|
||||
|
||||
mel_output, linear_output, alignments =\
|
||||
model.forward(text_input_var, mel_input_var)
|
||||
|
||||
mel_loss = criterion(mel_output, mel_spec_var)
|
||||
linear_loss = torch.abs(linear_output - linear_spec_var)
|
||||
linear_loss = 0.5 * \
|
||||
torch.mean(linear_loss) + 0.5 * \
|
||||
torch.mean(linear_loss[:, :n_priority_freq, :])
|
||||
#linear_loss = torch.abs(linear_output - linear_spec_var)
|
||||
#linear_loss = 0.5 * \
|
||||
#torch.mean(linear_loss) + 0.5 * \
|
||||
#torch.mean(linear_loss[:, :n_priority_freq, :])
|
||||
linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
|
||||
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
||||
linear_spec_var[: ,: ,:n_priority_freq])
|
||||
loss = mel_loss + linear_loss
|
||||
loss = loss.cuda()
|
||||
|
||||
start_time = time.time()
|
||||
# loss = loss.cuda()
|
||||
|
||||
loss.backward()
|
||||
|
||||
nn.utils.clip_grad_norm(model.parameters(), 1.)
|
||||
|
||||
grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)
|
||||
optimizer.step()
|
||||
|
||||
step_time = time.time() - start_time
|
||||
|
@ -171,7 +171,8 @@ def main(args):
|
|||
|
||||
progbar.update(i+1, values=[('total_loss', loss.data[0]),
|
||||
('linear_loss', linear_loss.data[0]),
|
||||
('mel_loss', mel_loss.data[0])])
|
||||
('mel_loss', mel_loss.data[0]),
|
||||
('grad_norm', grad_norm)])
|
||||
|
||||
tb.add_scalar('Train/TotalLoss', loss.data[0], current_step)
|
||||
tb.add_scalar('Train/LinearLoss', linear_loss.data[0],
|
||||
|
|
|
@ -81,10 +81,10 @@ class AudioProcessor(object):
|
|||
|
||||
def inv_spectrogram(self, spectrogram):
|
||||
'''Converts spectrogram to waveform using librosa'''
|
||||
S = _denormalize(spectrogram)
|
||||
S = _db_to_amp(S + self.ref_level_db) # Convert back to linear
|
||||
S = self._denormalize(spectrogram)
|
||||
S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear
|
||||
# Reconstruct phase
|
||||
return inv_preemphasis(_griffin_lim(S ** self.power))
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
|
||||
|
||||
|
||||
def _griffin_lim(self, S):
|
||||
|
@ -93,18 +93,13 @@ class AudioProcessor(object):
|
|||
'''
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
y = _istft(S_complex * angles)
|
||||
y = self._istft(S_complex * angles)
|
||||
for i in range(self.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(_stft(y)))
|
||||
y = _istft(S_complex * angles)
|
||||
angles = np.exp(1j * np.angle(self._stft(y)))
|
||||
y = self._istft(S_complex * angles)
|
||||
return y
|
||||
|
||||
|
||||
def _istft(self, y):
|
||||
_, hop_length, win_length = _stft_parameters()
|
||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
|
||||
def melspectrogram(self, y):
|
||||
D = self._stft(self.apply_preemphasis(y))
|
||||
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
|
||||
|
@ -115,11 +110,15 @@ class AudioProcessor(object):
|
|||
n_fft, hop_length, win_length = self._stft_parameters()
|
||||
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
def _istft(self, y):
|
||||
_, hop_length, win_length = self._stft_parameters()
|
||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
||||
|
||||
|
||||
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
|
||||
window_length = int(self.sample_rate * min_silence_sec)
|
||||
hop_length = int(window_length / 4)
|
||||
threshold = _db_to_amp(threshold_db)
|
||||
threshold = self._db_to_amp(threshold_db)
|
||||
for x in range(hop_length, len(wav) - window_length, hop_length):
|
||||
if np.max(wav[x:x + window_length]) < threshold:
|
||||
return x + hop_length
|
||||
|
|
|
@ -3,7 +3,9 @@ import numpy as np
|
|||
|
||||
def pad_data(x, length):
|
||||
_pad = 0
|
||||
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
|
||||
return np.pad(x, (0, length - x.shape[0]),
|
||||
mode='constant',
|
||||
constant_values=_pad)
|
||||
|
||||
|
||||
def prepare_data(inputs):
|
||||
|
|
Загрузка…
Ссылка в новой задаче