This commit is contained in:
erogol 2020-07-16 18:54:41 +02:00
Родитель 82dd465365
Коммит 9033070b8f
117 изменённых файлов: 13109 добавлений и 8 удалений

18
.github/PR_TEMPLATE.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,18 @@
---
name: 'Contribution Guideline '
about: Refer to Contirbution Guideline
title: ''
labels: ''
assignees: ''
---
### Contribution Guideline
Please send your PRs to `dev` branch if it is not directly related to a specific branch.
Before making a Pull Request, check your changes for basic mistakes and style problems by using a linter.
We have cardboardlinter setup in this repository, so for example, if you've made some changes and would like to run the linter on just the changed code, you can use the follow command:
```bash
pip install pylint cardboardlint
cardboardlinter --refspec master
```

Просмотреть файл

@ -6,6 +6,8 @@ git:
before_install:
- sudo apt-get update
- sudo apt-get -y install espeak
- python -m pip install --upgrade pip
- pip install six==1.12.0
matrix:
include:
@ -15,11 +17,13 @@ matrix:
env: TEST_SUITE="lint"
- name: "Unit tests"
python: "3.6"
install: pip install --quiet -r requirements_tests.txt
install:
- python setup.py install
env: TEST_SUITE="unittest"
- name: "Unit tests"
python: "3.6"
install: pip install --quiet -r requirements_tests.txt
install:
- python setup.py install
env: TEST_SUITE="testscripts"
script: ./.travis/script

Просмотреть файл

@ -11,12 +11,7 @@ fi
if [[ "$TEST_SUITE" == "unittest" ]]; then
# Run tests on all pushes
pushd tts_namespace
nosetests TTS.speaker_encoder.tests --nocapture
nosetests TTS.vocoder.tests --nocapture
nosetests TTS.tts.tests --nocapture
nosetests TTS.tts.tf.tests --nocapture
popd
nosetests tests --nocapture
fi
if [[ "$TEST_SUITE" == "testscripts" ]]; then

0
TTS/__init__.py Normal file
Просмотреть файл

85
TTS/bin/compute_statistics.py Executable file
Просмотреть файл

@ -0,0 +1,85 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import argparse
import numpy as np
from tqdm import tqdm
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.utils.io import load_config
from TTS.tts.utils.audio import AudioProcessor
def main():
"""Run preprocessing process."""
parser = argparse.ArgumentParser(
description="Compute mean and variance of spectrogtram features.")
parser.add_argument("--config_path", type=str, required=True,
help="TTS config file path to define audio processin parameters.")
parser.add_argument("--out_path", default=None, type=str,
help="directory to save the output file.")
args = parser.parse_args()
# load config
CONFIG = load_config(args.config_path)
CONFIG.audio['signal_norm'] = False # do not apply earlier normalization
CONFIG.audio['stats_path'] = None # discard pre-defined stats
# load audio processor
ap = AudioProcessor(**CONFIG.audio)
# load the meta data of target dataset
dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data
print(f" > There are {len(dataset_items)} files.")
mel_sum = 0
mel_square_sum = 0
linear_sum = 0
linear_square_sum = 0
N = 0
for item in tqdm(dataset_items):
# compute features
wav = ap.load_wav(item[1])
linear = ap.spectrogram(wav)
mel = ap.melspectrogram(wav)
# compute stats
N += mel.shape[1]
mel_sum += mel.sum(1)
linear_sum += linear.sum(1)
mel_square_sum += (mel ** 2).sum(axis=1)
linear_square_sum += (linear ** 2).sum(axis=1)
mel_mean = mel_sum / N
mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2)
linear_mean = linear_sum / N
linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2)
output_file_path = os.path.join(args.out_path, "scale_stats.npy")
stats = {}
stats['mel_mean'] = mel_mean
stats['mel_std'] = mel_scale
stats['linear_mean'] = linear_mean
stats['linear_std'] = linear_scale
print(f' > Avg mel spec mean: {mel_mean.mean()}')
print(f' > Avg mel spec scale: {mel_scale.mean()}')
print(f' > Avg linear spec mean: {linear_mean.mean()}')
print(f' > Avg lienar spec scale: {linear_scale.mean()}')
# set default config values for mean-var scaling
CONFIG.audio['stats_path'] = output_file_path
CONFIG.audio['signal_norm'] = True
# remove redundant values
del CONFIG.audio['max_norm']
del CONFIG.audio['min_level_db']
del CONFIG.audio['symmetric_norm']
del CONFIG.audio['clip_norm']
stats['audio_config'] = CONFIG.audio
np.save(output_file_path, stats, allow_pickle=True)
print(f' > scale_stats.npy is saved to {output_file_path}')
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,33 @@
# Convert Tensorflow Tacotron2 model to TF-Lite binary
import argparse
from TTS.tts.utils.io import load_config
from TTS.vocoder.tf.utils.generic_utils import setup_generator
from TTS.vocoder.tf.utils.io import load_checkpoint
from TTS.vocoder.tf.utils.tflite import convert_melgan_to_tflite
parser = argparse.ArgumentParser()
parser.add_argument('--tf_model',
type=str,
help='Path to target torch model to be converted to TF.')
parser.add_argument('--config_path',
type=str,
help='Path to config file of torch model.')
parser.add_argument('--output_path',
type=str,
help='path to tflite output binary.')
args = parser.parse_args()
# Set constants
CONFIG = load_config(args.config_path)
# load the model
model = setup_generator(CONFIG)
model.build_inference()
model = load_checkpoint(model, args.tf_model)
# create tflite model
tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)

Просмотреть файл

@ -0,0 +1,117 @@
import argparse
import os
import numpy as np
import tensorflow as tf
import torch
from fuzzywuzzy import fuzz
from TTS.tts.utils.io import load_config
from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import (
compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf)
from TTS.vocoder.tf.utils.generic_utils import \
setup_generator as setup_tf_generator
from TTS.vocoder.tf.utils.io import save_checkpoint
from TTS.vocoder.utils.generic_utils import setup_generator
# prevent GPU use
os.environ['CUDA_VISIBLE_DEVICES'] = ''
# define args
parser = argparse.ArgumentParser()
parser.add_argument('--torch_model_path',
type=str,
help='Path to target torch model to be converted to TF.')
parser.add_argument('--config_path',
type=str,
help='Path to config file of torch model.')
parser.add_argument(
'--output_path',
type=str,
help='path to output file including file name to save TF model.')
args = parser.parse_args()
# load model config
config_path = args.config_path
c = load_config(config_path)
num_speakers = 0
# init torch model
model = setup_generator(c)
checkpoint = torch.load(args.torch_model_path,
map_location=torch.device('cpu'))
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
model.remove_weight_norm()
state_dict = model.state_dict()
# init tf model
model_tf = setup_tf_generator(c)
common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
# get tf_model graph by passing an input
# B x D x T
dummy_input = tf.random.uniform((7, 80, 64), dtype=tf.float32)
mel_pred = model_tf(dummy_input, training=False)
# get tf variables
tf_vars = model_tf.weights
# match variable names with fuzzy logic
torch_var_names = list(state_dict.keys())
tf_var_names = [we.name for we in model_tf.weights]
var_map = []
for tf_name in tf_var_names:
# skip re-mapped layer names
if tf_name in [name[0] for name in var_map]:
continue
tf_name_edited = convert_tf_name(tf_name)
ratios = [
fuzz.ratio(torch_name, tf_name_edited)
for torch_name in torch_var_names
]
max_idx = np.argmax(ratios)
matching_name = torch_var_names[max_idx]
del torch_var_names[max_idx]
var_map.append((tf_name, matching_name))
# pass weights
tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict)
# Compare TF and TORCH models
# check embedding outputs
model.eval()
dummy_input_torch = torch.ones((1, 80, 10))
dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy())
dummy_input_tf = tf.transpose(dummy_input_tf, perm=[0, 2, 1])
dummy_input_tf = tf.expand_dims(dummy_input_tf, 2)
out_torch = model.layers[0](dummy_input_torch)
out_tf = model_tf.model_layers[0](dummy_input_tf)
out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :]
assert compare_torch_tf(out_torch, out_tf_) < 1e-5
for i in range(1, len(model.layers)):
print(f"{i} -> {model.layers[i]} vs {model_tf.model_layers[i]}")
out_torch = model.layers[i](out_torch)
out_tf = model_tf.model_layers[i](out_tf)
out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :]
diff = compare_torch_tf(out_torch, out_tf_)
assert diff < 1e-5, diff
torch.manual_seed(0)
dummy_input_torch = torch.rand((1, 80, 100))
dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy())
model.inference_padding = 0
model_tf.inference_padding = 0
output_torch = model.inference(dummy_input_torch)
output_tf = model_tf(dummy_input_tf, training=False)
assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(
output_torch, output_tf)
# save tf model
save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
args.output_path)
print(' > Model conversion is successfully completed :).')

Просмотреть файл

@ -0,0 +1,37 @@
# Convert Tensorflow Tacotron2 model to TF-Lite binary
import argparse
from TTS.tts.utils.io import load_config
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.tf.utils.generic_utils import setup_model
from TTS.tf.utils.io import load_checkpoint
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite
parser = argparse.ArgumentParser()
parser.add_argument('--tf_model',
type=str,
help='Path to target torch model to be converted to TF.')
parser.add_argument('--config_path',
type=str,
help='Path to config file of torch model.')
parser.add_argument('--output_path',
type=str,
help='path to tflite output binary.')
args = parser.parse_args()
# Set constants
CONFIG = load_config(args.config_path)
# load the model
c = CONFIG
num_speakers = 0
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, c, enable_tflite=True)
model.build_inference()
model = load_checkpoint(model, args.tf_model)
model.decoder.set_max_decoder_steps(1000)
# create tflite model
tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path)

Просмотреть файл

@ -0,0 +1,210 @@
# %%
import sys
sys.path.append('/home/erogol/Projects')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
# %%
import argparse
import numpy as np
import torch
import tensorflow as tf
from fuzzywuzzy import fuzz
from TTS.tts.utils.text.symbols import phonemes, symbols
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.io import load_config
from TTS.tf.models.tacotron2 import Tacotron2
from TTS.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name
from TTS.tf.utils.generic_utils import save_checkpoint
parser = argparse.ArgumentParser()
parser.add_argument('--torch_model_path',
type=str,
help='Path to target torch model to be converted to TF.')
parser.add_argument('--config_path',
type=str,
help='Path to config file of torch model.')
parser.add_argument('--output_path',
type=str,
help='path to output file including file name to save TF model.')
args = parser.parse_args()
# load model config
config_path = args.config_path
c = load_config(config_path)
num_speakers = 0
# init torch model
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, c)
checkpoint = torch.load(args.torch_model_path,
map_location=torch.device('cpu'))
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
# init tf model
model_tf = Tacotron2(num_chars=num_chars,
num_speakers=num_speakers,
r=model.decoder.r,
postnet_output_dim=c.audio['num_mels'],
decoder_output_dim=c.audio['num_mels'],
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder)
# set initial layer mapping - these are not captured by the below heuristic approach
# TODO: set layer names so that we can remove these manual matching
common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
var_map = [
('embedding/embeddings:0', 'embedding.weight'),
('encoder/lstm/forward_lstm/lstm_cell_1/kernel:0',
'encoder.lstm.weight_ih_l0'),
('encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0',
'encoder.lstm.weight_hh_l0'),
('encoder/lstm/backward_lstm/lstm_cell_2/kernel:0',
'encoder.lstm.weight_ih_l0_reverse'),
('encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0',
'encoder.lstm.weight_hh_l0_reverse'),
('encoder/lstm/forward_lstm/lstm_cell_1/bias:0',
('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')),
('encoder/lstm/backward_lstm/lstm_cell_2/bias:0',
('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')),
('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'),
('decoder/linear_projection/kernel:0',
'decoder.linear_projection.linear_layer.weight'),
('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight')
]
# %%
# get tf_model graph
mel_pred = model_tf.build_inference()
# get tf variables
tf_vars = model_tf.weights
# match variable names with fuzzy logic
torch_var_names = list(state_dict.keys())
tf_var_names = [we.name for we in model_tf.weights]
for tf_name in tf_var_names:
# skip re-mapped layer names
if tf_name in [name[0] for name in var_map]:
continue
tf_name_edited = convert_tf_name(tf_name)
ratios = [
fuzz.ratio(torch_name, tf_name_edited)
for torch_name in torch_var_names
]
max_idx = np.argmax(ratios)
matching_name = torch_var_names[max_idx]
del torch_var_names[max_idx]
var_map.append((tf_name, matching_name))
# %%
# print variable match
from pprint import pprint
pprint(var_map)
pprint(torch_var_names)
# pass weights
tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict)
# Compare TF and TORCH models
# %%
# check embedding outputs
model.eval()
input_ids = torch.randint(0, 24, (1, 128)).long()
o_t = model.embedding(input_ids)
o_tf = model_tf.embedding(input_ids.detach().numpy())
assert abs(o_t.detach().numpy() -
o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() -
o_tf.numpy()).sum()
# compare encoder outputs
oo_en = model.encoder.inference(o_t.transpose(1, 2))
ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False)
assert compare_torch_tf(oo_en, ooo_en) < 1e-5
#pylint: disable=redefined-builtin
# compare decoder.attention_rnn
inp = torch.rand([1, 768])
inp_tf = inp.numpy()
model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
output, cell_state = model.decoder.attention_rnn(inp)
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf,
states[2],
training=False)
assert compare_torch_tf(output, output_tf).mean() < 1e-5
query = output
inputs = torch.rand([1, 128, 512])
query_tf = query.detach().numpy()
inputs_tf = inputs.numpy()
# compare decoder.attention
model.decoder.attention.init_states(inputs)
processes_inputs = model.decoder.attention.preprocess_inputs(inputs)
loc_attn, proc_query = model.decoder.attention.get_location_attention(
query, processes_inputs)
context = model.decoder.attention(query, inputs, processes_inputs, None)
attention_states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)[-1]
model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf))
loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf, attention_states)
context_tf, attention, attention_states = model_tf.decoder.attention(query_tf, attention_states, training=False)
assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5
assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5
assert compare_torch_tf(context, context_tf) < 1e-5
# compare decoder.decoder_rnn
input = torch.rand([1, 1536])
input_tf = input.numpy()
model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
output, cell_state = model.decoder.decoder_rnn(
input, [model.decoder.decoder_hidden, model.decoder.decoder_cell])
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf,
states[3],
training=False)
assert abs(input - input_tf).mean() < 1e-5
assert compare_torch_tf(output, output_tf).mean() < 1e-5
# compare decoder.linear_projection
input = torch.rand([1, 1536])
input_tf = input.numpy()
output = model.decoder.linear_projection(input)
output_tf = model_tf.decoder.linear_projection(input_tf, training=False)
assert compare_torch_tf(output, output_tf) < 1e-5
# compare decoder outputs
model.decoder.max_decoder_steps = 100
model_tf.decoder.set_max_decoder_steps(100)
output, align, stop = model.decoder.inference(oo_en)
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False)
assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4
# compare the whole model output
outputs_torch = model.inference(input_ids)
outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy()))
print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean())
assert compare_torch_tf(outputs_torch[2][:, 50, :],
outputs_tf[2][:, 50, :]) < 1e-5
assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4
# %%
# save tf model
save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'],
checkpoint['r'], args.output_path)
print(' > Model conversion is successfully completed :).')

59
TTS/bin/distribute_tts.py Normal file
Просмотреть файл

@ -0,0 +1,59 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
def main():
"""
Call train.py as a new process and pass command arguments
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--continue_path',
type=str,
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default='',
required='--config_path' not in sys.argv)
parser.add_argument(
'--restore_path',
type=str,
help='Model file to be restored. Use to finetune a model.',
default='')
parser.add_argument(
'--config_path',
type=str,
help='Path to config file for training.',
required='--continue_path' not in sys.argv
)
args = parser.parse_args()
# OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
# True)
# stdout_path = os.path.join(OUT_PATH, "process_stdout/")
num_gpus = torch.cuda.device_count()
group_id = time.strftime("%Y_%m_%d-%H%M%S")
# set arguments for train.py
command = ['train.py']
command.append('--continue_path={}'.format(args.continue_path))
command.append('--restore_path={}'.format(args.restore_path))
command.append('--config_path={}'.format(args.config_path))
command.append('--group_id=group_{}'.format(group_id))
command.append('')
# run processes
processes = []
for i in range(num_gpus):
my_env = os.environ.copy()
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
command[-1] = '--rank={}'.format(i)
stdout = None if i == 0 else open(os.devnull, 'w')
p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
processes.append(p)
print(command)
for p in processes:
p.wait()
if __name__ == '__main__':
main()

109
TTS/bin/preprocess_tts.py Normal file
Просмотреть файл

@ -0,0 +1,109 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import librosa
import yaml
import shutil
import argparse
import matplotlib.pyplot as plt
import math, pickle, os, glob
import numpy as np
from tqdm import tqdm
from TTS.tts.utils.audio import AudioProcessor
from TTS.tts.utils.generic_utils import load_config
from multiprocessing import Pool
os.environ["OMP_NUM_THREADS"] = "1"
def get_files(path, extension=".wav"):
filenames = []
for filename in glob.iglob(f"{path}/**/*{extension}", recursive=True):
filenames += [filename]
return filenames
def _process_file(path):
wav = ap.load_wav(path)
mel = ap.melspectrogram(wav)
wav = wav.astype(np.float32)
# check
assert len(wav.shape) == 1, \
f"{path} seems to be multi-channel signal."
assert np.abs(wav).max() <= 1.0, \
f"{path} seems to be different from 16 bit PCM."
# gap when wav is not multiple of hop_length
gap = wav.shape[0] % ap.hop_length
assert mel.shape[1] * ap.hop_length == wav.shape[0] + ap.hop_length - gap, f'{mel.shape[1] * ap.hop_length} vs {wav.shape[0] + ap.hop_length + gap}'
return mel.astype(np.float32), wav
def extract_feats(wav_path):
idx = wav_path.split("/")[-1][:-4]
m, wav = _process_file(wav_path)
mel_path = f"{MEL_PATH}{idx}.npy"
np.save(mel_path, m.astype(np.float32), allow_pickle=False)
return wav_path, mel_path
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--config_path", type=str, help="path to config file for feature extraction."
)
parser.add_argument(
"--num_procs", type=int, default=4, help="number of parallel processes."
)
parser.add_argument(
"--data_path", type=str, default='', help="path to audio files."
)
parser.add_argument(
"--out_path", type=str, default='', help="destination to write files."
)
parser.add_argument(
"--ignore_errors", type=bool, default=False, help="ignore bad files."
)
args = parser.parse_args()
# load config
config = load_config(args.config_path)
config.update(vars(args))
config.audio['do_trim_silence'] = False
# config['audio']['signal_norm'] = False # do not apply earlier normalization
ap = AudioProcessor(**config['audio'])
SEG_PATH = config['data_path']
OUT_PATH = args.out_path
MEL_PATH = os.path.join(OUT_PATH, "mel/")
os.makedirs(OUT_PATH, exist_ok=True)
os.makedirs(MEL_PATH, exist_ok=True)
# TODO: use TTS data processors
wav_files = get_files(SEG_PATH)
print(" > Number of audio files : {}".format(len(wav_files)))
wav_file = wav_files[0]
m, wav = _process_file(wav_file)
# sanity check
print(' > Sample Spec Stats...')
print(' | > spectrogram max:', m.max())
print(' | > spectrogram min: ', m.min())
print(' | > spectrogram shape:', m.shape)
print(' | > wav shape:', wav.shape)
print(' | > wav max - min:', wav.max(), ' - ', wav.min())
# This will take a while depending on size of dataset
#with Pool(args.num_procs) as p:
# dataset_ids = list(tqdm(p.imap(extract_feats, wav_files), total=len(wav_files)))
dataset_ids = []
for wav_file in tqdm(wav_files):
item_id = extract_feats(wav_file)
dataset_ids.append(item_id)
# save metadata
with open(os.path.join(OUT_PATH, "metadata.txt"), "w") as f:
for data in dataset_ids:
f.write(f"{data[0]}|{data[1]}\n")

185
TTS/bin/synthesize.py Normal file
Просмотреть файл

@ -0,0 +1,185 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pylint: disable=redefined-outer-name, unused-argument
import os
import time
import argparse
import torch
import json
import string
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.io import load_config
from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes
from TTS.tts.utils.audio import AudioProcessor
def tts(model,
vocoder_model,
C,
VC,
text,
ap,
ap_vocoder,
use_cuda,
batched_vocoder,
speaker_id=None,
figures=False):
t_1 = time.time()
use_vocoder_model = vocoder_model is not None
waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis(
model, text, C, use_cuda, ap, speaker_id, style_wav=False,
truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars,
use_griffin_lim=(not use_vocoder_model), do_trim_silence=True)
if C.model == "Tacotron" and use_vocoder_model:
postnet_output = ap.out_linear_to_mel(postnet_output.T).T
# correct if there is a scale difference b/w two models
if use_vocoder_model:
postnet_output = ap._denormalize(postnet_output)
postnet_output = ap_vocoder._normalize(postnet_output)
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
waveform = vocoder_model.generate(
vocoder_input.cuda() if use_cuda else vocoder_input,
batched=batched_vocoder,
target=8000,
overlap=400)
print(" > Run-time: {}".format(time.time() - t_1))
return alignment, postnet_output, stop_tokens, waveform
if __name__ == "__main__":
global symbols, phonemes
parser = argparse.ArgumentParser()
parser.add_argument('text', type=str, help='Text to generate speech.')
parser.add_argument('config_path',
type=str,
help='Path to model config file.')
parser.add_argument(
'model_path',
type=str,
help='Path to model file.',
)
parser.add_argument(
'out_path',
type=str,
help='Path to save final wav file. Wav file will be names as the text given.',
)
parser.add_argument('--use_cuda',
type=bool,
help='Run model on CUDA.',
default=False)
parser.add_argument(
'--vocoder_path',
type=str,
help=
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
default="",
)
parser.add_argument('--vocoder_config_path',
type=str,
help='Path to vocoder model config file.',
default="")
parser.add_argument(
'--batched_vocoder',
type=bool,
help="If True, vocoder model uses faster batch processing.",
default=True)
parser.add_argument('--speakers_json',
type=str,
help="JSON file for multi-speaker model.",
default="")
parser.add_argument(
'--speaker_id',
type=int,
help="target speaker_id if the model is multi-speaker.",
default=None)
args = parser.parse_args()
if args.vocoder_path != "":
assert args.use_cuda, " [!] Enable cuda for vocoder."
from WaveRNN.models.wavernn import Model as VocoderModel
# load the config
C = load_config(args.config_path)
C.forward_attn_mask = True
# load the audio processor
ap = AudioProcessor(**C.audio)
# if the vocabulary was passed, replace the default
if 'characters' in C.keys():
symbols, phonemes = make_symbols(**C.characters)
# load speakers
if args.speakers_json != '':
speakers = json.load(open(args.speakers_json, 'r'))
num_speakers = len(speakers)
else:
num_speakers = 0
# load the model
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, C)
cp = torch.load(args.model_path)
model.load_state_dict(cp['model'])
model.eval()
if args.use_cuda:
model.cuda()
model.decoder.set_r(cp['r'])
# load vocoder model
if args.vocoder_path != "":
VC = load_config(args.vocoder_config_path)
ap_vocoder = AudioProcessor(**VC.audio)
bits = 10
vocoder_model = VocoderModel(rnn_dims=512,
fc_dims=512,
mode=VC.mode,
mulaw=VC.mulaw,
pad=VC.pad,
upsample_factors=VC.upsample_factors,
feat_dims=VC.audio["num_mels"],
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=ap.hop_length,
sample_rate=ap.sample_rate,
use_aux_net=True,
use_upsample_net=True)
check = torch.load(args.vocoder_path)
vocoder_model.load_state_dict(check['model'])
vocoder_model.eval()
if args.use_cuda:
vocoder_model.cuda()
else:
vocoder_model = None
VC = None
ap_vocoder = None
# synthesize voice
print(" > Text: {}".format(args.text))
_, _, _, wav = tts(model,
vocoder_model,
C,
VC,
args.text,
ap,
ap_vocoder,
args.use_cuda,
args.batched_vocoder,
speaker_id=args.speaker_id,
figures=False)
# save the results
file_name = args.text.replace(" ", "_")
file_name = file_name.translate(
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(args.out_path, file_name)
print(" > Saving output to {}".format(out_path))
ap.save_wav(wav, out_path)

255
TTS/bin/train_encoder.py Normal file
Просмотреть файл

@ -0,0 +1,255 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import os
import sys
import time
import traceback
import torch
from torch.utils.data import DataLoader
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.speaker_encoder.dataset import MyDataset
from TTS.speaker_encoder.loss import GE2ELoss
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.speaker_encoder.visual import plot_embeddings
from TTS.speaker_encoder.generic_utils import save_best_model
from TTS.tts.utils.audio import AudioProcessor
from TTS.tts.utils.generic_utils import (create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.tts.utils.io import load_config, copy_config_file
from TTS.tts.utils.training import check_update, NoamLR
from TTS.tts.utils.tensorboard_logger import TensorboardLogger
from TTS.tts.utils.radam import RAdam
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.manual_seed(54321)
use_cuda = torch.cuda.is_available()
num_gpus = torch.cuda.device_count()
print(" > Using CUDA: ", use_cuda)
print(" > Number of GPUs: ", num_gpus)
def setup_loader(ap, is_val=False, verbose=False):
if is_val:
loader = None
else:
dataset = MyDataset(ap,
meta_data_eval if is_val else meta_data_train,
voice_len=1.6,
num_utter_per_speaker=10,
skip_speakers=False,
verbose=verbose)
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(dataset,
batch_size=c.num_speakers_in_batch,
shuffle=False,
num_workers=c.num_loader_workers,
collate_fn=dataset.collate_fn)
return loader
def train(model, criterion, optimizer, scheduler, ap, global_step):
data_loader = setup_loader(ap, is_val=False, verbose=True)
model.train()
epoch_time = 0
best_loss = float('inf')
avg_loss = 0
end_time = time.time()
for _, data in enumerate(data_loader):
start_time = time.time()
# setup input data
inputs = data[0]
loader_time = time.time() - end_time
global_step += 1
# setup lr
if c.lr_decay:
scheduler.step()
optimizer.zero_grad()
# dispatch data to GPU
if use_cuda:
inputs = inputs.cuda(non_blocking=True)
# labels = labels.cuda(non_blocking=True)
# forward pass model
outputs = model(inputs)
# loss computation
loss = criterion(
outputs.view(c.num_speakers_in_batch,
outputs.shape[0] // c.num_speakers_in_batch, -1))
loss.backward()
grad_norm, _ = check_update(model, c.grad_clip)
optimizer.step()
step_time = time.time() - start_time
epoch_time += step_time
avg_loss = 0.01 * loss.item(
) + 0.99 * avg_loss if avg_loss != 0 else loss.item()
current_lr = optimizer.param_groups[0]['lr']
if global_step % c.steps_plot_stats == 0:
# Plot Training Epoch Stats
train_stats = {
"GE2Eloss": avg_loss,
"lr": current_lr,
"grad_norm": grad_norm,
"step_time": step_time
}
tb_logger.tb_train_epoch_stats(global_step, train_stats)
figures = {
# FIXME: not constant
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(),
10),
}
tb_logger.tb_train_figures(global_step, figures)
if global_step % c.print_step == 0:
print(
" | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} "
"StepTime:{:.2f} LoaderTime:{:.2f} LR:{:.6f}".format(
global_step, loss.item(), avg_loss, grad_norm, step_time,
loader_time, current_lr),
flush=True)
# save best model
best_loss = save_best_model(model, optimizer, avg_loss, best_loss,
OUT_PATH, global_step)
end_time = time.time()
return avg_loss, global_step
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global meta_data_train
global meta_data_eval
ap = AudioProcessor(**c.audio)
model = SpeakerEncoder(input_dim=40,
proj_dim=128,
lstm_dim=384,
num_lstm_layers=3)
optimizer = RAdam(model.parameters(), lr=c.lr)
criterion = GE2ELoss(loss_method='softmax')
if args.restore_path:
checkpoint = torch.load(args.restore_path)
try:
# TODO: fix optimizer init, model.cuda() needs to be called before
# optimizer restore
# optimizer.load_state_dict(checkpoint['optimizer'])
if c.reinit_layers:
raise RuntimeError
model.load_state_dict(checkpoint['model'])
except KeyError:
print(" > Partial model initialization.")
model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint, c)
model.load_state_dict(model_dict)
del model_dict
for group in optimizer.param_groups:
group['lr'] = c.lr
print(" > Model restored from step %d" % checkpoint['step'],
flush=True)
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
if use_cuda:
model = model.cuda()
criterion.cuda()
if c.lr_decay:
scheduler = NoamLR(optimizer,
warmup_steps=c.warmup_steps,
last_epoch=args.restore_step - 1)
else:
scheduler = None
num_params = count_parameters(model)
print("\n > Model has {} parameters".format(num_params), flush=True)
# pylint: disable=redefined-outer-name
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
global_step = args.restore_step
train_loss, global_step = train(model, criterion, optimizer, scheduler, ap,
global_step)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--restore_path',
type=str,
help='Path to model outputs (checkpoint, tensorboard etc.).',
default=0)
parser.add_argument(
'--config_path',
type=str,
help='Path to config file for training.',
)
parser.add_argument('--debug',
type=bool,
default=True,
help='Do not verify commit integrity to run training.')
parser.add_argument(
'--data_path',
type=str,
default='',
help='Defines the data path. It overwrites config.json.')
parser.add_argument('--output_path',
type=str,
help='path for training outputs.',
default='')
parser.add_argument('--output_folder',
type=str,
default='',
help='folder name for training outputs.')
args = parser.parse_args()
# setup output paths and read configs
c = load_config(args.config_path)
_ = os.path.dirname(os.path.realpath(__file__))
if args.data_path != '':
c.data_path = args.data_path
if args.output_path == '':
OUT_PATH = os.path.join(_, c.output_path)
else:
OUT_PATH = args.output_path
if args.output_folder == '':
OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
else:
OUT_PATH = os.path.join(OUT_PATH, args.output_folder)
new_fields = {}
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'),
new_fields)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR)
try:
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)

651
TTS/bin/train_tts.py Normal file
Просмотреть файл

@ -0,0 +1,651 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import glob
import os
import sys
import time
import traceback
import numpy as np
import torch
from torch.utils.data import DataLoader
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.datasets.TTSDataset import MyDataset
from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.utils.console_logger import ConsoleLogger
from TTS.tts.utils.distribute import (DistributedSampler,
apply_gradient_allreduce,
init_distributed, reduce_tensor)
from TTS.tts.utils.generic_utils import check_config, setup_model
from TTS.tts.utils.io import save_best_model, save_checkpoint
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import (get_speakers, load_speaker_mapping,
save_speaker_mapping)
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
gradual_training_scheduler, set_weight_decay,
setup_torch_training_env)
use_cuda, num_gpus = setup_torch_training_env(True, False)
def setup_loader(ap, r, is_val=False, verbose=False):
if is_val and not c.run_eval:
loader = None
else:
dataset = MyDataset(
r,
c.text_cleaner,
compute_linear_spec=True if c.model.lower() == 'tacotron' else False,
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len,
max_seq_len=c.max_seq_len,
phoneme_cache_path=c.phoneme_cache_path,
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars,
verbose=verbose)
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
batch_size=c.eval_batch_size if is_val else c.batch_size,
shuffle=False,
collate_fn=dataset.collate_fn,
drop_last=False,
sampler=sampler,
num_workers=c.num_val_loader_workers
if is_val else c.num_loader_workers,
pin_memory=False)
return loader
def format_data(data):
if c.use_speaker_embedding:
speaker_mapping = load_speaker_mapping(OUT_PATH)
# setup input data
text_input = data[0]
text_lengths = data[1]
speaker_names = data[2]
linear_input = data[3] if c.model in ["Tacotron"] else None
mel_input = data[4]
mel_lengths = data[5]
stop_targets = data[6]
avg_text_length = torch.mean(text_lengths.float())
avg_spec_length = torch.mean(mel_lengths.float())
if c.use_speaker_embedding:
speaker_ids = [
speaker_mapping[speaker_name] for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids)
else:
speaker_ids = None
# set stop targets view, we predict a single stop token per iteration.
stop_targets = stop_targets.view(text_input.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze(2)
# dispatch data to GPU
if use_cuda:
text_input = text_input.cuda(non_blocking=True)
text_lengths = text_lengths.cuda(non_blocking=True)
mel_input = mel_input.cuda(non_blocking=True)
mel_lengths = mel_lengths.cuda(non_blocking=True)
linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron"] else None
stop_targets = stop_targets.cuda(non_blocking=True)
if speaker_ids is not None:
speaker_ids = speaker_ids.cuda(non_blocking=True)
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length
def train(model, criterion, optimizer, optimizer_st, scheduler,
ap, global_step, epoch):
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
verbose=(epoch == 0))
model.train()
epoch_time = 0
keep_avg = KeepAverage()
if use_cuda:
batch_n_iter = int(
len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time()
c_logger.print_train_start()
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data(data)
loader_time = time.time() - end_time
global_step += 1
# setup lr
if c.noam_schedule:
scheduler.step()
optimizer.zero_grad()
if optimizer_st:
optimizer_st.zero_grad()
# forward pass model
if c.bidirectional_decoder or c.double_decoder_consistency:
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
else:
decoder_output, postnet_output, alignments, stop_tokens = model(
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
decoder_backward_output = None
alignments_backward = None
# set the alignment lengths wrt reduction factor for guided attention
if mel_lengths.max() % model.decoder.r != 0:
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
else:
alignment_lengths = mel_lengths // model.decoder.r
# compute loss
loss_dict = criterion(postnet_output, decoder_output, mel_input,
linear_input, stop_tokens, stop_targets,
mel_lengths, decoder_backward_output,
alignments, alignment_lengths, alignments_backward,
text_lengths)
# backward pass
loss_dict['loss'].backward()
optimizer, current_lr = adam_weight_decay(optimizer)
grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
optimizer.step()
# compute alignment error (the lower the better )
align_error = 1 - alignment_diagonal_score(alignments)
loss_dict['align_error'] = align_error
# backpass and check the grad norm for stop loss
if c.separate_stopnet:
loss_dict['stopnet_loss'].backward()
optimizer_st, _ = adam_weight_decay(optimizer_st)
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
optimizer_st.step()
else:
grad_norm_st = 0
step_time = time.time() - start_time
epoch_time += step_time
# aggregate losses from processes
if num_gpus > 1:
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss']
# detach loss values
loss_dict_new = dict()
for key, value in loss_dict.items():
if isinstance(value, (int, float)):
loss_dict_new[key] = value
else:
loss_dict_new[key] = value.item()
loss_dict = loss_dict_new
# update avg stats
update_train_values = dict()
for key, value in loss_dict.items():
update_train_values['avg_' + key] = value
update_train_values['avg_loader_time'] = loader_time
update_train_values['avg_step_time'] = step_time
keep_avg.update_values(update_train_values)
# print training progress
if global_step % c.print_step == 0:
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
avg_spec_length, avg_text_length,
step_time, loader_time, current_lr,
loss_dict, keep_avg.avg_values)
if args.rank == 0:
# Plot Training Iter Stats
# reduce TB load
if global_step % c.tb_plot_step == 0:
iter_stats = {
"lr": current_lr,
"grad_norm": grad_norm,
"grad_norm_st": grad_norm_st,
"step_time": step_time
}
iter_stats.update(loss_dict)
tb_logger.tb_train_iter_stats(global_step, iter_stats)
if global_step % c.save_step == 0:
if c.checkpoint:
# save model
save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
optimizer_st=optimizer_st,
model_loss=loss_dict['postnet_loss'])
# Diagnostic visualizations
const_spec = postnet_output[0].data.cpu().numpy()
gt_spec = linear_input[0].data.cpu().numpy() if c.model in [
"Tacotron", "TacotronGST"
] else mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
figures = {
"prediction": plot_spectrogram(const_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img),
}
if c.bidirectional_decoder or c.double_decoder_consistency:
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy())
tb_logger.tb_train_figures(global_step, figures)
# Sample audio
if c.model in ["Tacotron", "TacotronGST"]:
train_audio = ap.inv_spectrogram(const_spec.T)
else:
train_audio = ap.inv_melspectrogram(const_spec.T)
tb_logger.tb_train_audios(global_step,
{'TrainAudio': train_audio},
c.audio["sample_rate"])
end_time = time.time()
# print epoch stats
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
# Plot Epoch Stats
if args.rank == 0:
epoch_stats = {"epoch_time": epoch_time}
epoch_stats.update(keep_avg.avg_values)
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, global_step)
return keep_avg.avg_values, global_step
@torch.no_grad()
def evaluate(model, criterion, ap, global_step, epoch):
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
model.eval()
epoch_time = 0
keep_avg = KeepAverage()
c_logger.print_eval_start()
if data_loader is not None:
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
assert mel_input.shape[1] % model.decoder.r == 0
# forward pass model
if c.bidirectional_decoder or c.double_decoder_consistency:
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
else:
decoder_output, postnet_output, alignments, stop_tokens = model(
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
decoder_backward_output = None
alignments_backward = None
# set the alignment lengths wrt reduction factor for guided attention
if mel_lengths.max() % model.decoder.r != 0:
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
else:
alignment_lengths = mel_lengths // model.decoder.r
# compute loss
loss_dict = criterion(postnet_output, decoder_output, mel_input,
linear_input, stop_tokens, stop_targets,
mel_lengths, decoder_backward_output,
alignments, alignment_lengths, alignments_backward,
text_lengths)
# step time
step_time = time.time() - start_time
epoch_time += step_time
# compute alignment score
align_error = 1 - alignment_diagonal_score(alignments)
loss_dict['align_error'] = align_error
# aggregate losses from processes
if num_gpus > 1:
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
if c.stopnet:
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus)
# detach loss values
loss_dict_new = dict()
for key, value in loss_dict.items():
if isinstance(value, (int, float)):
loss_dict_new[key] = value
else:
loss_dict_new[key] = value.item()
loss_dict = loss_dict_new
# update avg stats
update_train_values = dict()
for key, value in loss_dict.items():
update_train_values['avg_' + key] = value
keep_avg.update_values(update_train_values)
if c.print_eval:
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
if args.rank == 0:
# Diagnostic visualizations
idx = np.random.randint(mel_input.shape[0])
const_spec = postnet_output[idx].data.cpu().numpy()
gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
"Tacotron", "TacotronGST"
] else mel_input[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy()
eval_figures = {
"prediction": plot_spectrogram(const_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img)
}
# Sample audio
if c.model in ["Tacotron", "TacotronGST"]:
eval_audio = ap.inv_spectrogram(const_spec.T)
else:
eval_audio = ap.inv_melspectrogram(const_spec.T)
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
c.audio["sample_rate"])
# Plot Validation Stats
if c.bidirectional_decoder or c.double_decoder_consistency:
align_b_img = alignments_backward[idx].data.cpu().numpy()
eval_figures['alignment2'] = plot_alignment(align_b_img)
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
tb_logger.tb_eval_figures(global_step, eval_figures)
if args.rank == 0 and epoch > c.test_delay_epochs:
if c.test_sentences_file is None:
test_sentences = [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963."
]
else:
with open(c.test_sentences_file, "r") as f:
test_sentences = [s.strip() for s in f.readlines()]
# test sentences
test_audios = {}
test_figures = {}
print(" | > Synthesizing test sentences")
speaker_id = 0 if c.use_speaker_embedding else None
style_wav = c.get("style_wav_for_test")
for idx, test_sentence in enumerate(test_sentences):
try:
wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis(
model,
test_sentence,
c,
use_cuda,
ap,
speaker_id=speaker_id,
style_wav=style_wav,
truncated=False,
enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
use_griffin_lim=True,
do_trim_silence=False)
file_path = os.path.join(AUDIO_PATH, str(global_step))
os.makedirs(file_path, exist_ok=True)
file_path = os.path.join(file_path,
"TestSentence_{}.wav".format(idx))
ap.save_wav(wav, file_path)
test_audios['{}-audio'.format(idx)] = wav
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
postnet_output, ap)
test_figures['{}-alignment'.format(idx)] = plot_alignment(
alignment)
except:
print(" !! Error creating Test Sentence -", idx)
traceback.print_exc()
tb_logger.tb_test_audios(global_step, test_audios,
c.audio['sample_rate'])
tb_logger.tb_test_figures(global_step, test_figures)
return keep_avg.avg_values
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global meta_data_train, meta_data_eval, symbols, phonemes
# Audio processor
ap = AudioProcessor(**c.audio)
if 'characters' in c.keys():
symbols, phonemes = make_symbols(**c.characters)
# DISTRUBUTED
if num_gpus > 1:
init_distributed(args.rank, num_gpus, args.group_id,
c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
# load data instances
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
# set the portion of the data used for training
if 'train_portion' in c.keys():
meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)]
if 'eval_portion' in c.keys():
meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)]
# parse speakers
if c.use_speaker_embedding:
speakers = get_speakers(meta_data_train)
if args.restore_path:
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)
assert all([speaker in speaker_mapping
for speaker in speakers]), "As of now you, you cannot " \
"introduce new speakers to " \
"a previously trained model."
else:
speaker_mapping = {name: i for i, name in enumerate(speakers)}
save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping)
print("Training with {} speakers: {}".format(num_speakers,
", ".join(speakers)))
else:
num_speakers = 0
model = setup_model(num_chars, num_speakers, c)
params = set_weight_decay(model, c.wd)
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
if c.stopnet and c.separate_stopnet:
optimizer_st = RAdam(model.decoder.stopnet.parameters(),
lr=c.lr,
weight_decay=0)
else:
optimizer_st = None
# setup criterion
criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)
if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location='cpu')
try:
# TODO: fix optimizer init, model.cuda() needs to be called before
# optimizer restore
# optimizer.load_state_dict(checkpoint['optimizer'])
if c.reinit_layers:
raise RuntimeError
model.load_state_dict(checkpoint['model'])
except:
print(" > Partial model initialization.")
model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
model.load_state_dict(model_dict)
del model_dict
for group in optimizer.param_groups:
group['lr'] = c.lr
print(" > Model restored from step %d" % checkpoint['step'],
flush=True)
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
if use_cuda:
model.cuda()
criterion.cuda()
# DISTRUBUTED
if num_gpus > 1:
model = apply_gradient_allreduce(model)
if c.noam_schedule:
scheduler = NoamLR(optimizer,
warmup_steps=c.warmup_steps,
last_epoch=args.restore_step - 1)
else:
scheduler = None
num_params = count_parameters(model)
print("\n > Model has {} parameters".format(num_params), flush=True)
if 'best_loss' not in locals():
best_loss = float('inf')
global_step = args.restore_step
for epoch in range(0, c.epochs):
c_logger.print_epoch_start(epoch, c.epochs)
# set gradual training
if c.gradual_training is not None:
r, c.batch_size = gradual_training_scheduler(global_step, c)
c.r = r
model.decoder.set_r(r)
if c.bidirectional_decoder:
model.decoder_backward.set_r(r)
print("\n > Number of output frames:", model.decoder.r)
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
optimizer_st, scheduler, ap,
global_step, epoch)
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = train_avg_loss_dict['avg_postnet_loss']
if c.run_eval:
target_loss = eval_avg_loss_dict['avg_postnet_loss']
best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
OUT_PATH)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--continue_path',
type=str,
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default='',
required='--config_path' not in sys.argv)
parser.add_argument(
'--restore_path',
type=str,
help='Model file to be restored. Use to finetune a model.',
default='')
parser.add_argument(
'--config_path',
type=str,
help='Path to config file for training.',
required='--continue_path' not in sys.argv
)
parser.add_argument('--debug',
type=bool,
default=False,
help='Do not verify commit integrity to run training.')
# DISTRUBUTED
parser.add_argument(
'--rank',
type=int,
default=0,
help='DISTRIBUTED: process rank for distributed training.')
parser.add_argument('--group_id',
type=str,
default="",
help='DISTRIBUTED: process group id.')
args = parser.parse_args()
if args.continue_path != '':
args.output_path = args.continue_path
args.config_path = os.path.join(args.continue_path, 'config.json')
list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv
latest_model_file = max(list_of_files, key=os.path.getctime)
args.restore_path = latest_model_file
print(f" > Training continues for {args.restore_path}")
# setup output paths and read configs
c = load_config(args.config_path)
check_config(c)
_ = os.path.dirname(os.path.realpath(__file__))
OUT_PATH = args.continue_path
if args.continue_path == '':
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
c_logger = ConsoleLogger()
if args.rank == 0:
os.makedirs(AUDIO_PATH, exist_ok=True)
new_fields = {}
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
# write model desc to tensorboard
tb_logger.tb_add_text('model-description', c['run_description'], 0)
try:
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)

661
TTS/bin/train_vocoder.py Normal file
Просмотреть файл

@ -0,0 +1,661 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import glob
import os
import sys
import time
import traceback
from inspect import signature
import torch
from torch.utils.data import DataLoader
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict)
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import setup_torch_training_env
from TTS.vocoder.datasets.gan_dataset import GANDataset
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
# from distribute import (DistributedSampler, apply_gradient_allreduce,
# init_distributed, reduce_tensor)
from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
from TTS.vocoder.utils.console_logger import ConsoleLogger
from TTS.vocoder.utils.generic_utils import (check_config, plot_results,
setup_discriminator,
setup_generator)
from TTS.vocoder.utils.io import save_best_model, save_checkpoint
use_cuda, num_gpus = setup_torch_training_env(True, True)
def setup_loader(ap, is_val=False, verbose=False):
if is_val and not c.run_eval:
loader = None
else:
dataset = GANDataset(ap=ap,
items=eval_data if is_val else train_data,
seq_len=c.seq_len,
hop_len=ap.hop_length,
pad_short=c.pad_short,
conv_pad=c.conv_pad,
is_training=not is_val,
return_segments=not is_val,
use_noise_augment=c.use_noise_augment,
use_cache=c.use_cache,
verbose=verbose)
dataset.shuffle_mapping()
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(dataset,
batch_size=1 if is_val else c.batch_size,
shuffle=True,
drop_last=False,
sampler=None,
num_workers=c.num_val_loader_workers
if is_val else c.num_loader_workers,
pin_memory=False)
return loader
def format_data(data):
if isinstance(data[0], list):
# setup input data
c_G, x_G = data[0]
c_D, x_D = data[1]
# dispatch data to GPU
if use_cuda:
c_G = c_G.cuda(non_blocking=True)
x_G = x_G.cuda(non_blocking=True)
c_D = c_D.cuda(non_blocking=True)
x_D = x_D.cuda(non_blocking=True)
return c_G, x_G, c_D, x_D
# return a whole audio segment
co, x = data
if use_cuda:
co = co.cuda(non_blocking=True)
x = x.cuda(non_blocking=True)
return co, x, None, None
def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
scheduler_G, scheduler_D, ap, global_step, epoch):
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
model_G.train()
model_D.train()
epoch_time = 0
keep_avg = KeepAverage()
if use_cuda:
batch_n_iter = int(
len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time()
c_logger.print_train_start()
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
c_G, y_G, c_D, y_D = format_data(data)
loader_time = time.time() - end_time
global_step += 1
##############################
# GENERATOR
##############################
# generator pass
y_hat = model_G(c_G)
y_hat_sub = None
y_G_sub = None
y_hat_vis = y_hat # for visualization
# PQMF formatting
if y_hat.shape[1] > 1:
y_hat_sub = y_hat
y_hat = model_G.pqmf_synthesis(y_hat)
y_hat_vis = y_hat
y_G_sub = model_G.pqmf_analysis(y_G)
if global_step > c.steps_to_start_discriminator:
# run D with or without cond. features
if len(signature(model_D.forward).parameters) == 2:
D_out_fake = model_D(y_hat, c_G)
else:
D_out_fake = model_D(y_hat)
D_out_real = None
if c.use_feat_match_loss:
with torch.no_grad():
D_out_real = model_D(y_G)
# format D outputs
if isinstance(D_out_fake, tuple):
scores_fake, feats_fake = D_out_fake
if D_out_real is None:
feats_real = None
else:
_, feats_real = D_out_real
else:
scores_fake = D_out_fake
else:
scores_fake, feats_fake, feats_real = None, None, None
# compute losses
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
feats_real, y_hat_sub, y_G_sub)
loss_G = loss_G_dict['G_loss']
# optimizer generator
optimizer_G.zero_grad()
loss_G.backward()
if c.gen_clip_grad > 0:
torch.nn.utils.clip_grad_norm_(model_G.parameters(),
c.gen_clip_grad)
optimizer_G.step()
if scheduler_G is not None:
scheduler_G.step()
loss_dict = dict()
for key, value in loss_G_dict.items():
if isinstance(value, int):
loss_dict[key] = value
else:
loss_dict[key] = value.item()
##############################
# DISCRIMINATOR
##############################
if global_step >= c.steps_to_start_discriminator:
# discriminator pass
with torch.no_grad():
y_hat = model_G(c_D)
# PQMF formatting
if y_hat.shape[1] > 1:
y_hat = model_G.pqmf_synthesis(y_hat)
# run D with or without cond. features
if len(signature(model_D.forward).parameters) == 2:
D_out_fake = model_D(y_hat.detach(), c_D)
D_out_real = model_D(y_D, c_D)
else:
D_out_fake = model_D(y_hat.detach())
D_out_real = model_D(y_D)
# format D outputs
if isinstance(D_out_fake, tuple):
scores_fake, feats_fake = D_out_fake
if D_out_real is None:
scores_real, feats_real = None, None
else:
scores_real, feats_real = D_out_real
else:
scores_fake = D_out_fake
scores_real = D_out_real
# compute losses
loss_D_dict = criterion_D(scores_fake, scores_real)
loss_D = loss_D_dict['D_loss']
# optimizer discriminator
optimizer_D.zero_grad()
loss_D.backward()
if c.disc_clip_grad > 0:
torch.nn.utils.clip_grad_norm_(model_D.parameters(),
c.disc_clip_grad)
optimizer_D.step()
if scheduler_D is not None:
scheduler_D.step()
for key, value in loss_D_dict.items():
if isinstance(value, (int, float)):
loss_dict[key] = value
else:
loss_dict[key] = value.item()
step_time = time.time() - start_time
epoch_time += step_time
# get current learning rates
current_lr_G = list(optimizer_G.param_groups)[0]['lr']
current_lr_D = list(optimizer_D.param_groups)[0]['lr']
# update avg stats
update_train_values = dict()
for key, value in loss_dict.items():
update_train_values['avg_' + key] = value
update_train_values['avg_loader_time'] = loader_time
update_train_values['avg_step_time'] = step_time
keep_avg.update_values(update_train_values)
# print training stats
if global_step % c.print_step == 0:
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
step_time, loader_time, current_lr_G,
current_lr_D, loss_dict,
keep_avg.avg_values)
# plot step stats
if global_step % 10 == 0:
iter_stats = {
"lr_G": current_lr_G,
"lr_D": current_lr_D,
"step_time": step_time
}
iter_stats.update(loss_dict)
tb_logger.tb_train_iter_stats(global_step, iter_stats)
# save checkpoint
if global_step % c.save_step == 0:
if c.checkpoint:
# save model
save_checkpoint(model_G,
optimizer_G,
scheduler_G,
model_D,
optimizer_D,
scheduler_D,
global_step,
epoch,
OUT_PATH,
model_losses=loss_dict)
# compute spectrograms
figures = plot_results(y_hat_vis, y_G, ap, global_step,
'train')
tb_logger.tb_train_figures(global_step, figures)
# Sample audio
sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy()
tb_logger.tb_train_audios(global_step,
{'train/audio': sample_voice},
c.audio["sample_rate"])
end_time = time.time()
# print epoch stats
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
# Plot Training Epoch Stats
epoch_stats = {"epoch_time": epoch_time}
epoch_stats.update(keep_avg.avg_values)
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
# TODO: plot model stats
# if c.tb_model_param_stats:
# tb_logger.tb_model_weights(model, global_step)
return keep_avg.avg_values, global_step
@torch.no_grad()
def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch):
data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0))
model_G.eval()
model_D.eval()
epoch_time = 0
keep_avg = KeepAverage()
end_time = time.time()
c_logger.print_eval_start()
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
c_G, y_G, _, _ = format_data(data)
loader_time = time.time() - end_time
global_step += 1
##############################
# GENERATOR
##############################
# generator pass
y_hat = model_G(c_G)
y_hat_sub = None
y_G_sub = None
# PQMF formatting
if y_hat.shape[1] > 1:
y_hat_sub = y_hat
y_hat = model_G.pqmf_synthesis(y_hat)
y_G_sub = model_G.pqmf_analysis(y_G)
if global_step > c.steps_to_start_discriminator:
if len(signature(model_D.forward).parameters) == 2:
D_out_fake = model_D(y_hat, c_G)
else:
D_out_fake = model_D(y_hat)
D_out_real = None
if c.use_feat_match_loss:
with torch.no_grad():
D_out_real = model_D(y_G)
# format D outputs
if isinstance(D_out_fake, tuple):
scores_fake, feats_fake = D_out_fake
if D_out_real is None:
feats_real = None
else:
_, feats_real = D_out_real
else:
scores_fake = D_out_fake
else:
scores_fake, feats_fake, feats_real = None, None, None
# compute losses
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
feats_real, y_hat_sub, y_G_sub)
loss_dict = dict()
for key, value in loss_G_dict.items():
if isinstance(value, (int, float)):
loss_dict[key] = value
else:
loss_dict[key] = value.item()
##############################
# DISCRIMINATOR
##############################
if global_step >= c.steps_to_start_discriminator:
# discriminator pass
with torch.no_grad():
y_hat = model_G(c_G)
# PQMF formatting
if y_hat.shape[1] > 1:
y_hat = model_G.pqmf_synthesis(y_hat)
# run D with or without cond. features
if len(signature(model_D.forward).parameters) == 2:
D_out_fake = model_D(y_hat.detach(), c_G)
D_out_real = model_D(y_G, c_G)
else:
D_out_fake = model_D(y_hat.detach())
D_out_real = model_D(y_G)
# format D outputs
if isinstance(D_out_fake, tuple):
scores_fake, feats_fake = D_out_fake
if D_out_real is None:
scores_real, feats_real = None, None
else:
scores_real, feats_real = D_out_real
else:
scores_fake = D_out_fake
scores_real = D_out_real
# compute losses
loss_D_dict = criterion_D(scores_fake, scores_real)
for key, value in loss_D_dict.items():
if isinstance(value, (int, float)):
loss_dict[key] = value
else:
loss_dict[key] = value.item()
step_time = time.time() - start_time
epoch_time += step_time
# update avg stats
update_eval_values = dict()
for key, value in loss_dict.items():
update_eval_values['avg_' + key] = value
update_eval_values['avg_loader_time'] = loader_time
update_eval_values['avg_step_time'] = step_time
keep_avg.update_values(update_eval_values)
# print eval stats
if c.print_eval:
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
# compute spectrograms
figures = plot_results(y_hat, y_G, ap, global_step, 'eval')
tb_logger.tb_eval_figures(global_step, figures)
# Sample audio
sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice},
c.audio["sample_rate"])
# synthesize a full voice
data_loader.return_segments = False
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
return keep_avg.avg_values
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global train_data, eval_data
print(f" > Loading wavs from: {c.data_path}")
if c.feature_path is not None:
print(f" > Loading features from: {c.feature_path}")
eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size)
else:
eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
# setup audio processor
ap = AudioProcessor(**c.audio)
# DISTRUBUTED
# if num_gpus > 1:
# init_distributed(args.rank, num_gpus, args.group_id,
# c.distributed["backend"], c.distributed["url"])
# setup models
model_gen = setup_generator(c)
model_disc = setup_discriminator(c)
# setup optimizers
optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0)
optimizer_disc = RAdam(model_disc.parameters(),
lr=c.lr_disc,
weight_decay=0)
# schedulers
scheduler_gen = None
scheduler_disc = None
if 'lr_scheduler_gen' in c:
scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen)
scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params)
if 'lr_scheduler_disc' in c:
scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc)
scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params)
# setup criterion
criterion_gen = GeneratorLoss(c)
criterion_disc = DiscriminatorLoss(c)
if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location='cpu')
try:
print(" > Restoring Generator Model...")
model_gen.load_state_dict(checkpoint['model'])
print(" > Restoring Generator Optimizer...")
optimizer_gen.load_state_dict(checkpoint['optimizer'])
print(" > Restoring Discriminator Model...")
model_disc.load_state_dict(checkpoint['model_disc'])
print(" > Restoring Discriminator Optimizer...")
optimizer_disc.load_state_dict(checkpoint['optimizer_disc'])
if 'scheduler' in checkpoint:
print(" > Restoring Generator LR Scheduler...")
scheduler_gen.load_state_dict(checkpoint['scheduler'])
# NOTE: Not sure if necessary
scheduler_gen.optimizer = optimizer_gen
if 'scheduler_disc' in checkpoint:
print(" > Restoring Discriminator LR Scheduler...")
scheduler_disc.load_state_dict(checkpoint['scheduler_disc'])
scheduler_disc.optimizer = optimizer_disc
except RuntimeError:
# retore only matching layers.
print(" > Partial model initialization...")
model_dict = model_gen.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
model_gen.load_state_dict(model_dict)
model_dict = model_disc.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c)
model_disc.load_state_dict(model_dict)
del model_dict
# reset lr if not countinuining training.
for group in optimizer_gen.param_groups:
group['lr'] = c.lr_gen
for group in optimizer_disc.param_groups:
group['lr'] = c.lr_disc
print(" > Model restored from step %d" % checkpoint['step'],
flush=True)
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
if use_cuda:
model_gen.cuda()
criterion_gen.cuda()
model_disc.cuda()
criterion_disc.cuda()
# DISTRUBUTED
# if num_gpus > 1:
# model = apply_gradient_allreduce(model)
num_params = count_parameters(model_gen)
print(" > Generator has {} parameters".format(num_params), flush=True)
num_params = count_parameters(model_disc)
print(" > Discriminator has {} parameters".format(num_params), flush=True)
if 'best_loss' not in locals():
best_loss = float('inf')
global_step = args.restore_step
for epoch in range(0, c.epochs):
c_logger.print_epoch_start(epoch, c.epochs)
_, global_step = train(model_gen, criterion_gen, optimizer_gen,
model_disc, criterion_disc, optimizer_disc,
scheduler_gen, scheduler_disc, ap, global_step,
epoch)
eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap,
global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = eval_avg_loss_dict[c.target_loss]
best_loss = save_best_model(target_loss,
best_loss,
model_gen,
optimizer_gen,
scheduler_gen,
model_disc,
optimizer_disc,
scheduler_disc,
global_step,
epoch,
OUT_PATH,
model_losses=eval_avg_loss_dict)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--continue_path',
type=str,
help=
'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default='',
required='--config_path' not in sys.argv)
parser.add_argument(
'--restore_path',
type=str,
help='Model file to be restored. Use to finetune a model.',
default='')
parser.add_argument('--config_path',
type=str,
help='Path to config file for training.',
required='--continue_path' not in sys.argv)
parser.add_argument('--debug',
type=bool,
default=False,
help='Do not verify commit integrity to run training.')
# DISTRUBUTED
parser.add_argument(
'--rank',
type=int,
default=0,
help='DISTRIBUTED: process rank for distributed training.')
parser.add_argument('--group_id',
type=str,
default="",
help='DISTRIBUTED: process group id.')
args = parser.parse_args()
if args.continue_path != '':
args.output_path = args.continue_path
args.config_path = os.path.join(args.continue_path, 'config.json')
list_of_files = glob.glob(
args.continue_path +
"/*.pth.tar") # * means all if need specific format then *.csv
latest_model_file = max(list_of_files, key=os.path.getctime)
args.restore_path = latest_model_file
print(f" > Training continues for {args.restore_path}")
# setup output paths and read configs
c = load_config(args.config_path)
check_config(c)
_ = os.path.dirname(os.path.realpath(__file__))
OUT_PATH = args.continue_path
if args.continue_path == '':
OUT_PATH = create_experiment_folder(c.output_path, c.run_name,
args.debug)
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
c_logger = ConsoleLogger()
if args.rank == 0:
os.makedirs(AUDIO_PATH, exist_ok=True)
new_fields = {}
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER')
# write model desc to tensorboard
tb_logger.tb_add_text('model-description', c['run_description'], 0)
try:
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)

47
TTS/server/README.md Normal file
Просмотреть файл

@ -0,0 +1,47 @@
## TTS example web-server
You'll need a model package (Zip file, includes TTS Python wheel, model files, server configuration, and optional nginx/uwsgi configs). Publicly available models are listed [here](https://github.com/mozilla/TTS/wiki/Released-Models#simple-packaging---self-contained-package-that-runs-an-http-api-for-a-pre-trained-tts-model).
Instructions below are based on a Ubuntu 18.04 machine, but it should be simple to adapt the package names to other distros if needed. Python 3.6 is recommended, as some of the dependencies' versions predate Python 3.7 and will force building from source, which requires extra dependencies and is not guaranteed to work.
#### Development server:
##### Using server.py
If you have the environment set already for TTS, then you can directly call ```server.py```.
##### Using .whl
1. apt-get install -y espeak libsndfile1 python3-venv
2. python3 -m venv /tmp/venv
3. source /tmp/venv/bin/activate
4. pip install -U pip setuptools wheel
5. pip install -U https//example.com/url/to/python/package.whl
6. python -m TTS.server.server
You can now open http://localhost:5002 in a browser
#### Running with nginx/uwsgi:
1. apt-get install -y uwsgi uwsgi-plugin-python3 nginx espeak libsndfile1 python3-venv
2. python3 -m venv /tmp/venv
3. source /tmp/venv/bin/activate
4. pip install -U pip setuptools wheel
5. pip install -U https//example.com/url/to/python/package.whl
6. curl -LO https://github.com/reuben/TTS/releases/download/t2-ljspeech-mold/t2-ljspeech-mold-nginx-uwsgi.zip
7. unzip *-nginx-uwsgi.zip
8. cp tts_site_nginx /etc/nginx/sites-enabled/default
9. service nginx restart
10. uwsgi --ini uwsgi.ini
You can now open http://localhost:80 in a browser (edit the port in /etc/nginx/sites-enabled/tts_site_nginx).
Configure number of workers (number of requests that will be processed in parallel) by editing the `uwsgi.ini` file, specifically the `processes` setting.
#### Creating a server package with an embedded model
[setup.py](../setup.py) was extended with two new parameters when running the `bdist_wheel` command:
- `--checkpoint <path to checkpoint file>` - path to model checkpoint file you want to embed in the package
- `--model_config <path to config.json file>` - path to corresponding config.json file for the checkpoint
To create a package, run `python setup.py bdist_wheel --checkpoint /path/to/checkpoint --model_config /path/to/config.json`.
A Python `.whl` file will be created in the `dist/` folder with the checkpoint and config embedded in it.

0
TTS/server/__init__.py Normal file
Просмотреть файл

16
TTS/server/conf.json Normal file
Просмотреть файл

@ -0,0 +1,16 @@
{
"tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
"tts_file":"best_model.pth.tar", // tts checkpoint file
"tts_config":"config.json", // tts config.json file
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
"vocoder_config":null,
"vocoder_file": null,
"wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
"wavernn_path":null, // wavernn model root path
"wavernn_file":null, // wavernn checkpoint file name
"wavernn_config": null, // wavernn config file
"is_wavernn_batched":true,
"port": 5002,
"use_cuda": true,
"debug": true
}

86
TTS/server/server.py Normal file
Просмотреть файл

@ -0,0 +1,86 @@
#!flask/bin/python
import argparse
import os
from flask import Flask, request, render_template, send_file
from TTS.server.synthesizer import Synthesizer
def create_argparser():
def convert_boolean(x):
return x.lower() in ['true', '1', 'yes']
parser = argparse.ArgumentParser()
parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file')
parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
return parser
synthesizer = None
embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
embedded_tts_folder = os.path.join(embedded_models_folder, 'tts')
tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
embedded_vocoder_folder = os.path.join(embedded_models_folder, 'vocoder')
vocoder_checkpoint_file = os.path.join(embedded_vocoder_folder, 'checkpoint.pth.tar')
vocoder_config_file = os.path.join(embedded_vocoder_folder, 'config.json')
# These models are soon to be deprecated
embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
args = create_argparser().parse_args()
# If these were not specified in the CLI args, use default values with embedded model files
if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
args.tts_checkpoint = tts_checkpoint_file
if not args.tts_config and os.path.isfile(tts_config_file):
args.tts_config = tts_config_file
if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
args.vocoder_checkpoint = vocoder_checkpoint_file
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
args.vocoder_config = vocoder_config_file
if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
args.wavernn_checkpoint = wavernn_checkpoint_file
if not args.wavernn_config and os.path.isfile(wavernn_config_file):
args.wavernn_config = wavernn_config_file
synthesizer = Synthesizer(args)
app = Flask(__name__)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/api/tts', methods=['GET'])
def tts():
text = request.args.get('text')
print(" > Model input: {}".format(text))
data = synthesizer.tts(text)
return send_file(data, mimetype='audio/wav')
def main():
app.run(debug=args.debug, host='0.0.0.0', port=args.port)
if __name__ == '__main__':
main()

194
TTS/server/synthesizer.py Normal file
Просмотреть файл

@ -0,0 +1,194 @@
import io
import sys
import time
import numpy as np
import torch
import yaml
import pysbd
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.speakers import load_speaker_mapping
from TTS.vocoder.utils.generic_utils import setup_generator
# pylint: disable=unused-wildcard-import
# pylint: disable=wildcard-import
from TTS.tts.utils.synthesis import *
from TTS.tts.utils.text import make_symbols, phonemes, symbols
class Synthesizer(object):
def __init__(self, config):
self.wavernn = None
self.vocoder_model = None
self.config = config
print(config)
self.seg = self.get_segmenter("en")
self.use_cuda = self.config.use_cuda
if self.use_cuda:
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
self.config.use_cuda)
if self.config.vocoder_checkpoint:
self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
if self.config.wavernn_lib_path:
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
self.config.wavernn_config, self.config.use_cuda)
@staticmethod
def get_segmenter(lang):
return pysbd.Segmenter(language=lang, clean=True)
def load_tts(self, tts_checkpoint, tts_config, use_cuda):
# pylint: disable=global-statement
global symbols, phonemes
print(" > Loading TTS model ...")
print(" | > model config: ", tts_config)
print(" | > checkpoint file: ", tts_checkpoint)
self.tts_config = load_config(tts_config)
self.use_phonemes = self.tts_config.use_phonemes
self.ap = AudioProcessor(**self.tts_config.audio)
if 'characters' in self.tts_config.keys():
symbols, phonemes = make_symbols(**self.tts_config.characters)
if self.use_phonemes:
self.input_size = len(phonemes)
else:
self.input_size = len(symbols)
# TODO: fix this for multi-speaker model - load speakers
if self.config.tts_speakers is not None:
self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
num_speakers = len(self.tts_speakers)
else:
num_speakers = 0
self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
# load model state
cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
# load the model
self.tts_model.load_state_dict(cp['model'])
if use_cuda:
self.tts_model.cuda()
self.tts_model.eval()
self.tts_model.decoder.max_decoder_steps = 3000
if 'r' in cp:
self.tts_model.decoder.set_r(cp['r'])
print(f" > model reduction factor: {cp['r']}")
def load_vocoder(self, model_file, model_config, use_cuda):
self.vocoder_config = load_config(model_config)
self.vocoder_model = setup_generator(self.vocoder_config)
self.vocoder_model.load_state_dict(torch.load(model_file, map_location="cpu")["model"])
self.vocoder_model.remove_weight_norm()
self.vocoder_model.inference_padding = 0
self.vocoder_config = load_config(model_config)
if use_cuda:
self.vocoder_model.cuda()
self.vocoder_model.eval()
def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
# TODO: set a function in wavernn code base for model setup and call it here.
sys.path.append(lib_path) # set this if WaveRNN is not installed globally
#pylint: disable=import-outside-toplevel
from WaveRNN.models.wavernn import Model
print(" > Loading WaveRNN model ...")
print(" | > model config: ", model_config)
print(" | > model file: ", model_file)
self.wavernn_config = load_config(model_config)
# This is the default architecture we use for our models.
# You might need to update it
self.wavernn = Model(
rnn_dims=512,
fc_dims=512,
mode=self.wavernn_config.mode,
mulaw=self.wavernn_config.mulaw,
pad=self.wavernn_config.pad,
use_aux_net=self.wavernn_config.use_aux_net,
use_upsample_net=self.wavernn_config.use_upsample_net,
upsample_factors=self.wavernn_config.upsample_factors,
feat_dims=80,
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=self.ap.hop_length,
sample_rate=self.ap.sample_rate,
).cuda()
check = torch.load(model_file, map_location="cpu")
self.wavernn.load_state_dict(check['model'])
if use_cuda:
self.wavernn.cuda()
self.wavernn.eval()
def save_wav(self, wav, path):
# wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
wav = np.array(wav)
self.ap.save_wav(wav, path)
def split_into_sentences(self, text):
return self.seg.segment(text)
def tts(self, text, speaker_id=None):
start_time = time.time()
wavs = []
sens = self.split_into_sentences(text)
print(sens)
speaker_id = id_to_torch(speaker_id)
if speaker_id is not None and self.use_cuda:
speaker_id = speaker_id.cuda()
for sen in sens:
# preprocess the given text
inputs = text_to_seqvec(sen, self.tts_config)
inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
inputs = inputs.unsqueeze(0)
# synthesize voice
_, postnet_output, _, _ = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None)
if self.vocoder_model:
# use native vocoder model
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
wav = self.vocoder_model.inference(vocoder_input)
if self.use_cuda:
wav = wav.cpu().numpy()
else:
wav = wav.numpy()
wav = wav.flatten()
elif self.wavernn:
# use 3rd paty wavernn
vocoder_input = None
if self.tts_config.model == "Tacotron":
vocoder_input = torch.FloatTensor(self.ap.out_linear_to_mel(linear_spec=postnet_output.T).T).T.unsqueeze(0)
else:
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
if self.use_cuda:
vocoder_input.cuda()
wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
else:
# use GL
if self.use_cuda:
postnet_output = postnet_output[0].cpu()
else:
postnet_output = postnet_output[0]
postnet_output = postnet_output.numpy()
wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
# trim silence
wav = trim_silence(wav, self.ap)
wavs += list(wav)
wavs += [0] * 10000
out = io.BytesIO()
self.save_wav(wavs, out)
# compute stats
process_time = time.time() - start_time
audio_time = len(wavs) / self.tts_config.audio['sample_rate']
print(f" > Processing time: {process_time}")
print(f" > Real-time factor: {process_time / audio_time}")
return out

Просмотреть файл

@ -0,0 +1,111 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="">
<meta name="author" content="">
<title>Mozilla - Text2Speech engine</title>
<!-- Bootstrap core CSS -->
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
<!-- Custom styles for this template -->
<style>
body {
padding-top: 54px;
}
@media (min-width: 992px) {
body {
padding-top: 56px;
}
}
</style>
</head>
<body>
<a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;" src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
<!-- Navigation -->
<!--
<nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
<div class="container">
<a class="navbar-brand" href="#">Mozilla TTS</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarResponsive">
<ul class="navbar-nav ml-auto">
<li class="nav-item active">
<a class="nav-link" href="#">Home
<span class="sr-only">(current)</span>
</a>
</li>
</ul>
</div>
</div>
</nav>
-->
<!-- Page Content -->
<div class="container">
<div class="row">
<div class="col-lg-12 text-center">
<img class="mt-5" src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" alt=></img>
<h1 class="mt-5">Mozilla TTS</h1>
<ul class="list-unstyled">
</ul>
<input id="text" placeholder="Type here..." size=45 type="text" name="text">
<button id="speak-button" name="speak">Speak</button><br/><br/>
<audio id="audio" controls autoplay hidden></audio>
<p id="message"></p>
</div>
</div>
</div>
<!-- Bootstrap core JavaScript -->
<script>
function q(selector) {return document.querySelector(selector)}
q('#text').focus()
function do_tts(e) {
text = q('#text').value
if (text) {
q('#message').textContent = 'Synthesizing...'
q('#speak-button').disabled = true
q('#audio').hidden = true
synthesize(text)
}
e.preventDefault()
return false
}
q('#speak-button').addEventListener('click', do_tts)
q('#text').addEventListener('keyup', function(e) {
if (e.keyCode == 13) { // enter
do_tts(e)
}
})
function synthesize(text) {
fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
.then(function(res) {
if (!res.ok) throw Error(res.statusText)
return res.blob()
}).then(function(blob) {
q('#message').textContent = ''
q('#speak-button').disabled = false
q('#audio').src = URL.createObjectURL(blob)
q('#audio').hidden = false
}).catch(function(err) {
q('#message').textContent = 'Error: ' + err.message
q('#speak-button').disabled = false
})
}
</script>
</body>
</html>

Просмотреть файл

@ -0,0 +1,18 @@
### Speaker Encoder
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
![](umap.png)
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
To run the code, you need to follow the same flow as in TTS.
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
- Watch training on Tensorboard as in TTS

Просмотреть файл

Просмотреть файл

@ -0,0 +1,88 @@
import argparse
import glob
import os
import numpy as np
from tqdm import tqdm
import torch
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.tts.utils.audio import AudioProcessor
from TTS.tts.utils.generic_utils import load_config
parser = argparse.ArgumentParser(
description='Compute embedding vectors for each wav file in a dataset. ')
parser.add_argument(
'model_path',
type=str,
help='Path to model outputs (checkpoint, tensorboard etc.).')
parser.add_argument(
'config_path',
type=str,
help='Path to config file for training.',
)
parser.add_argument(
'data_path',
type=str,
help='Data path for wav files - directory or CSV file')
parser.add_argument(
'output_path',
type=str,
help='path for training outputs.')
parser.add_argument(
'--use_cuda', type=bool, help='flag to set cuda.', default=False
)
parser.add_argument(
'--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
)
args = parser.parse_args()
c = load_config(args.config_path)
ap = AudioProcessor(**c['audio'])
data_path = args.data_path
split_ext = os.path.splitext(data_path)
sep = args.separator
if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
# Parse CSV
print(f'CSV file: {data_path}')
with open(data_path) as f:
wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
wav_files = []
print(f'Separator is: {sep}')
for line in f:
components = line.split(sep)
if len(components) != 2:
print("Invalid line")
continue
wav_file = os.path.join(wav_path, components[0] + '.wav')
#print(f'wav_file: {wav_file}')
if os.path.exists(wav_file):
wav_files.append(wav_file)
print(f'Count of wavs imported: {len(wav_files)}')
else:
# Parse all wav files in data_path
wav_path = data_path
wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
output_files = [wav_file.replace(wav_path, args.output_path).replace(
'.wav', '.npy') for wav_file in wav_files]
for output_file in output_files:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
model = SpeakerEncoder(**c.model)
model.load_state_dict(torch.load(args.model_path)['model'])
model.eval()
if args.use_cuda:
model.cuda()
for idx, wav_file in enumerate(tqdm(wav_files)):
mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T
mel_spec = torch.FloatTensor(mel_spec[None, :, :])
if args.use_cuda:
mel_spec = mel_spec.cuda()
embedd = model.compute_embedding(mel_spec)
np.save(output_files[idx], embedd.detach().cpu().numpy())

Просмотреть файл

@ -0,0 +1,59 @@
{
"run_name": "libritts_360-half",
"run_description": "train speaker encoder for libritts 360",
"audio": {
// Audio processing parameters
"num_mels": 40, // size of the mel spec frame.
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"frame_length_ms": 50, // stft window length in ms.
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Normalization parameters
"signal_norm": true, // normalize the spec values in range [0, 1]
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
},
"reinit_layers": [],
"grad_clip": 3.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"steps_plot_stats": 10, // number of steps to plot embeddings.
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 1, // Number of steps to log traning on console.
"output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"model": {
"input_dim": 40,
"proj_dim": 128,
"lstm_dim": 384,
"num_lstm_layers": 3
},
"datasets":
[
{
"name": "libri_tts",
"path": "/home/erogol/Data/Libri-TTS/train-clean-360/",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "libri_tts",
"path": "/home/erogol/Data/Libri-TTS/train-clean-100/",
"meta_file_train": null,
"meta_file_val": null
}
]
}

Просмотреть файл

@ -0,0 +1,123 @@
import numpy as np
import torch
import random
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
"""
Args:
ap (TTS.tts.utils.AudioProcessor): audio processor object.
meta_data (list): list of dataset instances.
seq_len (int): voice segment length in seconds.
verbose (bool): print diagnostic information.
"""
self.items = meta_data
self.sample_rate = ap.sample_rate
self.voice_len = voice_len
self.seq_len = int(voice_len * self.sample_rate)
self.num_speakers_in_batch = num_speakers_in_batch
self.num_utter_per_speaker = num_utter_per_speaker
self.skip_speakers = skip_speakers
self.ap = ap
self.verbose = verbose
self.__parse_items()
if self.verbose:
print("\n > DataLoader initialization")
print(f" | > Number of instances : {len(self.items)}")
print(f" | > Sequence length: {self.seq_len}")
print(f" | > Num speakers: {len(self.speakers)}")
def load_wav(self, filename):
audio = self.ap.load_wav(filename)
return audio
def load_data(self, idx):
text, wav_file, speaker_name = self.items[idx]
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
mel = self.ap.melspectrogram(wav).astype("float32")
# sample seq_len
assert text.size > 0, self.items[idx][1]
assert wav.size > 0, self.items[idx][1]
sample = {
"mel": mel,
"item_idx": self.items[idx][1],
"speaker_name": speaker_name,
}
return sample
def __parse_items(self):
"""
Find unique speaker ids and create a dict mapping utterances from speaker id
"""
speakers = list({item[-1] for item in self.items})
self.speaker_to_utters = {}
self.speakers = []
for speaker in speakers:
speaker_utters = [item[1] for item in self.items if item[2] == speaker]
if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers:
print(
f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}."
)
else:
self.speakers.append(speaker)
self.speaker_to_utters[speaker] = speaker_utters
def __len__(self):
return int(1e10)
def __sample_speaker(self):
speaker = random.sample(self.speakers, 1)[0]
if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
utters = random.choices(
self.speaker_to_utters[speaker], k=self.num_utter_per_speaker
)
else:
utters = random.sample(
self.speaker_to_utters[speaker], self.num_utter_per_speaker
)
return speaker, utters
def __sample_speaker_utterances(self, speaker):
"""
Sample all M utterances for the given speaker.
"""
feats = []
labels = []
for _ in range(self.num_utter_per_speaker):
# TODO:dummy but works
while True:
if len(self.speaker_to_utters[speaker]) > 0:
utter = random.sample(self.speaker_to_utters[speaker], 1)[0]
else:
self.speakers.remove(speaker)
speaker, _ = self.__sample_speaker()
continue
wav = self.load_wav(utter)
if wav.shape[0] - self.seq_len > 0:
break
self.speaker_to_utters[speaker].remove(utter)
offset = random.randint(0, wav.shape[0] - self.seq_len)
mel = self.ap.melspectrogram(wav[offset : offset + self.seq_len])
feats.append(torch.FloatTensor(mel))
labels.append(speaker)
return feats, labels
def __getitem__(self, idx):
speaker, _ = self.__sample_speaker()
return speaker
def collate_fn(self, batch):
labels = []
feats = []
for speaker in batch:
feats_, labels_ = self.__sample_speaker_utterances(speaker)
labels.append(labels_)
feats.extend(feats_)
feats = torch.stack(feats)
return feats.transpose(1, 2), labels

Просмотреть файл

@ -0,0 +1,41 @@
import os
import datetime
import torch
def save_checkpoint(model, optimizer, model_loss, out_path,
current_step, epoch):
checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step)
checkpoint_path = os.path.join(out_path, checkpoint_path)
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
new_state_dict = model.state_dict()
state = {
'model': new_state_dict,
'optimizer': optimizer.state_dict() if optimizer is not None else None,
'step': current_step,
'epoch': epoch,
'GE2Eloss': model_loss,
'date': datetime.date.today().strftime("%B %d, %Y"),
}
torch.save(state, checkpoint_path)
def save_best_model(model, optimizer, model_loss, best_loss, out_path,
current_step):
if model_loss < best_loss:
new_state_dict = model.state_dict()
state = {
'model': new_state_dict,
'optimizer': optimizer.state_dict(),
'step': current_step,
'GE2Eloss': model_loss,
'date': datetime.date.today().strftime("%B %d, %Y"),
}
best_loss = model_loss
bestmodel_path = 'best_model.pth.tar'
bestmodel_path = os.path.join(out_path, bestmodel_path)
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(
model_loss, bestmodel_path))
torch.save(state, bestmodel_path)
return best_loss

121
TTS/speaker_encoder/loss.py Normal file
Просмотреть файл

@ -0,0 +1,121 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
# adapted from https://github.com/cvqluu/GE2E-Loss
class GE2ELoss(nn.Module):
def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
"""
Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
Accepts an input of size (N, M, D)
where N is the number of speakers in the batch,
M is the number of utterances per speaker,
and D is the dimensionality of the embedding vector (e.g. d-vector)
Args:
- init_w (float): defines the initial value of w in Equation (5) of [1]
- init_b (float): definies the initial value of b in Equation (5) of [1]
"""
super(GE2ELoss, self).__init__()
# pylint: disable=E1102
self.w = nn.Parameter(torch.tensor(init_w))
# pylint: disable=E1102
self.b = nn.Parameter(torch.tensor(init_b))
self.loss_method = loss_method
assert self.loss_method in ["softmax", "contrast"]
if self.loss_method == "softmax":
self.embed_loss = self.embed_loss_softmax
if self.loss_method == "contrast":
self.embed_loss = self.embed_loss_contrast
# pylint: disable=R0201
def calc_new_centroids(self, dvecs, centroids, spkr, utt):
"""
Calculates the new centroids excluding the reference utterance
"""
excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
excl = torch.mean(excl, 0)
new_centroids = []
for i, centroid in enumerate(centroids):
if i == spkr:
new_centroids.append(excl)
else:
new_centroids.append(centroid)
return torch.stack(new_centroids)
def calc_cosine_sim(self, dvecs, centroids):
"""
Make the cosine similarity matrix with dims (N,M,N)
"""
cos_sim_matrix = []
for spkr_idx, speaker in enumerate(dvecs):
cs_row = []
for utt_idx, utterance in enumerate(speaker):
new_centroids = self.calc_new_centroids(
dvecs, centroids, spkr_idx, utt_idx
)
# vector based cosine similarity for speed
cs_row.append(
torch.clamp(
torch.mm(
utterance.unsqueeze(1).transpose(0, 1),
new_centroids.transpose(0, 1),
)
/ (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
1e-6,
)
)
cs_row = torch.cat(cs_row, dim=0)
cos_sim_matrix.append(cs_row)
return torch.stack(cos_sim_matrix)
# pylint: disable=R0201
def embed_loss_softmax(self, dvecs, cos_sim_matrix):
"""
Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
"""
N, M, _ = dvecs.shape
L = []
for j in range(N):
L_row = []
for i in range(M):
L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
L_row = torch.stack(L_row)
L.append(L_row)
return torch.stack(L)
# pylint: disable=R0201
def embed_loss_contrast(self, dvecs, cos_sim_matrix):
"""
Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
"""
N, M, _ = dvecs.shape
L = []
for j in range(N):
L_row = []
for i in range(M):
centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
excl_centroids_sigmoids = torch.cat(
(centroids_sigmoids[:j], centroids_sigmoids[j + 1 :])
)
L_row.append(
1.0
- torch.sigmoid(cos_sim_matrix[j, i, j])
+ torch.max(excl_centroids_sigmoids)
)
L_row = torch.stack(L_row)
L.append(L_row)
return torch.stack(L)
def forward(self, dvecs):
"""
Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
"""
centroids = torch.mean(dvecs, 1)
cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids)
torch.clamp(self.w, 1e-6)
cos_sim_matrix = self.w * cos_sim_matrix + self.b
L = self.embed_loss(dvecs, cos_sim_matrix)
return L.mean()

Просмотреть файл

@ -0,0 +1,88 @@
import torch
from torch import nn
class LSTMWithProjection(nn.Module):
def __init__(self, input_size, hidden_size, proj_size):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.proj_size = proj_size
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
self.linear = nn.Linear(hidden_size, proj_size, bias=False)
def forward(self, x):
self.lstm.flatten_parameters()
o, (_, _) = self.lstm(x)
return self.linear(o)
class SpeakerEncoder(nn.Module):
def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3):
super().__init__()
layers = []
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
for _ in range(num_lstm_layers - 1):
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
self.layers = nn.Sequential(*layers)
self._init_layers()
def _init_layers(self):
for name, param in self.layers.named_parameters():
if "bias" in name:
nn.init.constant_(param, 0.0)
elif "weight" in name:
nn.init.xavier_normal_(param)
def forward(self, x):
# TODO: implement state passing for lstms
d = self.layers(x)
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
return d
def inference(self, x):
d = self.layers.forward(x)
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
return d
def compute_embedding(self, x, num_frames=160, overlap=0.5):
"""
Generate embeddings for a batch of utterances
x: 1xTxD
"""
num_overlap = int(num_frames * overlap)
max_len = x.shape[1]
embed = None
cur_iter = 0
for offset in range(0, max_len, num_frames - num_overlap):
cur_iter += 1
end_offset = min(x.shape[1], offset + num_frames)
frames = x[:, offset:end_offset]
if embed is None:
embed = self.inference(frames)
else:
embed += self.inference(frames)
return embed / cur_iter
def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
"""
Generate embeddings for a batch of utterances
x: BxTxD
"""
num_overlap = num_frames * overlap
max_len = x.shape[1]
embed = None
num_iters = seq_lens / (num_frames - num_overlap)
cur_iter = 0
for offset in range(0, max_len, num_frames - num_overlap):
cur_iter += 1
end_offset = min(x.shape[1], offset + num_frames)
frames = x[:, offset:end_offset]
if embed is None:
embed = self.inference(frames)
else:
embed[cur_iter <= num_iters, :] += self.inference(
frames[cur_iter <= num_iters, :, :]
)
return embed / num_iters

Просмотреть файл

@ -0,0 +1,2 @@
umap-learn
numpy>=1.17.0

Двоичные данные
TTS/speaker_encoder/umap.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 24 KiB

Просмотреть файл

@ -0,0 +1,46 @@
import umap
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use("Agg")
colormap = (
np.array(
[
[76, 255, 0],
[0, 127, 70],
[255, 0, 0],
[255, 217, 38],
[0, 135, 255],
[165, 0, 165],
[255, 167, 255],
[0, 255, 255],
[255, 96, 38],
[142, 76, 0],
[33, 0, 127],
[0, 0, 0],
[183, 183, 183],
],
dtype=np.float,
)
/ 255
)
def plot_embeddings(embeddings, num_utter_per_speaker):
embeddings = embeddings[: 10 * num_utter_per_speaker]
model = umap.UMAP()
projection = model.fit_transform(embeddings)
num_speakers = embeddings.shape[0] // num_utter_per_speaker
ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker)
colors = [colormap[i] for i in ground_truth]
fig, ax = plt.subplots(figsize=(16, 10))
_ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
plt.gca().set_aspect("equal", "datalim")
plt.title("UMAP projection")
plt.tight_layout()
plt.savefig("umap")
return fig

0
TTS/tts/__init__.py Normal file
Просмотреть файл

149
TTS/tts/configs/config.json Normal file
Просмотреть файл

@ -0,0 +1,149 @@
{
"model": "Tacotron2",
"run_name": "ljspeech-ddc-bn",
"run_description": "tacotron2 with ddc and batch-normalization",
// AUDIO PARAMETERS
"audio":{
// stft parameters
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 20.0,
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
// "characters":{
// "pad": "_",
// "eos": "~",
// "bos": "^",
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
// "punctuations":"!'(),-.:;? ",
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
// },
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16,
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule": false, // use noam warmup and lr schedule.
"grad_clip": 1.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"wd": 0.000001, // Weight decay weight.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
// TACOTRON PRENET
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
"prenet_type": "bn", // "original" or "bn".
"prenet_dropout": false, // enable/disable dropout at prenet.
// TACOTRON ATTENTION
"attention_type": "original", // 'original' or 'graves'
"attention_heads": 4, // number of attention heads (only for 'graves')
"attention_norm": "sigmoid", // softmax or sigmoid.
"windowing": false, // Enables attention windowing. Used only in eval mode.
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
"transition_agent": false, // enable/disable transition agent of forward attention.
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
"double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
"ddc_r": 7, // reduction rate for coarse decoder.
// STOPNET
"stopnet": true, // Train stopnet predicting the end of synthesis.
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console.
"tb_plot_step": 100, // Number of steps to plot TB training figures.
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 153, // DATASET-RELATED: maximum text length
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",
// PHONEMES
"phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
"use_gst": false, // TACOTRON ONLY: use global style tokens
// DATASETS
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "/home/erogol/Data/LJSpeech-1.1/",
"meta_file_train": "metadata.csv",
"meta_file_val": null
}
]
}

Просмотреть файл

@ -0,0 +1,240 @@
import os
import numpy as np
import collections
import torch
import random
from torch.utils.data import Dataset
from TTS.tts.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
from TTS.tts.utils.data import prepare_data, prepare_tensor, prepare_stop_target
class MyDataset(Dataset):
def __init__(self,
outputs_per_step,
text_cleaner,
compute_linear_spec,
ap,
meta_data,
tp=None,
batch_group_size=0,
min_seq_len=0,
max_seq_len=float("inf"),
use_phonemes=True,
phoneme_cache_path=None,
phoneme_language="en-us",
enable_eos_bos=False,
verbose=False):
"""
Args:
outputs_per_step (int): number of time frames predicted per step.
text_cleaner (str): text cleaner used for the dataset.
compute_linear_spec (bool): compute linear spectrogram if True.
ap (TTS.tts.utils.AudioProcessor): audio processor object.
meta_data (list): list of dataset instances.
batch_group_size (int): (0) range of batch randomization after sorting
sequences by length.
min_seq_len (int): (0) minimum sequence length to be processed
by the loader.
max_seq_len (int): (float("inf")) maximum sequence length.
use_phonemes (bool): (true) if true, text converted to phonemes.
phoneme_cache_path (str): path to cache phoneme features.
phoneme_language (str): one the languages from
https://github.com/bootphon/phonemizer#languages
enable_eos_bos (bool): enable end of sentence and beginning of sentences characters.
verbose (bool): print diagnostic information.
"""
self.batch_group_size = batch_group_size
self.items = meta_data
self.outputs_per_step = outputs_per_step
self.sample_rate = ap.sample_rate
self.cleaners = text_cleaner
self.compute_linear_spec = compute_linear_spec
self.min_seq_len = min_seq_len
self.max_seq_len = max_seq_len
self.ap = ap
self.tp = tp
self.use_phonemes = use_phonemes
self.phoneme_cache_path = phoneme_cache_path
self.phoneme_language = phoneme_language
self.enable_eos_bos = enable_eos_bos
self.verbose = verbose
if use_phonemes and not os.path.isdir(phoneme_cache_path):
os.makedirs(phoneme_cache_path, exist_ok=True)
if self.verbose:
print("\n > DataLoader initialization")
print(" | > Use phonemes: {}".format(self.use_phonemes))
if use_phonemes:
print(" | > phoneme language: {}".format(phoneme_language))
print(" | > Number of instances : {}".format(len(self.items)))
self.sort_items()
def load_wav(self, filename):
audio = self.ap.load_wav(filename)
return audio
@staticmethod
def load_np(filename):
data = np.load(filename).astype('float32')
return data
def _generate_and_cache_phoneme_sequence(self, text, cache_path):
"""generate a phoneme sequence from text.
since the usage is for subsequent caching, we never add bos and
eos chars here. Instead we add those dynamically later; based on the
config option."""
phonemes = phoneme_to_sequence(text, [self.cleaners],
language=self.phoneme_language,
enable_eos_bos=False,
tp=self.tp)
phonemes = np.asarray(phonemes, dtype=np.int32)
np.save(cache_path, phonemes)
return phonemes
def _load_or_generate_phoneme_sequence(self, wav_file, text):
file_name = os.path.splitext(os.path.basename(wav_file))[0]
cache_path = os.path.join(self.phoneme_cache_path,
file_name + '_phoneme.npy')
try:
phonemes = np.load(cache_path)
except FileNotFoundError:
phonemes = self._generate_and_cache_phoneme_sequence(text,
cache_path)
except (ValueError, IOError):
print(" > ERROR: failed loading phonemes for {}. "
"Recomputing.".format(wav_file))
phonemes = self._generate_and_cache_phoneme_sequence(text,
cache_path)
if self.enable_eos_bos:
phonemes = pad_with_eos_bos(phonemes, tp=self.tp)
phonemes = np.asarray(phonemes, dtype=np.int32)
return phonemes
def load_data(self, idx):
text, wav_file, speaker_name = self.items[idx]
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
if self.use_phonemes:
text = self._load_or_generate_phoneme_sequence(wav_file, text)
else:
text = np.asarray(
text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32)
assert text.size > 0, self.items[idx][1]
assert wav.size > 0, self.items[idx][1]
sample = {
'text': text,
'wav': wav,
'item_idx': self.items[idx][1],
'speaker_name': speaker_name
}
return sample
def sort_items(self):
r"""Sort instances based on text length in ascending order"""
lengths = np.array([len(ins[0]) for ins in self.items])
idxs = np.argsort(lengths)
new_items = []
ignored = []
for i, idx in enumerate(idxs):
length = lengths[idx]
if length < self.min_seq_len or length > self.max_seq_len:
ignored.append(idx)
else:
new_items.append(self.items[idx])
# shuffle batch groups
if self.batch_group_size > 0:
for i in range(len(new_items) // self.batch_group_size):
offset = i * self.batch_group_size
end_offset = offset + self.batch_group_size
temp_items = new_items[offset:end_offset]
random.shuffle(temp_items)
new_items[offset:end_offset] = temp_items
self.items = new_items
if self.verbose:
print(" | > Max length sequence: {}".format(np.max(lengths)))
print(" | > Min length sequence: {}".format(np.min(lengths)))
print(" | > Avg length sequence: {}".format(np.mean(lengths)))
print(" | > Num. instances discarded by max-min (max={}, min={}) seq limits: {}".format(
self.max_seq_len, self.min_seq_len, len(ignored)))
print(" | > Batch group size: {}.".format(self.batch_group_size))
def __len__(self):
return len(self.items)
def __getitem__(self, idx):
return self.load_data(idx)
def collate_fn(self, batch):
r"""
Perform preprocessing and create a final data batch:
1. Sort batch instances by text-length
2. Convert Audio signal to Spectrograms.
3. PAD sequences wrt r.
4. Load to Torch.
"""
# Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.Mapping):
text_lenghts = np.array([len(d["text"]) for d in batch])
# sort items with text input length for RNN efficiency
text_lenghts, ids_sorted_decreasing = torch.sort(
torch.LongTensor(text_lenghts), dim=0, descending=True)
wav = [batch[idx]['wav'] for idx in ids_sorted_decreasing]
item_idxs = [
batch[idx]['item_idx'] for idx in ids_sorted_decreasing
]
text = [batch[idx]['text'] for idx in ids_sorted_decreasing]
speaker_name = [batch[idx]['speaker_name']
for idx in ids_sorted_decreasing]
# compute features
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
mel_lengths = [m.shape[1] for m in mel]
# compute 'stop token' targets
stop_targets = [
np.array([0.] * (mel_len - 1) + [1.]) for mel_len in mel_lengths
]
# PAD stop targets
stop_targets = prepare_stop_target(stop_targets,
self.outputs_per_step)
# PAD sequences with longest instance in the batch
text = prepare_data(text).astype(np.int32)
# PAD features with longest instance
mel = prepare_tensor(mel, self.outputs_per_step)
# B x D x T --> B x T x D
mel = mel.transpose(0, 2, 1)
# convert things to pytorch
text_lenghts = torch.LongTensor(text_lenghts)
text = torch.LongTensor(text)
mel = torch.FloatTensor(mel).contiguous()
mel_lengths = torch.LongTensor(mel_lengths)
stop_targets = torch.FloatTensor(stop_targets)
# compute linear spectrogram
if self.compute_linear_spec:
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
linear = prepare_tensor(linear, self.outputs_per_step)
linear = linear.transpose(0, 2, 1)
assert mel.shape[1] == linear.shape[1]
linear = torch.FloatTensor(linear).contiguous()
else:
linear = None
return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \
stop_targets, item_idxs
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
found {}".format(type(batch[0]))))

Просмотреть файл

Просмотреть файл

@ -0,0 +1,207 @@
import os
from glob import glob
import re
import sys
from TTS.tts.utils.generic_utils import split_dataset
def load_meta_data(datasets):
meta_data_train_all = []
meta_data_eval_all = []
for dataset in datasets:
name = dataset['name']
root_path = dataset['path']
meta_file_train = dataset['meta_file_train']
meta_file_val = dataset['meta_file_val']
preprocessor = get_preprocessor_by_name(name)
meta_data_train = preprocessor(root_path, meta_file_train)
if meta_file_val is None:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
else:
meta_data_eval = preprocessor(root_path, meta_file_val)
meta_data_train_all += meta_data_train
meta_data_eval_all += meta_data_eval
return meta_data_train_all, meta_data_eval_all
def get_preprocessor_by_name(name):
"""Returns the respective preprocessing function."""
thismodule = sys.modules[__name__]
return getattr(thismodule, name.lower())
def tweb(root_path, meta_file):
"""Normalize TWEB dataset.
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "tweb"
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('\t')
wav_file = os.path.join(root_path, cols[0] + '.wav')
text = cols[1]
items.append([text, wav_file, speaker_name])
return items
# def kusal(root_path, meta_file):
# txt_file = os.path.join(root_path, meta_file)
# texts = []
# wavs = []
# with open(txt_file, "r", encoding="utf8") as f:
# frames = [
# line.split('\t') for line in f
# if line.split('\t')[0] in self.wav_files_dict.keys()
# ]
# # TODO: code the rest
# return {'text': texts, 'wavs': wavs}
def mozilla(root_path, meta_file):
"""Normalizes Mozilla meta data files to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "mozilla"
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = cols[1].strip()
text = cols[0].strip()
wav_file = os.path.join(root_path, "wavs", wav_file)
items.append([text, wav_file, speaker_name])
return items
def mozilla_de(root_path, meta_file):
"""Normalizes Mozilla meta data files to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "mozilla"
with open(txt_file, 'r', encoding="ISO 8859-1") as ttf:
for line in ttf:
cols = line.strip().split('|')
wav_file = cols[0].strip()
text = cols[1].strip()
folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL"
wav_file = os.path.join(root_path, folder_name, wav_file)
items.append([text, wav_file, speaker_name])
return items
def mailabs(root_path, meta_files=None):
"""Normalizes M-AI-Labs meta data files to TTS format"""
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
if meta_files is None:
csv_files = glob(root_path+"/**/metadata.csv", recursive=True)
else:
csv_files = meta_files
# meta_files = [f.strip() for f in meta_files.split(",")]
items = []
for csv_file in csv_files:
txt_file = os.path.join(root_path, csv_file)
folder = os.path.dirname(txt_file)
# determine speaker based on folder structure...
speaker_name_match = speaker_regex.search(txt_file)
if speaker_name_match is None:
continue
speaker_name = speaker_name_match.group("speaker_name")
print(" | > {}".format(csv_file))
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('|')
if meta_files is None:
wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav')
else:
wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav')
if os.path.isfile(wav_file):
text = cols[1].strip()
items.append([text, wav_file, speaker_name])
else:
raise RuntimeError("> File %s does not exist!"%(wav_file))
return items
def ljspeech(root_path, meta_file):
"""Normalizes the Nancy meta data file to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "ljspeech"
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = os.path.join(root_path, 'wavs', cols[0] + '.wav')
text = cols[1]
items.append([text, wav_file, speaker_name])
return items
def nancy(root_path, meta_file):
"""Normalizes the Nancy meta data file to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "nancy"
with open(txt_file, 'r') as ttf:
for line in ttf:
utt_id = line.split()[1]
text = line[line.find('"') + 1:line.rfind('"') - 1]
wav_file = os.path.join(root_path, "wavn", utt_id + ".wav")
items.append([text, wav_file, speaker_name])
return items
def common_voice(root_path, meta_file):
"""Normalize the common voice meta data file to TTS format."""
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, 'r') as ttf:
for line in ttf:
if line.startswith("client_id"):
continue
cols = line.split("\t")
text = cols[2]
speaker_name = cols[0]
wav_file = os.path.join(root_path, "clips", cols[1] + ".wav")
items.append([text, wav_file, speaker_name])
return items
def libri_tts(root_path, meta_files=None):
"""https://ai.google/tools/datasets/libri-tts/"""
items = []
if meta_files is None:
meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True)
for meta_file in meta_files:
_meta_file = os.path.basename(meta_file).split('.')[0]
speaker_name = _meta_file.split('_')[0]
chapter_id = _meta_file.split('_')[1]
_root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
with open(meta_file, 'r') as ttf:
for line in ttf:
cols = line.split('\t')
wav_file = os.path.join(_root_path, cols[0] + '.wav')
text = cols[1]
items.append([text, wav_file, speaker_name])
for item in items:
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
return items
def custom_turkish(root_path, meta_file):
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "turkish-female"
skipped_files = []
with open(txt_file, 'r', encoding='utf-8') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
if not os.path.exists(wav_file):
skipped_files.append(wav_file)
continue
text = cols[1].strip()
items.append([text, wav_file, speaker_name])
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
return items

Просмотреть файл

Просмотреть файл

@ -0,0 +1,389 @@
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
class Linear(nn.Module):
def __init__(self,
in_features,
out_features,
bias=True,
init_gain='linear'):
super(Linear, self).__init__()
self.linear_layer = torch.nn.Linear(
in_features, out_features, bias=bias)
self._init_w(init_gain)
def _init_w(self, init_gain):
torch.nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=torch.nn.init.calculate_gain(init_gain))
def forward(self, x):
return self.linear_layer(x)
class LinearBN(nn.Module):
def __init__(self,
in_features,
out_features,
bias=True,
init_gain='linear'):
super(LinearBN, self).__init__()
self.linear_layer = torch.nn.Linear(
in_features, out_features, bias=bias)
self.batch_normalization = nn.BatchNorm1d(out_features, momentum=0.1, eps=1e-5)
self._init_w(init_gain)
def _init_w(self, init_gain):
torch.nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=torch.nn.init.calculate_gain(init_gain))
def forward(self, x):
out = self.linear_layer(x)
if len(out.shape) == 3:
out = out.permute(1, 2, 0)
out = self.batch_normalization(out)
if len(out.shape) == 3:
out = out.permute(2, 0, 1)
return out
class Prenet(nn.Module):
def __init__(self,
in_features,
prenet_type="original",
prenet_dropout=True,
out_features=[256, 256],
bias=True):
super(Prenet, self).__init__()
self.prenet_type = prenet_type
self.prenet_dropout = prenet_dropout
in_features = [in_features] + out_features[:-1]
if prenet_type == "bn":
self.linear_layers = nn.ModuleList([
LinearBN(in_size, out_size, bias=bias)
for (in_size, out_size) in zip(in_features, out_features)
])
elif prenet_type == "original":
self.linear_layers = nn.ModuleList([
Linear(in_size, out_size, bias=bias)
for (in_size, out_size) in zip(in_features, out_features)
])
def forward(self, x):
for linear in self.linear_layers:
if self.prenet_dropout:
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training)
else:
x = F.relu(linear(x))
return x
####################
# ATTENTION MODULES
####################
class LocationLayer(nn.Module):
def __init__(self,
attention_dim,
attention_n_filters=32,
attention_kernel_size=31):
super(LocationLayer, self).__init__()
self.location_conv1d = nn.Conv1d(
in_channels=2,
out_channels=attention_n_filters,
kernel_size=attention_kernel_size,
stride=1,
padding=(attention_kernel_size - 1) // 2,
bias=False)
self.location_dense = Linear(
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
def forward(self, attention_cat):
processed_attention = self.location_conv1d(attention_cat)
processed_attention = self.location_dense(
processed_attention.transpose(1, 2))
return processed_attention
class GravesAttention(nn.Module):
""" Discretized Graves attention:
- https://arxiv.org/abs/1910.10288
- https://arxiv.org/pdf/1906.01083.pdf
"""
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
def __init__(self, query_dim, K):
super(GravesAttention, self).__init__()
self._mask_value = 1e-8
self.K = K
# self.attention_alignment = 0.05
self.eps = 1e-5
self.J = None
self.N_a = nn.Sequential(
nn.Linear(query_dim, query_dim, bias=True),
nn.ReLU(),
nn.Linear(query_dim, 3*K, bias=True))
self.attention_weights = None
self.mu_prev = None
self.init_layers()
def init_layers(self):
torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean
torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std
def init_states(self, inputs):
if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
# pylint: disable=R0201
# pylint: disable=unused-argument
def preprocess_inputs(self, inputs):
return None
def forward(self, query, inputs, processed_inputs, mask):
"""
shapes:
query: B x D_attention_rnn
inputs: B x T_in x D_encoder
processed_inputs: place_holder
mask: B x T_in
"""
gbk_t = self.N_a(query)
gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
# attention model parameters
# each B x K
g_t = gbk_t[:, 0, :]
b_t = gbk_t[:, 1, :]
k_t = gbk_t[:, 2, :]
# dropout to decorrelate attention heads
g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
# attention GMM parameters
sig_t = torch.nn.functional.softplus(b_t) + self.eps
mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
g_t = torch.softmax(g_t, dim=-1) + self.eps
j = self.J[:inputs.size(1)+1]
# attention weights
phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
# discritize attention weights
alpha_t = torch.sum(phi_t, 1)
alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
alpha_t[alpha_t == 0] = 1e-8
# apply masking
if mask is not None:
alpha_t.data.masked_fill_(~mask, self._mask_value)
context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
self.attention_weights = alpha_t
self.mu_prev = mu_t
return context
class OriginalAttention(nn.Module):
"""Following the methods proposed here:
- https://arxiv.org/abs/1712.05884
- https://arxiv.org/abs/1807.06736 + state masking at inference
- Using sigmoid instead of softmax normalization
- Attention windowing at inference time
"""
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask):
super(OriginalAttention, self).__init__()
self.query_layer = Linear(
query_dim, attention_dim, bias=False, init_gain='tanh')
self.inputs_layer = Linear(
embedding_dim, attention_dim, bias=False, init_gain='tanh')
self.v = Linear(attention_dim, 1, bias=True)
if trans_agent:
self.ta = nn.Linear(
query_dim + embedding_dim, 1, bias=True)
if location_attention:
self.location_layer = LocationLayer(
attention_dim,
attention_location_n_filters,
attention_location_kernel_size,
)
self._mask_value = -float("inf")
self.windowing = windowing
self.win_idx = None
self.norm = norm
self.forward_attn = forward_attn
self.trans_agent = trans_agent
self.forward_attn_mask = forward_attn_mask
self.location_attention = location_attention
def init_win_idx(self):
self.win_idx = -1
self.win_back = 2
self.win_front = 6
def init_forward_attn(self, inputs):
B = inputs.shape[0]
T = inputs.shape[1]
self.alpha = torch.cat(
[torch.ones([B, 1]),
torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
def init_location_attention(self, inputs):
B = inputs.size(0)
T = inputs.size(1)
self.attention_weights_cum = torch.zeros([B, T], device=inputs.device)
def init_states(self, inputs):
B = inputs.size(0)
T = inputs.size(1)
self.attention_weights = torch.zeros([B, T], device=inputs.device)
if self.location_attention:
self.init_location_attention(inputs)
if self.forward_attn:
self.init_forward_attn(inputs)
if self.windowing:
self.init_win_idx()
def preprocess_inputs(self, inputs):
return self.inputs_layer(inputs)
def update_location_attention(self, alignments):
self.attention_weights_cum += alignments
def get_location_attention(self, query, processed_inputs):
attention_cat = torch.cat((self.attention_weights.unsqueeze(1),
self.attention_weights_cum.unsqueeze(1)),
dim=1)
processed_query = self.query_layer(query.unsqueeze(1))
processed_attention_weights = self.location_layer(attention_cat)
energies = self.v(
torch.tanh(processed_query + processed_attention_weights +
processed_inputs))
energies = energies.squeeze(-1)
return energies, processed_query
def get_attention(self, query, processed_inputs):
processed_query = self.query_layer(query.unsqueeze(1))
energies = self.v(torch.tanh(processed_query + processed_inputs))
energies = energies.squeeze(-1)
return energies, processed_query
def apply_windowing(self, attention, inputs):
back_win = self.win_idx - self.win_back
front_win = self.win_idx + self.win_front
if back_win > 0:
attention[:, :back_win] = -float("inf")
if front_win < inputs.shape[1]:
attention[:, front_win:] = -float("inf")
# this is a trick to solve a special problem.
# but it does not hurt.
if self.win_idx == -1:
attention[:, 0] = attention.max()
# Update the window
self.win_idx = torch.argmax(attention, 1).long()[0].item()
return attention
def apply_forward_attention(self, alignment):
# forward attention
fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device),
(1, 0, 0, 0))
# compute transition potentials
alpha = ((1 - self.u) * self.alpha
+ self.u * fwd_shifted_alpha
+ 1e-8) * alignment
# force incremental alignment
if not self.training and self.forward_attn_mask:
_, n = fwd_shifted_alpha.max(1)
val, n2 = alpha.max(1)
for b in range(alignment.shape[0]):
alpha[b, n[b] + 3:] = 0
alpha[b, :(
n[b] - 1
)] = 0 # ignore all previous states to prevent repetition.
alpha[b,
(n[b] - 2
)] = 0.01 * val[b] # smoothing factor for the prev step
# renormalize attention weights
alpha = alpha / alpha.sum(dim=1, keepdim=True)
return alpha
def forward(self, query, inputs, processed_inputs, mask):
"""
shapes:
query: B x D_attn_rnn
inputs: B x T_en x D_en
processed_inputs:: B x T_en x D_attn
mask: B x T_en
"""
if self.location_attention:
attention, _ = self.get_location_attention(
query, processed_inputs)
else:
attention, _ = self.get_attention(
query, processed_inputs)
# apply masking
if mask is not None:
attention.data.masked_fill_(~mask, self._mask_value)
# apply windowing - only in eval mode
if not self.training and self.windowing:
attention = self.apply_windowing(attention, inputs)
# normalize attention values
if self.norm == "softmax":
alignment = torch.softmax(attention, dim=-1)
elif self.norm == "sigmoid":
alignment = torch.sigmoid(attention) / torch.sigmoid(
attention).sum(
dim=1, keepdim=True)
else:
raise ValueError("Unknown value for attention norm type")
if self.location_attention:
self.update_location_attention(alignment)
# apply forward attention if enabled
if self.forward_attn:
alignment = self.apply_forward_attention(alignment)
self.alpha = alignment
context = torch.bmm(alignment.unsqueeze(1), inputs)
context = context.squeeze(1)
self.attention_weights = alignment
# compute transition agent
if self.forward_attn and self.trans_agent:
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
self.u = torch.sigmoid(self.ta(ta_input))
return context
def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask, attn_K):
if attn_type == "original":
return OriginalAttention(query_dim, embedding_dim, attention_dim,
location_attention,
attention_location_n_filters,
attention_location_kernel_size, windowing,
norm, forward_attn, trans_agent,
forward_attn_mask)
if attn_type == "graves":
return GravesAttention(query_dim, attn_K)
raise RuntimeError(
" [!] Given Attention Type '{attn_type}' is not exist.")

Просмотреть файл

@ -0,0 +1,169 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
class GST(nn.Module):
"""Global Style Token Module for factorizing prosody in speech.
See https://arxiv.org/pdf/1803.09017"""
def __init__(self, num_mel, num_heads, num_style_tokens, embedding_dim):
super().__init__()
self.encoder = ReferenceEncoder(num_mel, embedding_dim)
self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens,
embedding_dim)
def forward(self, inputs):
enc_out = self.encoder(inputs)
style_embed = self.style_token_layer(enc_out)
return style_embed
class ReferenceEncoder(nn.Module):
"""NN module creating a fixed size prosody embedding from a spectrogram.
inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
outputs: [batch_size, embedding_dim]
"""
def __init__(self, num_mel, embedding_dim):
super().__init__()
self.num_mel = num_mel
filters = [1] + [32, 32, 64, 64, 128, 128]
num_layers = len(filters) - 1
convs = [
nn.Conv2d(
in_channels=filters[i],
out_channels=filters[i + 1],
kernel_size=(3, 3),
stride=(2, 2),
padding=(1, 1)) for i in range(num_layers)
]
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList([
nn.BatchNorm2d(num_features=filter_size)
for filter_size in filters[1:]
])
post_conv_height = self.calculate_post_conv_height(
num_mel, 3, 2, 1, num_layers)
self.recurrence = nn.GRU(
input_size=filters[-1] * post_conv_height,
hidden_size=embedding_dim // 2,
batch_first=True)
def forward(self, inputs):
batch_size = inputs.size(0)
x = inputs.view(batch_size, 1, -1, self.num_mel)
# x: 4D tensor [batch_size, num_channels==1, num_frames, num_mel]
for conv, bn in zip(self.convs, self.bns):
x = conv(x)
x = bn(x)
x = F.relu(x)
x = x.transpose(1, 2)
# x: 4D tensor [batch_size, post_conv_width,
# num_channels==128, post_conv_height]
post_conv_width = x.size(1)
x = x.contiguous().view(batch_size, post_conv_width, -1)
# x: 3D tensor [batch_size, post_conv_width,
# num_channels*post_conv_height]
self.recurrence.flatten_parameters()
memory, out = self.recurrence(x)
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
return out.squeeze(0)
@staticmethod
def calculate_post_conv_height(height, kernel_size, stride, pad,
n_convs):
"""Height of spec after n convolutions with fixed kernel/stride/pad."""
for _ in range(n_convs):
height = (height - kernel_size + 2 * pad) // stride + 1
return height
class StyleTokenLayer(nn.Module):
"""NN Module attending to style tokens based on prosody encodings."""
def __init__(self, num_heads, num_style_tokens,
embedding_dim):
super().__init__()
self.query_dim = embedding_dim // 2
self.key_dim = embedding_dim // num_heads
self.style_tokens = nn.Parameter(
torch.FloatTensor(num_style_tokens, self.key_dim))
nn.init.orthogonal_(self.style_tokens)
self.attention = MultiHeadAttention(
query_dim=self.query_dim,
key_dim=self.key_dim,
num_units=embedding_dim,
num_heads=num_heads)
def forward(self, inputs):
batch_size = inputs.size(0)
prosody_encoding = inputs.unsqueeze(1)
# prosody_encoding: 3D tensor [batch_size, 1, encoding_size==128]
tokens = torch.tanh(self.style_tokens) \
.unsqueeze(0) \
.expand(batch_size, -1, -1)
# tokens: 3D tensor [batch_size, num tokens, token embedding size]
style_embed = self.attention(prosody_encoding, tokens)
return style_embed
class MultiHeadAttention(nn.Module):
'''
input:
query --- [N, T_q, query_dim]
key --- [N, T_k, key_dim]
output:
out --- [N, T_q, num_units]
'''
def __init__(self, query_dim, key_dim, num_units, num_heads):
super().__init__()
self.num_units = num_units
self.num_heads = num_heads
self.key_dim = key_dim
self.W_query = nn.Linear(
in_features=query_dim, out_features=num_units, bias=False)
self.W_key = nn.Linear(
in_features=key_dim, out_features=num_units, bias=False)
self.W_value = nn.Linear(
in_features=key_dim, out_features=num_units, bias=False)
def forward(self, query, key):
queries = self.W_query(query) # [N, T_q, num_units]
keys = self.W_key(key) # [N, T_k, num_units]
values = self.W_value(key)
split_size = self.num_units // self.num_heads
queries = torch.stack(
torch.split(queries, split_size, dim=2),
dim=0) # [h, N, T_q, num_units/h]
keys = torch.stack(
torch.split(keys, split_size, dim=2),
dim=0) # [h, N, T_k, num_units/h]
values = torch.stack(
torch.split(values, split_size, dim=2),
dim=0) # [h, N, T_k, num_units/h]
# score = softmax(QK^T / (d_k ** 0.5))
scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k]
scores = scores / (self.key_dim**0.5)
scores = F.softmax(scores, dim=3)
# out = score * V
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
out = torch.cat(
torch.split(out, 1, dim=0),
dim=3).squeeze(0) # [N, T_q, num_units]
return out

246
TTS/tts/layers/losses.py Normal file
Просмотреть файл

@ -0,0 +1,246 @@
import numpy as np
import torch
from torch import nn
from torch.nn import functional
from TTS.tts.utils.generic_utils import sequence_mask
class L1LossMasked(nn.Module):
def __init__(self, seq_len_norm):
super(L1LossMasked, self).__init__()
self.seq_len_norm = seq_len_norm
def forward(self, x, target, length):
"""
Args:
x: A Variable containing a FloatTensor of size
(batch, max_len, dim) which contains the
unnormalized probability for each class.
target: A Variable containing a LongTensor of size
(batch, max_len, dim) which contains the index of the true
class for each corresponding step.
length: A Variable containing a LongTensor of size (batch,)
which contains the length of each data in a batch.
Returns:
loss: An average loss value in range [0, 1] masked by the length.
"""
# mask: (batch, max_len, 1)
target.requires_grad = False
mask = sequence_mask(
sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
if self.seq_len_norm:
norm_w = mask / mask.sum(dim=1, keepdim=True)
out_weights = norm_w.div(target.shape[0] * target.shape[2])
mask = mask.expand_as(x)
loss = functional.l1_loss(
x * mask, target * mask, reduction='none')
loss = loss.mul(out_weights.to(loss.device)).sum()
else:
mask = mask.expand_as(x)
loss = functional.l1_loss(
x * mask, target * mask, reduction='sum')
loss = loss / mask.sum()
return loss
class MSELossMasked(nn.Module):
def __init__(self, seq_len_norm):
super(MSELossMasked, self).__init__()
self.seq_len_norm = seq_len_norm
def forward(self, x, target, length):
"""
Args:
x: A Variable containing a FloatTensor of size
(batch, max_len, dim) which contains the
unnormalized probability for each class.
target: A Variable containing a LongTensor of size
(batch, max_len, dim) which contains the index of the true
class for each corresponding step.
length: A Variable containing a LongTensor of size (batch,)
which contains the length of each data in a batch.
Returns:
loss: An average loss value in range [0, 1] masked by the length.
"""
# mask: (batch, max_len, 1)
target.requires_grad = False
mask = sequence_mask(
sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
if self.seq_len_norm:
norm_w = mask / mask.sum(dim=1, keepdim=True)
out_weights = norm_w.div(target.shape[0] * target.shape[2])
mask = mask.expand_as(x)
loss = functional.mse_loss(
x * mask, target * mask, reduction='none')
loss = loss.mul(out_weights.to(loss.device)).sum()
else:
mask = mask.expand_as(x)
loss = functional.mse_loss(
x * mask, target * mask, reduction='sum')
loss = loss / mask.sum()
return loss
class AttentionEntropyLoss(nn.Module):
# pylint: disable=R0201
def forward(self, align):
"""
Forces attention to be more decisive by penalizing
soft attention weights
TODO: arguments
TODO: unit_test
"""
entropy = torch.distributions.Categorical(probs=align).entropy()
loss = (entropy / np.log(align.shape[1])).mean()
return loss
class BCELossMasked(nn.Module):
def __init__(self, pos_weight):
super(BCELossMasked, self).__init__()
self.pos_weight = pos_weight
def forward(self, x, target, length):
"""
Args:
x: A Variable containing a FloatTensor of size
(batch, max_len) which contains the
unnormalized probability for each class.
target: A Variable containing a LongTensor of size
(batch, max_len) which contains the index of the true
class for each corresponding step.
length: A Variable containing a LongTensor of size (batch,)
which contains the length of each data in a batch.
Returns:
loss: An average loss value in range [0, 1] masked by the length.
"""
# mask: (batch, max_len, 1)
target.requires_grad = False
mask = sequence_mask(sequence_length=length, max_len=target.size(1)).float()
loss = functional.binary_cross_entropy_with_logits(
x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum')
loss = loss / mask.sum()
return loss
class GuidedAttentionLoss(torch.nn.Module):
def __init__(self, sigma=0.4):
super(GuidedAttentionLoss, self).__init__()
self.sigma = sigma
def _make_ga_masks(self, ilens, olens):
B = len(ilens)
max_ilen = max(ilens)
max_olen = max(olens)
ga_masks = torch.zeros((B, max_olen, max_ilen))
for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
ga_masks[idx, :olen, :ilen] = self._make_ga_mask(ilen, olen, self.sigma)
return ga_masks
def forward(self, att_ws, ilens, olens):
ga_masks = self._make_ga_masks(ilens, olens).to(att_ws.device)
seq_masks = self._make_masks(ilens, olens).to(att_ws.device)
losses = ga_masks * att_ws
loss = torch.mean(losses.masked_select(seq_masks))
return loss
@staticmethod
def _make_ga_mask(ilen, olen, sigma):
grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
grid_x, grid_y = grid_x.float(), grid_y.float()
return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2)))
@staticmethod
def _make_masks(ilens, olens):
in_masks = sequence_mask(ilens)
out_masks = sequence_mask(olens)
return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
class TacotronLoss(torch.nn.Module):
def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4):
super(TacotronLoss, self).__init__()
self.stopnet_pos_weight = stopnet_pos_weight
self.ga_alpha = c.ga_alpha
self.config = c
# postnet decoder loss
if c.loss_masking:
self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [
"Tacotron"
] else MSELossMasked(c.seq_len_norm)
else:
self.criterion = nn.L1Loss() if c.model in ["Tacotron"
] else nn.MSELoss()
# guided attention loss
if c.ga_alpha > 0:
self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
# stopnet loss
# pylint: disable=not-callable
self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None
def forward(self, postnet_output, decoder_output, mel_input, linear_input,
stopnet_output, stopnet_target, output_lens, decoder_b_output,
alignments, alignment_lens, alignments_backwards, input_lens):
return_dict = {}
# decoder and postnet losses
if self.config.loss_masking:
decoder_loss = self.criterion(decoder_output, mel_input,
output_lens)
if self.config.model in ["Tacotron", "TacotronGST"]:
postnet_loss = self.criterion(postnet_output, linear_input,
output_lens)
else:
postnet_loss = self.criterion(postnet_output, mel_input,
output_lens)
else:
decoder_loss = self.criterion(decoder_output, mel_input)
if self.config.model in ["Tacotron", "TacotronGST"]:
postnet_loss = self.criterion(postnet_output, linear_input)
else:
postnet_loss = self.criterion(postnet_output, mel_input)
loss = decoder_loss + postnet_loss
return_dict['decoder_loss'] = decoder_loss
return_dict['postnet_loss'] = postnet_loss
# stopnet loss
stop_loss = self.criterion_st(
stopnet_output, stopnet_target,
output_lens) if self.config.stopnet else torch.zeros(1)
if not self.config.separate_stopnet and self.config.stopnet:
loss += stop_loss
return_dict['stopnet_loss'] = stop_loss
# backward decoder loss (if enabled)
if self.config.bidirectional_decoder:
if self.config.loss_masking:
decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input, output_lens)
else:
decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input)
decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_b_output, dims=(1, )), decoder_output)
loss += decoder_b_loss + decoder_c_loss
return_dict['decoder_b_loss'] = decoder_b_loss
return_dict['decoder_c_loss'] = decoder_c_loss
# double decoder consistency loss (if enabled)
if self.config.double_decoder_consistency:
decoder_b_loss = self.criterion(decoder_b_output, mel_input, output_lens)
# decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output)
attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards)
loss += decoder_b_loss + attention_c_loss
return_dict['decoder_coarse_loss'] = decoder_b_loss
return_dict['decoder_ddc_loss'] = attention_c_loss
# guided attention loss (if enabled)
if self.config.ga_alpha > 0:
ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens)
loss += ga_loss * self.ga_alpha
return_dict['ga_loss'] = ga_loss * self.ga_alpha
return_dict['loss'] = loss
return return_dict

496
TTS/tts/layers/tacotron.py Normal file
Просмотреть файл

@ -0,0 +1,496 @@
# coding: utf-8
import torch
from torch import nn
from .common_layers import Prenet, init_attn, Linear
class BatchNormConv1d(nn.Module):
r"""A wrapper for Conv1d with BatchNorm. It sets the activation
function between Conv and BatchNorm layers. BatchNorm layer
is initialized with the TF default values for momentum and eps.
Args:
in_channels: size of each input sample
out_channels: size of each output samples
kernel_size: kernel size of conv filters
stride: stride of conv filters
padding: padding of conv filters
activation: activation function set b/w Conv1d and BatchNorm
Shapes:
- input: batch x dims
- output: batch x dims
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
activation=None):
super(BatchNormConv1d, self).__init__()
self.padding = padding
self.padder = nn.ConstantPad1d(padding, 0)
self.conv1d = nn.Conv1d(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=0,
bias=False)
# Following tensorflow's default parameters
self.bn = nn.BatchNorm1d(out_channels, momentum=0.99, eps=1e-3)
self.activation = activation
# self.init_layers()
def init_layers(self):
if type(self.activation) == torch.nn.ReLU:
w_gain = 'relu'
elif type(self.activation) == torch.nn.Tanh:
w_gain = 'tanh'
elif self.activation is None:
w_gain = 'linear'
else:
raise RuntimeError('Unknown activation function')
torch.nn.init.xavier_uniform_(
self.conv1d.weight, gain=torch.nn.init.calculate_gain(w_gain))
def forward(self, x):
x = self.padder(x)
x = self.conv1d(x)
x = self.bn(x)
if self.activation is not None:
x = self.activation(x)
return x
class Highway(nn.Module):
# TODO: Try GLU layer
def __init__(self, in_size, out_size):
super(Highway, self).__init__()
self.H = nn.Linear(in_size, out_size)
self.H.bias.data.zero_()
self.T = nn.Linear(in_size, out_size)
self.T.bias.data.fill_(-1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
# self.init_layers()
def init_layers(self):
torch.nn.init.xavier_uniform_(
self.H.weight, gain=torch.nn.init.calculate_gain('relu'))
torch.nn.init.xavier_uniform_(
self.T.weight, gain=torch.nn.init.calculate_gain('sigmoid'))
def forward(self, inputs):
H = self.relu(self.H(inputs))
T = self.sigmoid(self.T(inputs))
return H * T + inputs * (1.0 - T)
class CBHG(nn.Module):
"""CBHG module: a recurrent neural network composed of:
- 1-d convolution banks
- Highway networks + residual connections
- Bidirectional gated recurrent units
Args:
in_features (int): sample size
K (int): max filter size in conv bank
projections (list): conv channel sizes for conv projections
num_highways (int): number of highways layers
Shapes:
- input: B x D x T_in
- output: B x T_in x D*2
"""
def __init__(self,
in_features,
K=16,
conv_bank_features=128,
conv_projections=[128, 128],
highway_features=128,
gru_features=128,
num_highways=4):
super(CBHG, self).__init__()
self.in_features = in_features
self.conv_bank_features = conv_bank_features
self.highway_features = highway_features
self.gru_features = gru_features
self.conv_projections = conv_projections
self.relu = nn.ReLU()
# list of conv1d bank with filter size k=1...K
# TODO: try dilational layers instead
self.conv1d_banks = nn.ModuleList([
BatchNormConv1d(in_features,
conv_bank_features,
kernel_size=k,
stride=1,
padding=[(k - 1) // 2, k // 2],
activation=self.relu) for k in range(1, K + 1)
])
# max pooling of conv bank, with padding
# TODO: try average pooling OR larger kernel size
out_features = [K * conv_bank_features] + conv_projections[:-1]
activations = [self.relu] * (len(conv_projections) - 1)
activations += [None]
# setup conv1d projection layers
layer_set = []
for (in_size, out_size, ac) in zip(out_features, conv_projections,
activations):
layer = BatchNormConv1d(in_size,
out_size,
kernel_size=3,
stride=1,
padding=[1, 1],
activation=ac)
layer_set.append(layer)
self.conv1d_projections = nn.ModuleList(layer_set)
# setup Highway layers
if self.highway_features != conv_projections[-1]:
self.pre_highway = nn.Linear(conv_projections[-1],
highway_features,
bias=False)
self.highways = nn.ModuleList([
Highway(highway_features, highway_features)
for _ in range(num_highways)
])
# bi-directional GPU layer
self.gru = nn.GRU(gru_features,
gru_features,
1,
batch_first=True,
bidirectional=True)
def forward(self, inputs):
# (B, in_features, T_in)
x = inputs
# (B, hid_features*K, T_in)
# Concat conv1d bank outputs
outs = []
for conv1d in self.conv1d_banks:
out = conv1d(x)
outs.append(out)
x = torch.cat(outs, dim=1)
assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks)
for conv1d in self.conv1d_projections:
x = conv1d(x)
x += inputs
x = x.transpose(1, 2)
if self.highway_features != self.conv_projections[-1]:
x = self.pre_highway(x)
# Residual connection
# TODO: try residual scaling as in Deep Voice 3
# TODO: try plain residual layers
for highway in self.highways:
x = highway(x)
# (B, T_in, hid_features*2)
# TODO: replace GRU with convolution as in Deep Voice 3
self.gru.flatten_parameters()
outputs, _ = self.gru(x)
return outputs
class EncoderCBHG(nn.Module):
def __init__(self):
super(EncoderCBHG, self).__init__()
self.cbhg = CBHG(
128,
K=16,
conv_bank_features=128,
conv_projections=[128, 128],
highway_features=128,
gru_features=128,
num_highways=4)
def forward(self, x):
return self.cbhg(x)
class Encoder(nn.Module):
r"""Encapsulate Prenet and CBHG modules for encoder"""
def __init__(self, in_features):
super(Encoder, self).__init__()
self.prenet = Prenet(in_features, out_features=[256, 128])
self.cbhg = EncoderCBHG()
def forward(self, inputs):
r"""
Args:
inputs (FloatTensor): embedding features
Shapes:
- inputs: batch x time x in_features
- outputs: batch x time x 128*2
"""
# B x T x prenet_dim
outputs = self.prenet(inputs)
outputs = self.cbhg(outputs.transpose(1, 2))
return outputs
class PostCBHG(nn.Module):
def __init__(self, mel_dim):
super(PostCBHG, self).__init__()
self.cbhg = CBHG(
mel_dim,
K=8,
conv_bank_features=128,
conv_projections=[256, mel_dim],
highway_features=128,
gru_features=128,
num_highways=4)
def forward(self, x):
return self.cbhg(x)
class Decoder(nn.Module):
"""Decoder module.
Args:
in_features (int): input vector (encoder output) sample size.
memory_dim (int): memory vector (prev. time-step output) sample size.
r (int): number of outputs per time step.
memory_size (int): size of the past window. if <= 0 memory_size = r
TODO: arguments
"""
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, in_features, memory_dim, r, memory_size, attn_type, attn_windowing,
attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn, attn_K,
separate_stopnet, speaker_embedding_dim):
super(Decoder, self).__init__()
self.r_init = r
self.r = r
self.in_features = in_features
self.max_decoder_steps = 500
self.use_memory_queue = memory_size > 0
self.memory_size = memory_size if memory_size > 0 else r
self.memory_dim = memory_dim
self.separate_stopnet = separate_stopnet
self.query_dim = 256
# memory -> |Prenet| -> processed_memory
prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim
self.prenet = Prenet(
prenet_dim,
prenet_type,
prenet_dropout,
out_features=[256, 128])
# processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State
# attention_rnn generates queries for the attention mechanism
self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim)
self.attention = init_attn(attn_type=attn_type,
query_dim=self.query_dim,
embedding_dim=in_features,
attention_dim=128,
location_attention=location_attn,
attention_location_n_filters=32,
attention_location_kernel_size=31,
windowing=attn_windowing,
norm=attn_norm,
forward_attn=forward_attn,
trans_agent=trans_agent,
forward_attn_mask=forward_attn_mask,
attn_K=attn_K)
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
self.project_to_decoder_in = nn.Linear(256 + in_features, 256)
# decoder_RNN_input -> |RNN| -> RNN_state
self.decoder_rnns = nn.ModuleList(
[nn.GRUCell(256, 256) for _ in range(2)])
# RNN_state -> |Linear| -> mel_spec
self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init)
# learn init values instead of zero init.
self.stopnet = StopNet(256 + memory_dim * self.r_init)
def set_r(self, new_r):
self.r = new_r
def _reshape_memory(self, memory):
"""
Reshape the spectrograms for given 'r'
"""
# Grouping multiple frames if necessary
if memory.size(-1) == self.memory_dim:
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
# Time first (T_decoder, B, memory_dim)
memory = memory.transpose(0, 1)
return memory
def _init_states(self, inputs):
"""
Initialization of decoder states
"""
B = inputs.size(0)
T = inputs.size(1)
# go frame as zeros matrix
if self.use_memory_queue:
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size)
else:
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim)
# decoder states
self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
self.decoder_rnn_hiddens = [
torch.zeros(1, device=inputs.device).repeat(B, 256)
for idx in range(len(self.decoder_rnns))
]
self.context_vec = inputs.data.new(B, self.in_features).zero_()
# cache attention inputs
self.processed_inputs = self.attention.preprocess_inputs(inputs)
def _parse_outputs(self, outputs, attentions, stop_tokens):
# Back to batch first
attentions = torch.stack(attentions).transpose(0, 1)
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
outputs = outputs.view(
outputs.size(0), -1, self.memory_dim)
outputs = outputs.transpose(1, 2)
return outputs, attentions, stop_tokens
def decode(self, inputs, mask=None):
# Prenet
processed_memory = self.prenet(self.memory_input)
# Attention RNN
self.attention_rnn_hidden = self.attention_rnn(
torch.cat((processed_memory, self.context_vec), -1),
self.attention_rnn_hidden)
self.context_vec = self.attention(
self.attention_rnn_hidden, inputs, self.processed_inputs, mask)
# Concat RNN output and attention context vector
decoder_input = self.project_to_decoder_in(
torch.cat((self.attention_rnn_hidden, self.context_vec), -1))
# Pass through the decoder RNNs
for idx in range(len(self.decoder_rnns)):
self.decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
decoder_input, self.decoder_rnn_hiddens[idx])
# Residual connection
decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input
decoder_output = decoder_input
# predict mel vectors from decoder vectors
output = self.proj_to_mel(decoder_output)
# output = torch.sigmoid(output)
# predict stop token
stopnet_input = torch.cat([decoder_output, output], -1)
if self.separate_stopnet:
stop_token = self.stopnet(stopnet_input.detach())
else:
stop_token = self.stopnet(stopnet_input)
output = output[:, : self.r * self.memory_dim]
return output, stop_token, self.attention.attention_weights
def _update_memory_input(self, new_memory):
if self.use_memory_queue:
if self.memory_size > self.r:
# memory queue size is larger than number of frames per decoder iter
self.memory_input = torch.cat([
new_memory, self.memory_input[:, :(
self.memory_size - self.r) * self.memory_dim].clone()
], dim=-1)
else:
# memory queue size smaller than number of frames per decoder iter
self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
else:
# use only the last frame prediction
# assert new_memory.shape[-1] == self.r * self.memory_dim
self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):]
def forward(self, inputs, memory, mask, speaker_embeddings=None):
"""
Args:
inputs: Encoder outputs.
memory: Decoder memory (autoregression. If None (at eval-time),
decoder outputs are used as decoder inputs. If None, it uses the last
output as the input.
mask: Attention mask for sequence padding.
Shapes:
- inputs: batch x time x encoder_out_dim
- memory: batch x #mel_specs x mel_spec_dim
"""
# Run greedy decoding if memory is None
memory = self._reshape_memory(memory)
outputs = []
attentions = []
stop_tokens = []
t = 0
self._init_states(inputs)
self.attention.init_states(inputs)
while len(outputs) < memory.size(0):
if t > 0:
new_memory = memory[t - 1]
self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, mask)
outputs += [output]
attentions += [attention]
stop_tokens += [stop_token.squeeze(1)]
t += 1
return self._parse_outputs(outputs, attentions, stop_tokens)
def inference(self, inputs, speaker_embeddings=None):
"""
Args:
inputs: encoder outputs.
speaker_embeddings: speaker vectors.
Shapes:
- inputs: batch x time x encoder_out_dim
- speaker_embeddings: batch x embed_dim
"""
outputs = []
attentions = []
stop_tokens = []
t = 0
self._init_states(inputs)
self.attention.init_win_idx()
self.attention.init_states(inputs)
while True:
if t > 0:
new_memory = outputs[-1]
self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, None)
stop_token = torch.sigmoid(stop_token.data)
outputs += [output]
attentions += [attention]
stop_tokens += [stop_token]
t += 1
if t > inputs.shape[1] / 4 and (stop_token > 0.6
or attention[:, -1].item() > 0.6):
break
elif t > self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break
return self._parse_outputs(outputs, attentions, stop_tokens)
class StopNet(nn.Module):
r"""
Args:
in_features (int): feature dimension of input.
"""
def __init__(self, in_features):
super(StopNet, self).__init__()
self.dropout = nn.Dropout(0.1)
self.linear = nn.Linear(in_features, 1)
torch.nn.init.xavier_uniform_(
self.linear.weight, gain=torch.nn.init.calculate_gain('linear'))
def forward(self, inputs):
outputs = self.dropout(inputs)
outputs = self.linear(outputs)
return outputs

353
TTS/tts/layers/tacotron2.py Normal file
Просмотреть файл

@ -0,0 +1,353 @@
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F
from .common_layers import init_attn, Prenet, Linear
class ConvBNBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, activation=None):
super(ConvBNBlock, self).__init__()
assert (kernel_size - 1) % 2 == 0
padding = (kernel_size - 1) // 2
self.convolution1d = nn.Conv1d(in_channels,
out_channels,
kernel_size,
padding=padding)
self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5)
self.dropout = nn.Dropout(p=0.5)
if activation == 'relu':
self.activation = nn.ReLU()
elif activation == 'tanh':
self.activation = nn.Tanh()
else:
self.activation = nn.Identity()
def forward(self, x):
o = self.convolution1d(x)
o = self.batch_normalization(o)
o = self.activation(o)
o = self.dropout(o)
return o
class Postnet(nn.Module):
def __init__(self, output_dim, num_convs=5):
super(Postnet, self).__init__()
self.convolutions = nn.ModuleList()
self.convolutions.append(
ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh'))
for _ in range(1, num_convs - 1):
self.convolutions.append(
ConvBNBlock(512, 512, kernel_size=5, activation='tanh'))
self.convolutions.append(
ConvBNBlock(512, output_dim, kernel_size=5, activation=None))
def forward(self, x):
o = x
for layer in self.convolutions:
o = layer(o)
return o
class Encoder(nn.Module):
def __init__(self, output_input_dim=512):
super(Encoder, self).__init__()
self.convolutions = nn.ModuleList()
for _ in range(3):
self.convolutions.append(
ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu'))
self.lstm = nn.LSTM(output_input_dim,
int(output_input_dim / 2),
num_layers=1,
batch_first=True,
bias=True,
bidirectional=True)
self.rnn_state = None
def forward(self, x, input_lengths):
o = x
for layer in self.convolutions:
o = layer(o)
o = o.transpose(1, 2)
o = nn.utils.rnn.pack_padded_sequence(o,
input_lengths,
batch_first=True)
self.lstm.flatten_parameters()
o, _ = self.lstm(o)
o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)
return o
def inference(self, x):
o = x
for layer in self.convolutions:
o = layer(o)
o = o.transpose(1, 2)
# self.lstm.flatten_parameters()
o, _ = self.lstm(o)
return o
# adapted from https://github.com/NVIDIA/tacotron2/
class Decoder(nn.Module):
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm,
prenet_type, prenet_dropout, forward_attn, trans_agent,
forward_attn_mask, location_attn, attn_K, separate_stopnet,
speaker_embedding_dim):
super(Decoder, self).__init__()
self.frame_dim = frame_dim
self.r_init = r
self.r = r
self.encoder_embedding_dim = input_dim
self.separate_stopnet = separate_stopnet
self.max_decoder_steps = 1000
self.stop_threshold = 0.5
# model dimensions
self.query_dim = 1024
self.decoder_rnn_dim = 1024
self.prenet_dim = 256
self.attn_dim = 128
self.p_attention_dropout = 0.1
self.p_decoder_dropout = 0.1
# memory -> |Prenet| -> processed_memory
prenet_dim = self.frame_dim
self.prenet = Prenet(prenet_dim,
prenet_type,
prenet_dropout,
out_features=[self.prenet_dim, self.prenet_dim],
bias=False)
self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim,
self.query_dim,
bias=True)
self.attention = init_attn(attn_type=attn_type,
query_dim=self.query_dim,
embedding_dim=input_dim,
attention_dim=128,
location_attention=location_attn,
attention_location_n_filters=32,
attention_location_kernel_size=31,
windowing=attn_win,
norm=attn_norm,
forward_attn=forward_attn,
trans_agent=trans_agent,
forward_attn_mask=forward_attn_mask,
attn_K=attn_K)
self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim,
self.decoder_rnn_dim,
bias=True)
self.linear_projection = Linear(self.decoder_rnn_dim + input_dim,
self.frame_dim * self.r_init)
self.stopnet = nn.Sequential(
nn.Dropout(0.1),
Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init,
1,
bias=True,
init_gain='sigmoid'))
self.memory_truncated = None
def set_r(self, new_r):
self.r = new_r
def get_go_frame(self, inputs):
B = inputs.size(0)
memory = torch.zeros(1, device=inputs.device).repeat(B,
self.frame_dim * self.r)
return memory
def _init_states(self, inputs, mask, keep_states=False):
B = inputs.size(0)
# T = inputs.size(1)
if not keep_states:
self.query = torch.zeros(1, device=inputs.device).repeat(
B, self.query_dim)
self.attention_rnn_cell_state = torch.zeros(
1, device=inputs.device).repeat(B, self.query_dim)
self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(
B, self.decoder_rnn_dim)
self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(
B, self.decoder_rnn_dim)
self.context = torch.zeros(1, device=inputs.device).repeat(
B, self.encoder_embedding_dim)
self.inputs = inputs
self.processed_inputs = self.attention.preprocess_inputs(inputs)
self.mask = mask
def _reshape_memory(self, memory):
"""
Reshape the spectrograms for given 'r'
"""
# Grouping multiple frames if necessary
if memory.size(-1) == self.frame_dim:
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
# Time first (T_decoder, B, frame_dim)
memory = memory.transpose(0, 1)
return memory
def _parse_outputs(self, outputs, stop_tokens, alignments):
alignments = torch.stack(alignments).transpose(0, 1)
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
outputs = outputs.view(outputs.size(0), -1, self.frame_dim)
outputs = outputs.transpose(1, 2)
return outputs, stop_tokens, alignments
def _update_memory(self, memory):
if len(memory.shape) == 2:
return memory[:, self.frame_dim * (self.r - 1):]
return memory[:, :, self.frame_dim * (self.r - 1):]
def decode(self, memory):
'''
shapes:
- memory: B x r * self.frame_dim
'''
# self.context: B x D_en
# query_input: B x D_en + (r * self.frame_dim)
query_input = torch.cat((memory, self.context), -1)
# self.query and self.attention_rnn_cell_state : B x D_attn_rnn
self.query, self.attention_rnn_cell_state = self.attention_rnn(
query_input, (self.query, self.attention_rnn_cell_state))
self.query = F.dropout(self.query, self.p_attention_dropout,
self.training)
self.attention_rnn_cell_state = F.dropout(
self.attention_rnn_cell_state, self.p_attention_dropout,
self.training)
# B x D_en
self.context = self.attention(self.query, self.inputs,
self.processed_inputs, self.mask)
# B x (D_en + D_attn_rnn)
decoder_rnn_input = torch.cat((self.query, self.context), -1)
# self.decoder_hidden and self.decoder_cell: B x D_decoder_rnn
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
decoder_rnn_input, (self.decoder_hidden, self.decoder_cell))
self.decoder_hidden = F.dropout(self.decoder_hidden,
self.p_decoder_dropout, self.training)
# B x (D_decoder_rnn + D_en)
decoder_hidden_context = torch.cat((self.decoder_hidden, self.context),
dim=1)
# B x (self.r * self.frame_dim)
decoder_output = self.linear_projection(decoder_hidden_context)
# B x (D_decoder_rnn + (self.r * self.frame_dim))
stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1)
if self.separate_stopnet:
stop_token = self.stopnet(stopnet_input.detach())
else:
stop_token = self.stopnet(stopnet_input)
# select outputs for the reduction rate self.r
decoder_output = decoder_output[:, :self.r * self.frame_dim]
return decoder_output, self.attention.attention_weights, stop_token
def forward(self, inputs, memories, mask, speaker_embeddings=None):
memory = self.get_go_frame(inputs).unsqueeze(0)
memories = self._reshape_memory(memories)
memories = torch.cat((memory, memories), dim=0)
memories = self._update_memory(memories)
if speaker_embeddings is not None:
memories = torch.cat([memories, speaker_embeddings], dim=-1)
memories = self.prenet(memories)
self._init_states(inputs, mask=mask)
self.attention.init_states(inputs)
outputs, stop_tokens, alignments = [], [], []
while len(outputs) < memories.size(0) - 1:
memory = memories[len(outputs)]
decoder_output, attention_weights, stop_token = self.decode(memory)
outputs += [decoder_output.squeeze(1)]
stop_tokens += [stop_token.squeeze(1)]
alignments += [attention_weights]
outputs, stop_tokens, alignments = self._parse_outputs(
outputs, stop_tokens, alignments)
return outputs, alignments, stop_tokens
def inference(self, inputs, speaker_embeddings=None):
memory = self.get_go_frame(inputs)
memory = self._update_memory(memory)
self._init_states(inputs, mask=None)
self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0
while True:
memory = self.prenet(memory)
if speaker_embeddings is not None:
memory = torch.cat([memory, speaker_embeddings], dim=-1)
decoder_output, alignment, stop_token = self.decode(memory)
stop_token = torch.sigmoid(stop_token.data)
outputs += [decoder_output.squeeze(1)]
stop_tokens += [stop_token]
alignments += [alignment]
if stop_token > self.stop_threshold and t > inputs.shape[0] // 2:
break
if len(outputs) == self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break
memory = self._update_memory(decoder_output)
t += 1
outputs, stop_tokens, alignments = self._parse_outputs(
outputs, stop_tokens, alignments)
return outputs, alignments, stop_tokens
def inference_truncated(self, inputs):
"""
Preserve decoder states for continuous inference
"""
if self.memory_truncated is None:
self.memory_truncated = self.get_go_frame(inputs)
self._init_states(inputs, mask=None, keep_states=False)
else:
self._init_states(inputs, mask=None, keep_states=True)
self.attention.init_win_idx()
self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False]
while True:
memory = self.prenet(self.memory_truncated)
decoder_output, alignment, stop_token = self.decode(memory)
stop_token = torch.sigmoid(stop_token.data)
outputs += [decoder_output.squeeze(1)]
stop_tokens += [stop_token]
alignments += [alignment]
if stop_token > 0.7:
break
if len(outputs) == self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break
self.memory_truncated = decoder_output
t += 1
outputs, stop_tokens, alignments = self._parse_outputs(
outputs, stop_tokens, alignments)
return outputs, alignments, stop_tokens
def inference_step(self, inputs, t, memory=None):
"""
For debug purposes
"""
if t == 0:
memory = self.get_go_frame(inputs)
self._init_states(inputs, mask=None)
memory = self.prenet(memory)
decoder_output, stop_token, alignment = self.decode(memory)
stop_token = torch.sigmoid(stop_token.data)
memory = decoder_output
return decoder_output, stop_token, alignment

Просмотреть файл

160
TTS/tts/models/tacotron.py Normal file
Просмотреть файл

@ -0,0 +1,160 @@
# coding: utf-8
import torch
from torch import nn
from TTS.tts.layers.gst_layers import GST
from TTS.tts.layers.tacotron import Decoder, Encoder, PostCBHG
from TTS.tts.models.tacotron_abstract import TacotronAbstract
class Tacotron(TacotronAbstract):
def __init__(self,
num_chars,
num_speakers,
r=5,
postnet_output_dim=1025,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm="sigmoid",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=5,
separate_stopnet=True,
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
gst=False,
memory_size=5):
super(Tacotron,
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
decoder_output_dim, attn_type, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet,
bidirectional_decoder, double_decoder_consistency,
ddc_r, gst)
decoder_in_features = 512 if num_speakers > 1 else 256
encoder_in_features = 512 if num_speakers > 1 else 256
speaker_embedding_dim = 256
proj_speaker_dim = 80 if num_speakers > 1 else 0
# base model layers
self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
self.embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(encoder_in_features)
self.decoder = Decoder(decoder_in_features, decoder_output_dim, r,
memory_size, attn_type, attn_win, attn_norm,
prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn,
attn_K, separate_stopnet, proj_speaker_dim)
self.postnet = PostCBHG(decoder_output_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
postnet_output_dim)
# speaker embedding layers
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.speaker_project_mel = nn.Sequential(
nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh())
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
# global style token layers
if self.gst:
gst_embedding_dim = 256
self.gst_layer = GST(num_mel=80,
num_heads=4,
num_style_tokens=10,
embedding_dim=gst_embedding_dim)
# backward pass decoder
if self.bidirectional_decoder:
self._init_backward_decoder()
# setup DDC
if self.double_decoder_consistency:
self.coarse_decoder = Decoder(
decoder_in_features, decoder_output_dim, ddc_r, memory_size,
attn_type, attn_win, attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask, location_attn,
attn_K, separate_stopnet, proj_speaker_dim)
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None):
"""
Shapes:
- characters: B x T_in
- text_lengths: B
- mel_specs: B x T_out x D
- speaker_ids: B x 1
"""
self._init_states()
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x T_in x embed_dim
inputs = self.embedding(characters)
# B x speaker_embed_dim
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
# B x T_in x embed_dim + speaker_embed_dim
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
# B x T_in x encoder_in_features
encoder_outputs = self.encoder(inputs)
# sequence masking
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# global style token
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
# decoder_outputs: B x decoder_in_features x T_out
# alignments: B x T_in x encoder_in_features
# stop_tokens: B x T_in
decoder_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, input_mask,
self.speaker_embeddings_projected)
# sequence masking
if output_mask is not None:
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
# B x T_out x decoder_in_features
postnet_outputs = self.postnet(decoder_outputs)
# sequence masking
if output_mask is not None:
postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs)
# B x T_out x posnet_dim
postnet_outputs = self.last_linear(postnet_outputs)
# B x T_out x decoder_in_features
decoder_outputs = decoder_outputs.transpose(1, 2).contiguous()
if self.bidirectional_decoder:
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
if self.double_decoder_consistency:
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask)
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
return decoder_outputs, postnet_outputs, alignments, stop_tokens
@torch.no_grad()
def inference(self, characters, speaker_ids=None, style_mel=None):
inputs = self.embedding(characters)
self._init_states()
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
encoder_outputs = self.encoder(inputs)
if self.gst and style_mel is not None:
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs, self.speaker_embeddings_projected)
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = self.last_linear(postnet_outputs)
decoder_outputs = decoder_outputs.transpose(1, 2)
return decoder_outputs, postnet_outputs, alignments, stop_tokens

169
TTS/tts/models/tacotron2.py Normal file
Просмотреть файл

@ -0,0 +1,169 @@
import torch
from torch import nn
from TTS.tts.layers.gst_layers import GST
from TTS.tts.layers.tacotron2 import Decoder, Encoder, Postnet
from TTS.tts.models.tacotron_abstract import TacotronAbstract
# TODO: match function arguments with tacotron
class Tacotron2(TacotronAbstract):
def __init__(self,
num_chars,
num_speakers,
r,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm="softmax",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=5,
separate_stopnet=True,
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
gst=False):
super(Tacotron2,
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
decoder_output_dim, attn_type, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet,
bidirectional_decoder, double_decoder_consistency,
ddc_r, gst)
decoder_in_features = 512 if num_speakers > 1 else 512
encoder_in_features = 512 if num_speakers > 1 else 512
proj_speaker_dim = 80 if num_speakers > 1 else 0
# base layers
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, 512)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(encoder_in_features)
self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
self.postnet = Postnet(self.postnet_output_dim)
# global style token layers
if self.gst:
gst_embedding_dim = encoder_in_features
self.gst_layer = GST(num_mel=80,
num_heads=4,
num_style_tokens=10,
embedding_dim=gst_embedding_dim)
# backward pass decoder
if self.bidirectional_decoder:
self._init_backward_decoder()
# setup DDC
if self.double_decoder_consistency:
self.coarse_decoder = Decoder(
decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn, attn_K,
separate_stopnet, proj_speaker_dim)
@staticmethod
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
mel_outputs = mel_outputs.transpose(1, 2)
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
return mel_outputs, mel_outputs_postnet, alignments
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
self._init_states()
# compute mask for padding
# B x T_in_max (boolean)
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x D_embed x T_in_max
embedded_inputs = self.embedding(text).transpose(1, 2)
# B x T_in_max x D_en
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
# adding speaker embeddding to encoder output
# TODO: multi-speaker
# B x speaker_embed_dim
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
# B x T_in x embed_dim + speaker_embed_dim
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
self.speaker_embeddings)
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# global style token
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
# B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
decoder_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, input_mask)
# sequence masking
if mel_lengths is not None:
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
# B x mel_dim x T_out
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = decoder_outputs + postnet_outputs
# sequence masking
if output_mask is not None:
postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs)
# B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
decoder_outputs, postnet_outputs, alignments)
if self.bidirectional_decoder:
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
if self.double_decoder_consistency:
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask)
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
return decoder_outputs, postnet_outputs, alignments, stop_tokens
@torch.no_grad()
def inference(self, text, speaker_ids=None):
embedded_inputs = self.embedding(text).transpose(1, 2)
encoder_outputs = self.encoder.inference(embedded_inputs)
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
self.speaker_embeddings)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs)
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = decoder_outputs + postnet_outputs
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
decoder_outputs, postnet_outputs, alignments)
return decoder_outputs, postnet_outputs, alignments, stop_tokens
def inference_truncated(self, text, speaker_ids=None):
"""
Preserve model states for continuous inference
"""
embedded_inputs = self.embedding(text).transpose(1, 2)
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
encoder_outputs)
mel_outputs_postnet = self.postnet(mel_outputs)
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
mel_outputs, mel_outputs_postnet, alignments)
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
def _speaker_embedding_pass(self, encoder_outputs, speaker_ids):
# TODO: multi-speaker
# if hasattr(self, "speaker_embedding") and speaker_ids is None:
# raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
# if hasattr(self, "speaker_embedding") and speaker_ids is not None:
# speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
# encoder_outputs.size(1),
# -1)
# encoder_outputs = encoder_outputs + speaker_embeddings
# return encoder_outputs
pass

Просмотреть файл

@ -0,0 +1,180 @@
import copy
from abc import ABC, abstractmethod
import torch
from torch import nn
from TTS.tts.utils.generic_utils import sequence_mask
class TacotronAbstract(ABC, nn.Module):
def __init__(self,
num_chars,
num_speakers,
r,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm="softmax",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=5,
separate_stopnet=True,
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
gst=False):
""" Abstract Tacotron class """
super().__init__()
self.num_chars = num_chars
self.r = r
self.decoder_output_dim = decoder_output_dim
self.postnet_output_dim = postnet_output_dim
self.gst = gst
self.num_speakers = num_speakers
self.bidirectional_decoder = bidirectional_decoder
self.double_decoder_consistency = double_decoder_consistency
self.ddc_r = ddc_r
self.attn_type = attn_type
self.attn_win = attn_win
self.attn_norm = attn_norm
self.prenet_type = prenet_type
self.prenet_dropout = prenet_dropout
self.forward_attn = forward_attn
self.trans_agent = trans_agent
self.forward_attn_mask = forward_attn_mask
self.location_attn = location_attn
self.attn_K = attn_K
self.separate_stopnet = separate_stopnet
# layers
self.embedding = None
self.encoder = None
self.decoder = None
self.postnet = None
# global style token
if self.gst:
self.gst_layer = None
# model states
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
# additional layers
self.decoder_backward = None
self.coarse_decoder = None
#############################
# INIT FUNCTIONS
#############################
def _init_states(self):
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
def _init_backward_decoder(self):
self.decoder_backward = copy.deepcopy(self.decoder)
def _init_coarse_decoder(self):
self.coarse_decoder = copy.deepcopy(self.decoder)
self.coarse_decoder.r_init = self.ddc_r
self.coarse_decoder.set_r(self.ddc_r)
#############################
# CORE FUNCTIONS
#############################
@abstractmethod
def forward(self):
pass
@abstractmethod
def inference(self):
pass
#############################
# COMMON COMPUTE FUNCTIONS
#############################
def compute_masks(self, text_lengths, mel_lengths):
"""Compute masks against sequence paddings."""
# B x T_in_max (boolean)
device = text_lengths.device
input_mask = sequence_mask(text_lengths).to(device)
output_mask = None
if mel_lengths is not None:
max_len = mel_lengths.max()
r = self.decoder.r
max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len
output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device)
return input_mask, output_mask
def _backward_pass(self, mel_specs, encoder_outputs, mask):
""" Run backwards decoder """
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
self.speaker_embeddings_projected)
decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
return decoder_outputs_b, alignments_b
def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments,
input_mask):
""" Double Decoder Consistency """
T = mel_specs.shape[1]
if T % self.coarse_decoder.r > 0:
padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r)
mel_specs = torch.nn.functional.pad(mel_specs,
(0, 0, 0, padding_size, 0, 0))
decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder(
encoder_outputs.detach(), mel_specs, input_mask)
# scale_factor = self.decoder.r_init / self.decoder.r
alignments_backward = torch.nn.functional.interpolate(
alignments_backward.transpose(1, 2),
size=alignments.shape[1],
mode='nearest').transpose(1, 2)
decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
return decoder_outputs_backward, alignments_backward
#############################
# EMBEDDING FUNCTIONS
#############################
def compute_speaker_embedding(self, speaker_ids):
""" Compute speaker embedding vectors """
if hasattr(self, "speaker_embedding") and speaker_ids is None:
raise RuntimeError(
" [!] Model has speaker embedding layer but speaker_id is not provided"
)
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1)
if hasattr(self, "speaker_project_mel") and speaker_ids is not None:
self.speaker_embeddings_projected = self.speaker_project_mel(
self.speaker_embeddings).squeeze(1)
def compute_gst(self, inputs, mel_specs):
""" Compute global style token """
# pylint: disable=not-callable
gst_outputs = self.gst_layer(mel_specs)
inputs = self._add_speaker_embedding(inputs, gst_outputs)
return inputs
@staticmethod
def _add_speaker_embedding(outputs, speaker_embeddings):
speaker_embeddings_ = speaker_embeddings.expand(
outputs.size(0), outputs.size(1), -1)
outputs = outputs + speaker_embeddings_
return outputs
@staticmethod
def _concat_speaker_embedding(outputs, speaker_embeddings):
speaker_embeddings_ = speaker_embeddings.expand(
outputs.size(0), outputs.size(1), -1)
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
return outputs

20
TTS/tts/tf/README.md Normal file
Просмотреть файл

@ -0,0 +1,20 @@
## Utilities to Convert Models to Tensorflow2
Here there are experimental utilities to convert trained Torch models to Tensorflow (2.2>=).
Converting Torch models to TF enables all the TF toolkit to be used for better deployment and device specific optimizations.
Note that we do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome.
To see how you can use TF model at inference, check the notebook.
This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own.
### Converting a Model
- Run ```convert_tacotron2_torch_to_tf.py --torch_model_path /path/to/torch/model.pth.tar --config_path /path/to/model/config.json --output_path /path/to/output/tf/model``` with the right arguments.
### Known issues ans limitations
- We use a custom model load/save mechanism which enables us to store model related information with models weights. (Similar to Torch). However, it is prone to random errors.
- Current TF model implementation is slightly slower than Torch model. Hopefully, it'll get better with improving TF support for eager mode and ```tf.function```.
- TF implementation of Tacotron2 only supports regular Tacotron2 as in the paper.
- You can only convert models trained after TF model implementation since model layers has been updated in Torch model.

0
TTS/tts/tf/__init__.py Normal file
Просмотреть файл

Просмотреть файл

@ -0,0 +1,285 @@
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.ops import math_ops
# from tensorflow_addons.seq2seq import BahdanauAttention
class Linear(keras.layers.Layer):
def __init__(self, units, use_bias, **kwargs):
super(Linear, self).__init__(**kwargs)
self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
self.activation = keras.layers.ReLU()
def call(self, x):
"""
shapes:
x: B x T x C
"""
return self.activation(self.linear_layer(x))
class LinearBN(keras.layers.Layer):
def __init__(self, units, use_bias, **kwargs):
super(LinearBN, self).__init__(**kwargs)
self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
self.batch_normalization = keras.layers.BatchNormalization(axis=-1, momentum=0.90, epsilon=1e-5, name='batch_normalization')
self.activation = keras.layers.ReLU()
def call(self, x, training=None):
"""
shapes:
x: B x T x C
"""
out = self.linear_layer(x)
out = self.batch_normalization(out, training=training)
return self.activation(out)
class Prenet(keras.layers.Layer):
def __init__(self,
prenet_type,
prenet_dropout,
units,
bias,
**kwargs):
super(Prenet, self).__init__(**kwargs)
self.prenet_type = prenet_type
self.prenet_dropout = prenet_dropout
self.linear_layers = []
if prenet_type == "bn":
self.linear_layers += [LinearBN(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)]
elif prenet_type == "original":
self.linear_layers += [Linear(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)]
else:
raise RuntimeError(' [!] Unknown prenet type.')
if prenet_dropout:
self.dropout = keras.layers.Dropout(rate=0.5)
def call(self, x, training=None):
"""
shapes:
x: B x T x C
"""
for linear in self.linear_layers:
if self.prenet_dropout:
x = self.dropout(linear(x), training=training)
else:
x = linear(x)
return x
def _sigmoid_norm(score):
attn_weights = tf.nn.sigmoid(score)
attn_weights = attn_weights / tf.reduce_sum(attn_weights, axis=1, keepdims=True)
return attn_weights
class Attention(keras.layers.Layer):
"""TODO: implement forward_attention
TODO: location sensitive attention
TODO: implement attention windowing """
def __init__(self, attn_dim, use_loc_attn, loc_attn_n_filters,
loc_attn_kernel_size, use_windowing, norm, use_forward_attn,
use_trans_agent, use_forward_attn_mask, **kwargs):
super(Attention, self).__init__(**kwargs)
self.use_loc_attn = use_loc_attn
self.loc_attn_n_filters = loc_attn_n_filters
self.loc_attn_kernel_size = loc_attn_kernel_size
self.use_windowing = use_windowing
self.norm = norm
self.use_forward_attn = use_forward_attn
self.use_trans_agent = use_trans_agent
self.use_forward_attn_mask = use_forward_attn_mask
self.query_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name='query_layer/linear_layer')
self.inputs_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name=f'{self.name}/inputs_layer/linear_layer')
self.v = tf.keras.layers.Dense(1, use_bias=True, name='v/linear_layer')
if use_loc_attn:
self.location_conv1d = keras.layers.Conv1D(
filters=loc_attn_n_filters,
kernel_size=loc_attn_kernel_size,
padding='same',
use_bias=False,
name='location_layer/location_conv1d')
self.location_dense = keras.layers.Dense(attn_dim, use_bias=False, name='location_layer/location_dense')
if norm == 'softmax':
self.norm_func = tf.nn.softmax
elif norm == 'sigmoid':
self.norm_func = _sigmoid_norm
else:
raise ValueError("Unknown value for attention norm type")
def init_states(self, batch_size, value_length):
states = []
if self.use_loc_attn:
attention_cum = tf.zeros([batch_size, value_length])
attention_old = tf.zeros([batch_size, value_length])
states = [attention_cum, attention_old]
if self.use_forward_attn:
alpha = tf.concat([
tf.ones([batch_size, 1]),
tf.zeros([batch_size, value_length])[:, :-1] + 1e-7
], 1)
states.append(alpha)
return tuple(states)
def process_values(self, values):
""" cache values for decoder iterations """
#pylint: disable=attribute-defined-outside-init
self.processed_values = self.inputs_layer(values)
self.values = values
def get_loc_attn(self, query, states):
""" compute location attention, query layer and
unnorm. attention weights"""
attention_cum, attention_old = states[:2]
attn_cat = tf.stack([attention_old, attention_cum], axis=2)
processed_query = self.query_layer(tf.expand_dims(query, 1))
processed_attn = self.location_dense(self.location_conv1d(attn_cat))
score = self.v(
tf.nn.tanh(self.processed_values + processed_query +
processed_attn))
score = tf.squeeze(score, axis=2)
return score, processed_query
def get_attn(self, query):
""" compute query layer and unnormalized attention weights """
processed_query = self.query_layer(tf.expand_dims(query, 1))
score = self.v(tf.nn.tanh(self.processed_values + processed_query))
score = tf.squeeze(score, axis=2)
return score, processed_query
def apply_score_masking(self, score, mask): #pylint: disable=no-self-use
""" ignore sequence paddings """
padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
# Bias so padding positions do not contribute to attention distribution.
score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32)
return score
def apply_forward_attention(self, alignment, alpha): #pylint: disable=no-self-use
# forward attention
fwd_shifted_alpha = tf.pad(alpha[:, :-1], ((0, 0), (1, 0)), constant_values=0.0)
# compute transition potentials
new_alpha = ((1 - 0.5) * alpha + 0.5 * fwd_shifted_alpha + 1e-8) * alignment
# renormalize attention weights
new_alpha = new_alpha / tf.reduce_sum(new_alpha, axis=1, keepdims=True)
return new_alpha
def update_states(self, old_states, scores_norm, attn_weights, new_alpha=None):
states = []
if self.use_loc_attn:
states = [old_states[0] + scores_norm, attn_weights]
if self.use_forward_attn:
states.append(new_alpha)
return tuple(states)
def call(self, query, states):
"""
shapes:
query: B x D
"""
if self.use_loc_attn:
score, _ = self.get_loc_attn(query, states)
else:
score, _ = self.get_attn(query)
# TODO: masking
# if mask is not None:
# self.apply_score_masking(score, mask)
# attn_weights shape == (batch_size, max_length, 1)
# normalize attention scores
scores_norm = self.norm_func(score)
attn_weights = scores_norm
# apply forward attention
new_alpha = None
if self.use_forward_attn:
new_alpha = self.apply_forward_attention(attn_weights, states[-1])
attn_weights = new_alpha
# update states tuple
# states = (cum_attn_weights, attn_weights, new_alpha)
states = self.update_states(states, scores_norm, attn_weights, new_alpha)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = tf.matmul(tf.expand_dims(attn_weights, axis=2), self.values, transpose_a=True, transpose_b=False)
context_vector = tf.squeeze(context_vector, axis=1)
return context_vector, attn_weights, states
# def _location_sensitive_score(processed_query, keys, processed_loc, attention_v, attention_b):
# dtype = processed_query.dtype
# num_units = keys.shape[-1].value or array_ops.shape(keys)[-1]
# return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query + processed_loc + attention_b), [2])
# class LocationSensitiveAttention(BahdanauAttention):
# def __init__(self,
# units,
# memory=None,
# memory_sequence_length=None,
# normalize=False,
# probability_fn="softmax",
# kernel_initializer="glorot_uniform",
# dtype=None,
# name="LocationSensitiveAttention",
# location_attention_filters=32,
# location_attention_kernel_size=31):
# super(LocationSensitiveAttention,
# self).__init__(units=units,
# memory=memory,
# memory_sequence_length=memory_sequence_length,
# normalize=normalize,
# probability_fn='softmax', ## parent module default
# kernel_initializer=kernel_initializer,
# dtype=dtype,
# name=name)
# if probability_fn == 'sigmoid':
# self.probability_fn = lambda score, _: self._sigmoid_normalization(score)
# self.location_conv = keras.layers.Conv1D(filters=location_attention_filters, kernel_size=location_attention_kernel_size, padding='same', use_bias=False)
# self.location_dense = keras.layers.Dense(units, use_bias=False)
# # self.v = keras.layers.Dense(1, use_bias=True)
# def _location_sensitive_score(self, processed_query, keys, processed_loc):
# processed_query = tf.expand_dims(processed_query, 1)
# return tf.reduce_sum(self.attention_v * tf.tanh(keys + processed_query + processed_loc), [2])
# def _location_sensitive(self, alignment_cum, alignment_old):
# alignment_cat = tf.stack([alignment_cum, alignment_old], axis=2)
# return self.location_dense(self.location_conv(alignment_cat))
# def _sigmoid_normalization(self, score):
# return tf.nn.sigmoid(score) / tf.reduce_sum(tf.nn.sigmoid(score), axis=-1, keepdims=True)
# # def _apply_masking(self, score, mask):
# # padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
# # # Bias so padding positions do not contribute to attention distribution.
# # score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32)
# # return score
# def _calculate_attention(self, query, state):
# alignment_cum, alignment_old = state[:2]
# processed_query = self.query_layer(
# query) if self.query_layer else query
# processed_loc = self._location_sensitive(alignment_cum, alignment_old)
# score = self._location_sensitive_score(
# processed_query,
# self.keys,
# processed_loc)
# alignment = self.probability_fn(score, state)
# alignment_cum = alignment_cum + alignment
# state[0] = alignment_cum
# state[1] = alignment
# return alignment, state
# def compute_context(self, alignments):
# expanded_alignments = tf.expand_dims(alignments, 1)
# context = tf.matmul(expanded_alignments, self.values)
# context = tf.squeeze(context, [1])
# return context
# # def call(self, query, state):
# # alignment, next_state = self._calculate_attention(query, state)
# # return alignment, next_state

Просмотреть файл

@ -0,0 +1,300 @@
import tensorflow as tf
from tensorflow import keras
from TTS.tts.tf.utils.tf_utils import shape_list
from TTS.tts.tf.layers.common_layers import Prenet, Attention
# from tensorflow_addons.seq2seq import AttentionWrapper
class ConvBNBlock(keras.layers.Layer):
def __init__(self, filters, kernel_size, activation, **kwargs):
super(ConvBNBlock, self).__init__(**kwargs)
self.convolution1d = keras.layers.Conv1D(filters, kernel_size, padding='same', name='convolution1d')
self.batch_normalization = keras.layers.BatchNormalization(axis=2, momentum=0.90, epsilon=1e-5, name='batch_normalization')
self.dropout = keras.layers.Dropout(rate=0.5, name='dropout')
self.activation = keras.layers.Activation(activation, name='activation')
def call(self, x, training=None):
o = self.convolution1d(x)
o = self.batch_normalization(o, training=training)
o = self.activation(o)
o = self.dropout(o, training=training)
return o
class Postnet(keras.layers.Layer):
def __init__(self, output_filters, num_convs, **kwargs):
super(Postnet, self).__init__(**kwargs)
self.convolutions = []
self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name='convolutions_0'))
for idx in range(1, num_convs - 1):
self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name=f'convolutions_{idx}'))
self.convolutions.append(ConvBNBlock(output_filters, 5, 'linear', name=f'convolutions_{idx+1}'))
def call(self, x, training=None):
o = x
for layer in self.convolutions:
o = layer(o, training=training)
return o
class Encoder(keras.layers.Layer):
def __init__(self, output_input_dim, **kwargs):
super(Encoder, self).__init__(**kwargs)
self.convolutions = []
for idx in range(3):
self.convolutions.append(ConvBNBlock(output_input_dim, 5, 'relu', name=f'convolutions_{idx}'))
self.lstm = keras.layers.Bidirectional(keras.layers.LSTM(output_input_dim // 2, return_sequences=True, use_bias=True), name='lstm')
def call(self, x, training=None):
o = x
for layer in self.convolutions:
o = layer(o, training=training)
o = self.lstm(o)
return o
class Decoder(keras.layers.Layer):
#pylint: disable=unused-argument
def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type,
prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask,
use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, enable_tflite, **kwargs):
super(Decoder, self).__init__(**kwargs)
self.frame_dim = frame_dim
self.r_init = tf.constant(r, dtype=tf.int32)
self.r = tf.constant(r, dtype=tf.int32)
self.output_dim = r * self.frame_dim
self.separate_stopnet = separate_stopnet
self.enable_tflite = enable_tflite
# layer constants
self.max_decoder_steps = tf.constant(1000, dtype=tf.int32)
self.stop_thresh = tf.constant(0.5, dtype=tf.float32)
# model dimensions
self.query_dim = 1024
self.decoder_rnn_dim = 1024
self.prenet_dim = 256
self.attn_dim = 128
self.p_attention_dropout = 0.1
self.p_decoder_dropout = 0.1
self.prenet = Prenet(prenet_type,
prenet_dropout,
[self.prenet_dim, self.prenet_dim],
bias=False,
name='prenet')
self.attention_rnn = keras.layers.LSTMCell(self.query_dim, use_bias=True, name='attention_rnn', )
self.attention_rnn_dropout = keras.layers.Dropout(0.5)
# TODO: implement other attn options
self.attention = Attention(attn_dim=self.attn_dim,
use_loc_attn=True,
loc_attn_n_filters=32,
loc_attn_kernel_size=31,
use_windowing=False,
norm=attn_norm,
use_forward_attn=use_forward_attn,
use_trans_agent=use_trans_agent,
use_forward_attn_mask=use_forward_attn_mask,
name='attention')
self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name='decoder_rnn')
self.decoder_rnn_dropout = keras.layers.Dropout(0.5)
self.linear_projection = keras.layers.Dense(self.frame_dim * r, name='linear_projection/linear_layer')
self.stopnet = keras.layers.Dense(1, name='stopnet/linear_layer')
def set_max_decoder_steps(self, new_max_steps):
self.max_decoder_steps = tf.constant(new_max_steps, dtype=tf.int32)
def set_r(self, new_r):
self.r = tf.constant(new_r, dtype=tf.int32)
self.output_dim = self.frame_dim * new_r
def build_decoder_initial_states(self, batch_size, memory_dim, memory_length):
zero_frame = tf.zeros([batch_size, self.frame_dim])
zero_context = tf.zeros([batch_size, memory_dim])
attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32)
decoder_rnn_state = self.decoder_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32)
attention_states = self.attention.init_states(batch_size, memory_length)
return zero_frame, zero_context, attention_rnn_state, decoder_rnn_state, attention_states
def step(self, prenet_next, states,
memory_seq_length=None, training=None):
_, context_next, attention_rnn_state, decoder_rnn_state, attention_states = states
attention_rnn_input = tf.concat([prenet_next, context_next], -1)
attention_rnn_output, attention_rnn_state = \
self.attention_rnn(attention_rnn_input,
attention_rnn_state, training=training)
attention_rnn_output = self.attention_rnn_dropout(attention_rnn_output, training=training)
context, attention, attention_states = self.attention(attention_rnn_output, attention_states, training=training)
decoder_rnn_input = tf.concat([attention_rnn_output, context], -1)
decoder_rnn_output, decoder_rnn_state = \
self.decoder_rnn(decoder_rnn_input, decoder_rnn_state, training=training)
decoder_rnn_output = self.decoder_rnn_dropout(decoder_rnn_output, training=training)
linear_projection_input = tf.concat([decoder_rnn_output, context], -1)
output_frame = self.linear_projection(linear_projection_input, training=training)
stopnet_input = tf.concat([decoder_rnn_output, output_frame], -1)
stopnet_output = self.stopnet(stopnet_input, training=training)
output_frame = output_frame[:, :self.r * self.frame_dim]
states = (output_frame[:, self.frame_dim * (self.r - 1):], context, attention_rnn_state, decoder_rnn_state, attention_states)
return output_frame, stopnet_output, states, attention
def decode(self, memory, states, frames, memory_seq_length=None):
B, _, _ = shape_list(memory)
num_iter = shape_list(frames)[1] // self.r
# init states
frame_zero = tf.expand_dims(states[0], 1)
frames = tf.concat([frame_zero, frames], axis=1)
outputs = tf.TensorArray(dtype=tf.float32, size=num_iter)
attentions = tf.TensorArray(dtype=tf.float32, size=num_iter)
stop_tokens = tf.TensorArray(dtype=tf.float32, size=num_iter)
# pre-computes
self.attention.process_values(memory)
prenet_output = self.prenet(frames, training=True)
step_count = tf.constant(0, dtype=tf.int32)
def _body(step, memory, prenet_output, states, outputs, stop_tokens, attentions):
prenet_next = prenet_output[:, step]
output, stop_token, states, attention = self.step(prenet_next,
states,
memory_seq_length)
outputs = outputs.write(step, output)
attentions = attentions.write(step, attention)
stop_tokens = stop_tokens.write(step, stop_token)
return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions
_, memory, _, states, outputs, stop_tokens, attentions = \
tf.while_loop(lambda *arg: True,
_body,
loop_vars=(step_count, memory, prenet_output,
states, outputs, stop_tokens, attentions),
parallel_iterations=32,
swap_memory=True,
maximum_iterations=num_iter)
outputs = outputs.stack()
attentions = attentions.stack()
stop_tokens = stop_tokens.stack()
outputs = tf.transpose(outputs, [1, 0, 2])
attentions = tf.transpose(attentions, [1, 0, 2])
stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
stop_tokens = tf.squeeze(stop_tokens, axis=2)
outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
return outputs, stop_tokens, attentions
def decode_inference(self, memory, states):
B, _, _ = shape_list(memory)
# init states
outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
stop_tokens = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
# pre-computes
self.attention.process_values(memory)
# iter vars
stop_flag = tf.constant(False, dtype=tf.bool)
step_count = tf.constant(0, dtype=tf.int32)
def _body(step, memory, states, outputs, stop_tokens, attentions, stop_flag):
frame_next = states[0]
prenet_next = self.prenet(frame_next, training=False)
output, stop_token, states, attention = self.step(prenet_next,
states,
None,
training=False)
stop_token = tf.math.sigmoid(stop_token)
outputs = outputs.write(step, output)
attentions = attentions.write(step, attention)
stop_tokens = stop_tokens.write(step, stop_token)
stop_flag = tf.greater(stop_token, self.stop_thresh)
stop_flag = tf.reduce_all(stop_flag)
return step + 1, memory, states, outputs, stop_tokens, attentions, stop_flag
cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool))
_, memory, states, outputs, stop_tokens, attentions, stop_flag = \
tf.while_loop(cond,
_body,
loop_vars=(step_count, memory, states, outputs,
stop_tokens, attentions, stop_flag),
parallel_iterations=32,
swap_memory=True,
maximum_iterations=self.max_decoder_steps)
outputs = outputs.stack()
attentions = attentions.stack()
stop_tokens = stop_tokens.stack()
outputs = tf.transpose(outputs, [1, 0, 2])
attentions = tf.transpose(attentions, [1, 0, 2])
stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
stop_tokens = tf.squeeze(stop_tokens, axis=2)
outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
return outputs, stop_tokens, attentions
def decode_inference_tflite(self, memory, states):
"""Inference with TF-Lite compatibility. It assumes
batch_size is 1"""
# init states
# dynamic_shape is not supported in TFLite
outputs = tf.TensorArray(dtype=tf.float32,
size=self.max_decoder_steps,
element_shape=tf.TensorShape(
[self.output_dim]),
clear_after_read=False,
dynamic_size=False)
# stop_flags = tf.TensorArray(dtype=tf.bool,
# size=self.max_decoder_steps,
# element_shape=tf.TensorShape(
# []),
# clear_after_read=False,
# dynamic_size=False)
attentions = ()
stop_tokens = ()
# pre-computes
self.attention.process_values(memory)
# iter vars
stop_flag = tf.constant(False, dtype=tf.bool)
step_count = tf.constant(0, dtype=tf.int32)
def _body(step, memory, states, outputs, stop_flag):
frame_next = states[0]
prenet_next = self.prenet(frame_next, training=False)
output, stop_token, states, _ = self.step(prenet_next,
states,
None,
training=False)
stop_token = tf.math.sigmoid(stop_token)
stop_flag = tf.greater(stop_token, self.stop_thresh)
stop_flag = tf.reduce_all(stop_flag)
# stop_flags = stop_flags.write(step, tf.logical_not(stop_flag))
outputs = outputs.write(step, tf.reshape(output, [-1]))
return step + 1, memory, states, outputs, stop_flag
cond = lambda step, m, s, o, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool))
step_count, memory, states, outputs, stop_flag = \
tf.while_loop(cond,
_body,
loop_vars=(step_count, memory, states, outputs,
stop_flag),
parallel_iterations=32,
swap_memory=True,
maximum_iterations=self.max_decoder_steps)
outputs = outputs.stack()
outputs = tf.gather(outputs, tf.range(step_count)) # pylint: disable=no-value-for-parameter
outputs = tf.expand_dims(outputs, axis=[0])
outputs = tf.transpose(outputs, [1, 0, 2])
outputs = tf.reshape(outputs, [1, -1, self.frame_dim])
return outputs, stop_tokens, attentions
def call(self, memory, states, frames=None, memory_seq_length=None, training=False):
if training:
return self.decode(memory, states, frames, memory_seq_length)
if self.enable_tflite:
return self.decode_inference_tflite(memory, states)
return self.decode_inference(memory, states)

Просмотреть файл

@ -0,0 +1,108 @@
import tensorflow as tf
from tensorflow import keras
from TTS.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet
from TTS.tts.tf.utils.tf_utils import shape_list
#pylint: disable=too-many-ancestors
class Tacotron2(keras.models.Model):
def __init__(self,
num_chars,
num_speakers,
r,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm="softmax",
attn_K=4,
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
separate_stopnet=True,
bidirectional_decoder=False,
enable_tflite=False):
super(Tacotron2, self).__init__()
self.r = r
self.decoder_output_dim = decoder_output_dim
self.postnet_output_dim = postnet_output_dim
self.bidirectional_decoder = bidirectional_decoder
self.num_speakers = num_speakers
self.speaker_embed_dim = 256
self.enable_tflite = enable_tflite
self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding')
self.encoder = Encoder(512, name='encoder')
# TODO: most of the decoder args have no use at the momment
self.decoder = Decoder(decoder_output_dim,
r,
attn_type=attn_type,
use_attn_win=attn_win,
attn_norm=attn_norm,
prenet_type=prenet_type,
prenet_dropout=prenet_dropout,
use_forward_attn=forward_attn,
use_trans_agent=trans_agent,
use_forward_attn_mask=forward_attn_mask,
use_location_attn=location_attn,
attn_K=attn_K,
separate_stopnet=separate_stopnet,
speaker_emb_dim=self.speaker_embed_dim,
name='decoder',
enable_tflite=enable_tflite)
self.postnet = Postnet(postnet_output_dim, 5, name='postnet')
@tf.function(experimental_relax_shapes=True)
def call(self, characters, text_lengths=None, frames=None, training=None):
if training:
return self.training(characters, text_lengths, frames)
if not training:
return self.inference(characters)
raise RuntimeError(' [!] Set model training mode True or False')
def training(self, characters, text_lengths, frames):
B, T = shape_list(characters)
embedding_vectors = self.embedding(characters, training=True)
encoder_output = self.encoder(embedding_vectors, training=True)
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, frames, text_lengths, training=True)
postnet_frames = self.postnet(decoder_frames, training=True)
output_frames = decoder_frames + postnet_frames
return decoder_frames, output_frames, attentions, stop_tokens
def inference(self, characters):
B, T = shape_list(characters)
embedding_vectors = self.embedding(characters, training=False)
encoder_output = self.encoder(embedding_vectors, training=False)
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False)
postnet_frames = self.postnet(decoder_frames, training=False)
output_frames = decoder_frames + postnet_frames
print(output_frames.shape)
return decoder_frames, output_frames, attentions, stop_tokens
@tf.function(
experimental_relax_shapes=True,
input_signature=[
tf.TensorSpec([1, None], dtype=tf.int32),
],)
def inference_tflite(self, characters):
B, T = shape_list(characters)
embedding_vectors = self.embedding(characters, training=False)
encoder_output = self.encoder(embedding_vectors, training=False)
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False)
postnet_frames = self.postnet(decoder_frames, training=False)
output_frames = decoder_frames + postnet_frames
print(output_frames.shape)
return decoder_frames, output_frames, attentions, stop_tokens
def build_inference(self, ):
# TODO: issue https://github.com/PyCQA/pylint/issues/3613
input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg
self(input_ids)

Просмотреть файл

@ -0,0 +1,81 @@
import numpy as np
import tensorflow as tf
def tf_create_dummy_inputs():
""" Create dummy inputs for TF Tacotron2 model """
batch_size = 4
max_input_length = 32
max_mel_length = 128
pad = 1
n_chars = 24
input_ids = tf.random.uniform([batch_size, max_input_length + pad], maxval=n_chars, dtype=tf.int32)
input_lengths = np.random.randint(0, high=max_input_length+1 + pad, size=[batch_size])
input_lengths[-1] = max_input_length
input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32)
mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80])
mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size])
mel_lengths[-1] = max_mel_length
mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32)
return input_ids, input_lengths, mel_outputs, mel_lengths
def compare_torch_tf(torch_tensor, tf_tensor):
""" Compute the average absolute difference b/w torch and tf tensors """
return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean()
def convert_tf_name(tf_name):
""" Convert certain patterns in TF layer names to Torch patterns """
tf_name_tmp = tf_name
tf_name_tmp = tf_name_tmp.replace(':0', '')
tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0')
tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1')
tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh')
tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight')
tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight')
tf_name_tmp = tf_name_tmp.replace('/beta', '/bias')
tf_name_tmp = tf_name_tmp.replace('/', '.')
return tf_name_tmp
def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict):
""" Transfer weigths from torch state_dict to TF variables """
print(" > Passing weights from Torch to TF ...")
for tf_var in tf_vars:
torch_var_name = var_map_dict[tf_var.name]
print(f' | > {tf_var.name} <-- {torch_var_name}')
# if tuple, it is a bias variable
if not isinstance(torch_var_name, tuple):
torch_layer_name = '.'.join(torch_var_name.split('.')[-2:])
torch_weight = state_dict[torch_var_name]
if 'convolution1d/kernel' in tf_var.name or 'conv1d/kernel' in tf_var.name:
# out_dim, in_dim, filter -> filter, in_dim, out_dim
numpy_weight = torch_weight.permute([2, 1, 0]).detach().cpu().numpy()
elif 'lstm_cell' in tf_var.name and 'kernel' in tf_var.name:
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
# if variable is for bidirectional lstm and it is a bias vector there
# needs to be pre-defined two matching torch bias vectors
elif '_lstm/lstm_cell_' in tf_var.name and 'bias' in tf_var.name:
bias_vectors = [value for key, value in state_dict.items() if key in torch_var_name]
assert len(bias_vectors) == 2
numpy_weight = bias_vectors[0] + bias_vectors[1]
elif 'rnn' in tf_var.name and 'kernel' in tf_var.name:
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
elif 'rnn' in tf_var.name and 'bias' in tf_var.name:
bias_vectors = [value for key, value in state_dict.items() if torch_var_name[:-2] in key]
assert len(bias_vectors) == 2
numpy_weight = bias_vectors[0] + bias_vectors[1]
elif 'linear_layer' in torch_layer_name and 'weight' in torch_var_name:
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
else:
numpy_weight = torch_weight.detach().cpu().numpy()
assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}"
tf.keras.backend.set_value(tf_var, numpy_weight)
return tf_vars
def load_tf_vars(model_tf, tf_vars):
for tf_var in tf_vars:
model_tf.get_layer(tf_var.name).set_weights(tf_var)
return model_tf

Просмотреть файл

@ -0,0 +1,104 @@
import os
import datetime
import importlib
import pickle
import numpy as np
import tensorflow as tf
def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs):
state = {
'model': model.weights,
'optimizer': optimizer,
'step': current_step,
'epoch': epoch,
'date': datetime.date.today().strftime("%B %d, %Y"),
'r': r
}
state.update(kwargs)
pickle.dump(state, open(output_path, 'wb'))
def load_checkpoint(model, checkpoint_path):
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
tf_vars = model.weights
for tf_var in tf_vars:
layer_name = tf_var.name
try:
chkp_var_value = chkp_var_dict[layer_name]
except KeyError:
class_name = list(chkp_var_dict.keys())[0].split("/")[0]
layer_name = f"{class_name}/{layer_name}"
chkp_var_value = chkp_var_dict[layer_name]
tf.keras.backend.set_value(tf_var, chkp_var_value)
if 'r' in checkpoint.keys():
model.decoder.set_r(checkpoint['r'])
return model
def sequence_mask(sequence_length, max_len=None):
if max_len is None:
max_len = sequence_length.max()
batch_size = sequence_length.size(0)
seq_range = np.empty([0, max_len], dtype=np.int8)
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
if sequence_length.is_cuda:
seq_range_expand = seq_range_expand.cuda()
seq_length_expand = (
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
# B x T_max
return seq_range_expand < seq_length_expand
# @tf.custom_gradient
def check_gradient(x, grad_clip):
x_normed = tf.clip_by_norm(x, grad_clip)
grad_norm = tf.norm(grad_clip)
return x_normed, grad_norm
def count_parameters(model, c):
try:
return model.count_params()
except RuntimeError:
input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype('int32'))
input_lengths = np.random.randint(100, 129, (8, ))
input_lengths[-1] = 128
input_lengths = tf.convert_to_tensor(input_lengths.astype('int32'))
mel_spec = np.random.rand(8, 2 * c.r,
c.audio['num_mels']).astype('float32')
mel_spec = tf.convert_to_tensor(mel_spec)
speaker_ids = np.random.randint(
0, 5, (8, )) if c.use_speaker_embedding else None
_ = model(input_dummy, input_lengths, mel_spec, speaker_ids=speaker_ids)
return model.count_params()
def setup_model(num_chars, num_speakers, c, enable_tflite=False):
print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('TTS.tts.tf.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model)
if c.model.lower() in "tacotron":
raise NotImplementedError(' [!] Tacotron model is not ready.')
# tacotron2
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
postnet_output_dim=c.audio['num_mels'],
decoder_output_dim=c.audio['num_mels'],
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
enable_tflite=enable_tflite)
return model

42
TTS/tts/tf/utils/io.py Normal file
Просмотреть файл

@ -0,0 +1,42 @@
import pickle
import datetime
import tensorflow as tf
def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs):
state = {
'model': model.weights,
'optimizer': optimizer,
'step': current_step,
'epoch': epoch,
'date': datetime.date.today().strftime("%B %d, %Y"),
'r': r
}
state.update(kwargs)
pickle.dump(state, open(output_path, 'wb'))
def load_checkpoint(model, checkpoint_path):
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
tf_vars = model.weights
for tf_var in tf_vars:
layer_name = tf_var.name
try:
chkp_var_value = chkp_var_dict[layer_name]
except KeyError:
class_name = list(chkp_var_dict.keys())[0].split("/")[0]
layer_name = f"{class_name}/{layer_name}"
chkp_var_value = chkp_var_dict[layer_name]
tf.keras.backend.set_value(tf_var, chkp_var_value)
if 'r' in checkpoint.keys():
model.decoder.set_r(checkpoint['r'])
return model
def load_tflite_model(tflite_path):
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
tflite_model.allocate_tensors()
return tflite_model

Просмотреть файл

@ -0,0 +1,8 @@
import tensorflow as tf
def shape_list(x):
"""Deal with dynamic shape in tensorflow cleanly."""
static = x.shape.as_list()
dynamic = tf.shape(x)
return [dynamic[i] if s is None else s for i, s in enumerate(static)]

Просмотреть файл

@ -0,0 +1,31 @@
import tensorflow as tf
def convert_tacotron2_to_tflite(model,
output_path=None,
experimental_converter=True):
"""Convert Tensorflow Tacotron2 model to TFLite. Save a binary file if output_path is
provided, else return TFLite model."""
concrete_function = model.inference_tflite.get_concrete_function()
converter = tf.lite.TFLiteConverter.from_concrete_functions(
[concrete_function])
converter.experimental_new_converter = experimental_converter
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
]
tflite_model = converter.convert()
print(f'Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.')
if output_path is not None:
# same model binary if outputpath is provided
with open(output_path, 'wb') as f:
f.write(tflite_model)
return None
return tflite_model
def load_tflite_model(tflite_path):
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
tflite_model.allocate_tensors()
return tflite_model

Просмотреть файл

Просмотреть файл

@ -0,0 +1,95 @@
import datetime
from TTS.utils.io import AttrDict
tcolors = AttrDict({
'OKBLUE': '\033[94m',
'HEADER': '\033[95m',
'OKGREEN': '\033[92m',
'WARNING': '\033[93m',
'FAIL': '\033[91m',
'ENDC': '\033[0m',
'BOLD': '\033[1m',
'UNDERLINE': '\033[4m'
})
class ConsoleLogger():
def __init__(self):
# TODO: color code for value changes
# use these to compare values between iterations
self.old_train_loss_dict = None
self.old_epoch_loss_dict = None
self.old_eval_loss_dict = None
# pylint: disable=no-self-use
def get_time(self):
now = datetime.datetime.now()
return now.strftime("%Y-%m-%d %H:%M:%S")
def print_epoch_start(self, epoch, max_epoch):
print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD,
epoch, max_epoch, tcolors.ENDC),
flush=True)
def print_train_start(self):
print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}")
def print_train_step(self, batch_steps, step, global_step, avg_spec_length,
avg_text_length, step_time, loader_time, lr,
loss_dict, avg_loss_dict):
indent = " | > "
print()
log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(
tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC)
for key, value in loss_dict.items():
# print the avg value if given
if f'avg_{key}' in avg_loss_dict.keys():
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
else:
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}"\
f"step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}"
print(log_text, flush=True)
# pylint: disable=unused-argument
def print_train_epoch_end(self, global_step, epoch, epoch_time,
print_dict):
indent = " | > "
log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch_time:.2f} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n"
for key, value in print_dict.items():
log_text += "{}{}: {:.5f}\n".format(indent, key, value)
print(log_text, flush=True)
def print_eval_start(self):
print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n")
def print_eval_step(self, step, loss_dict, avg_loss_dict):
indent = " | > "
print()
log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n"
for key, value in loss_dict.items():
# print the avg value if given
if f'avg_{key}' in avg_loss_dict.keys():
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
else:
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
print(log_text, flush=True)
def print_epoch_end(self, epoch, avg_loss_dict):
indent = " | > "
log_text = " {}--> EVAL PERFORMANCE{}\n".format(
tcolors.BOLD, tcolors.ENDC)
for key, value in avg_loss_dict.items():
# print the avg value if given
color = tcolors.FAIL
sign = '+'
diff = 0
if self.old_eval_loss_dict is not None:
diff = value - self.old_eval_loss_dict[key]
if diff <= 0:
color = tcolors.OKGREEN
sign = ''
log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff)
self.old_eval_loss_dict = avg_loss_dict
print(log_text, flush=True)

77
TTS/tts/utils/data.py Normal file
Просмотреть файл

@ -0,0 +1,77 @@
import numpy as np
def _pad_data(x, length):
_pad = 0
assert x.ndim == 1
return np.pad(
x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
def prepare_data(inputs):
max_len = max((len(x) for x in inputs))
return np.stack([_pad_data(x, max_len) for x in inputs])
def _pad_tensor(x, length):
_pad = 0.
assert x.ndim == 2
x = np.pad(
x, [[0, 0], [0, length - x.shape[1]]],
mode='constant',
constant_values=_pad)
return x
def prepare_tensor(inputs, out_steps):
max_len = max((x.shape[1] for x in inputs))
remainder = max_len % out_steps
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
return np.stack([_pad_tensor(x, pad_len) for x in inputs])
def _pad_stop_target(x, length):
_pad = 0.
assert x.ndim == 1
return np.pad(
x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
def prepare_stop_target(inputs, out_steps):
""" Pad row vectors with 1. """
max_len = max((x.shape[0] for x in inputs))
remainder = max_len % out_steps
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
def pad_per_step(inputs, pad_len):
return np.pad(
inputs, [[0, 0], [0, 0], [0, pad_len]],
mode='constant',
constant_values=0.0)
# pylint: disable=attribute-defined-outside-init
class StandardScaler():
def set_stats(self, mean, scale):
self.mean_ = mean
self.scale_ = scale
def reset_stats(self):
delattr(self, 'mean_')
delattr(self, 'scale_')
def transform(self, X):
X = np.asarray(X)
X -= self.mean_
X /= self.scale_
return X
def inverse_transform(self, X):
X = np.asarray(X)
X *= self.scale_
X += self.mean_
return X

120
TTS/tts/utils/distribute.py Normal file
Просмотреть файл

@ -0,0 +1,120 @@
# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
import os, sys
import math
import time
import subprocess
import argparse
import torch
import torch.distributed as dist
from torch.utils.data.sampler import Sampler
from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from TTS.utils.generic_utils import create_experiment_folder
class DistributedSampler(Sampler):
"""
Non shuffling Distributed Sampler
"""
def __init__(self, dataset, num_replicas=None, rank=None):
super(DistributedSampler, self).__init__(dataset)
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size()
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank()
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas
def __iter__(self):
indices = torch.arange(len(self.dataset)).tolist()
# add extra samples to make it evenly divisible
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
# subsample
indices = indices[self.rank:self.total_size:self.num_replicas]
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
def reduce_tensor(tensor, num_gpus):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM)
rt /= num_gpus
return rt
def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
# Set cuda device so everything is done on the right GPU.
torch.cuda.set_device(rank % torch.cuda.device_count())
# Initialize distributed communication
dist.init_process_group(
dist_backend,
init_method=dist_url,
world_size=num_gpus,
rank=rank,
group_name=group_name)
def apply_gradient_allreduce(module):
# sync model parameters
for p in module.state_dict().values():
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if module.needs_reduction:
module.needs_reduction = False
# bucketing params based on value types
buckets = {}
for param in module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced, op=dist.reduce_op.SUM)
coalesced /= dist.get_world_size()
for buf, synced in zip(
grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(module.parameters()):
def allreduce_hook(*_):
Variable._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
def set_needs_reduction(self, *_):
self.needs_reduction = True
module.register_forward_hook(set_needs_reduction)
return module

Просмотреть файл

@ -0,0 +1,34 @@
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence
class DurationCalculator():
def calculate_durations(self, att_ws, ilens, olens):
"""calculate duration from given alignment matrices"""
durations = [self._calculate_duration(att_w, ilen, olen) for att_w, ilen, olen in zip(att_ws, ilens, olens)]
return pad_sequence(durations, batch_first=True)
@staticmethod
def _calculate_duration(att_w, ilen, olen):
'''
attw : batch x outs x ins
'''
durations = torch.stack([att_w[:olen, :ilen].argmax(-1).eq(i).sum() for i in range(ilen)])
return durations
def calculate_scores(self, att_ws, ilens, olens):
"""calculate scores per duration step"""
scores = [self._calculate_scores(att_w, ilen, olen, self.K) for att_w, ilen, olen in zip(att_ws, ilens, olens)]
return pad_list(scores, 0)
@staticmethod
def _calculate_scores(att_w, ilen, olen, k):
# which input is attended for each output
scores = [None] * ilen
values, idxs = att_w[:olen, :ilen].max(-1)
for i in range(ilen):
vals = values[torch.where(idxs == i)]
scores[i] = vals
scores = [torch.nn.functional.pad(score, (0, k - score.shape[0])) for score in scores]
return torch.stack(scores)

Просмотреть файл

@ -0,0 +1,257 @@
import os
import glob
import torch
import shutil
import datetime
import subprocess
import importlib
import numpy as np
from collections import Counter
from TTS.utils.generic_utils import check_argument
def split_dataset(items):
is_multi_speaker = False
speakers = [item[-1] for item in items]
is_multi_speaker = len(set(speakers)) > 1
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
len(items) * 0.01)
assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
np.random.seed(0)
np.random.shuffle(items)
if is_multi_speaker:
items_eval = []
# most stupid code ever -- Fix it !
while len(items_eval) < eval_split_size:
speakers = [item[-1] for item in items]
speaker_counter = Counter(speakers)
item_idx = np.random.randint(0, len(items))
if speaker_counter[items[item_idx][-1]] > 1:
items_eval.append(items[item_idx])
del items[item_idx]
return items_eval, items
return items[:eval_split_size], items[eval_split_size:]
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
def sequence_mask(sequence_length, max_len=None):
if max_len is None:
max_len = sequence_length.data.max()
batch_size = sequence_length.size(0)
seq_range = torch.arange(0, max_len).long()
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
if sequence_length.is_cuda:
seq_range_expand = seq_range_expand.to(sequence_length.device)
seq_length_expand = (
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
# B x T_max
return seq_range_expand < seq_length_expand
def setup_model(num_chars, num_speakers, c):
print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('TTS.tts.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model)
if c.model.lower() in "tacotron":
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
decoder_output_dim=c.audio['num_mels'],
gst=c.use_gst,
memory_size=c.memory_size,
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r)
elif c.model.lower() == "tacotron2":
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
postnet_output_dim=c.audio['num_mels'],
decoder_output_dim=c.audio['num_mels'],
gst=c.use_gst,
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r)
return model
class KeepAverage():
def __init__(self):
self.avg_values = {}
self.iters = {}
def __getitem__(self, key):
return self.avg_values[key]
def items(self):
return self.avg_values.items()
def add_value(self, name, init_val=0, init_iter=0):
self.avg_values[name] = init_val
self.iters[name] = init_iter
def update_value(self, name, value, weighted_avg=False):
if name not in self.avg_values:
# add value if not exist before
self.add_value(name, init_val=value)
else:
# else update existing value
if weighted_avg:
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
self.iters[name] += 1
else:
self.avg_values[name] = self.avg_values[name] * \
self.iters[name] + value
self.iters[name] += 1
self.avg_values[name] /= self.iters[name]
def add_values(self, name_dict):
for key, value in name_dict.items():
self.add_value(key, init_val=value)
def update_values(self, value_dict):
for key, value in value_dict.items():
self.update_value(key, value)
def check_config(c):
check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
check_argument('run_name', c, restricted=True, val_type=str)
check_argument('run_description', c, val_type=str)
# AUDIO
check_argument('audio', c, restricted=True, val_type=dict)
# audio processing parameters
check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
# vocabulary parameters
check_argument('characters', c, restricted=False, val_type=dict)
check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
# normalization parameters
check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100)
check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
check_argument('trim_db', c['audio'], restricted=True, val_type=int)
# training parameters
check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
check_argument('r', c, restricted=True, val_type=int, min_val=1)
check_argument('gradual_training', c, restricted=False, val_type=list)
check_argument('loss_masking', c, restricted=True, val_type=bool)
# check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
# validation parameters
check_argument('run_eval', c, restricted=True, val_type=bool)
check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
check_argument('test_sentences_file', c, restricted=False, val_type=str)
# optimizer
check_argument('noam_schedule', c, restricted=False, val_type=bool)
check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
check_argument('lr', c, restricted=True, val_type=float, min_val=0)
check_argument('wd', c, restricted=True, val_type=float, min_val=0)
check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
check_argument('seq_len_norm', c, restricted=True, val_type=bool)
# tacotron prenet
check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
check_argument('prenet_dropout', c, restricted=True, val_type=bool)
# attention
check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
check_argument('attention_heads', c, restricted=True, val_type=int)
check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
check_argument('windowing', c, restricted=True, val_type=bool)
check_argument('use_forward_attn', c, restricted=True, val_type=bool)
check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
check_argument('transition_agent', c, restricted=True, val_type=bool)
check_argument('transition_agent', c, restricted=True, val_type=bool)
check_argument('location_attn', c, restricted=True, val_type=bool)
check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
check_argument('double_decoder_consistency', c, restricted=True, val_type=bool)
check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
# stopnet
check_argument('stopnet', c, restricted=True, val_type=bool)
check_argument('separate_stopnet', c, restricted=True, val_type=bool)
# tensorboard
check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1)
check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
check_argument('checkpoint', c, restricted=True, val_type=bool)
check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
# dataloading
# pylint: disable=import-outside-toplevel
from TTS.tts.utils.text import cleaners
check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners))
check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
# paths
check_argument('output_path', c, restricted=True, val_type=str)
# multi-speaker gst
check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
check_argument('style_wav_for_test', c, restricted=True, val_type=str)
check_argument('use_gst', c, restricted=True, val_type=bool)
# datasets - checking only the first entry
check_argument('datasets', c, restricted=True, val_type=list)
for dataset_entry in c['datasets']:
check_argument('name', dataset_entry, restricted=True, val_type=str)
check_argument('path', dataset_entry, restricted=True, val_type=str)
check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)

Просмотреть файл

@ -0,0 +1,177 @@
import torch
import librosa
import soundfile as sf
import numpy as np
import scipy.io
import scipy.signal
from TTS.tts.utils.stft_torch import STFT
class AudioProcessor(object):
def __init__(self,
sample_rate=None,
num_mels=None,
frame_shift_ms=None,
frame_length_ms=None,
hop_length=None,
win_length=None,
num_freq=None,
power=None,
mel_fmin=None,
mel_fmax=None,
griffin_lim_iters=None,
do_trim_silence=False,
trim_db=60,
sound_norm=False,
use_cuda=False,
**_):
print(" > Setting up Torch based Audio Processor...")
# setup class attributed
self.sample_rate = sample_rate
self.num_mels = num_mels
self.frame_shift_ms = frame_shift_ms
self.frame_length_ms = frame_length_ms
self.num_freq = num_freq
self.power = power
self.griffin_lim_iters = griffin_lim_iters
self.mel_fmin = mel_fmin or 0
self.mel_fmax = mel_fmax
self.do_trim_silence = do_trim_silence
self.trim_db = trim_db
self.sound_norm = sound_norm
# setup stft parameters
if hop_length is None:
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
else:
self.hop_length = hop_length
self.win_length = win_length
self.n_fft = (self.num_freq - 1) * 2
members = vars(self)
# print class attributes
for key, value in members.items():
print(" | > {}:{}".format(key, value))
# create spectrogram utils
self.mel_basis = torch.from_numpy(self._build_mel_basis()).float()
self.inv_mel_basis = torch.from_numpy(np.linalg.pinv(self._build_mel_basis())).float()
self.stft = STFT(filter_length=self.n_fft, hop_length=self.hop_length, win_length=self.win_length,
window='hann', padding_mode='constant', use_cuda=use_cuda)
### setting up the parameters ###
def _build_mel_basis(self):
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel(
self.sample_rate,
self.n_fft,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
def _stft_parameters(self, ):
"""Compute necessary stft parameters with given time values"""
n_fft = (self.num_freq - 1) * 2
factor = self.frame_length_ms / self.frame_shift_ms
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
win_length = int(hop_length * factor)
return n_fft, hop_length, win_length
### DB and AMP conversion ###
def amp_to_db(self, x):
return torch.log10(torch.clamp(x, min=1e-5))
def db_to_amp(self, x):
return torch.pow(10.0, x)
### SPECTROGRAM ###
def linear_to_mel(self, spectrogram):
return torch.matmul(self.mel_basis, spectrogram)
def mel_to_linear(self, mel_spec):
return np.maximum(1e-10, np.matmul(self.inv_mel_basis, mel_spec))
def spectrogram(self, y):
''' Compute spectrograms
Args:
y (Tensor): audio signal. (B x T)
'''
M, P = self.stft.transform(y)
return self.amp_to_db(M)
def melspectrogram(self, y):
''' Compute mel-spectrograms
Args:
y (Tensor): audio signal. (B x T)
'''
M, P = self.stft.transform(y)
return self.amp_to_db(self.linear_to_mel(M))
### INV SPECTROGRAM ###
def inv_spectrogram(self, S):
"""Converts spectrogram to waveform using librosa"""
S = self.db_to_amp(S)
return self.griffin_lim(S**self.power)
def inv_melspectrogram(self, S):
'''Converts mel spectrogram to waveform using librosa'''
S = self.db_to_amp(S)
S = self.mel_to_linear(S) # Convert back to linear
return self.griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec):
S = self._denormalize(linear_spec)
S = self._db_to_amp(S)
S = self._linear_to_mel(np.abs(S))
S = self._amp_to_db(S)
mel = self._normalize(S)
return mel
def griffin_lim(self, S):
"""
PARAMS
------
magnitudes: spectrogram magnitudes
"""
angles = np.angle(np.exp(2j * np.pi * np.random.rand(*S.size())))
angles = angles.astype(np.float32)
angles = torch.from_numpy(angles)
signal = self.stft.inverse(S, angles).squeeze(1)
for _ in range(self.griffin_lim_iters):
_, angles = self.stft.transform(signal)
signal = self.stft.inverse(S, angles).squeeze(1)
return signal
### Audio processing ###
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(self.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
threshold = self._db_to_amp(threshold_db)
for x in range(hop_length, len(wav) - window_length, hop_length):
if np.max(wav[x:x + window_length]) < threshold:
return x + hop_length
return len(wav)
def trim_silence(self, wav):
""" Trim silent parts with a threshold and 0.01 sec margin """
margin = int(self.sample_rate * 0.01)
wav = wav[margin:-margin]
return librosa.effects.trim(
wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0]
def sound_norm(self, x):
return x / abs(x).max() * 0.9
### SAVE and LOAD ###
def load_wav(self, filename, sr=None):
if sr is None:
x, sr = sf.read(filename)
else:
x, sr = librosa.load(filename, sr=sr)
return x
def save_wav(self, wav, path):
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16))

45
TTS/tts/utils/io.py Normal file
Просмотреть файл

@ -0,0 +1,45 @@
import os
import torch
import datetime
def load_checkpoint(model, checkpoint_path, use_cuda=False):
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
model.load_state_dict(state['model'])
if use_cuda:
model.cuda()
# set model stepsize
if 'r' in state.keys():
model.decoder.set_r(state['r'])
return model, state
def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
new_state_dict = model.state_dict()
state = {
'model': new_state_dict,
'optimizer': optimizer.state_dict() if optimizer is not None else None,
'step': current_step,
'epoch': epoch,
'date': datetime.date.today().strftime("%B %d, %Y"),
'r': r
}
state.update(kwargs)
torch.save(state, output_path)
def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs):
file_name = 'checkpoint_{}.pth.tar'.format(current_step)
checkpoint_path = os.path.join(output_folder, file_name)
print(" > CHECKPOINT : {}".format(checkpoint_path))
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, **kwargs)
def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs):
if target_loss < best_loss:
file_name = 'best_model.pth.tar'
checkpoint_path = os.path.join(output_folder, file_name)
print(" > BEST MODEL : {}".format(checkpoint_path))
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs)
best_loss = target_loss
return best_loss

18
TTS/tts/utils/measures.py Normal file
Просмотреть файл

@ -0,0 +1,18 @@
import torch
def alignment_diagonal_score(alignments, binary=False):
"""
Compute how diagonal alignment predictions are. It is useful
to measure the alignment consistency of a model
Args:
alignments (torch.Tensor): batch of alignments.
binary (bool): if True, ignore scores and consider attention
as a binary mask.
Shape:
alignments : batch x decoder_steps x encoder_steps
"""
maxs = alignments.max(dim=1)[0]
if binary:
maxs[maxs > 0] = 1
return maxs.mean(dim=1).mean(dim=0).item()

31
TTS/tts/utils/speakers.py Normal file
Просмотреть файл

@ -0,0 +1,31 @@
import os
import json
from TTS.tts.datasets.preprocess import get_preprocessor_by_name
def make_speakers_json_path(out_path):
"""Returns conventional speakers.json location."""
return os.path.join(out_path, "speakers.json")
def load_speaker_mapping(out_path):
"""Loads speaker mapping if already present."""
try:
with open(make_speakers_json_path(out_path)) as f:
return json.load(f)
except FileNotFoundError:
return {}
def save_speaker_mapping(out_path, speaker_mapping):
"""Saves speaker mapping if not yet present."""
speakers_json_path = make_speakers_json_path(out_path)
with open(speakers_json_path, "w") as f:
json.dump(speaker_mapping, f, indent=4)
def get_speakers(items):
"""Returns a sorted, unique list of speakers in a given dataset."""
speakers = {e[2] for e in items}
return sorted(speakers)

247
TTS/tts/utils/stft_torch.py Normal file
Просмотреть файл

@ -0,0 +1,247 @@
"""
BSD 3-Clause License
Copyright (c) 2017, Prem Seetharaman
All rights reserved.
* Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import torch
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
from scipy.signal import get_window
from librosa.util import pad_center, tiny, normalize
from librosa.filters import mel as librosa_mel_fn
def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
n_fft=800, dtype=np.float32, norm=None):
"""
# from librosa 0.6
Compute the sum-square envelope of a window function at a given hop length.
This is used to estimate modulation effects induced by windowing
observations in short-time fourier transforms.
Parameters
----------
window : string, tuple, number, callable, or list-like
Window specification, as in `get_window`
n_frames : int > 0
The number of analysis frames
hop_length : int > 0
The number of samples to advance between frames
win_length : [optional]
The length of the window function. By default, this matches `n_fft`.
n_fft : int > 0
The length of each analysis frame.
dtype : np.dtype
The data type of the output
Returns
-------
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
The sum-squared envelope of the window function
"""
if win_length is None:
win_length = n_fft
n = n_fft + hop_length * (n_frames - 1)
x = np.zeros(n, dtype=dtype)
# Compute the squared window at the desired length
win_sq = get_window(window, win_length, fftbins=True)
win_sq = normalize(win_sq, norm=norm)**2
win_sq = pad_center(win_sq, n_fft)
# Fill the envelope
for i in range(n_frames):
sample = i * hop_length
x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
return x
def amp_to_db(x):
o = 20 * torch.log10(torch.clamp(x, min=1e-5))
return o
def db_to_amp(x):
o = torch.pow(x * 0.05, 10.0)
return o
class STFT(torch.nn.Module):
"""adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
def __init__(self, filter_length=800, hop_length=200, win_length=800,
window='hann', padding_mode='reflect', use_cuda=False):
super(STFT, self).__init__()
self.filter_length = filter_length
self.hop_length = hop_length
self.win_length = win_length
self.window = window
self.padding_mode = padding_mode
self.use_cuda = use_cuda
self.forward_transform = None
scale = self.filter_length / self.hop_length
fourier_basis = np.fft.fft(np.eye(self.filter_length))
cutoff = int((self.filter_length / 2 + 1))
fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
np.imag(fourier_basis[:cutoff, :])])
forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
inverse_basis = torch.FloatTensor(
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
if window is not None:
assert(filter_length >= win_length)
# get window and zero center pad it to filter_length
fft_window = get_window(window, win_length, fftbins=True)
fft_window = pad_center(fft_window, filter_length)
fft_window = torch.from_numpy(fft_window).float()
# window the bases
forward_basis *= fft_window
inverse_basis *= fft_window
self.register_buffer('forward_basis', forward_basis.float())
self.register_buffer('inverse_basis', inverse_basis.float())
def transform(self, input_data):
num_batches = input_data.size(0)
num_samples = input_data.size(1)
self.num_samples = num_samples
# similar to librosa, reflect-pad the input
input_data = input_data.view(num_batches, 1, num_samples)
input_data = F.pad(
input_data.unsqueeze(1),
(int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
mode=self.padding_mode)
input_data = input_data.squeeze(1)
# https://github.com/NVIDIA/tacotron2/issues/125
if self.use_cuda:
forward_transform = F.conv1d(
input_data.cuda(),
Variable(self.forward_basis, requires_grad=False).cuda(),
stride=self.hop_length,
padding=0).cpu()
else:
forward_transform = F.conv1d(
input_data,
Variable(self.forward_basis, requires_grad=False),
stride=self.hop_length,
padding=0)
cutoff = int((self.filter_length / 2) + 1)
real_part = forward_transform[:, :cutoff, :]
imag_part = forward_transform[:, cutoff:, :]
magnitude = torch.sqrt(real_part**2 + imag_part**2)
phase = torch.autograd.Variable(
torch.atan2(imag_part.data, real_part.data))
return magnitude, phase
def inverse(self, magnitude, phase):
recombine_magnitude_phase = torch.cat(
[magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
inverse_transform = F.conv_transpose1d(
recombine_magnitude_phase,
Variable(self.inverse_basis, requires_grad=False),
stride=self.hop_length,
padding=0)
if self.window is not None:
window_sum = window_sumsquare(
self.window, magnitude.size(-1), hop_length=self.hop_length,
win_length=self.win_length, n_fft=self.filter_length,
dtype=np.float32)
# remove modulation effects
approx_nonzero_indices = torch.from_numpy(
np.where(window_sum > tiny(window_sum))[0])
window_sum = torch.autograd.Variable(
torch.from_numpy(window_sum), requires_grad=False)
window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
# scale by hop ratio
inverse_transform *= float(self.filter_length) / self.hop_length
inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
return inverse_transform
def forward(self, input_data):
self.magnitude, self.phase = self.transform(input_data)
reconstruction = self.inverse(self.magnitude, self.phase)
return reconstruction
class TacotronSTFT(torch.nn.Module):
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
mel_fmax=None, padding_mode='constant'):
super(TacotronSTFT, self).__init__()
self.n_mel_channels = n_mel_channels
self.sampling_rate = sampling_rate
self.stft_fn = STFT(filter_length, hop_length, win_length, padding_mode=padding_mode)
mel_basis = librosa_mel_fn(
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
mel_basis = torch.from_numpy(mel_basis).float()
self.register_buffer('mel_basis', mel_basis)
def spectral_normalize(self, magnitudes):
output = amp_to_db(magnitudes)
return output
def spectral_de_normalize(self, magnitudes):
output = db_to_amp(magnitudes)
return output
def mel_spectrogram(self, y):
"""Computes mel-spectrograms from a batch of waves
PARAMS
------
y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
RETURNS
-------
mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
"""
assert(torch.min(y.data) >= -1)
assert(torch.max(y.data) <= 1)
magnitudes, phases = self.stft_fn.transform(y)
magnitudes = magnitudes.data
mel_output = torch.matmul(self.mel_basis, magnitudes)
mel_output = self.spectral_normalize(mel_output)
return mel_output

231
TTS/tts/utils/synthesis.py Normal file
Просмотреть файл

@ -0,0 +1,231 @@
import pkg_resources
installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable
if 'tensorflow' in installed or 'tensorflow-gpu' in installed:
import tensorflow as tf
import torch
import numpy as np
from .text import text_to_sequence, phoneme_to_sequence
def text_to_seqvec(text, CONFIG):
text_cleaner = [CONFIG.text_cleaner]
# text ot phonemes to sequence vector
if CONFIG.use_phonemes:
seq = np.asarray(
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
CONFIG.enable_eos_bos_chars,
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None),
dtype=np.int32)
else:
seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32)
return seq
def numpy_to_torch(np_array, dtype, cuda=False):
if np_array is None:
return None
tensor = torch.as_tensor(np_array, dtype=dtype)
if cuda:
return tensor.cuda()
return tensor
def numpy_to_tf(np_array, dtype):
if np_array is None:
return None
tensor = tf.convert_to_tensor(np_array, dtype=dtype)
return tensor
def compute_style_mel(style_wav, ap):
style_mel = ap.melspectrogram(
ap.load_wav(style_wav)).expand_dims(0)
return style_mel
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
if CONFIG.use_gst:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, style_mel=style_mel, speaker_ids=speaker_id)
else:
if truncated:
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
inputs, speaker_ids=speaker_id)
else:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, speaker_ids=speaker_id)
return decoder_output, postnet_output, alignments, stop_tokens
def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
if CONFIG.use_gst and style_mel is not None:
raise NotImplementedError(' [!] GST inference not implemented for TF')
if truncated:
raise NotImplementedError(' [!] Truncated inference not implemented for TF')
if speaker_id is not None:
raise NotImplementedError(' [!] Multi-Speaker not implemented for TF')
# TODO: handle multispeaker case
decoder_output, postnet_output, alignments, stop_tokens = model(
inputs, training=False)
return decoder_output, postnet_output, alignments, stop_tokens
def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
if CONFIG.use_gst and style_mel is not None:
raise NotImplementedError(' [!] GST inference not implemented for TfLite')
if truncated:
raise NotImplementedError(' [!] Truncated inference not implemented for TfLite')
if speaker_id is not None:
raise NotImplementedError(' [!] Multi-Speaker not implemented for TfLite')
# get input and output details
input_details = model.get_input_details()
output_details = model.get_output_details()
# reshape input tensor for the new input shape
model.resize_tensor_input(input_details[0]['index'], inputs.shape)
model.allocate_tensors()
detail = input_details[0]
# input_shape = detail['shape']
model.set_tensor(detail['index'], inputs)
# run the model
model.invoke()
# collect outputs
decoder_output = model.get_tensor(output_details[0]['index'])
postnet_output = model.get_tensor(output_details[1]['index'])
# tflite model only returns feature frames
return decoder_output, postnet_output, None, None
def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens):
postnet_output = postnet_output[0].data.cpu().numpy()
decoder_output = decoder_output[0].data.cpu().numpy()
alignment = alignments[0].cpu().data.numpy()
stop_tokens = stop_tokens[0].cpu().numpy()
return postnet_output, decoder_output, alignment, stop_tokens
def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens):
postnet_output = postnet_output[0].numpy()
decoder_output = decoder_output[0].numpy()
alignment = alignments[0].numpy()
stop_tokens = stop_tokens[0].numpy()
return postnet_output, decoder_output, alignment, stop_tokens
def parse_outputs_tflite(postnet_output, decoder_output):
postnet_output = postnet_output[0]
decoder_output = decoder_output[0]
return postnet_output, decoder_output
def trim_silence(wav, ap):
return wav[:ap.find_endpoint(wav)]
def inv_spectrogram(postnet_output, ap, CONFIG):
if CONFIG.model.lower() in ["tacotron"]:
wav = ap.inv_spectrogram(postnet_output.T)
else:
wav = ap.inv_melspectrogram(postnet_output.T)
return wav
def id_to_torch(speaker_id):
if speaker_id is not None:
speaker_id = np.asarray(speaker_id)
speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
return speaker_id
# TODO: perform GL with pytorch for batching
def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
'''Apply griffin-lim to each sample iterating throught the first dimension.
Args:
inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size.
input_lens (Tensor or np.Array): 1D array of sample lengths.
CONFIG (Dict): TTS config.
ap (AudioProcessor): TTS audio processor.
'''
wavs = []
for idx, spec in enumerate(inputs):
wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding
wav = inv_spectrogram(spec, ap, CONFIG)
# assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}"
wavs.append(wav[:wav_len])
return wavs
def synthesis(model,
text,
CONFIG,
use_cuda,
ap,
speaker_id=None,
style_wav=None,
truncated=False,
enable_eos_bos_chars=False, #pylint: disable=unused-argument
use_griffin_lim=False,
do_trim_silence=False,
backend='torch'):
"""Synthesize voice for the given text.
Args:
model (TTS.tts.models): model to synthesize.
text (str): target text
CONFIG (dict): config dictionary to be loaded from config.json.
use_cuda (bool): enable cuda.
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
model outputs.
speaker_id (int): id of speaker
style_wav (str): Uses for style embedding of GST.
truncated (bool): keep model states after inference. It can be used
for continuous inference at long texts.
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
do_trim_silence (bool): trim silence after synthesis.
backend (str): tf or torch
"""
# GST processing
style_mel = None
if CONFIG.model == "TacotronGST" and style_wav is not None:
style_mel = compute_style_mel(style_wav, ap)
# preprocess the given text
inputs = text_to_seqvec(text, CONFIG)
# pass tensors to backend
if backend == 'torch':
speaker_id = id_to_torch(speaker_id)
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda)
inputs = inputs.unsqueeze(0)
elif backend == 'tf':
# TODO: handle speaker id for tf model
style_mel = numpy_to_tf(style_mel, tf.float32)
inputs = numpy_to_tf(inputs, tf.int32)
inputs = tf.expand_dims(inputs, 0)
elif backend == 'tflite':
style_mel = numpy_to_tf(style_mel, tf.float32)
inputs = numpy_to_tf(inputs, tf.int32)
inputs = tf.expand_dims(inputs, 0)
# synthesize voice
if backend == 'torch':
decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
model, inputs, CONFIG, truncated, speaker_id, style_mel)
postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch(
postnet_output, decoder_output, alignments, stop_tokens)
elif backend == 'tf':
decoder_output, postnet_output, alignments, stop_tokens = run_model_tf(
model, inputs, CONFIG, truncated, speaker_id, style_mel)
postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf(
postnet_output, decoder_output, alignments, stop_tokens)
elif backend == 'tflite':
decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite(
model, inputs, CONFIG, truncated, speaker_id, style_mel)
postnet_output, decoder_output = parse_outputs_tflite(
postnet_output, decoder_output)
# convert outputs to numpy
# plot results
wav = None
if use_griffin_lim:
wav = inv_spectrogram(postnet_output, ap, CONFIG)
# trim silence
if do_trim_silence:
wav = trim_silence(wav, ap)
return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs

32
TTS/tts/utils/tests.py Normal file
Просмотреть файл

@ -0,0 +1,32 @@
import os
import subprocess
import tempfile
import nbformat
def _notebook_run(path):
"""Execute a notebook via nbconvert and collect output.
:returns (parsed nb object, execution errors)
"""
dirname, filename = os.path.split(path)
os.chdir(dirname)
with tempfile.NamedTemporaryFile(suffix=".ipynb") as fout:
args = ["jupyter", "nbconvert", "--to", "notebook", "--execute",
"--ExecutePreprocessor.timeout=60",
"--output", fout.name, filename]
subprocess.check_call(args)
fout.seek(0)
nb = nbformat.read(fout, nbformat.current_nbformat)
errors = [output for cell in nb.cells if "outputs" in cell
for output in cell["outputs"]\
if output.output_type == "error"]
return nb, errors
def test_ipynb(path):
nb, errors = _notebook_run(path)
assert errors == []

Просмотреть файл

@ -0,0 +1,187 @@
# -*- coding: utf-8 -*-
import re
from packaging import version
import phonemizer
from phonemizer.phonemize import phonemize
from TTS.tts.utils.text import cleaners
from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
_eos
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
_phonemes_to_id = {s: i for i, s in enumerate(phonemes)}
_id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
# Regular expression matching text enclosed in curly braces:
_CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)')
# Regular expression matching punctuations, ignoring empty space
PHONEME_PUNCTUATION_PATTERN = r'['+_phoneme_punctuations+']+'
def text2phone(text, language):
'''
Convert graphemes to phonemes.
'''
seperator = phonemizer.separator.Separator(' |', '', '|')
#try:
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
if version.parse(phonemizer.__version__) < version.parse('2.1'):
ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
ph = ph[:-1].strip() # skip the last empty character
# phonemizer does not tackle punctuations. Here we do.
# Replace \n with matching punctuations.
if punctuations:
# if text ends with a punctuation.
if text[-1] == punctuations[-1]:
for punct in punctuations[:-1]:
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
ph = ph + punctuations[-1]
else:
for punct in punctuations:
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
elif version.parse(phonemizer.__version__) >= version.parse('2.1'):
ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True)
# this is a simple fix for phonemizer.
# https://github.com/bootphon/phonemizer/issues/32
if punctuations:
for punctuation in punctuations:
ph = ph.replace(f"| |{punctuation} ", f"|{punctuation}| |").replace(f"| |{punctuation}", f"|{punctuation}| |")
ph = ph[:-3]
else:
raise RuntimeError(" [!] Use 'phonemizer' version 2.1 or older.")
return ph
def pad_with_eos_bos(phoneme_sequence, tp=None):
# pylint: disable=global-statement
global _phonemes_to_id, _bos, _eos
if tp:
_bos = tp['bos']
_eos = tp['eos']
_, _phonemes = make_symbols(**tp)
_phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
# pylint: disable=global-statement
global _phonemes_to_id
if tp:
_, _phonemes = make_symbols(**tp)
_phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
sequence = []
clean_text = _clean_text(text, cleaner_names)
to_phonemes = text2phone(clean_text, language)
if to_phonemes is None:
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
# iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
for phoneme in filter(None, to_phonemes.split('|')):
sequence += _phoneme_to_sequence(phoneme)
# Append EOS char
if enable_eos_bos:
sequence = pad_with_eos_bos(sequence, tp=tp)
return sequence
def sequence_to_phoneme(sequence, tp=None):
# pylint: disable=global-statement
'''Converts a sequence of IDs back to a string'''
global _id_to_phonemes
result = ''
if tp:
_, _phonemes = make_symbols(**tp)
_id_to_phonemes = {i: s for i, s in enumerate(_phonemes)}
for symbol_id in sequence:
if symbol_id in _id_to_phonemes:
s = _id_to_phonemes[symbol_id]
result += s
return result.replace('}{', ' ')
def text_to_sequence(text, cleaner_names, tp=None):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
Args:
text: string to convert to a sequence
cleaner_names: names of the cleaner functions to run the text through
Returns:
List of integers corresponding to the symbols in the text
'''
# pylint: disable=global-statement
global _symbol_to_id
if tp:
_symbols, _ = make_symbols(**tp)
_symbol_to_id = {s: i for i, s in enumerate(_symbols)}
sequence = []
# Check for curly braces and treat their contents as ARPAbet:
while text:
m = _CURLY_RE.match(text)
if not m:
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
break
sequence += _symbols_to_sequence(
_clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3)
return sequence
def sequence_to_text(sequence, tp=None):
'''Converts a sequence of IDs back to a string'''
# pylint: disable=global-statement
global _id_to_symbol
if tp:
_symbols, _ = make_symbols(**tp)
_id_to_symbol = {i: s for i, s in enumerate(_symbols)}
result = ''
for symbol_id in sequence:
if symbol_id in _id_to_symbol:
s = _id_to_symbol[symbol_id]
# Enclose ARPAbet back in curly braces:
if len(s) > 1 and s[0] == '@':
s = '{%s}' % s[1:]
result += s
return result.replace('}{', ' ')
def _clean_text(text, cleaner_names):
for name in cleaner_names:
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception('Unknown cleaner: %s' % name)
text = cleaner(text)
return text
def _symbols_to_sequence(syms):
return [_symbol_to_id[s] for s in syms if _should_keep_symbol(s)]
def _phoneme_to_sequence(phons):
return [_phonemes_to_id[s] for s in list(phons) if _should_keep_phoneme(s)]
def _arpabet_to_sequence(text):
return _symbols_to_sequence(['@' + s for s in text.split()])
def _should_keep_symbol(s):
return s in _symbol_to_id and s not in ['~', '^', '_']
def _should_keep_phoneme(p):
return p in _phonemes_to_id and p not in ['~', '^', '_']

Просмотреть файл

@ -0,0 +1,123 @@
'''
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
'''
import re
from unidecode import unidecode
from .number_norm import normalize_numbers
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
def expand_abbreviations(text):
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def expand_numbers(text):
return normalize_numbers(text)
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, ' ', text).strip()
def convert_to_ascii(text):
return unidecode(text)
def remove_aux_symbols(text):
text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text)
return text
def replace_symbols(text):
text = text.replace(';', ',')
text = text.replace('-', ' ')
text = text.replace(':', ',')
text = text.replace('&', 'and')
return text
def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
text = collapse_whitespace(text)
return text
def transliteration_cleaners(text):
'''Pipeline for non-English text that transliterates to ASCII.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
# TODO: elaborate it
def basic_turkish_cleaners(text):
'''Pipeline for Turkish text'''
text = text.replace("I", "ı")
text = lowercase(text)
text = collapse_whitespace(text)
return text
def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text

Просмотреть файл

@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
import re
VALID_SYMBOLS = [
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
'Y', 'Z', 'ZH'
]
class CMUDict:
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
def __init__(self, file_or_path, keep_ambiguous=True):
if isinstance(file_or_path, str):
with open(file_or_path, encoding='latin-1') as f:
entries = _parse_cmudict(f)
else:
entries = _parse_cmudict(file_or_path)
if not keep_ambiguous:
entries = {
word: pron
for word, pron in entries.items() if len(pron) == 1
}
self._entries = entries
def __len__(self):
return len(self._entries)
def lookup(self, word):
'''Returns list of ARPAbet pronunciations of the given word.'''
return self._entries.get(word.upper())
@staticmethod
def get_arpabet(word, cmudict, punctuation_symbols):
first_symbol, last_symbol = '', ''
if word and word[0] in punctuation_symbols:
first_symbol = word[0]
word = word[1:]
if word and word[-1] in punctuation_symbols:
last_symbol = word[-1]
word = word[:-1]
arpabet = cmudict.lookup(word)
if arpabet is not None:
return first_symbol + '{%s}' % arpabet[0] + last_symbol
return first_symbol + word + last_symbol
_alt_re = re.compile(r'\([0-9]+\)')
def _parse_cmudict(file):
cmudict = {}
for line in file:
if line and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
parts = line.split(' ')
word = re.sub(_alt_re, '', parts[0])
pronunciation = _get_pronunciation(parts[1])
if pronunciation:
if word in cmudict:
cmudict[word].append(pronunciation)
else:
cmudict[word] = [pronunciation]
return cmudict
def _get_pronunciation(s):
parts = s.strip().split(' ')
for part in parts:
if part not in VALID_SYMBOLS:
return None
return ' '.join(parts)

Просмотреть файл

@ -0,0 +1,71 @@
""" from https://github.com/keithito/tacotron """
import inflect
import re
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+')
def _remove_commas(m):
return m.group(1).replace(',', '')
def _expand_decimal_point(m):
return m.group(1).replace('.', ' point ')
def _expand_dollars(m):
match = m.group(1)
parts = match.split('.')
if len(parts) > 2:
return match + ' dollars' # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
return '%s %s' % (dollars, dollar_unit)
elif cents:
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s' % (cents, cent_unit)
else:
return 'zero dollars'
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if 1000 < num < 3000:
if num == 2000:
return 'two thousand'
if 2000 < num < 2010:
return 'two thousand ' + _inflect.number_to_words(num % 100)
if num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
return _inflect.number_to_words(num,
andword='',
zero='oh',
group=2).replace(', ', ' ')
return _inflect.number_to_words(num, andword='')
def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
return text

Просмотреть файл

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
'''
Defines the set of symbols used in text input to the model.
The default is a set of ASCII characters that works well for English or text that has been run
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
'''
def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name
''' Function to create symbols and phonemes '''
_phonemes_sorted = sorted(list(phonemes))
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in _phonemes_sorted]
# Export all symbols:
_symbols = [pad, eos, bos] + list(characters) + _arpabet
_phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
return _symbols, _phonemes
_pad = '_'
_eos = '~'
_bos = '^'
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
_punctuations = '!\'(),-.:;? '
_phoneme_punctuations = '.!;:,?'
# Phonemes definition
_vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ'
_non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ'
_pulmonic_consonants = 'pbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ'
_suprasegmentals = 'ˈˌːˑ'
_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ'
_diacrilics = 'ɚ˞ɫ'
_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
# Generate ALIEN language
# from random import shuffle
# shuffle(phonemes)
if __name__ == '__main__':
print(" > TTS symbols {}".format(len(symbols)))
print(symbols)
print(" > TTS phonemes {}".format(len(phonemes)))
print(phonemes)

93
TTS/tts/utils/visual.py Normal file
Просмотреть файл

@ -0,0 +1,93 @@
import torch
import librosa
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None):
if isinstance(alignment, torch.Tensor):
alignment_ = alignment.detach().cpu().numpy().squeeze()
else:
alignment_ = alignment
fig, ax = plt.subplots(figsize=fig_size)
im = ax.imshow(
alignment_.T, aspect='auto', origin='lower', interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
if info is not None:
xlabel += '\n\n' + info
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
# plt.yticks(range(len(text)), list(text))
plt.tight_layout()
if title is not None:
plt.title(title)
return fig
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10)):
if isinstance(spectrogram, torch.Tensor):
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
else:
spectrogram_ = spectrogram.T
if ap is not None:
spectrogram_ = ap._denormalize(spectrogram_) # pylint: disable=protected-access
fig = plt.figure(figsize=fig_size)
plt.imshow(spectrogram_, aspect="auto", origin="lower")
plt.colorbar()
plt.tight_layout()
return fig
def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)):
if decoder_output is not None:
num_plot = 4
else:
num_plot = 3
label_fontsize = 16
fig = plt.figure(figsize=figsize)
plt.subplot(num_plot, 1, 1)
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
# compute phoneme representation and back
if CONFIG.use_phonemes:
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
print(text)
plt.yticks(range(len(text)), list(text))
plt.colorbar()
# plot stopnet predictions
plt.subplot(num_plot, 1, 2)
plt.plot(range(len(stop_tokens)), list(stop_tokens))
# plot postnet spectrogram
plt.subplot(num_plot, 1, 3)
librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'],
hop_length=hop_length, x_axis="time", y_axis="linear",
fmin=CONFIG.audio['mel_fmin'],
fmax=CONFIG.audio['mel_fmax'])
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()
if decoder_output is not None:
plt.subplot(num_plot, 1, 4)
librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'],
hop_length=hop_length, x_axis="time", y_axis="linear",
fmin=CONFIG.audio['mel_fmin'],
fmax=CONFIG.audio['mel_fmax'])
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()
if output_path:
print(output_path)
fig.savefig(output_path)
plt.close()

0
TTS/utils/__init__.py Normal file
Просмотреть файл

356
TTS/utils/audio.py Normal file
Просмотреть файл

@ -0,0 +1,356 @@
import librosa
import soundfile as sf
import numpy as np
import scipy.io
import scipy.signal
from TTS.tts.utils.data import StandardScaler
class AudioProcessor(object):
def __init__(self,
sample_rate=None,
num_mels=None,
min_level_db=None,
frame_shift_ms=None,
frame_length_ms=None,
hop_length=None,
win_length=None,
ref_level_db=None,
fft_size=1024,
power=None,
preemphasis=0.0,
signal_norm=None,
symmetric_norm=None,
max_norm=None,
mel_fmin=None,
mel_fmax=None,
spec_gain=20,
stft_pad_mode='reflect',
clip_norm=True,
griffin_lim_iters=None,
do_trim_silence=False,
trim_db=60,
do_sound_norm=False,
stats_path=None,
**_):
print(" > Setting up Audio Processor...")
# setup class attributed
self.sample_rate = sample_rate
self.num_mels = num_mels
self.min_level_db = min_level_db or 0
self.frame_shift_ms = frame_shift_ms
self.frame_length_ms = frame_length_ms
self.ref_level_db = ref_level_db
self.fft_size = fft_size
self.power = power
self.preemphasis = preemphasis
self.griffin_lim_iters = griffin_lim_iters
self.signal_norm = signal_norm
self.symmetric_norm = symmetric_norm
self.mel_fmin = mel_fmin or 0
self.mel_fmax = mel_fmax
self.spec_gain = float(spec_gain)
self.stft_pad_mode = 'reflect'
self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
self.trim_db = trim_db
self.do_sound_norm = do_sound_norm
self.stats_path = stats_path
# setup stft parameters
if hop_length is None:
# compute stft parameters from given time values
self.hop_length, self.win_length = self._stft_parameters()
else:
# use stft parameters from config file
self.hop_length = hop_length
self.win_length = win_length
assert min_level_db != 0.0, " [!] min_level_db is 0"
assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
members = vars(self)
for key, value in members.items():
print(" | > {}:{}".format(key, value))
# create spectrogram utils
self.mel_basis = self._build_mel_basis()
self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
# setup scaler
if stats_path:
mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path)
self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
self.signal_norm = True
self.max_norm = None
self.clip_norm = None
self.symmetric_norm = None
### setting up the parameters ###
def _build_mel_basis(self, ):
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel(
self.sample_rate,
self.fft_size,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
def _stft_parameters(self, ):
"""Compute necessary stft parameters with given time values"""
factor = self.frame_length_ms / self.frame_shift_ms
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
win_length = int(hop_length * factor)
return hop_length, win_length
### normalization ###
def _normalize(self, S):
"""Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]"""
#pylint: disable=no-else-return
S = S.copy()
if self.signal_norm:
# mean-var scaling
if hasattr(self, 'mel_scaler'):
if S.shape[0] == self.num_mels:
return self.mel_scaler.transform(S.T).T
elif S.shape[0] == self.fft_size / 2:
return self.linear_scaler.transform(S.T).T
else:
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
# range normalization
S -= self.ref_level_db # discard certain range of DB assuming it is air noise
S_norm = ((S - self.min_level_db) / (-self.min_level_db))
if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm:
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
return S_norm
else:
S_norm = self.max_norm * S_norm
if self.clip_norm:
S_norm = np.clip(S_norm, 0, self.max_norm)
return S_norm
else:
return S
def _denormalize(self, S):
"""denormalize values"""
#pylint: disable=no-else-return
S_denorm = S.copy()
if self.signal_norm:
# mean-var scaling
if hasattr(self, 'mel_scaler'):
if S_denorm.shape[0] == self.num_mels:
return self.mel_scaler.inverse_transform(S_denorm.T).T
elif S_denorm.shape[0] == self.fft_size / 2:
return self.linear_scaler.inverse_transform(S_denorm.T).T
else:
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
if self.symmetric_norm:
if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
return S_denorm + self.ref_level_db
else:
if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = (S_denorm * -self.min_level_db /
self.max_norm) + self.min_level_db
return S_denorm + self.ref_level_db
else:
return S_denorm
### Mean-STD scaling ###
def load_stats(self, stats_path):
stats = np.load(stats_path, allow_pickle=True).item() #pylint: disable=unexpected-keyword-arg
mel_mean = stats['mel_mean']
mel_std = stats['mel_std']
linear_mean = stats['linear_mean']
linear_std = stats['linear_std']
stats_config = stats['audio_config']
# check all audio parameters used for computing stats
skip_parameters = ['griffin_lim_iters', 'stats_path', 'do_trim_silence', 'ref_level_db', 'power']
for key in stats_config.keys():
if key in skip_parameters:
continue
assert stats_config[key] == self.__dict__[key],\
f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
return mel_mean, mel_std, linear_mean, linear_std, stats_config
# pylint: disable=attribute-defined-outside-init
def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std):
self.mel_scaler = StandardScaler()
self.mel_scaler.set_stats(mel_mean, mel_std)
self.linear_scaler = StandardScaler()
self.linear_scaler.set_stats(linear_mean, linear_std)
### DB and AMP conversion ###
# pylint: disable=no-self-use
def _amp_to_db(self, x):
return self.spec_gain * np.log10(np.maximum(1e-5, x))
# pylint: disable=no-self-use
def _db_to_amp(self, x):
return np.power(10.0, x / self.spec_gain)
### Preemphasis ###
def apply_preemphasis(self, x):
if self.preemphasis == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.")
return scipy.signal.lfilter([1, -self.preemphasis], [1], x)
def apply_inv_preemphasis(self, x):
if self.preemphasis == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.")
return scipy.signal.lfilter([1], [1, -self.preemphasis], x)
### SPECTROGRAMs ###
def _linear_to_mel(self, spectrogram):
return np.dot(self.mel_basis, spectrogram)
def _mel_to_linear(self, mel_spec):
return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec))
def spectrogram(self, y):
if self.preemphasis != 0:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amp_to_db(np.abs(D))
return self._normalize(S)
def melspectrogram(self, y):
if self.preemphasis != 0:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
return self._normalize(S)
def inv_spectrogram(self, spectrogram):
"""Converts spectrogram to waveform using librosa"""
S = self._denormalize(spectrogram)
S = self._db_to_amp(S)
# Reconstruct phase
if self.preemphasis != 0:
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def inv_melspectrogram(self, mel_spectrogram):
'''Converts melspectrogram to waveform using librosa'''
D = self._denormalize(mel_spectrogram)
S = self._db_to_amp(D)
S = self._mel_to_linear(S) # Convert back to linear
if self.preemphasis != 0:
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec):
S = self._denormalize(linear_spec)
S = self._db_to_amp(S)
S = self._linear_to_mel(np.abs(S))
S = self._amp_to_db(S)
mel = self._normalize(S)
return mel
### STFT and ISTFT ###
def _stft(self, y):
return librosa.stft(
y=y,
n_fft=self.fft_size,
hop_length=self.hop_length,
win_length=self.win_length,
pad_mode=self.stft_pad_mode,
)
def _istft(self, y):
return librosa.istft(
y, hop_length=self.hop_length, win_length=self.win_length)
def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = self._istft(S_complex * angles)
for _ in range(self.griffin_lim_iters):
angles = np.exp(1j * np.angle(self._stft(y)))
y = self._istft(S_complex * angles)
return y
def compute_stft_paddings(self, x, pad_sides=1):
'''compute right padding (final frame) or both sides padding (first and final frames)
'''
assert pad_sides in (1, 2)
pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
if pad_sides == 1:
return 0, pad
return pad // 2, pad // 2 + pad % 2
### Audio Processing ###
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(self.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
threshold = self._db_to_amp(threshold_db)
for x in range(hop_length, len(wav) - window_length, hop_length):
if np.max(wav[x:x + window_length]) < threshold:
return x + hop_length
return len(wav)
def trim_silence(self, wav):
""" Trim silent parts with a threshold and 0.01 sec margin """
margin = int(self.sample_rate * 0.01)
wav = wav[margin:-margin]
return librosa.effects.trim(
wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0]
@staticmethod
def sound_norm(x):
return x / abs(x).max() * 0.9
### save and load ###
def load_wav(self, filename, sr=None):
if sr is None:
x, sr = sf.read(filename)
else:
x, sr = librosa.load(filename, sr=sr)
if self.do_trim_silence:
try:
x = self.trim_silence(x)
except ValueError:
print(f' [!] File cannot be trimmed for silence - {filename}')
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
if self.do_sound_norm:
x = self.sound_norm(x)
return x
def save_wav(self, wav, path):
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16))
@staticmethod
def mulaw_encode(wav, qc):
mu = 2 ** qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
# Quantize signal to the specified number of levels.
signal = (signal + 1) / 2 * mu + 0.5
return np.floor(signal,)
@staticmethod
def mulaw_decode(wav, qc):
"""Recovers waveform from quantized values."""
mu = 2 ** qc - 1
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
return x
@staticmethod
def encode_16bits(x):
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
@staticmethod
def quantize(x, bits):
return (x + 1.) * (2**bits - 1) / 2
@staticmethod
def dequantize(x, bits):
return 2 * x / (2**bits - 1) - 1

154
TTS/utils/generic_utils.py Normal file
Просмотреть файл

@ -0,0 +1,154 @@
import os
import glob
import torch
import shutil
import datetime
import subprocess
import importlib
import numpy as np
from collections import Counter
def get_git_branch():
try:
out = subprocess.check_output(["git", "branch"]).decode("utf8")
current = next(line for line in out.split("\n")
if line.startswith("*"))
current.replace("* ", "")
except subprocess.CalledProcessError:
current = "inside_docker"
return current
def get_commit_hash():
"""https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script"""
# try:
# subprocess.check_output(['git', 'diff-index', '--quiet',
# 'HEAD']) # Verify client is clean
# except:
# raise RuntimeError(
# " !! Commit before training to get the commit hash.")
try:
commit = subprocess.check_output(
['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
# Not copying .git folder into docker container
except subprocess.CalledProcessError:
commit = "0000000"
print(' > Git Hash: {}'.format(commit))
return commit
def create_experiment_folder(root_path, model_name, debug):
""" Create a folder with the current date and time """
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
if debug:
commit_hash = 'debug'
else:
commit_hash = get_commit_hash()
output_folder = os.path.join(
root_path, model_name + '-' + date_str + '-' + commit_hash)
os.makedirs(output_folder, exist_ok=True)
print(" > Experiment folder: {}".format(output_folder))
return output_folder
def remove_experiment_folder(experiment_path):
"""Check folder if there is a checkpoint, otherwise remove the folder"""
checkpoint_files = glob.glob(experiment_path + "/*.pth.tar")
if not checkpoint_files:
if os.path.exists(experiment_path):
shutil.rmtree(experiment_path, ignore_errors=True)
print(" ! Run is removed from {}".format(experiment_path))
else:
print(" ! Run is kept in {}".format(experiment_path))
def count_parameters(model):
r"""Count number of trainable parameters in a network"""
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def set_init_dict(model_dict, checkpoint_state, c):
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
for k, v in checkpoint_state.items():
if k not in model_dict:
print(" | > Layer missing in the model definition: {}".format(k))
# 1. filter out unnecessary keys
pretrained_dict = {
k: v
for k, v in checkpoint_state.items() if k in model_dict
}
# 2. filter out different size layers
pretrained_dict = {
k: v
for k, v in pretrained_dict.items()
if v.numel() == model_dict[k].numel()
}
# 3. skip reinit layers
if c.reinit_layers is not None:
for reinit_layer_name in c.reinit_layers:
pretrained_dict = {
k: v
for k, v in pretrained_dict.items()
if reinit_layer_name not in k
}
# 4. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
len(model_dict)))
return model_dict
class KeepAverage():
def __init__(self):
self.avg_values = {}
self.iters = {}
def __getitem__(self, key):
return self.avg_values[key]
def items(self):
return self.avg_values.items()
def add_value(self, name, init_val=0, init_iter=0):
self.avg_values[name] = init_val
self.iters[name] = init_iter
def update_value(self, name, value, weighted_avg=False):
if name not in self.avg_values:
# add value if not exist before
self.add_value(name, init_val=value)
else:
# else update existing value
if weighted_avg:
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
self.iters[name] += 1
else:
self.avg_values[name] = self.avg_values[name] * \
self.iters[name] + value
self.iters[name] += 1
self.avg_values[name] /= self.iters[name]
def add_values(self, name_dict):
for key, value in name_dict.items():
self.add_value(key, init_val=value)
def update_values(self, value_dict):
for key, value in value_dict.items():
self.update_value(key, value)
def check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None):
if alternative in c.keys() and c[alternative] is not None:
return
if restricted:
assert name in c.keys(), f' [!] {name} not defined in config.json'
if name in c.keys():
if max_val:
assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
if min_val:
assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
if enum_list:
assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
if val_type:
assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'

32
TTS/utils/io.py Normal file
Просмотреть файл

@ -0,0 +1,32 @@
import re
import json
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def load_config(config_path):
config = AttrDict()
with open(config_path, "r") as f:
input_str = f.read()
input_str = re.sub(r'\\\n', '', input_str)
input_str = re.sub(r'//.*\n', '\n', input_str)
data = json.loads(input_str)
config.update(data)
return config
def copy_config_file(config_file, out_path, new_fields):
config_lines = open(config_file, "r").readlines()
# add extra information fields
for key, value in new_fields.items():
if isinstance(value, str):
new_line = '"{}":"{}",\n'.format(key, value)
else:
new_line = '"{}":{},\n'.format(key, value)
config_lines.insert(1, new_line)
config_out_file = open(out_path, "w")
config_out_file.writelines(config_lines)
config_out_file.close()

97
TTS/utils/radam.py Normal file
Просмотреть файл

@ -0,0 +1,97 @@
# from https://github.com/LiyuanLucasLiu/RAdam
import math
import torch
from torch.optim.optimizer import Optimizer, required
class RAdam(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
if lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if eps < 0.0:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
self.degenerated_to_sgd = degenerated_to_sgd
if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
for param in params:
if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
param['buffer'] = [[None, None, None] for _ in range(10)]
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
super(RAdam, self).__init__(params, defaults)
def __setstate__(self, state):
super(RAdam, self).__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data.float()
if grad.is_sparse:
raise RuntimeError('RAdam does not support sparse gradients')
p_data_fp32 = p.data.float()
state = self.state[p]
if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
state['step'] += 1
buffered = group['buffer'][int(state['step'] % 10)]
if state['step'] == buffered[0]:
N_sma, step_size = buffered[1], buffered[2]
else:
buffered[0] = state['step']
beta2_t = beta2 ** state['step']
N_sma_max = 2 / (1 - beta2) - 1
N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
buffered[1] = N_sma
# more conservative since it's an approximated value
if N_sma >= 5:
step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
elif self.degenerated_to_sgd:
step_size = 1.0 / (1 - beta1 ** state['step'])
else:
step_size = -1
buffered[2] = step_size
# more conservative since it's an approximated value
if N_sma >= 5:
if group['weight_decay'] != 0:
p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
denom = exp_avg_sq.sqrt().add_(group['eps'])
p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr'])
p.data.copy_(p_data_fp32)
elif step_size > 0:
if group['weight_decay'] != 0:
p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr'])
p.data.copy_(p_data_fp32)
return loss

231
TTS/utils/synthesis.py Normal file
Просмотреть файл

@ -0,0 +1,231 @@
import pkg_resources
installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable
if 'tensorflow' in installed or 'tensorflow-gpu' in installed:
import tensorflow as tf
import torch
import numpy as np
from .text import text_to_sequence, phoneme_to_sequence
def text_to_seqvec(text, CONFIG):
text_cleaner = [CONFIG.text_cleaner]
# text ot phonemes to sequence vector
if CONFIG.use_phonemes:
seq = np.asarray(
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
CONFIG.enable_eos_bos_chars,
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None),
dtype=np.int32)
else:
seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32)
return seq
def numpy_to_torch(np_array, dtype, cuda=False):
if np_array is None:
return None
tensor = torch.as_tensor(np_array, dtype=dtype)
if cuda:
return tensor.cuda()
return tensor
def numpy_to_tf(np_array, dtype):
if np_array is None:
return None
tensor = tf.convert_to_tensor(np_array, dtype=dtype)
return tensor
def compute_style_mel(style_wav, ap):
style_mel = ap.melspectrogram(
ap.load_wav(style_wav)).expand_dims(0)
return style_mel
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
if CONFIG.use_gst:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, style_mel=style_mel, speaker_ids=speaker_id)
else:
if truncated:
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
inputs, speaker_ids=speaker_id)
else:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, speaker_ids=speaker_id)
return decoder_output, postnet_output, alignments, stop_tokens
def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
if CONFIG.use_gst and style_mel is not None:
raise NotImplementedError(' [!] GST inference not implemented for TF')
if truncated:
raise NotImplementedError(' [!] Truncated inference not implemented for TF')
if speaker_id is not None:
raise NotImplementedError(' [!] Multi-Speaker not implemented for TF')
# TODO: handle multispeaker case
decoder_output, postnet_output, alignments, stop_tokens = model(
inputs, training=False)
return decoder_output, postnet_output, alignments, stop_tokens
def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
if CONFIG.use_gst and style_mel is not None:
raise NotImplementedError(' [!] GST inference not implemented for TfLite')
if truncated:
raise NotImplementedError(' [!] Truncated inference not implemented for TfLite')
if speaker_id is not None:
raise NotImplementedError(' [!] Multi-Speaker not implemented for TfLite')
# get input and output details
input_details = model.get_input_details()
output_details = model.get_output_details()
# reshape input tensor for the new input shape
model.resize_tensor_input(input_details[0]['index'], inputs.shape)
model.allocate_tensors()
detail = input_details[0]
# input_shape = detail['shape']
model.set_tensor(detail['index'], inputs)
# run the model
model.invoke()
# collect outputs
decoder_output = model.get_tensor(output_details[0]['index'])
postnet_output = model.get_tensor(output_details[1]['index'])
# tflite model only returns feature frames
return decoder_output, postnet_output, None, None
def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens):
postnet_output = postnet_output[0].data.cpu().numpy()
decoder_output = decoder_output[0].data.cpu().numpy()
alignment = alignments[0].cpu().data.numpy()
stop_tokens = stop_tokens[0].cpu().numpy()
return postnet_output, decoder_output, alignment, stop_tokens
def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens):
postnet_output = postnet_output[0].numpy()
decoder_output = decoder_output[0].numpy()
alignment = alignments[0].numpy()
stop_tokens = stop_tokens[0].numpy()
return postnet_output, decoder_output, alignment, stop_tokens
def parse_outputs_tflite(postnet_output, decoder_output):
postnet_output = postnet_output[0]
decoder_output = decoder_output[0]
return postnet_output, decoder_output
def trim_silence(wav, ap):
return wav[:ap.find_endpoint(wav)]
def inv_spectrogram(postnet_output, ap, CONFIG):
if CONFIG.model.lower() in ["tacotron"]:
wav = ap.inv_spectrogram(postnet_output.T)
else:
wav = ap.inv_melspectrogram(postnet_output.T)
return wav
def id_to_torch(speaker_id):
if speaker_id is not None:
speaker_id = np.asarray(speaker_id)
speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
return speaker_id
# TODO: perform GL with pytorch for batching
def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
'''Apply griffin-lim to each sample iterating throught the first dimension.
Args:
inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size.
input_lens (Tensor or np.Array): 1D array of sample lengths.
CONFIG (Dict): TTS config.
ap (AudioProcessor): TTS audio processor.
'''
wavs = []
for idx, spec in enumerate(inputs):
wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding
wav = inv_spectrogram(spec, ap, CONFIG)
# assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}"
wavs.append(wav[:wav_len])
return wavs
def synthesis(model,
text,
CONFIG,
use_cuda,
ap,
speaker_id=None,
style_wav=None,
truncated=False,
enable_eos_bos_chars=False, #pylint: disable=unused-argument
use_griffin_lim=False,
do_trim_silence=False,
backend='torch'):
"""Synthesize voice for the given text.
Args:
model (TTS.tts.models): model to synthesize.
text (str): target text
CONFIG (dict): config dictionary to be loaded from config.json.
use_cuda (bool): enable cuda.
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
model outputs.
speaker_id (int): id of speaker
style_wav (str): Uses for style embedding of GST.
truncated (bool): keep model states after inference. It can be used
for continuous inference at long texts.
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
do_trim_silence (bool): trim silence after synthesis.
backend (str): tf or torch
"""
# GST processing
style_mel = None
if CONFIG.model == "TacotronGST" and style_wav is not None:
style_mel = compute_style_mel(style_wav, ap)
# preprocess the given text
inputs = text_to_seqvec(text, CONFIG)
# pass tensors to backend
if backend == 'torch':
speaker_id = id_to_torch(speaker_id)
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda)
inputs = inputs.unsqueeze(0)
elif backend == 'tf':
# TODO: handle speaker id for tf model
style_mel = numpy_to_tf(style_mel, tf.float32)
inputs = numpy_to_tf(inputs, tf.int32)
inputs = tf.expand_dims(inputs, 0)
elif backend == 'tflite':
style_mel = numpy_to_tf(style_mel, tf.float32)
inputs = numpy_to_tf(inputs, tf.int32)
inputs = tf.expand_dims(inputs, 0)
# synthesize voice
if backend == 'torch':
decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
model, inputs, CONFIG, truncated, speaker_id, style_mel)
postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch(
postnet_output, decoder_output, alignments, stop_tokens)
elif backend == 'tf':
decoder_output, postnet_output, alignments, stop_tokens = run_model_tf(
model, inputs, CONFIG, truncated, speaker_id, style_mel)
postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf(
postnet_output, decoder_output, alignments, stop_tokens)
elif backend == 'tflite':
decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite(
model, inputs, CONFIG, truncated, speaker_id, style_mel)
postnet_output, decoder_output = parse_outputs_tflite(
postnet_output, decoder_output)
# convert outputs to numpy
# plot results
wav = None
if use_griffin_lim:
wav = inv_spectrogram(postnet_output, ap, CONFIG)
# trim silence
if do_trim_silence:
wav = trim_silence(wav, ap)
return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs

Просмотреть файл

@ -0,0 +1,81 @@
import traceback
from tensorboardX import SummaryWriter
class TensorboardLogger(object):
def __init__(self, log_dir, model_name):
self.model_name = model_name
self.writer = SummaryWriter(log_dir)
self.train_stats = {}
self.eval_stats = {}
def tb_model_weights(self, model, step):
layer_num = 1
for name, param in model.named_parameters():
if param.numel() == 1:
self.writer.add_scalar(
"layer{}-{}/value".format(layer_num, name),
param.max(), step)
else:
self.writer.add_scalar(
"layer{}-{}/max".format(layer_num, name),
param.max(), step)
self.writer.add_scalar(
"layer{}-{}/min".format(layer_num, name),
param.min(), step)
self.writer.add_scalar(
"layer{}-{}/mean".format(layer_num, name),
param.mean(), step)
self.writer.add_scalar(
"layer{}-{}/std".format(layer_num, name),
param.std(), step)
self.writer.add_histogram(
"layer{}-{}/param".format(layer_num, name), param, step)
self.writer.add_histogram(
"layer{}-{}/grad".format(layer_num, name), param.grad, step)
layer_num += 1
def dict_to_tb_scalar(self, scope_name, stats, step):
for key, value in stats.items():
self.writer.add_scalar('{}/{}'.format(scope_name, key), value, step)
def dict_to_tb_figure(self, scope_name, figures, step):
for key, value in figures.items():
self.writer.add_figure('{}/{}'.format(scope_name, key), value, step)
def dict_to_tb_audios(self, scope_name, audios, step, sample_rate):
for key, value in audios.items():
try:
self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate)
except:
traceback.print_exc()
def tb_train_iter_stats(self, step, stats):
self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step)
def tb_train_epoch_stats(self, step, stats):
self.dict_to_tb_scalar(f"{self.model_name}_TrainEpochStats", stats, step)
def tb_train_figures(self, step, figures):
self.dict_to_tb_figure(f"{self.model_name}_TrainFigures", figures, step)
def tb_train_audios(self, step, audios, sample_rate):
self.dict_to_tb_audios(f"{self.model_name}_TrainAudios", audios, step, sample_rate)
def tb_eval_stats(self, step, stats):
self.dict_to_tb_scalar(f"{self.model_name}_EvalStats", stats, step)
def tb_eval_figures(self, step, figures):
self.dict_to_tb_figure(f"{self.model_name}_EvalFigures", figures, step)
def tb_eval_audios(self, step, audios, sample_rate):
self.dict_to_tb_audios(f"{self.model_name}_EvalAudios", audios, step, sample_rate)
def tb_test_audios(self, step, audios, sample_rate):
self.dict_to_tb_audios(f"{self.model_name}_TestAudios", audios, step, sample_rate)
def tb_test_figures(self, step, figures):
self.dict_to_tb_figure(f"{self.model_name}_TestFigures", figures, step)
def tb_add_text(self, title, text, step):
self.writer.add_text(title, text, step)

108
TTS/utils/training.py Normal file
Просмотреть файл

@ -0,0 +1,108 @@
import torch
import numpy as np
def setup_torch_training_env(cudnn_enable, cudnn_benchmark):
torch.backends.cudnn.enabled = cudnn_enable
torch.backends.cudnn.benchmark = cudnn_benchmark
torch.manual_seed(54321)
use_cuda = torch.cuda.is_available()
num_gpus = torch.cuda.device_count()
print(" > Using CUDA: ", use_cuda)
print(" > Number of GPUs: ", num_gpus)
return use_cuda, num_gpus
def check_update(model, grad_clip, ignore_stopnet=False):
r'''Check model gradient against unexpected jumps and failures'''
skip_flag = False
if ignore_stopnet:
grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
else:
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
# compatibility with different torch versions
if isinstance(grad_norm, float):
if np.isinf(grad_norm):
print(" | > Gradient is INF !!")
skip_flag = True
else:
if torch.isinf(grad_norm):
print(" | > Gradient is INF !!")
skip_flag = True
return grad_norm, skip_flag
def lr_decay(init_lr, global_step, warmup_steps):
r'''from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py'''
warmup_steps = float(warmup_steps)
step = global_step + 1.
lr = init_lr * warmup_steps**0.5 * np.minimum(step * warmup_steps**-1.5,
step**-0.5)
return lr
def adam_weight_decay(optimizer):
"""
Custom weight decay operation, not effecting grad values.
"""
for group in optimizer.param_groups:
for param in group['params']:
current_lr = group['lr']
weight_decay = group['weight_decay']
factor = -weight_decay * group['lr']
param.data = param.data.add(param.data,
alpha=factor)
return optimizer, current_lr
# pylint: disable=dangerous-default-value
def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}):
"""
Skip biases, BatchNorm parameters, rnns.
and attention projection layer v
"""
decay = []
no_decay = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue
if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]):
no_decay.append(param)
else:
decay.append(param)
return [{
'params': no_decay,
'weight_decay': 0.
}, {
'params': decay,
'weight_decay': weight_decay
}]
# pylint: disable=protected-access
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1):
self.warmup_steps = float(warmup_steps)
super(NoamLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
step = max(self.last_epoch, 1)
return [
base_lr * self.warmup_steps**0.5 *
min(step * self.warmup_steps**-1.5, step**-0.5)
for base_lr in self.base_lrs
]
def gradual_training_scheduler(global_step, config):
"""Setup the gradual training schedule wrt number
of active GPUs"""
num_gpus = torch.cuda.device_count()
if num_gpus == 0:
num_gpus = 1
new_values = None
# we set the scheduling wrt num_gpus
for values in config.gradual_training:
if global_step * num_gpus >= values[0]:
new_values = values
return new_values[1], new_values[2]

38
TTS/vocoder/README.md Normal file
Просмотреть файл

@ -0,0 +1,38 @@
# Mozilla TTS Vocoders (Experimental)
We provide here different vocoder implementations which can be combined with our TTS models to enable "FASTER THAN REAL-TIME" end-to-end TTS stack.
Currently, there are implementations of the following models.
- Melgan
- MultiBand-Melgan
- GAN-TTS (Discriminator Only)
It is also very easy to adapt different vocoder models as we provide here a flexible and modular (but not too modular) framework.
## Training a model
You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset.
In order to train a new model, you need to collecto all your wav files under a common parent folder and give this path to `data_path` field in '''config.json'''
You need to define other relevant parameters in your ```config.json``` and then start traning with the following command from Mozilla TTS root path, where '0' is the Id of the GPU you wish to use.
```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --config_path path/to/config.json```
Exampled config files can be found under `vocoder/configs/` folder.
You can continue a previous training by the following command.
```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --continue_path path/to/your/model/folder```
You can fine-tune a pre-trained model by the following command.
```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --restore_path path/to/your/model.pth.tar```
Restoring a model starts a new training in a different output folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same conditions the previous training run left off.
You can also follow your training runs on Tensorboard as you do with our TTS models.
## Acknowledgement
Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.

0
TTS/vocoder/__init__.py Normal file
Просмотреть файл

Просмотреть файл

@ -0,0 +1,151 @@
{
"run_name": "multiband-melgan-rwd",
"run_description": "multiband melgan with random window discriminator from https://arxiv.org/pdf/1909.11646.pdf",
// AUDIO PARAMETERS
"audio":{
// stft parameters
"num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// Griffin-Lim
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// DISTRIBUTED TRAINING
// "distributed":{
// "backend": "nccl",
// "url": "tcp:\/\/localhost:54321"
// },
// MODEL PARAMETERS
"use_pqmf": true,
// LOSS PARAMETERS
"use_stft_loss": true,
"use_subband_stft_loss": true,
"use_mse_gan_loss": true,
"use_hinge_gan_loss": false,
"use_feat_match_loss": false, // use only with melgan discriminators
// loss weights
"stft_loss_weight": 0.5,
"subband_stft_loss_weight": 0.5,
"mse_G_loss_weight": 2.5,
"hinge_G_loss_weight": 2.5,
"feat_match_loss_weight": 25,
// multiscale stft loss parameters
"stft_loss_params": {
"n_ffts": [1024, 2048, 512],
"hop_lengths": [120, 240, 50],
"win_lengths": [600, 1200, 240]
},
// subband multiscale stft loss parameters
"subband_stft_loss_params":{
"n_ffts": [384, 683, 171],
"hop_lengths": [30, 60, 10],
"win_lengths": [150, 300, 60]
},
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
// DISCRIMINATOR
"discriminator_model": "random_window_discriminator",
"discriminator_model_params":{
"uncond_disc_donwsample_factors": [8, 4],
"cond_disc_downsample_factors": [[8, 4, 2, 2, 2], [8, 4, 2, 2], [8, 4, 2], [8, 4], [4, 2, 2]],
"cond_disc_out_channels": [[128, 128, 256, 256], [128, 256, 256], [128, 256], [256], [128, 256]],
"window_sizes": [512, 1024, 2048, 4096, 8192]
},
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1
// GENERATOR
"generator_model": "multiband_melgan_generator",
"generator_model_params": {
"upsample_factors":[8, 4, 2],
"num_res_blocks": 4
},
// DATASET
"data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/",
"seq_len": 16384,
"pad_short": 2000,
"conv_pad": 0,
"use_noise_augment": false,
"use_cache": true,
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
"batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule": false, // use noam warmup and lr schedule.
"warmup_steps_gen": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"warmup_steps_disc": 4000,
"epochs": 10000, // total number of epochs to train.
"wd": 0.0, // Weight decay weight.
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_gen_params": {
"gamma": 0.5,
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
},
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_disc_params": {
"gamma": 0.5,
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
},
"lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_disc": 1e-4,
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log traning on console.
"print_eval": false, // If True, it prints loss values for each step in eval run.
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"eval_split_size": 10,
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/"
}

Просмотреть файл

@ -0,0 +1,144 @@
{
"run_name": "multiband-melgan",
"run_description": "multiband melgan mean-var scaling",
// AUDIO PARAMETERS
"audio":{
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 0, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// DISTRIBUTED TRAINING
// "distributed":{
// "backend": "nccl",
// "url": "tcp:\/\/localhost:54321"
// },
// MODEL PARAMETERS
"use_pqmf": true,
// LOSS PARAMETERS
"use_stft_loss": true,
"use_subband_stft_loss": true,
"use_mse_gan_loss": true,
"use_hinge_gan_loss": false,
"use_feat_match_loss": false, // use only with melgan discriminators
// loss weights
"stft_loss_weight": 0.5,
"subband_stft_loss_weight": 0.5,
"mse_G_loss_weight": 2.5,
"hinge_G_loss_weight": 2.5,
"feat_match_loss_weight": 25,
// multiscale stft loss parameters
"stft_loss_params": {
"n_ffts": [1024, 2048, 512],
"hop_lengths": [120, 240, 50],
"win_lengths": [600, 1200, 240]
},
// subband multiscale stft loss parameters
"subband_stft_loss_params":{
"n_ffts": [384, 683, 171],
"hop_lengths": [30, 60, 10],
"win_lengths": [150, 300, 60]
},
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
// DISCRIMINATOR
"discriminator_model": "melgan_multiscale_discriminator",
"discriminator_model_params":{
"base_channels": 16,
"max_channels":512,
"downsample_factors":[4, 4, 4]
},
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1
// GENERATOR
"generator_model": "multiband_melgan_generator",
"generator_model_params": {
"upsample_factors":[8, 4, 2],
"num_res_blocks": 4
},
// DATASET
"data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/",
"feature_path": null,
"seq_len": 16384,
"pad_short": 2000,
"conv_pad": 0,
"use_noise_augment": false,
"use_cache": true,
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
"batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"epochs": 10000, // total number of epochs to train.
"wd": 0.0, // Weight decay weight.
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_gen_params": {
"gamma": 0.5,
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
},
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_disc_params": {
"gamma": 0.5,
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
},
"lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_disc": 1e-4,
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log traning on console.
"print_eval": false, // If True, it prints loss values for each step in eval run.
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"eval_split_size": 10,
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/"
}

Просмотреть файл

@ -0,0 +1,144 @@
{
"run_name": "multiband-melgan",
"run_description": "multiband melgan mean-var scaling",
// AUDIO PARAMETERS
"audio":{
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 0, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/MozillaMerged22050/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// DISTRIBUTED TRAINING
// "distributed":{
// "backend": "nccl",
// "url": "tcp:\/\/localhost:54321"
// },
// MODEL PARAMETERS
"use_pqmf": true,
// LOSS PARAMETERS
"use_stft_loss": true,
"use_subband_stft_loss": true,
"use_mse_gan_loss": true,
"use_hinge_gan_loss": false,
"use_feat_match_loss": false, // use only with melgan discriminators
// loss weights
"stft_loss_weight": 0.5,
"subband_stft_loss_weight": 0.5,
"mse_G_loss_weight": 2.5,
"hinge_G_loss_weight": 2.5,
"feat_match_loss_weight": 25,
// multiscale stft loss parameters
"stft_loss_params": {
"n_ffts": [1024, 2048, 512],
"hop_lengths": [120, 240, 50],
"win_lengths": [600, 1200, 240]
},
// subband multiscale stft loss parameters
"subband_stft_loss_params":{
"n_ffts": [384, 683, 171],
"hop_lengths": [30, 60, 10],
"win_lengths": [150, 300, 60]
},
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
// DISCRIMINATOR
"discriminator_model": "melgan_multiscale_discriminator",
"discriminator_model_params":{
"base_channels": 16,
"max_channels":512,
"downsample_factors":[4, 4, 4]
},
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1
// GENERATOR
"generator_model": "multiband_melgan_generator",
"generator_model_params": {
"upsample_factors":[8, 4, 2],
"num_res_blocks": 4
},
// DATASET
"data_path": "/home/erogol/Data/MozillaMerged22050/wavs/",
"feature_path": null,
"seq_len": 16384,
"pad_short": 2000,
"conv_pad": 0,
"use_noise_augment": false,
"use_cache": true,
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
"batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"epochs": 10000, // total number of epochs to train.
"wd": 0.0, // Weight decay weight.
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_gen_params": {
"gamma": 0.5,
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
},
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_disc_params": {
"gamma": 0.5,
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
},
"lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_disc": 1e-4,
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log traning on console.
"print_eval": false, // If True, it prints loss values for each step in eval run.
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"eval_split_size": 10,
// PATHS
"output_path": "/home/erogol/Models/Mozilla/"
}

Просмотреть файл

Просмотреть файл

@ -0,0 +1,127 @@
import os
import glob
import torch
import random
import numpy as np
from torch.utils.data import Dataset
from multiprocessing import Manager
class GANDataset(Dataset):
"""
GAN Dataset searchs for all the wav files under root path
and converts them to acoustic features on the fly and returns
random segments of (audio, feature) couples.
"""
def __init__(self,
ap,
items,
seq_len,
hop_len,
pad_short,
conv_pad=2,
is_training=True,
return_segments=True,
use_noise_augment=False,
use_cache=False,
verbose=False):
self.ap = ap
self.item_list = items
self.compute_feat = not isinstance(items[0], (tuple, list))
self.seq_len = seq_len
self.hop_len = hop_len
self.pad_short = pad_short
self.conv_pad = conv_pad
self.is_training = is_training
self.return_segments = return_segments
self.use_cache = use_cache
self.use_noise_augment = use_noise_augment
self.verbose = verbose
assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len."
self.feat_frame_len = seq_len // hop_len + (2 * conv_pad)
# map G and D instances
self.G_to_D_mappings = list(range(len(self.item_list)))
self.shuffle_mapping()
# cache acoustic features
if use_cache:
self.create_feature_cache()
def create_feature_cache(self):
self.manager = Manager()
self.cache = self.manager.list()
self.cache += [None for _ in range(len(self.item_list))]
@staticmethod
def find_wav_files(path):
return glob.glob(os.path.join(path, '**', '*.wav'), recursive=True)
def __len__(self):
return len(self.item_list)
def __getitem__(self, idx):
""" Return different items for Generator and Discriminator and
cache acoustic features """
if self.return_segments:
idx2 = self.G_to_D_mappings[idx]
item1 = self.load_item(idx)
item2 = self.load_item(idx2)
return item1, item2
item1 = self.load_item(idx)
return item1
def shuffle_mapping(self):
random.shuffle(self.G_to_D_mappings)
def load_item(self, idx):
""" load (audio, feat) couple """
if self.compute_feat:
# compute features from wav
wavpath = self.item_list[idx]
# print(wavpath)
if self.use_cache and self.cache[idx] is not None:
audio, mel = self.cache[idx]
else:
audio = self.ap.load_wav(wavpath)
if len(audio) < self.seq_len + self.pad_short:
audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \
mode='constant', constant_values=0.0)
mel = self.ap.melspectrogram(audio)
else:
# load precomputed features
wavpath, feat_path = self.item_list[idx]
if self.use_cache and self.cache[idx] is not None:
audio, mel = self.cache[idx]
else:
audio = self.ap.load_wav(wavpath)
mel = np.load(feat_path)
# correct the audio length wrt padding applied in stft
audio = np.pad(audio, (0, self.hop_len), mode="edge")
audio = audio[:mel.shape[-1] * self.hop_len]
assert mel.shape[-1] * self.hop_len == audio.shape[-1], f' [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}'
audio = torch.from_numpy(audio).float().unsqueeze(0)
mel = torch.from_numpy(mel).float().squeeze(0)
if self.return_segments:
max_mel_start = mel.shape[1] - self.feat_frame_len
mel_start = random.randint(0, max_mel_start)
mel_end = mel_start + self.feat_frame_len
mel = mel[:, mel_start:mel_end]
audio_start = mel_start * self.hop_len
audio = audio[:, audio_start:audio_start +
self.seq_len]
if self.use_noise_augment and self.is_training and self.return_segments:
audio = audio + (1 / 32768) * torch.randn_like(audio)
return (mel, audio)

Просмотреть файл

@ -0,0 +1,37 @@
import glob
import os
from pathlib import Path
import numpy as np
def find_wav_files(data_path):
wav_paths = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True)
return wav_paths
def find_feat_files(data_path):
feat_paths = glob.glob(os.path.join(data_path, '**', '*.npy'), recursive=True)
return feat_paths
def load_wav_data(data_path, eval_split_size):
wav_paths = find_wav_files(data_path)
np.random.seed(0)
np.random.shuffle(wav_paths)
return wav_paths[:eval_split_size], wav_paths[eval_split_size:]
def load_wav_feat_data(data_path, feat_path, eval_split_size):
wav_paths = sorted(find_wav_files(data_path))
feat_paths = sorted(find_feat_files(feat_path))
assert len(wav_paths) == len(feat_paths)
for wav, feat in zip(wav_paths, feat_paths):
wav_name = Path(wav).stem
feat_name = Path(feat).stem
assert wav_name == feat_name
items = list(zip(wav_paths, feat_paths))
np.random.seed(0)
np.random.shuffle(items)
return items[:eval_split_size], items[eval_split_size:]

Просмотреть файл

Просмотреть файл

@ -0,0 +1,309 @@
import torch
from torch import nn
from torch.nn import functional as F
class TorchSTFT():
def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
""" Torch based STFT operation """
self.n_fft = n_fft
self.hop_length = hop_length
self.win_length = win_length
self.window = getattr(torch, window)(win_length)
def __call__(self, x):
# B x D x T x 2
o = torch.stft(x,
self.n_fft,
self.hop_length,
self.win_length,
self.window,
center=True,
pad_mode="reflect", # compatible with audio.py
normalized=False,
onesided=True)
M = o[:, :, :, 0]
P = o[:, :, :, 1]
return torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8))
#################################
# GENERATOR LOSSES
#################################
class STFTLoss(nn.Module):
""" Single scale STFT Loss """
def __init__(self, n_fft, hop_length, win_length):
super(STFTLoss, self).__init__()
self.n_fft = n_fft
self.hop_length = hop_length
self.win_length = win_length
self.stft = TorchSTFT(n_fft, hop_length, win_length)
def forward(self, y_hat, y):
y_hat_M = self.stft(y_hat)
y_M = self.stft(y)
# magnitude loss
loss_mag = F.l1_loss(torch.log(y_M), torch.log(y_hat_M))
# spectral convergence loss
loss_sc = torch.norm(y_M - y_hat_M, p="fro") / torch.norm(y_M, p="fro")
return loss_mag, loss_sc
class MultiScaleSTFTLoss(torch.nn.Module):
""" Multi scale STFT loss """
def __init__(self,
n_ffts=(1024, 2048, 512),
hop_lengths=(120, 240, 50),
win_lengths=(600, 1200, 240)):
super(MultiScaleSTFTLoss, self).__init__()
self.loss_funcs = torch.nn.ModuleList()
for n_fft, hop_length, win_length in zip(n_ffts, hop_lengths, win_lengths):
self.loss_funcs.append(STFTLoss(n_fft, hop_length, win_length))
def forward(self, y_hat, y):
N = len(self.loss_funcs)
loss_sc = 0
loss_mag = 0
for f in self.loss_funcs:
lm, lsc = f(y_hat, y)
loss_mag += lm
loss_sc += lsc
loss_sc /= N
loss_mag /= N
return loss_mag, loss_sc
class MultiScaleSubbandSTFTLoss(MultiScaleSTFTLoss):
""" Multiscale STFT loss for multi band model outputs """
# pylint: disable=no-self-use
def forward(self, y_hat, y):
y_hat = y_hat.view(-1, 1, y_hat.shape[2])
y = y.view(-1, 1, y.shape[2])
return super().forward(y_hat.squeeze(1), y.squeeze(1))
class MSEGLoss(nn.Module):
""" Mean Squared Generator Loss """
# pylint: disable=no-self-use
def forward(self, score_real):
loss_fake = F.mse_loss(score_real, score_real.new_ones(score_real.shape))
return loss_fake
class HingeGLoss(nn.Module):
""" Hinge Discriminator Loss """
# pylint: disable=no-self-use
def forward(self, score_real):
# TODO: this might be wrong
loss_fake = torch.mean(F.relu(1. - score_real))
return loss_fake
##################################
# DISCRIMINATOR LOSSES
##################################
class MSEDLoss(nn.Module):
""" Mean Squared Discriminator Loss """
def __init__(self,):
super(MSEDLoss, self).__init__()
self.loss_func = nn.MSELoss()
# pylint: disable=no-self-use
def forward(self, score_fake, score_real):
loss_real = self.loss_func(score_real, score_real.new_ones(score_real.shape))
loss_fake = self.loss_func(score_fake, score_fake.new_zeros(score_fake.shape))
loss_d = loss_real + loss_fake
return loss_d, loss_real, loss_fake
class HingeDLoss(nn.Module):
""" Hinge Discriminator Loss """
# pylint: disable=no-self-use
def forward(self, score_fake, score_real):
loss_real = torch.mean(F.relu(1. - score_real))
loss_fake = torch.mean(F.relu(1. + score_fake))
loss_d = loss_real + loss_fake
return loss_d, loss_real, loss_fake
class MelganFeatureLoss(nn.Module):
def __init__(self,):
super(MelganFeatureLoss, self).__init__()
self.loss_func = nn.L1Loss()
# pylint: disable=no-self-use
def forward(self, fake_feats, real_feats):
loss_feats = 0
for fake_feat, real_feat in zip(fake_feats, real_feats):
loss_feats += self.loss_func(fake_feat, real_feat)
loss_feats /= len(fake_feats) + len(real_feats)
return loss_feats
#####################################
# LOSS WRAPPERS
#####################################
def _apply_G_adv_loss(scores_fake, loss_func):
""" Compute G adversarial loss function
and normalize values """
adv_loss = 0
if isinstance(scores_fake, list):
for score_fake in scores_fake:
fake_loss = loss_func(score_fake)
adv_loss += fake_loss
adv_loss /= len(scores_fake)
else:
fake_loss = loss_func(scores_fake)
adv_loss = fake_loss
return adv_loss
def _apply_D_loss(scores_fake, scores_real, loss_func):
""" Compute D loss func and normalize loss values """
loss = 0
real_loss = 0
fake_loss = 0
if isinstance(scores_fake, list):
# multi-scale loss
for score_fake, score_real in zip(scores_fake, scores_real):
total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real)
loss += total_loss
real_loss += real_loss
fake_loss += fake_loss
# normalize loss values with number of scales
loss /= len(scores_fake)
real_loss /= len(scores_real)
fake_loss /= len(scores_fake)
else:
# single scale loss
total_loss, real_loss, fake_loss = loss_func(scores_fake, scores_real)
loss = total_loss
return loss, real_loss, fake_loss
##################################
# MODEL LOSSES
##################################
class GeneratorLoss(nn.Module):
def __init__(self, C):
""" Compute Generator Loss values depending on training
configuration """
super(GeneratorLoss, self).__init__()
assert not(C.use_mse_gan_loss and C.use_hinge_gan_loss),\
" [!] Cannot use HingeGANLoss and MSEGANLoss together."
self.use_stft_loss = C.use_stft_loss
self.use_subband_stft_loss = C.use_subband_stft_loss
self.use_mse_gan_loss = C.use_mse_gan_loss
self.use_hinge_gan_loss = C.use_hinge_gan_loss
self.use_feat_match_loss = C.use_feat_match_loss
self.stft_loss_weight = C.stft_loss_weight
self.subband_stft_loss_weight = C.subband_stft_loss_weight
self.mse_gan_loss_weight = C.mse_G_loss_weight
self.hinge_gan_loss_weight = C.hinge_G_loss_weight
self.feat_match_loss_weight = C.feat_match_loss_weight
if C.use_stft_loss:
self.stft_loss = MultiScaleSTFTLoss(**C.stft_loss_params)
if C.use_subband_stft_loss:
self.subband_stft_loss = MultiScaleSubbandSTFTLoss(**C.subband_stft_loss_params)
if C.use_mse_gan_loss:
self.mse_loss = MSEGLoss()
if C.use_hinge_gan_loss:
self.hinge_loss = HingeGLoss()
if C.use_feat_match_loss:
self.feat_match_loss = MelganFeatureLoss()
def forward(self, y_hat=None, y=None, scores_fake=None, feats_fake=None, feats_real=None, y_hat_sub=None, y_sub=None):
gen_loss = 0
adv_loss = 0
return_dict = {}
# STFT Loss
if self.use_stft_loss:
stft_loss_mg, stft_loss_sc = self.stft_loss(y_hat.squeeze(1), y.squeeze(1))
return_dict['G_stft_loss_mg'] = stft_loss_mg
return_dict['G_stft_loss_sc'] = stft_loss_sc
gen_loss += self.stft_loss_weight * (stft_loss_mg + stft_loss_sc)
# subband STFT Loss
if self.use_subband_stft_loss:
subband_stft_loss_mg, subband_stft_loss_sc = self.subband_stft_loss(y_hat_sub, y_sub)
return_dict['G_subband_stft_loss_mg'] = subband_stft_loss_mg
return_dict['G_subband_stft_loss_sc'] = subband_stft_loss_sc
gen_loss += self.subband_stft_loss_weight * (subband_stft_loss_mg + subband_stft_loss_sc)
# multiscale MSE adversarial loss
if self.use_mse_gan_loss and scores_fake is not None:
mse_fake_loss = _apply_G_adv_loss(scores_fake, self.mse_loss)
return_dict['G_mse_fake_loss'] = mse_fake_loss
adv_loss += self.mse_gan_loss_weight * mse_fake_loss
# multiscale Hinge adversarial loss
if self.use_hinge_gan_loss and not scores_fake is not None:
hinge_fake_loss = _apply_G_adv_loss(scores_fake, self.hinge_loss)
return_dict['G_hinge_fake_loss'] = hinge_fake_loss
adv_loss += self.hinge_gan_loss_weight * hinge_fake_loss
# Feature Matching Loss
if self.use_feat_match_loss and not feats_fake:
feat_match_loss = self.feat_match_loss(feats_fake, feats_real)
return_dict['G_feat_match_loss'] = feat_match_loss
adv_loss += self.feat_match_loss_weight * feat_match_loss
return_dict['G_loss'] = gen_loss + adv_loss
return_dict['G_gen_loss'] = gen_loss
return_dict['G_adv_loss'] = adv_loss
return return_dict
class DiscriminatorLoss(nn.Module):
""" Compute Discriminator Loss values depending on training
configuration """
def __init__(self, C):
super(DiscriminatorLoss, self).__init__()
assert not(C.use_mse_gan_loss and C.use_hinge_gan_loss),\
" [!] Cannot use HingeGANLoss and MSEGANLoss together."
self.use_mse_gan_loss = C.use_mse_gan_loss
self.use_hinge_gan_loss = C.use_hinge_gan_loss
if C.use_mse_gan_loss:
self.mse_loss = MSEDLoss()
if C.use_hinge_gan_loss:
self.hinge_loss = HingeDLoss()
def forward(self, scores_fake, scores_real):
loss = 0
return_dict = {}
if self.use_mse_gan_loss:
mse_D_loss, mse_D_real_loss, mse_D_fake_loss = _apply_D_loss(
scores_fake=scores_fake,
scores_real=scores_real,
loss_func=self.mse_loss)
return_dict['D_mse_gan_loss'] = mse_D_loss
return_dict['D_mse_gan_real_loss'] = mse_D_real_loss
return_dict['D_mse_gan_fake_loss'] = mse_D_fake_loss
loss += mse_D_loss
if self.use_hinge_gan_loss:
hinge_D_loss, hinge_D_real_loss, hinge_D_fake_loss = _apply_D_loss(
scores_fake=scores_fake,
scores_real=scores_real,
loss_func=self.hinge_loss)
return_dict['D_hinge_gan_loss'] = hinge_D_loss
return_dict['D_hinge_gan_real_loss'] = hinge_D_real_loss
return_dict['D_hinge_gan_fake_loss'] = hinge_D_fake_loss
loss += hinge_D_loss
return_dict['D_loss'] = loss
return return_dict

Просмотреть файл

@ -0,0 +1,45 @@
from torch import nn
from torch.nn.utils import weight_norm
class ResidualStack(nn.Module):
def __init__(self, channels, num_res_blocks, kernel_size):
super(ResidualStack, self).__init__()
assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd."
base_padding = (kernel_size - 1) // 2
self.blocks = nn.ModuleList()
for idx in range(num_res_blocks):
layer_kernel_size = kernel_size
layer_dilation = layer_kernel_size**idx
layer_padding = base_padding * layer_dilation
self.blocks += [nn.Sequential(
nn.LeakyReLU(0.2),
nn.ReflectionPad1d(layer_padding),
weight_norm(
nn.Conv1d(channels,
channels,
kernel_size=kernel_size,
dilation=layer_dilation,
bias=True)),
nn.LeakyReLU(0.2),
weight_norm(
nn.Conv1d(channels, channels, kernel_size=1, bias=True)),
)]
self.shortcuts = nn.ModuleList([
weight_norm(nn.Conv1d(channels, channels, kernel_size=1,
bias=True)) for i in range(num_res_blocks)
])
def forward(self, x):
for block, shortcut in zip(self.blocks, self.shortcuts):
x = shortcut(x) + block(x)
return x
def remove_weight_norm(self):
for block, shortcut in zip(self.blocks, self.shortcuts):
nn.utils.remove_weight_norm(block[2])
nn.utils.remove_weight_norm(block[4])
nn.utils.remove_weight_norm(shortcut)

Просмотреть файл

@ -0,0 +1,56 @@
import numpy as np
import torch
import torch.nn.functional as F
from scipy import signal as sig
# adapted from
# https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan
class PQMF(torch.nn.Module):
def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0):
super(PQMF, self).__init__()
self.N = N
self.taps = taps
self.cutoff = cutoff
self.beta = beta
QMF = sig.firwin(taps + 1, cutoff, window=('kaiser', beta))
H = np.zeros((N, len(QMF)))
G = np.zeros((N, len(QMF)))
for k in range(N):
constant_factor = (2 * k + 1) * (np.pi /
(2 * N)) * (np.arange(taps + 1) -
((taps - 1) / 2))
phase = (-1)**k * np.pi / 4
H[k] = 2 * QMF * np.cos(constant_factor + phase)
G[k] = 2 * QMF * np.cos(constant_factor - phase)
H = torch.from_numpy(H[:, None, :]).float()
G = torch.from_numpy(G[None, :, :]).float()
self.register_buffer("H", H)
self.register_buffer("G", G)
updown_filter = torch.zeros((N, N, N)).float()
for k in range(N):
updown_filter[k, k, 0] = 1.0
self.register_buffer("updown_filter", updown_filter)
self.N = N
self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
def forward(self, x):
return self.analysis(x)
def analysis(self, x):
return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N)
def synthesis(self, x):
x = F.conv_transpose1d(x,
self.updown_filter * self.N,
stride=self.N)
x = F.conv1d(x, self.G, padding=self.taps // 2)
return x

640
TTS/vocoder/layers/qmf.dat Normal file
Просмотреть файл

@ -0,0 +1,640 @@
0.0000000e+000
-5.5252865e-004
-5.6176926e-004
-4.9475181e-004
-4.8752280e-004
-4.8937912e-004
-5.0407143e-004
-5.2265643e-004
-5.4665656e-004
-5.6778026e-004
-5.8709305e-004
-6.1327474e-004
-6.3124935e-004
-6.5403334e-004
-6.7776908e-004
-6.9416146e-004
-7.1577365e-004
-7.2550431e-004
-7.4409419e-004
-7.4905981e-004
-7.6813719e-004
-7.7248486e-004
-7.8343323e-004
-7.7798695e-004
-7.8036647e-004
-7.8014496e-004
-7.7579773e-004
-7.6307936e-004
-7.5300014e-004
-7.3193572e-004
-7.2153920e-004
-6.9179375e-004
-6.6504151e-004
-6.3415949e-004
-5.9461189e-004
-5.5645764e-004
-5.1455722e-004
-4.6063255e-004
-4.0951215e-004
-3.5011759e-004
-2.8969812e-004
-2.0983373e-004
-1.4463809e-004
-6.1733441e-005
1.3494974e-005
1.0943831e-004
2.0430171e-004
2.9495311e-004
4.0265402e-004
5.1073885e-004
6.2393761e-004
7.4580259e-004
8.6084433e-004
9.8859883e-004
1.1250155e-003
1.2577885e-003
1.3902495e-003
1.5443220e-003
1.6868083e-003
1.8348265e-003
1.9841141e-003
2.1461584e-003
2.3017255e-003
2.4625617e-003
2.6201759e-003
2.7870464e-003
2.9469448e-003
3.1125421e-003
3.2739613e-003
3.4418874e-003
3.6008268e-003
3.7603923e-003
3.9207432e-003
4.0819753e-003
4.2264269e-003
4.3730720e-003
4.5209853e-003
4.6606461e-003
4.7932561e-003
4.9137604e-003
5.0393023e-003
5.1407354e-003
5.2461166e-003
5.3471681e-003
5.4196776e-003
5.4876040e-003
5.5475715e-003
5.5938023e-003
5.6220643e-003
5.6455197e-003
5.6389200e-003
5.6266114e-003
5.5917129e-003
5.5404364e-003
5.4753783e-003
5.3838976e-003
5.2715759e-003
5.1382275e-003
4.9839688e-003
4.8109469e-003
4.6039530e-003
4.3801862e-003
4.1251642e-003
3.8456408e-003
3.5401247e-003
3.2091886e-003
2.8446758e-003
2.4508540e-003
2.0274176e-003
1.5784683e-003
1.0902329e-003
5.8322642e-004
2.7604519e-005
-5.4642809e-004
-1.1568136e-003
-1.8039473e-003
-2.4826724e-003
-3.1933778e-003
-3.9401124e-003
-4.7222596e-003
-5.5337211e-003
-6.3792293e-003
-7.2615817e-003
-8.1798233e-003
-9.1325330e-003
-1.0115022e-002
-1.1131555e-002
-1.2185000e-002
-1.3271822e-002
-1.4390467e-002
-1.5540555e-002
-1.6732471e-002
-1.7943338e-002
-1.9187243e-002
-2.0453179e-002
-2.1746755e-002
-2.3068017e-002
-2.4416099e-002
-2.5787585e-002
-2.7185943e-002
-2.8607217e-002
-3.0050266e-002
-3.1501761e-002
-3.2975408e-002
-3.4462095e-002
-3.5969756e-002
-3.7481285e-002
-3.9005368e-002
-4.0534917e-002
-4.2064909e-002
-4.3609754e-002
-4.5148841e-002
-4.6684303e-002
-4.8216572e-002
-4.9738576e-002
-5.1255616e-002
-5.2763075e-002
-5.4245277e-002
-5.5717365e-002
-5.7161645e-002
-5.8591568e-002
-5.9983748e-002
-6.1345517e-002
-6.2685781e-002
-6.3971590e-002
-6.5224711e-002
-6.6436751e-002
-6.7607599e-002
-6.8704383e-002
-6.9763024e-002
-7.0762871e-002
-7.1700267e-002
-7.2568258e-002
-7.3362026e-002
-7.4100364e-002
-7.4745256e-002
-7.5313734e-002
-7.5800836e-002
-7.6199248e-002
-7.6499217e-002
-7.6709349e-002
-7.6817398e-002
-7.6823001e-002
-7.6720492e-002
-7.6505072e-002
-7.6174832e-002
-7.5730576e-002
-7.5157626e-002
-7.4466439e-002
-7.3640601e-002
-7.2677464e-002
-7.1582636e-002
-7.0353307e-002
-6.8966401e-002
-6.7452502e-002
-6.5769067e-002
-6.3944481e-002
-6.1960278e-002
-5.9816657e-002
-5.7515269e-002
-5.5046003e-002
-5.2409382e-002
-4.9597868e-002
-4.6630331e-002
-4.3476878e-002
-4.0145828e-002
-3.6641812e-002
-3.2958393e-002
-2.9082401e-002
-2.5030756e-002
-2.0799707e-002
-1.6370126e-002
-1.1762383e-002
-6.9636862e-003
-1.9765601e-003
3.2086897e-003
8.5711749e-003
1.4128883e-002
1.9883413e-002
2.5822729e-002
3.1953127e-002
3.8277657e-002
4.4780682e-002
5.1480418e-002
5.8370533e-002
6.5440985e-002
7.2694330e-002
8.0137293e-002
8.7754754e-002
9.5553335e-002
1.0353295e-001
1.1168269e-001
1.2000780e-001
1.2850029e-001
1.3715518e-001
1.4597665e-001
1.5496071e-001
1.6409589e-001
1.7338082e-001
1.8281725e-001
1.9239667e-001
2.0212502e-001
2.1197359e-001
2.2196527e-001
2.3206909e-001
2.4230169e-001
2.5264803e-001
2.6310533e-001
2.7366340e-001
2.8432142e-001
2.9507167e-001
3.0590986e-001
3.1682789e-001
3.2781137e-001
3.3887227e-001
3.4999141e-001
3.6115899e-001
3.7237955e-001
3.8363500e-001
3.9492118e-001
4.0623177e-001
4.1756969e-001
4.2891199e-001
4.4025538e-001
4.5159965e-001
4.6293081e-001
4.7424532e-001
4.8552531e-001
4.9677083e-001
5.0798175e-001
5.1912350e-001
5.3022409e-001
5.4125534e-001
5.5220513e-001
5.6307891e-001
5.7385241e-001
5.8454032e-001
5.9511231e-001
6.0557835e-001
6.1591099e-001
6.2612427e-001
6.3619801e-001
6.4612697e-001
6.5590163e-001
6.6551399e-001
6.7496632e-001
6.8423533e-001
6.9332824e-001
7.0223887e-001
7.1094104e-001
7.1944626e-001
7.2774489e-001
7.3582118e-001
7.4368279e-001
7.5131375e-001
7.5870808e-001
7.6586749e-001
7.7277809e-001
7.7942875e-001
7.8583531e-001
7.9197358e-001
7.9784664e-001
8.0344858e-001
8.0876950e-001
8.1381913e-001
8.1857760e-001
8.2304199e-001
8.2722753e-001
8.3110385e-001
8.3469374e-001
8.3797173e-001
8.4095414e-001
8.4362383e-001
8.4598185e-001
8.4803158e-001
8.4978052e-001
8.5119715e-001
8.5230470e-001
8.5310209e-001
8.5357206e-001
8.5373856e-001
8.5357206e-001
8.5310209e-001
8.5230470e-001
8.5119715e-001
8.4978052e-001
8.4803158e-001
8.4598185e-001
8.4362383e-001
8.4095414e-001
8.3797173e-001
8.3469374e-001
8.3110385e-001
8.2722753e-001
8.2304199e-001
8.1857760e-001
8.1381913e-001
8.0876950e-001
8.0344858e-001
7.9784664e-001
7.9197358e-001
7.8583531e-001
7.7942875e-001
7.7277809e-001
7.6586749e-001
7.5870808e-001
7.5131375e-001
7.4368279e-001
7.3582118e-001
7.2774489e-001
7.1944626e-001
7.1094104e-001
7.0223887e-001
6.9332824e-001
6.8423533e-001
6.7496632e-001
6.6551399e-001
6.5590163e-001
6.4612697e-001
6.3619801e-001
6.2612427e-001
6.1591099e-001
6.0557835e-001
5.9511231e-001
5.8454032e-001
5.7385241e-001
5.6307891e-001
5.5220513e-001
5.4125534e-001
5.3022409e-001
5.1912350e-001
5.0798175e-001
4.9677083e-001
4.8552531e-001
4.7424532e-001
4.6293081e-001
4.5159965e-001
4.4025538e-001
4.2891199e-001
4.1756969e-001
4.0623177e-001
3.9492118e-001
3.8363500e-001
3.7237955e-001
3.6115899e-001
3.4999141e-001
3.3887227e-001
3.2781137e-001
3.1682789e-001
3.0590986e-001
2.9507167e-001
2.8432142e-001
2.7366340e-001
2.6310533e-001
2.5264803e-001
2.4230169e-001
2.3206909e-001
2.2196527e-001
2.1197359e-001
2.0212502e-001
1.9239667e-001
1.8281725e-001
1.7338082e-001
1.6409589e-001
1.5496071e-001
1.4597665e-001
1.3715518e-001
1.2850029e-001
1.2000780e-001
1.1168269e-001
1.0353295e-001
9.5553335e-002
8.7754754e-002
8.0137293e-002
7.2694330e-002
6.5440985e-002
5.8370533e-002
5.1480418e-002
4.4780682e-002
3.8277657e-002
3.1953127e-002
2.5822729e-002
1.9883413e-002
1.4128883e-002
8.5711749e-003
3.2086897e-003
-1.9765601e-003
-6.9636862e-003
-1.1762383e-002
-1.6370126e-002
-2.0799707e-002
-2.5030756e-002
-2.9082401e-002
-3.2958393e-002
-3.6641812e-002
-4.0145828e-002
-4.3476878e-002
-4.6630331e-002
-4.9597868e-002
-5.2409382e-002
-5.5046003e-002
-5.7515269e-002
-5.9816657e-002
-6.1960278e-002
-6.3944481e-002
-6.5769067e-002
-6.7452502e-002
-6.8966401e-002
-7.0353307e-002
-7.1582636e-002
-7.2677464e-002
-7.3640601e-002
-7.4466439e-002
-7.5157626e-002
-7.5730576e-002
-7.6174832e-002
-7.6505072e-002
-7.6720492e-002
-7.6823001e-002
-7.6817398e-002
-7.6709349e-002
-7.6499217e-002
-7.6199248e-002
-7.5800836e-002
-7.5313734e-002
-7.4745256e-002
-7.4100364e-002
-7.3362026e-002
-7.2568258e-002
-7.1700267e-002
-7.0762871e-002
-6.9763024e-002
-6.8704383e-002
-6.7607599e-002
-6.6436751e-002
-6.5224711e-002
-6.3971590e-002
-6.2685781e-002
-6.1345517e-002
-5.9983748e-002
-5.8591568e-002
-5.7161645e-002
-5.5717365e-002
-5.4245277e-002
-5.2763075e-002
-5.1255616e-002
-4.9738576e-002
-4.8216572e-002
-4.6684303e-002
-4.5148841e-002
-4.3609754e-002
-4.2064909e-002
-4.0534917e-002
-3.9005368e-002
-3.7481285e-002
-3.5969756e-002
-3.4462095e-002
-3.2975408e-002
-3.1501761e-002
-3.0050266e-002
-2.8607217e-002
-2.7185943e-002
-2.5787585e-002
-2.4416099e-002
-2.3068017e-002
-2.1746755e-002
-2.0453179e-002
-1.9187243e-002
-1.7943338e-002
-1.6732471e-002
-1.5540555e-002
-1.4390467e-002
-1.3271822e-002
-1.2185000e-002
-1.1131555e-002
-1.0115022e-002
-9.1325330e-003
-8.1798233e-003
-7.2615817e-003
-6.3792293e-003
-5.5337211e-003
-4.7222596e-003
-3.9401124e-003
-3.1933778e-003
-2.4826724e-003
-1.8039473e-003
-1.1568136e-003
-5.4642809e-004
2.7604519e-005
5.8322642e-004
1.0902329e-003
1.5784683e-003
2.0274176e-003
2.4508540e-003
2.8446758e-003
3.2091886e-003
3.5401247e-003
3.8456408e-003
4.1251642e-003
4.3801862e-003
4.6039530e-003
4.8109469e-003
4.9839688e-003
5.1382275e-003
5.2715759e-003
5.3838976e-003
5.4753783e-003
5.5404364e-003
5.5917129e-003
5.6266114e-003
5.6389200e-003
5.6455197e-003
5.6220643e-003
5.5938023e-003
5.5475715e-003
5.4876040e-003
5.4196776e-003
5.3471681e-003
5.2461166e-003
5.1407354e-003
5.0393023e-003
4.9137604e-003
4.7932561e-003
4.6606461e-003
4.5209853e-003
4.3730720e-003
4.2264269e-003
4.0819753e-003
3.9207432e-003
3.7603923e-003
3.6008268e-003
3.4418874e-003
3.2739613e-003
3.1125421e-003
2.9469448e-003
2.7870464e-003
2.6201759e-003
2.4625617e-003
2.3017255e-003
2.1461584e-003
1.9841141e-003
1.8348265e-003
1.6868083e-003
1.5443220e-003
1.3902495e-003
1.2577885e-003
1.1250155e-003
9.8859883e-004
8.6084433e-004
7.4580259e-004
6.2393761e-004
5.1073885e-004
4.0265402e-004
2.9495311e-004
2.0430171e-004
1.0943831e-004
1.3494974e-005
-6.1733441e-005
-1.4463809e-004
-2.0983373e-004
-2.8969812e-004
-3.5011759e-004
-4.0951215e-004
-4.6063255e-004
-5.1455722e-004
-5.5645764e-004
-5.9461189e-004
-6.3415949e-004
-6.6504151e-004
-6.9179375e-004
-7.2153920e-004
-7.3193572e-004
-7.5300014e-004
-7.6307936e-004
-7.7579773e-004
-7.8014496e-004
-7.8036647e-004
-7.7798695e-004
-7.8343323e-004
-7.7248486e-004
-7.6813719e-004
-7.4905981e-004
-7.4409419e-004
-7.2550431e-004
-7.1577365e-004
-6.9416146e-004
-6.7776908e-004
-6.5403334e-004
-6.3124935e-004
-6.1327474e-004
-5.8709305e-004
-5.6778026e-004
-5.4665656e-004
-5.2265643e-004
-5.0407143e-004
-4.8937912e-004
-4.8752280e-004
-4.9475181e-004
-5.6176926e-004
-5.5252865e-004

Просмотреть файл

Просмотреть файл

@ -0,0 +1,78 @@
import numpy as np
from torch import nn
from torch.nn.utils import weight_norm
class MelganDiscriminator(nn.Module):
def __init__(self,
in_channels=1,
out_channels=1,
kernel_sizes=(5, 3),
base_channels=16,
max_channels=1024,
downsample_factors=(4, 4, 4, 4)):
super(MelganDiscriminator, self).__init__()
self.layers = nn.ModuleList()
layer_kernel_size = np.prod(kernel_sizes)
layer_padding = (layer_kernel_size - 1) // 2
# initial layer
self.layers += [
nn.Sequential(
nn.ReflectionPad1d(layer_padding),
weight_norm(
nn.Conv1d(in_channels,
base_channels,
layer_kernel_size,
stride=1)), nn.LeakyReLU(0.2, inplace=True))
]
# downsampling layers
layer_in_channels = base_channels
for downsample_factor in downsample_factors:
layer_out_channels = min(layer_in_channels * downsample_factor,
max_channels)
layer_kernel_size = downsample_factor * 10 + 1
layer_padding = (layer_kernel_size - 1) // 2
layer_groups = layer_in_channels // 4
self.layers += [
nn.Sequential(
weight_norm(
nn.Conv1d(layer_in_channels,
layer_out_channels,
kernel_size=layer_kernel_size,
stride=downsample_factor,
padding=layer_padding,
groups=layer_groups)),
nn.LeakyReLU(0.2, inplace=True))
]
layer_in_channels = layer_out_channels
# last 2 layers
layer_padding1 = (kernel_sizes[0] - 1) // 2
layer_padding2 = (kernel_sizes[1] - 1) // 2
self.layers += [
nn.Sequential(
weight_norm(
nn.Conv1d(layer_out_channels,
layer_out_channels,
kernel_size=kernel_sizes[0],
stride=1,
padding=layer_padding1)),
nn.LeakyReLU(0.2, inplace=True),
),
weight_norm(
nn.Conv1d(layer_out_channels,
out_channels,
kernel_size=kernel_sizes[1],
stride=1,
padding=layer_padding2)),
]
def forward(self, x):
feats = []
for layer in self.layers:
x = layer(x)
feats.append(x)
return x, feats

Просмотреть файл

@ -0,0 +1,98 @@
import torch
from torch import nn
from torch.nn.utils import weight_norm
from TTS.vocoder.layers.melgan import ResidualStack
class MelganGenerator(nn.Module):
def __init__(self,
in_channels=80,
out_channels=1,
proj_kernel=7,
base_channels=512,
upsample_factors=(8, 8, 2, 2),
res_kernel=3,
num_res_blocks=3):
super(MelganGenerator, self).__init__()
# assert model parameters
assert (proj_kernel -
1) % 2 == 0, " [!] proj_kernel should be an odd number."
# setup additional model parameters
base_padding = (proj_kernel - 1) // 2
act_slope = 0.2
self.inference_padding = 2
# initial layer
layers = []
layers += [
nn.ReflectionPad1d(base_padding),
weight_norm(
nn.Conv1d(in_channels,
base_channels,
kernel_size=proj_kernel,
stride=1,
bias=True))
]
# upsampling layers and residual stacks
for idx, upsample_factor in enumerate(upsample_factors):
layer_in_channels = base_channels // (2**idx)
layer_out_channels = base_channels // (2**(idx + 1))
layer_filter_size = upsample_factor * 2
layer_stride = upsample_factor
layer_output_padding = upsample_factor % 2
layer_padding = upsample_factor // 2 + layer_output_padding
layers += [
nn.LeakyReLU(act_slope),
weight_norm(
nn.ConvTranspose1d(layer_in_channels,
layer_out_channels,
layer_filter_size,
stride=layer_stride,
padding=layer_padding,
output_padding=layer_output_padding,
bias=True)),
ResidualStack(
channels=layer_out_channels,
num_res_blocks=num_res_blocks,
kernel_size=res_kernel
)
]
layers += [nn.LeakyReLU(act_slope)]
# final layer
layers += [
nn.ReflectionPad1d(base_padding),
weight_norm(
nn.Conv1d(layer_out_channels,
out_channels,
proj_kernel,
stride=1,
bias=True)),
nn.Tanh()
]
self.layers = nn.Sequential(*layers)
def forward(self, c):
return self.layers(c)
def inference(self, c):
c = c.to(self.layers[1].weight.device)
c = torch.nn.functional.pad(
c,
(self.inference_padding, self.inference_padding),
'replicate')
return self.layers(c)
def remove_weight_norm(self):
for _, layer in enumerate(self.layers):
if len(layer.state_dict()) != 0:
try:
nn.utils.remove_weight_norm(layer)
except ValueError:
layer.remove_weight_norm()

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше