зеркало из https://github.com/mozilla/TTS.git
312 строки
9.6 KiB
Plaintext
312 строки
9.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"TTS_PATH = \"/home/erogol/projects/\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%load_ext autoreload\n",
|
|
"%autoreload 2\n",
|
|
"import os\n",
|
|
"import sys\n",
|
|
"sys.path.append(TTS_PATH)\n",
|
|
"import torch\n",
|
|
"import importlib\n",
|
|
"import numpy as np\n",
|
|
"from tqdm import tqdm as tqdm\n",
|
|
"from torch.utils.data import DataLoader\n",
|
|
"from TTS.models.tacotron2 import Tacotron2\n",
|
|
"from TTS.datasets.TTSDataset import MyDataset\n",
|
|
"from TTS.utils.audio import AudioProcessor\n",
|
|
"from TTS.utils.visual import plot_spectrogram\n",
|
|
"from TTS.utils.generic_utils import load_config, setup_model\n",
|
|
"from TTS.datasets.preprocess import ljspeech\n",
|
|
"%matplotlib inline\n",
|
|
"\n",
|
|
"import os\n",
|
|
"os.environ['CUDA_VISIBLE_DEVICES']='1'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def set_filename(wav_path, out_path):\n",
|
|
" wav_file = os.path.basename(wav_path)\n",
|
|
" file_name = wav_file.split('.')[0]\n",
|
|
" os.makedirs(os.path.join(out_path, \"quant\"), exist_ok=True)\n",
|
|
" os.makedirs(os.path.join(out_path, \"mel\"), exist_ok=True)\n",
|
|
" os.makedirs(os.path.join(out_path, \"wav_gl\"), exist_ok=True)\n",
|
|
" wavq_path = os.path.join(out_path, \"quant\", file_name)\n",
|
|
" mel_path = os.path.join(out_path, \"mel\", file_name)\n",
|
|
" wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n",
|
|
" return file_name, wavq_path, mel_path, wav_path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"OUT_PATH = \"/home/erogol/Data/Mozilla/wavernn/4841/\"\n",
|
|
"DATA_PATH = \"/home/erogol/Data/Mozilla/\"\n",
|
|
"DATASET = \"mozilla\"\n",
|
|
"METADATA_FILE = \"metadata.txt\"\n",
|
|
"CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/mozilla_models/4841/config.json\"\n",
|
|
"MODEL_FILE = \"/media/erogol/data_ssd/Data/models/mozilla_models/4841/best_model.pth.tar\"\n",
|
|
"DRY_RUN = False # if False, does not generate output files, only computes loss and visuals.\n",
|
|
"BATCH_SIZE = 32\n",
|
|
"\n",
|
|
"use_cuda = torch.cuda.is_available()\n",
|
|
"print(\" > CUDA enabled: \", use_cuda)\n",
|
|
"\n",
|
|
"C = load_config(CONFIG_PATH)\n",
|
|
"ap = AudioProcessor(bits=9, **C.audio)\n",
|
|
"C.prenet_dropout = False\n",
|
|
"C.separate_stopnet = True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"preprocessor = importlib.import_module('datasets.preprocess')\n",
|
|
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
|
"\n",
|
|
"dataset = MyDataset(DATA_PATH, METADATA_FILE, C.r, C.text_cleaner, ap, preprocessor, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path)\n",
|
|
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from TTS.utils.text.symbols import symbols, phonemes\n",
|
|
"from TTS.utils.generic_utils import sequence_mask\n",
|
|
"from TTS.layers.losses import L1LossMasked\n",
|
|
"from TTS.utils.text.symbols import symbols, phonemes\n",
|
|
"\n",
|
|
"# load the model\n",
|
|
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
|
"model = setup_model(num_chars, C)\n",
|
|
"checkpoint = torch.load(MODEL_FILE)\n",
|
|
"model.load_state_dict(checkpoint['model'])\n",
|
|
"print(checkpoint['step'])\n",
|
|
"model.eval()\n",
|
|
"if use_cuda:\n",
|
|
" model = model.cuda()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Generate model outputs "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pickle\n",
|
|
"\n",
|
|
"file_idxs = []\n",
|
|
"losses = []\n",
|
|
"postnet_losses = []\n",
|
|
"criterion = L1LossMasked()\n",
|
|
"for data in tqdm(loader):\n",
|
|
" # setup input data\n",
|
|
" text_input = data[0]\n",
|
|
" text_lengths = data[1]\n",
|
|
" linear_input = data[2]\n",
|
|
" mel_input = data[3]\n",
|
|
" mel_lengths = data[4]\n",
|
|
" stop_targets = data[5]\n",
|
|
" item_idx = data[6]\n",
|
|
" \n",
|
|
" # dispatch data to GPU\n",
|
|
" if use_cuda:\n",
|
|
" text_input = text_input.cuda()\n",
|
|
" text_lengths = text_lengths.cuda()\n",
|
|
" mel_input = mel_input.cuda()\n",
|
|
" mel_lengths = mel_lengths.cuda()\n",
|
|
"# linear_input = linear_input.cuda()\n",
|
|
" stop_targets = stop_targets.cuda()\n",
|
|
" \n",
|
|
" mask = sequence_mask(text_lengths)\n",
|
|
" mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input)\n",
|
|
" \n",
|
|
" # compute mel specs from linear spec if model is Tacotron\n",
|
|
" mel_specs = []\n",
|
|
" if C.model == \"Tacotron\":\n",
|
|
" postnet_outputs = postnet_outputs.data.cpu().numpy()\n",
|
|
" for b in range(postnet_outputs.shape[0]):\n",
|
|
" postnet_output = postnet_outputs[b]\n",
|
|
" mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T).cuda())\n",
|
|
" postnet_outputs = torch.stack(mel_specs)\n",
|
|
" \n",
|
|
" loss = criterion(mel_outputs, mel_input, mel_lengths)\n",
|
|
" loss_postnet = criterion(postnet_outputs, mel_input, mel_lengths)\n",
|
|
" losses.append(loss.item())\n",
|
|
" postnet_losses.append(loss_postnet.item())\n",
|
|
" if not DRY_RUN:\n",
|
|
" for idx in range(text_input.shape[0]):\n",
|
|
" wav_file_path = item_idx[idx]\n",
|
|
" wav = ap.load_wav(wav_file_path)\n",
|
|
" file_name, wavq_path, mel_path, wav_path = set_filename(wav_file_path, OUT_PATH)\n",
|
|
" file_idxs.append(file_name)\n",
|
|
"\n",
|
|
"# # quantize and save wav\n",
|
|
"# wavq = ap.quantize(wav)\n",
|
|
"# np.save(wavq_path, wavq)\n",
|
|
"\n",
|
|
" # save TTS mel\n",
|
|
" mel = postnet_outputs[idx]\n",
|
|
" mel = mel.data.cpu().numpy()\n",
|
|
" mel_length = mel_lengths[idx]\n",
|
|
" mel = mel[:mel_length, :].T\n",
|
|
" np.save(mel_path, mel)\n",
|
|
"\n",
|
|
" # save GL voice\n",
|
|
" # wav_gen = ap.inv_mel_spectrogram(mel.T) # mel to wav\n",
|
|
" # wav_gen = ap.quantize(wav_gen)\n",
|
|
" # np.save(wav_path, wav_gen)\n",
|
|
"\n",
|
|
"if not DRY_RUN:\n",
|
|
" pickle.dump(file_idxs, open(OUT_PATH+\"/dataset_ids.pkl\", \"wb\")) \n",
|
|
" \n",
|
|
"\n",
|
|
"print(np.mean(losses))\n",
|
|
"print(np.mean(postnet_losses))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Check model performance"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"idx = 1\n",
|
|
"mel_example = postnet_outputs[idx].data.cpu().numpy()\n",
|
|
"plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n",
|
|
"print(mel_example[:mel_lengths[1], :].shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mel_example = mel_outputs[idx].data.cpu().numpy()\n",
|
|
"plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n",
|
|
"print(mel_example[:mel_lengths[1], :].shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"wav = ap.load_wav(item_idx[idx])\n",
|
|
"melt = ap.melspectrogram(wav)\n",
|
|
"print(melt.shape)\n",
|
|
"plot_spectrogram(melt.T, ap);"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# postnet, decoder diff\n",
|
|
"from matplotlib import pylab as plt\n",
|
|
"mel_diff = mel_outputs[idx] - postnet_outputs[idx]\n",
|
|
"plt.figure(figsize=(16, 10))\n",
|
|
"plt.imshow(abs(mel_diff.detach().cpu().numpy()[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
|
|
"plt.colorbar()\n",
|
|
"plt.tight_layout()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from matplotlib import pylab as plt\n",
|
|
"# mel = mel_poutputs[idx].detach().cpu().numpy()\n",
|
|
"mel = postnet_outputs[idx].detach().cpu().numpy()\n",
|
|
"mel_diff2 = melt.T - mel[:melt.shape[1]]\n",
|
|
"plt.figure(figsize=(16, 10))\n",
|
|
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
|
|
"plt.colorbar()\n",
|
|
"plt.tight_layout()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|