зеркало из https://github.com/mozilla/TTS.git
Merge branch 'dev'
This commit is contained in:
Коммит
df5899daf4
|
@ -26,25 +26,21 @@ jobs:
|
|||
- run: |
|
||||
sudo apt update
|
||||
sudo apt install espeak git
|
||||
# so we can take advantage of pyproject.toml build-dependency support
|
||||
- run: python3 -m pip install --upgrade pip
|
||||
- run: python3 -m pip install numpy Cython
|
||||
- run: sudo pip install --upgrade pip
|
||||
- run: sudo pip install -e .
|
||||
- run: |
|
||||
python3 setup.py egg_info
|
||||
python3 -m pip install -e .
|
||||
- run: |
|
||||
python3 -m pip install --quiet --upgrade cardboardlint pylint
|
||||
sudo pip install --quiet --upgrade cardboardlint pylint
|
||||
cardboardlinter --refspec ${CIRCLE_BRANCH} -n auto
|
||||
- run: nosetests tests --nocapture
|
||||
- run: |
|
||||
./tests/test_server_package.sh
|
||||
./tests/test_glow-tts_train.sh
|
||||
./tests/test_server_package.sh
|
||||
./tests/test_tacotron_train.sh
|
||||
./tests/test_vocoder_gan_train.sh
|
||||
./tests/test_vocoder_wavegrad_train.sh
|
||||
./tests/test_vocoder_wavernn_train.sh
|
||||
./tests/test_speedy_speech_train.sh
|
||||
sudo ./tests/test_server_package.sh
|
||||
sudo ./tests/test_glow-tts_train.sh
|
||||
sudo ./tests/test_server_package.sh
|
||||
sudo ./tests/test_tacotron_train.sh
|
||||
sudo ./tests/test_vocoder_gan_train.sh
|
||||
sudo ./tests/test_vocoder_wavegrad_train.sh
|
||||
sudo ./tests/test_vocoder_wavernn_train.sh
|
||||
sudo ./tests/test_speedy_speech_train.sh
|
||||
|
||||
test-3.7:
|
||||
<<: *test-template
|
||||
|
|
|
@ -6,9 +6,11 @@ labels: ''
|
|||
assignees: ''
|
||||
|
||||
---
|
||||
<b>Questions</b> will not be answered here!!
|
||||
<b>Questions</b> will not be answered here!!
|
||||
|
||||
Please consider posting on [TTS Discourse](https://discourse.mozilla.org/c/tts) page if your issue is not directly related to TTS development (Bugs, code updates etc.).
|
||||
Help is much more valuable if it's shared publicly, so that more people can benefit from it.
|
||||
|
||||
Please consider posting on [TTS Discourse](https://discourse.mozilla.org/c/tts) page or matrix [chat room](https://matrix.to/#/!KTePhNahjgiVumkqca:matrix.org?via=matrix.org) if your issue is not directly related to TTS development (Bugs, code updates etc.).
|
||||
|
||||
You can also check https://github.com/mozilla/TTS/wiki/FAQ for common questions and answers.
|
||||
|
||||
|
|
20
Dockerfile
20
Dockerfile
|
@ -1,20 +0,0 @@
|
|||
FROM pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime
|
||||
|
||||
WORKDIR /srv/app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libsndfile1 espeak && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Copy Source later to enable dependency caching
|
||||
COPY requirements.txt /srv/app/
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
COPY . /srv/app
|
||||
|
||||
# http://bugs.python.org/issue19846
|
||||
# > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK.
|
||||
ENV LANG C.UTF-8
|
||||
|
||||
CMD python3.6 server/server.py -c server/conf.json
|
|
@ -0,0 +1,11 @@
|
|||
include README.md
|
||||
include LICENSE.txt
|
||||
include requirements.txt
|
||||
recursive-include TTS *.json
|
||||
recursive-include TTS *.html
|
||||
recursive-include TTS *.png
|
||||
recursive-include TTS *.md
|
||||
recursive-include TTS *.py
|
||||
recursive-include TTS *.pyx
|
||||
recursive-include images *.png
|
||||
|
38
README.md
38
README.md
|
@ -36,9 +36,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
|
|||
## 🔗 Links and Resources
|
||||
| Type | Links |
|
||||
| ------------------------------- | --------------------------------------- |
|
||||
| 💾 **Installation** | [TTS/README.md](https://github.com/mozilla/TTS/tree/dev#install-tts)|
|
||||
| 👩🏾🏫 **Tutorials and Examples** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials) |
|
||||
| 🤖 **Released Models** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/Released-Models)|
|
||||
| 🚀 **Released Models** | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/Released-Models)|
|
||||
| 💻 **Docker Image** | [Repository by @synesthesiam](https://github.com/synesthesiam/docker-mozillatts)|
|
||||
| 🖥️ **Demo Server** | [TTS/server](https://github.com/mozilla/TTS/tree/master/TTS/server)|
|
||||
| 🤖 **Running TTS on Terminal** | [TTS/README.md](https://github.com/mozilla/TTS#example-synthesizing-speech-on-terminal-using-the-released-models)|
|
||||
|
||||
## 🥇 TTS Performance
|
||||
<p align="center"><img src="https://discourse-prod-uploads-81679984178418.s3.dualstack.us-west-2.amazonaws.com/optimized/3X/6/4/6428f980e9ec751c248e591460895f7881aec0c6_2_1035x591.png" width="800" /></p>
|
||||
|
@ -90,9 +93,20 @@ Please use our dedicated channels for questions and discussion. Help is much mor
|
|||
You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).
|
||||
|
||||
## Install TTS
|
||||
TTS supports **python >= 3.6**.
|
||||
TTS supports **python >= 3.6, <3.9**.
|
||||
|
||||
```python setup.py install``` or ```python setup.py develop``` to keep your installation in your working directory.
|
||||
If you are only interested in [synthesizing speech](https://github.com/mozilla/TTS/tree/dev#example-synthesizing-speech-on-terminal-using-the-released-models) with the released TTS models, installing from PyPI is the easiest option.
|
||||
|
||||
```
|
||||
pip install TTS
|
||||
```
|
||||
|
||||
If you plan to code or train models, clone TTS and install it locally.
|
||||
|
||||
```
|
||||
git clone https://github.com/mozilla/TTS
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## Directory Structure
|
||||
```
|
||||
|
@ -138,6 +152,24 @@ Some of the public datasets that we successfully applied TTS:
|
|||
- [LibriTTS](https://openslr.org/60/)
|
||||
- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01
|
||||
|
||||
## Example: Synthesizing Speech on Terminal Using the Released Models.
|
||||
|
||||
After the installation, TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under the TTS project.
|
||||
|
||||
Listing released TTS models.
|
||||
```tts --list_models```
|
||||
|
||||
Run a tts and a vocoder model from the released model list. (Simply copy and paste the full model names from the list as arguments for the command below.)
|
||||
```tts --text "Text for TTS" --model_name "<type>/<language>/<dataset>/<model_name>" --vocoder_name "<type>/<language>/<dataset>/<model_name>" --output_path```
|
||||
|
||||
Run your own TTS model (Using Griffin-Lim Vocoder)
|
||||
```tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav```
|
||||
|
||||
Run your own TTS and Vocoder models
|
||||
```tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json```
|
||||
|
||||
**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder.
|
||||
|
||||
## Example: Training and Fine-tuning LJ-Speech Dataset
|
||||
Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below.
|
||||
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
{
|
||||
"tts_models":{
|
||||
"en":{
|
||||
"ljspeech":{
|
||||
"glow-tts":{
|
||||
"description": "",
|
||||
"model_file": "1NFsfhH8W8AgcfJ-BsL8CYAwQfZ5k4T-n",
|
||||
"config_file": "1IAROF3yy9qTK43vG_-R67y3Py9yYbD6t",
|
||||
"stats_file": null,
|
||||
"commit": ""
|
||||
},
|
||||
"tacotron2-DCA": {
|
||||
"description": "",
|
||||
"model_file": "1CFoPDQBnhfBFu2Gc0TBSJn8o-TuNKQn7",
|
||||
"config_file": "1lWSscNfKet1zZSJCNirOn7v9bigUZ8C1",
|
||||
"stats_file": "1qevpGRVHPmzfiRBNuugLMX62x1k7B5vK",
|
||||
"commit": ""
|
||||
},
|
||||
"speedy-speech-wn":{
|
||||
"description": "Speedy Speech model with wavenet decoder.",
|
||||
"model_file": "1VXAwiq6N-Viq3rsSXlf43bdoi0jSvMAJ",
|
||||
"config_file": "1KvZilhsNP3EumVggDcD46yd834eO5hR3",
|
||||
"stats_file": "1Ju7apZ5JlgsVECcETL-GEx3DRoNzWfkR",
|
||||
"commit": "77b6145"
|
||||
}
|
||||
}
|
||||
},
|
||||
"es":{
|
||||
"mai":{
|
||||
"tacotron2-DDC":{
|
||||
"model_file": "1jZ4HvYcAXI5ZClke2iGA7qFQQJBXIovw",
|
||||
"config_file": "1s7g4n-B73ChCB48AQ88_DV_8oyLth8r0",
|
||||
"stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv",
|
||||
"commit": ""
|
||||
}
|
||||
}
|
||||
},
|
||||
"fr":{
|
||||
"mai":{
|
||||
"tacotron2-DDC":{
|
||||
"model_file": "1qyxrrCyoXUvBG2lqVd0KqAlHj-2nZCgS",
|
||||
"config_file": "1yECKeP2LI7tNv4E8yVNx1yLmCfTCpkqG",
|
||||
"stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv",
|
||||
"commit": ""
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"vocoder_models":{
|
||||
"universal":{
|
||||
"libri-tts":{
|
||||
"wavegrad":{
|
||||
"model_file": "1r2g90JaZsfCj9dJkI9ioIU6JCFMPRqi6",
|
||||
"config_file": "1POrrLf5YEpZyjvWyMccj1nGCVc94mR6s",
|
||||
"stats_file": "1Vwbv4t-N1i3jXqI0bgKAhShAEO097sK0",
|
||||
"commit": "ea976b0"
|
||||
},
|
||||
"fullband-melgan":{
|
||||
"model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K",
|
||||
"config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu",
|
||||
"stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU",
|
||||
"commit": "4132240"
|
||||
}
|
||||
}
|
||||
},
|
||||
"en": {
|
||||
"ljspeech":{
|
||||
"mulitband-melgan":{
|
||||
"model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K",
|
||||
"config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu",
|
||||
"stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU",
|
||||
"commit": "ea976b0"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -2,114 +2,138 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import json
|
||||
# pylint: disable=redefined-outer-name, unused-argument
|
||||
import os
|
||||
import sys
|
||||
import string
|
||||
import time
|
||||
from argparse import RawTextHelpFormatter
|
||||
# pylint: disable=redefined-outer-name, unused-argument
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from TTS.tts.utils.generic_utils import setup_model, is_tacotron
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.vocoder.utils.generic_utils import setup_generator
|
||||
from TTS.utils.manage import ModelManager
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
|
||||
|
||||
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):
|
||||
t_1 = time.time()
|
||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
|
||||
|
||||
# grab spectrogram (thx to the nice guys at mozilla discourse for codesnipplet)
|
||||
if args.save_spectogram:
|
||||
spec_file_name = args.text.replace(" ", "_")[0:10]
|
||||
spec_file_name = spec_file_name.translate(
|
||||
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.npy'
|
||||
spec_file_name = os.path.join(args.out_path, spec_file_name)
|
||||
spectrogram = torch.FloatTensor(mel_postnet_spec.T)
|
||||
spectrogram = spectrogram.unsqueeze(0)
|
||||
np.save(spec_file_name, spectrogram)
|
||||
print(" > Saving raw spectogram to " + spec_file_name)
|
||||
|
||||
if CONFIG.model == "Tacotron" and not use_gl:
|
||||
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
|
||||
if not use_gl:
|
||||
# Use if not computed noise schedule with tune_wavegrad
|
||||
beta = np.linspace(1e-6, 0.01, 50)
|
||||
vocoder_model.compute_noise_level(beta)
|
||||
|
||||
# Use alternative when using output npy file from tune_wavegrad
|
||||
# beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item()
|
||||
# vocoder_model.compute_noise_level(beta['beta'])
|
||||
|
||||
device_type = "cuda" if use_cuda else "cpu"
|
||||
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0))
|
||||
if use_cuda and not use_gl:
|
||||
waveform = waveform.cpu()
|
||||
if not use_gl:
|
||||
waveform = waveform.numpy()
|
||||
waveform = waveform.squeeze()
|
||||
rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
|
||||
tps = (time.time() - t_1) / len(waveform)
|
||||
print(" > Run-time: {}".format(time.time() - t_1))
|
||||
print(" > Real-time factor: {}".format(rtf))
|
||||
print(" > Time per step: {}".format(tps))
|
||||
return waveform
|
||||
def str2bool(v):
|
||||
if isinstance(v, bool):
|
||||
return v
|
||||
if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
||||
return True
|
||||
if v.lower() in ('no', 'false', 'f', 'n', '0'):
|
||||
return False
|
||||
raise argparse.ArgumentTypeError('Boolean value expected.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def main():
|
||||
# pylint: disable=bad-continuation
|
||||
parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n'''
|
||||
|
||||
'''You can either use your trained model or choose a model from the provided list.\n'''\
|
||||
|
||||
'''
|
||||
Example runs:
|
||||
|
||||
# list provided models
|
||||
./TTS/bin/synthesize.py --list_models
|
||||
|
||||
# run a model from the list
|
||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
|
||||
|
||||
# run your own TTS model (Using Griffin-Lim Vocoder)
|
||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
|
||||
# run your own TTS and Vocoder models
|
||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
|
||||
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
|
||||
|
||||
''',
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('text', type=str, help='Text to generate speech.')
|
||||
parser.add_argument('config_path',
|
||||
type=str,
|
||||
help='Path to model config file.')
|
||||
parser.add_argument(
|
||||
'model_path',
|
||||
'--list_models',
|
||||
type=str2bool,
|
||||
nargs='?',
|
||||
const=True,
|
||||
default=False,
|
||||
help='list available pre-trained tts and vocoder models.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--text',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Text to generate speech.'
|
||||
)
|
||||
|
||||
# Args for running pre-trained TTS models.
|
||||
parser.add_argument(
|
||||
'--model_name',
|
||||
type=str,
|
||||
default=None,
|
||||
help=
|
||||
'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--vocoder_name',
|
||||
type=str,
|
||||
default=None,
|
||||
help=
|
||||
'Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>'
|
||||
)
|
||||
|
||||
# Args for running custom models
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
default=None,
|
||||
type=str,
|
||||
help='Path to model config file.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--model_path',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to model file.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'out_path',
|
||||
'--out_path',
|
||||
type=str,
|
||||
help='Path to save final wav file. Wav file will be names as the text given.',
|
||||
default=Path(__file__).resolve().parent,
|
||||
help='Path to save final wav file. Wav file will be named as the given text.',
|
||||
)
|
||||
parser.add_argument('--use_cuda',
|
||||
type=bool,
|
||||
help='Run model on CUDA.',
|
||||
default=False)
|
||||
parser.add_argument(
|
||||
'--use_cuda',
|
||||
type=bool,
|
||||
help='Run model on CUDA.',
|
||||
default=False
|
||||
)
|
||||
parser.add_argument(
|
||||
'--vocoder_path',
|
||||
type=str,
|
||||
help=
|
||||
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
|
||||
default="",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument('--vocoder_config_path',
|
||||
type=str,
|
||||
help='Path to vocoder model config file.',
|
||||
default="")
|
||||
parser.add_argument(
|
||||
'--batched_vocoder',
|
||||
type=bool,
|
||||
help="If True, vocoder model uses faster batch processing.",
|
||||
default=True)
|
||||
parser.add_argument('--speakers_json',
|
||||
type=str,
|
||||
help="JSON file for multi-speaker model.",
|
||||
default="")
|
||||
parser.add_argument(
|
||||
'--speaker_fileid',
|
||||
'--vocoder_config_path',
|
||||
type=str,
|
||||
help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
|
||||
help='Path to vocoder model config file.',
|
||||
default=None)
|
||||
|
||||
# args for multi-speaker synthesis
|
||||
parser.add_argument(
|
||||
'--speakers_json',
|
||||
type=str,
|
||||
help="JSON file for multi-speaker model.",
|
||||
default=None)
|
||||
parser.add_argument(
|
||||
'--speaker_idx',
|
||||
type=str,
|
||||
help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
|
||||
default=None)
|
||||
parser.add_argument(
|
||||
'--gst_style',
|
||||
help="Wav path file for GST stylereference.",
|
||||
default=None)
|
||||
|
||||
# aux args
|
||||
parser.add_argument(
|
||||
'--save_spectogram',
|
||||
type=bool,
|
||||
|
@ -118,88 +142,77 @@ if __name__ == "__main__":
|
|||
|
||||
args = parser.parse_args()
|
||||
|
||||
# load the config
|
||||
C = load_config(args.config_path)
|
||||
C.forward_attn_mask = True
|
||||
# load model manager
|
||||
path = Path(__file__).parent / "../.models.json"
|
||||
manager = ModelManager(path)
|
||||
|
||||
# load the audio processor
|
||||
ap = AudioProcessor(**C.audio)
|
||||
model_path = None
|
||||
config_path = None
|
||||
vocoder_path = None
|
||||
vocoder_config_path = None
|
||||
|
||||
# if the vocabulary was passed, replace the default
|
||||
if 'characters' in C.keys():
|
||||
symbols, phonemes = make_symbols(**C.characters)
|
||||
# CASE1: list pre-trained TTS models
|
||||
if args.list_models:
|
||||
manager.list_models()
|
||||
sys.exit()
|
||||
|
||||
speaker_embedding = None
|
||||
speaker_embedding_dim = None
|
||||
num_speakers = 0
|
||||
# CASE2: load pre-trained models
|
||||
if args.model_name is not None:
|
||||
model_path, config_path = manager.download_model(args.model_name)
|
||||
|
||||
# load speakers
|
||||
if args.speakers_json != '':
|
||||
speaker_mapping = json.load(open(args.speakers_json, 'r'))
|
||||
num_speakers = len(speaker_mapping)
|
||||
if C.use_external_speaker_embedding_file:
|
||||
if args.speaker_fileid is not None:
|
||||
speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding']
|
||||
else: # if speaker_fileid is not specificated use the first sample in speakers.json
|
||||
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
|
||||
speaker_embedding_dim = len(speaker_embedding)
|
||||
if args.vocoder_name is not None:
|
||||
vocoder_path, vocoder_config_path = manager.download_model(args.vocoder_name)
|
||||
|
||||
# load the model
|
||||
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
|
||||
cp = torch.load(args.model_path, map_location=torch.device('cpu'))
|
||||
model.load_state_dict(cp['model'])
|
||||
model.eval()
|
||||
if args.use_cuda:
|
||||
model.cuda()
|
||||
if is_tacotron(C):
|
||||
model.decoder.set_r(cp['r'])
|
||||
# CASE3: load custome models
|
||||
if args.model_path is not None:
|
||||
model_path = args.model_path
|
||||
config_path = args.config_path
|
||||
|
||||
# load vocoder model
|
||||
if args.vocoder_path != "":
|
||||
VC = load_config(args.vocoder_config_path)
|
||||
vocoder_model = setup_generator(VC)
|
||||
vocoder_model.load_state_dict(torch.load(args.vocoder_path, map_location="cpu")["model"])
|
||||
vocoder_model.remove_weight_norm()
|
||||
if args.use_cuda:
|
||||
vocoder_model.cuda()
|
||||
vocoder_model.eval()
|
||||
else:
|
||||
vocoder_model = None
|
||||
VC = None
|
||||
if args.vocoder_path is not None:
|
||||
vocoder_path = args.vocoder_path
|
||||
vocoder_config_path = args.vocoder_config_path
|
||||
|
||||
# synthesize voice
|
||||
use_griffin_lim = args.vocoder_path == ""
|
||||
# RUN THE SYNTHESIS
|
||||
# load models
|
||||
synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda)
|
||||
|
||||
use_griffin_lim = vocoder_path is None
|
||||
print(" > Text: {}".format(args.text))
|
||||
|
||||
if not C.use_external_speaker_embedding_file:
|
||||
if args.speaker_fileid.isdigit():
|
||||
args.speaker_fileid = int(args.speaker_fileid)
|
||||
else:
|
||||
args.speaker_fileid = None
|
||||
else:
|
||||
args.speaker_fileid = None
|
||||
# # handle multi-speaker setting
|
||||
# if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None:
|
||||
# if args.speaker_idx.isdigit():
|
||||
# args.speaker_idx = int(args.speaker_idx)
|
||||
# else:
|
||||
# args.speaker_idx = None
|
||||
# else:
|
||||
# args.speaker_idx = None
|
||||
|
||||
if args.gst_style is None:
|
||||
if is_tacotron(C):
|
||||
gst_style = C.gst['gst_style_input']
|
||||
else:
|
||||
gst_style = None
|
||||
else:
|
||||
# check if gst_style string is a dict, if is dict convert else use string
|
||||
try:
|
||||
gst_style = json.loads(args.gst_style)
|
||||
if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']:
|
||||
raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), C.gst['gst_style_tokens']))
|
||||
except ValueError:
|
||||
gst_style = args.gst_style
|
||||
# if args.gst_style is None:
|
||||
# if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None:
|
||||
# gst_style = model_config.gst['gst_style_input']
|
||||
# else:
|
||||
# gst_style = None
|
||||
# else:
|
||||
# # check if gst_style string is a dict, if is dict convert else use string
|
||||
# try:
|
||||
# gst_style = json.loads(args.gst_style)
|
||||
# if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']:
|
||||
# raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens']))
|
||||
# except ValueError:
|
||||
# gst_style = args.gst_style
|
||||
|
||||
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style)
|
||||
# kick it
|
||||
wav = synthesizer.tts(args.text)
|
||||
|
||||
# save the results
|
||||
file_name = args.text.replace(" ", "_")[0:10]
|
||||
file_name = args.text.replace(" ", "_")[0:20]
|
||||
file_name = file_name.translate(
|
||||
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
|
||||
out_path = os.path.join(args.out_path, file_name)
|
||||
print(" > Saving output to {}".format(out_path))
|
||||
ap.save_wav(wav, out_path)
|
||||
synthesizer.save_wav(wav, out_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -549,7 +549,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
scaler.load_state_dict(checkpoint["scaler"])
|
||||
if c.reinit_layers:
|
||||
raise RuntimeError
|
||||
except KeyError:
|
||||
except (KeyError, RuntimeError):
|
||||
print(" > Partial model initialization.")
|
||||
model_dict = model.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||
|
|
|
@ -9,6 +9,22 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple
|
|||
##### Using server.py
|
||||
If you have the environment set already for TTS, then you can directly call ```server.py```.
|
||||
|
||||
**Note:** After installing TTS as a package you can use ```tts-server``` to call the commands below.
|
||||
|
||||
Examples runs:
|
||||
|
||||
List officially released models.
|
||||
```python TTS/server/server.py --list_models ```
|
||||
|
||||
Run the server with the official models.
|
||||
```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan```
|
||||
|
||||
Run the server with the official models on a GPU.
|
||||
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan --use_cuda True```
|
||||
|
||||
Run the server with a custom models.
|
||||
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
|
||||
|
||||
##### Using .whl
|
||||
1. apt-get install -y espeak libsndfile1 python3-venv
|
||||
2. python3 -m venv /tmp/venv
|
||||
|
@ -21,6 +37,8 @@ You can now open http://localhost:5002 in a browser
|
|||
|
||||
#### Running with nginx/uwsgi:
|
||||
|
||||
**Note:** This method uses an old TTS model, so quality might be low.
|
||||
|
||||
1. apt-get install -y uwsgi uwsgi-plugin-python3 nginx espeak libsndfile1 python3-venv
|
||||
2. python3 -m venv /tmp/venv
|
||||
3. source /tmp/venv/bin/activate
|
||||
|
|
|
@ -5,10 +5,6 @@
|
|||
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
|
||||
"vocoder_config":null,
|
||||
"vocoder_file": null,
|
||||
"wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
|
||||
"wavernn_path":null, // wavernn model root path
|
||||
"wavernn_file":null, // wavernn checkpoint file name
|
||||
"wavernn_config": null, // wavernn config file
|
||||
"is_wavernn_batched":true,
|
||||
"port": 5002,
|
||||
"use_cuda": true,
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
#!flask/bin/python
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import io
|
||||
from pathlib import Path
|
||||
|
||||
from flask import Flask, request, render_template, send_file
|
||||
from TTS.server.synthesizer import Synthesizer
|
||||
from flask import Flask, render_template, request, send_file
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
from TTS.utils.manage import ModelManager
|
||||
from TTS.utils.io import load_config
|
||||
|
||||
|
||||
def create_argparser():
|
||||
|
@ -11,21 +16,20 @@ def create_argparser():
|
|||
return x.lower() in ['true', '1', 'yes']
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file')
|
||||
parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
|
||||
parser.add_argument('--list_models', type=convert_boolean, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.')
|
||||
parser.add_argument('--model_name', type=str, help='name of one of the released tts models.')
|
||||
parser.add_argument('--vocoder_name', type=str, help='name of one of the released vocoder models.')
|
||||
parser.add_argument('--tts_checkpoint', type=str, help='path to custom tts checkpoint file')
|
||||
parser.add_argument('--tts_config', type=str, help='path to custom tts config.json file')
|
||||
parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
|
||||
parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
|
||||
parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
|
||||
parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
|
||||
parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
|
||||
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
|
||||
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
|
||||
parser.add_argument('--vocoder_config', type=str, default=None, help='path to vocoder config file.')
|
||||
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to vocoder checkpoint file.')
|
||||
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
|
||||
parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
|
||||
parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
|
||||
parser.add_argument('--show_details', type=convert_boolean, default=False, help='Generate model detail page.')
|
||||
return parser
|
||||
|
||||
|
||||
synthesizer = None
|
||||
|
||||
embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
|
||||
|
@ -45,6 +49,20 @@ wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
|
|||
|
||||
args = create_argparser().parse_args()
|
||||
|
||||
path = Path(__file__).parent / "../.models.json"
|
||||
manager = ModelManager(path)
|
||||
|
||||
if args.list_models:
|
||||
manager.list_models()
|
||||
sys.exit()
|
||||
|
||||
# set models by the released models
|
||||
if args.model_name is not None:
|
||||
tts_checkpoint_file, tts_config_file = manager.download_model(args.model_name)
|
||||
|
||||
if args.vocoder_name is not None:
|
||||
vocoder_checkpoint_file, vocoder_config_file = manager.download_model(args.vocoder_name)
|
||||
|
||||
# If these were not specified in the CLI args, use default values with embedded model files
|
||||
if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
|
||||
args.tts_checkpoint = tts_checkpoint_file
|
||||
|
@ -56,26 +74,38 @@ if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
|
|||
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
|
||||
args.vocoder_config = vocoder_config_file
|
||||
|
||||
if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
|
||||
args.wavernn_checkpoint = wavernn_checkpoint_file
|
||||
if not args.wavernn_config and os.path.isfile(wavernn_config_file):
|
||||
args.wavernn_config = wavernn_config_file
|
||||
|
||||
synthesizer = Synthesizer(args)
|
||||
synthesizer = Synthesizer(args.tts_checkpoint, args.tts_config, args.vocoder_checkpoint, args.vocoder_config, args.use_cuda)
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return render_template('index.html')
|
||||
return render_template('index.html', show_details=args.show_details)
|
||||
|
||||
@app.route('/details')
|
||||
def details():
|
||||
model_config = load_config(args.tts_config)
|
||||
if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
|
||||
vocoder_config = load_config(args.vocoder_config)
|
||||
else:
|
||||
vocoder_config = None
|
||||
|
||||
return render_template('details.html',
|
||||
show_details=args.show_details
|
||||
, model_config=model_config
|
||||
, vocoder_config=vocoder_config
|
||||
, args=args.__dict__
|
||||
)
|
||||
|
||||
@app.route('/api/tts', methods=['GET'])
|
||||
def tts():
|
||||
text = request.args.get('text')
|
||||
print(" > Model input: {}".format(text))
|
||||
data = synthesizer.tts(text)
|
||||
return send_file(data, mimetype='audio/wav')
|
||||
wavs = synthesizer.tts(text)
|
||||
out = io.BytesIO()
|
||||
synthesizer.save_wav(wavs, out)
|
||||
return send_file(out, mimetype='audio/wav')
|
||||
|
||||
|
||||
def main():
|
||||
|
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 25 KiB |
|
@ -1,193 +0,0 @@
|
|||
import io
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import pysbd
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.tts.utils.generic_utils import setup_model
|
||||
from TTS.tts.utils.speakers import load_speaker_mapping
|
||||
from TTS.vocoder.utils.generic_utils import setup_generator
|
||||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.tts.utils.synthesis import *
|
||||
|
||||
from TTS.tts.utils.text import make_symbols, phonemes, symbols
|
||||
|
||||
|
||||
class Synthesizer(object):
|
||||
def __init__(self, config):
|
||||
self.wavernn = None
|
||||
self.vocoder_model = None
|
||||
self.config = config
|
||||
print(config)
|
||||
self.seg = self.get_segmenter("en")
|
||||
self.use_cuda = self.config.use_cuda
|
||||
if self.use_cuda:
|
||||
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
|
||||
self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
|
||||
self.config.use_cuda)
|
||||
if self.config.vocoder_checkpoint:
|
||||
self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
|
||||
if self.config.wavernn_lib_path:
|
||||
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
|
||||
self.config.wavernn_config, self.config.use_cuda)
|
||||
|
||||
@staticmethod
|
||||
def get_segmenter(lang):
|
||||
return pysbd.Segmenter(language=lang, clean=True)
|
||||
|
||||
def load_tts(self, tts_checkpoint, tts_config, use_cuda):
|
||||
# pylint: disable=global-statement
|
||||
global symbols, phonemes
|
||||
|
||||
print(" > Loading TTS model ...")
|
||||
print(" | > model config: ", tts_config)
|
||||
print(" | > checkpoint file: ", tts_checkpoint)
|
||||
|
||||
self.tts_config = load_config(tts_config)
|
||||
self.use_phonemes = self.tts_config.use_phonemes
|
||||
self.ap = AudioProcessor(**self.tts_config.audio)
|
||||
|
||||
if 'characters' in self.tts_config.keys():
|
||||
symbols, phonemes = make_symbols(**self.tts_config.characters)
|
||||
|
||||
if self.use_phonemes:
|
||||
self.input_size = len(phonemes)
|
||||
else:
|
||||
self.input_size = len(symbols)
|
||||
# TODO: fix this for multi-speaker model - load speakers
|
||||
if self.config.tts_speakers is not None:
|
||||
self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
|
||||
num_speakers = len(self.tts_speakers)
|
||||
else:
|
||||
num_speakers = 0
|
||||
self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
|
||||
# load model state
|
||||
cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
|
||||
# load the model
|
||||
self.tts_model.load_state_dict(cp['model'])
|
||||
if use_cuda:
|
||||
self.tts_model.cuda()
|
||||
self.tts_model.eval()
|
||||
self.tts_model.decoder.max_decoder_steps = 3000
|
||||
if 'r' in cp:
|
||||
self.tts_model.decoder.set_r(cp['r'])
|
||||
print(f" > model reduction factor: {cp['r']}")
|
||||
|
||||
def load_vocoder(self, model_file, model_config, use_cuda):
|
||||
self.vocoder_config = load_config(model_config)
|
||||
self.vocoder_model = setup_generator(self.vocoder_config)
|
||||
self.vocoder_model.load_state_dict(torch.load(model_file, map_location="cpu")["model"])
|
||||
self.vocoder_model.remove_weight_norm()
|
||||
self.vocoder_model.inference_padding = 0
|
||||
self.vocoder_config = load_config(model_config)
|
||||
|
||||
if use_cuda:
|
||||
self.vocoder_model.cuda()
|
||||
self.vocoder_model.eval()
|
||||
|
||||
def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
|
||||
# TODO: set a function in wavernn code base for model setup and call it here.
|
||||
sys.path.append(lib_path) # set this if WaveRNN is not installed globally
|
||||
#pylint: disable=import-outside-toplevel
|
||||
from WaveRNN.models.wavernn import Model
|
||||
print(" > Loading WaveRNN model ...")
|
||||
print(" | > model config: ", model_config)
|
||||
print(" | > model file: ", model_file)
|
||||
self.wavernn_config = load_config(model_config)
|
||||
# This is the default architecture we use for our models.
|
||||
# You might need to update it
|
||||
self.wavernn = Model(
|
||||
rnn_dims=512,
|
||||
fc_dims=512,
|
||||
mode=self.wavernn_config.mode,
|
||||
mulaw=self.wavernn_config.mulaw,
|
||||
pad=self.wavernn_config.pad,
|
||||
use_aux_net=self.wavernn_config.use_aux_net,
|
||||
use_upsample_net=self.wavernn_config.use_upsample_net,
|
||||
upsample_factors=self.wavernn_config.upsample_factors,
|
||||
feat_dims=80,
|
||||
compute_dims=128,
|
||||
res_out_dims=128,
|
||||
res_blocks=10,
|
||||
hop_length=self.ap.hop_length,
|
||||
sample_rate=self.ap.sample_rate,
|
||||
).cuda()
|
||||
|
||||
check = torch.load(model_file, map_location="cpu")
|
||||
self.wavernn.load_state_dict(check['model'])
|
||||
if use_cuda:
|
||||
self.wavernn.cuda()
|
||||
self.wavernn.eval()
|
||||
|
||||
def save_wav(self, wav, path):
|
||||
# wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
|
||||
wav = np.array(wav)
|
||||
self.ap.save_wav(wav, path)
|
||||
|
||||
def split_into_sentences(self, text):
|
||||
return self.seg.segment(text)
|
||||
|
||||
def tts(self, text, speaker_id=None):
|
||||
start_time = time.time()
|
||||
wavs = []
|
||||
sens = self.split_into_sentences(text)
|
||||
print(sens)
|
||||
speaker_id = id_to_torch(speaker_id)
|
||||
if speaker_id is not None and self.use_cuda:
|
||||
speaker_id = speaker_id.cuda()
|
||||
|
||||
for sen in sens:
|
||||
# preprocess the given text
|
||||
inputs = text_to_seqvec(sen, self.tts_config)
|
||||
inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
|
||||
inputs = inputs.unsqueeze(0)
|
||||
# synthesize voice
|
||||
_, postnet_output, _, _ = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None)
|
||||
if self.vocoder_model:
|
||||
# use native vocoder model
|
||||
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
|
||||
wav = self.vocoder_model.inference(vocoder_input)
|
||||
if self.use_cuda:
|
||||
wav = wav.cpu().numpy()
|
||||
else:
|
||||
wav = wav.numpy()
|
||||
wav = wav.flatten()
|
||||
elif self.wavernn:
|
||||
# use 3rd paty wavernn
|
||||
vocoder_input = None
|
||||
if self.tts_config.model == "Tacotron":
|
||||
vocoder_input = torch.FloatTensor(self.ap.out_linear_to_mel(linear_spec=postnet_output.T).T).T.unsqueeze(0)
|
||||
else:
|
||||
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
|
||||
if self.use_cuda:
|
||||
vocoder_input.cuda()
|
||||
wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
|
||||
else:
|
||||
# use GL
|
||||
if self.use_cuda:
|
||||
postnet_output = postnet_output[0].cpu()
|
||||
else:
|
||||
postnet_output = postnet_output[0]
|
||||
postnet_output = postnet_output.numpy()
|
||||
wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
|
||||
|
||||
# trim silence
|
||||
wav = trim_silence(wav, self.ap)
|
||||
|
||||
wavs += list(wav)
|
||||
wavs += [0] * 10000
|
||||
|
||||
out = io.BytesIO()
|
||||
self.save_wav(wavs, out)
|
||||
|
||||
# compute stats
|
||||
process_time = time.time() - start_time
|
||||
audio_time = len(wavs) / self.tts_config.audio['sample_rate']
|
||||
print(f" > Processing time: {process_time}")
|
||||
print(f" > Real-time factor: {process_time / audio_time}")
|
||||
return out
|
|
@ -0,0 +1,131 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
<meta name="description" content="">
|
||||
<meta name="author" content="">
|
||||
|
||||
<title>TTS engine</title>
|
||||
|
||||
<!-- Bootstrap core CSS -->
|
||||
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
||||
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
|
||||
rel="stylesheet">
|
||||
|
||||
<!-- Custom styles for this template -->
|
||||
<style>
|
||||
body {
|
||||
padding-top: 54px;
|
||||
}
|
||||
|
||||
@media (min-width: 992px) {
|
||||
body {
|
||||
padding-top: 56px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
|
||||
src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
|
||||
|
||||
{% if show_details == true %}
|
||||
|
||||
<div class="container">
|
||||
<b>Model details</b>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<details>
|
||||
<summary>CLI arguments:</summary>
|
||||
<table border="1" align="center" width="75%">
|
||||
<tr>
|
||||
<td> CLI key </td>
|
||||
<td> Value </td>
|
||||
</tr>
|
||||
|
||||
{% for key, value in args.items() %}
|
||||
|
||||
<tr>
|
||||
<td>{{ key }}</td>
|
||||
<td>{{ value }}</td>
|
||||
</tr>
|
||||
|
||||
{% endfor %}
|
||||
</table>
|
||||
</details>
|
||||
</div></br>
|
||||
|
||||
<div class="container">
|
||||
|
||||
{% if model_config != None %}
|
||||
|
||||
<details>
|
||||
<summary>Model config:</summary>
|
||||
|
||||
<table border="1" align="center" width="75%">
|
||||
<tr>
|
||||
<td> Key </td>
|
||||
<td> Value </td>
|
||||
</tr>
|
||||
|
||||
|
||||
{% for key, value in model_config.items() %}
|
||||
|
||||
<tr>
|
||||
<td>{{ key }}</td>
|
||||
<td>{{ value }}</td>
|
||||
</tr>
|
||||
|
||||
{% endfor %}
|
||||
|
||||
</table>
|
||||
</details>
|
||||
|
||||
{% endif %}
|
||||
|
||||
</div></br>
|
||||
|
||||
|
||||
|
||||
<div class="container">
|
||||
{% if vocoder_config != None %}
|
||||
<details>
|
||||
<summary>Vocoder model config:</summary>
|
||||
|
||||
<table border="1" align="center" width="75%">
|
||||
<tr>
|
||||
<td> Key </td>
|
||||
<td> Value </td>
|
||||
</tr>
|
||||
|
||||
|
||||
{% for key, value in vocoder_config.items() %}
|
||||
|
||||
<tr>
|
||||
<td>{{ key }}</td>
|
||||
<td>{{ value }}</td>
|
||||
</tr>
|
||||
|
||||
{% endfor %}
|
||||
|
||||
|
||||
</table>
|
||||
</details>
|
||||
{% endif %}
|
||||
</div></br>
|
||||
|
||||
{% else %}
|
||||
<div class="container">
|
||||
<b>Please start server with --show_details=true to see details.</b>
|
||||
</div>
|
||||
|
||||
{% endif %}
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
|
@ -56,11 +56,15 @@
|
|||
<div class="container">
|
||||
<div class="row">
|
||||
<div class="col-lg-12 text-center">
|
||||
<img class="mt-5" src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" alt=></img>
|
||||
<img class="mt-5" src="{{url_for('static', filename='TTS_circle.png')}}" align="middle" />
|
||||
|
||||
<ul class="list-unstyled">
|
||||
</ul>
|
||||
<input id="text" placeholder="Type here..." size=45 type="text" name="text">
|
||||
<button id="speak-button" name="speak">Speak</button><br/><br/>
|
||||
{%if show_details%}
|
||||
<button id="details-button" onclick="location.href = 'details'" name="model-details">Model Details</button><br/><br/>
|
||||
{%endif%}
|
||||
<audio id="audio" controls autoplay hidden></audio>
|
||||
<p id="message"></p>
|
||||
</div>
|
||||
|
|
|
@ -128,8 +128,9 @@ class InvConvNear(nn.Module):
|
|||
return z, logdet
|
||||
|
||||
def store_inverse(self):
|
||||
self.weight_inv = torch.inverse(
|
||||
weight_inv = torch.inverse(
|
||||
self.weight.float()).to(dtype=self.weight.dtype)
|
||||
self.weight_inv = nn.Parameter(weight_inv, requires_grad=False)
|
||||
|
||||
|
||||
class CouplingBlock(nn.Module):
|
||||
|
|
|
@ -2,7 +2,13 @@ import numpy as np
|
|||
import torch
|
||||
from torch.nn import functional as F
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.layers.glow_tts.monotonic_align.core import maximum_path_c
|
||||
|
||||
try:
|
||||
# TODO: fix pypi cython installation problem.
|
||||
from TTS.tts.layers.glow_tts.monotonic_align.core import maximum_path_c
|
||||
CYTHON = True
|
||||
except ModuleNotFoundError:
|
||||
CYTHON = False
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
|
@ -32,6 +38,12 @@ def generate_path(duration, mask):
|
|||
|
||||
|
||||
def maximum_path(value, mask):
|
||||
if CYTHON:
|
||||
return maximum_path_cython(value, mask)
|
||||
return maximum_path_numpy(value, mask)
|
||||
|
||||
|
||||
def maximum_path_cython(value, mask):
|
||||
""" Cython optimised version.
|
||||
value: [b, t_x, t_y]
|
||||
mask: [b, t_x, t_y]
|
||||
|
@ -47,3 +59,45 @@ def maximum_path(value, mask):
|
|||
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
|
||||
maximum_path_c(path, value, t_x_max, t_y_max)
|
||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
|
||||
|
||||
def maximum_path_numpy(value, mask, max_neg_val=None):
|
||||
"""
|
||||
Monotonic alignment search algorithm
|
||||
Numpy-friendly version. It's about 4 times faster than torch version.
|
||||
value: [b, t_x, t_y]
|
||||
mask: [b, t_x, t_y]
|
||||
"""
|
||||
if max_neg_val is None:
|
||||
max_neg_val = -np.inf # Patch for Sphinx complaint
|
||||
value = value * mask
|
||||
|
||||
device = value.device
|
||||
dtype = value.dtype
|
||||
value = value.cpu().detach().numpy()
|
||||
mask = mask.cpu().detach().numpy().astype(np.bool)
|
||||
|
||||
b, t_x, t_y = value.shape
|
||||
direction = np.zeros(value.shape, dtype=np.int64)
|
||||
v = np.zeros((b, t_x), dtype=np.float32)
|
||||
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
|
||||
for j in range(t_y):
|
||||
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
|
||||
v1 = v
|
||||
max_mask = v1 >= v0
|
||||
v_max = np.where(max_mask, v1, v0)
|
||||
direction[:, :, j] = max_mask
|
||||
|
||||
index_mask = x_range <= j
|
||||
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
|
||||
direction = np.where(mask, direction, 1)
|
||||
|
||||
path = np.zeros(value.shape, dtype=np.float32)
|
||||
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
|
||||
index_range = np.arange(b)
|
||||
for j in reversed(range(t_y)):
|
||||
path[index_range, index, j] = 1
|
||||
index = index + direction[index_range, index, j] - 1
|
||||
path = path * mask.astype(np.float32)
|
||||
path = torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
return path
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from distutils.core import setup
|
||||
from Cython.Build import cythonize
|
||||
import numpy
|
||||
# from distutils.core import setup
|
||||
# from Cython.Build import cythonize
|
||||
# import numpy
|
||||
|
||||
setup(name='monotonic_align',
|
||||
ext_modules=cythonize("core.pyx"),
|
||||
include_dirs=[numpy.get_include()])
|
||||
# setup(name='monotonic_align',
|
||||
# ext_modules=cythonize("core.pyx"),
|
||||
# include_dirs=[numpy.get_include()])
|
||||
|
|
|
@ -223,3 +223,11 @@ class GlowTts(nn.Module):
|
|||
|
||||
def store_inverse(self):
|
||||
self.decoder.store_inverse()
|
||||
|
||||
def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
self.load_state_dict(state['model'])
|
||||
if eval:
|
||||
self.eval()
|
||||
self.store_inverse()
|
||||
assert not self.training
|
||||
|
|
|
@ -188,5 +188,12 @@ class SpeedySpeech(nn.Module):
|
|||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
|
||||
y_lengths = o_dr.sum(1)
|
||||
o_de, attn= self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
|
||||
o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
|
||||
return o_de, attn
|
||||
|
||||
def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
self.load_state_dict(state['model'])
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
||||
|
|
|
@ -121,6 +121,14 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
def inference(self):
|
||||
pass
|
||||
|
||||
def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
self.load_state_dict(state['model'])
|
||||
self.decoder.set_r(state['r'])
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
||||
|
||||
#############################
|
||||
# COMMON COMPUTE FUNCTIONS
|
||||
#############################
|
||||
|
|
|
@ -7,7 +7,7 @@ from TTS.utils.io import RenamingUnpickler
|
|||
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
|
||||
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False):
|
||||
"""Load ```TTS.tts.models``` checkpoints.
|
||||
|
||||
Args:
|
||||
|
@ -33,6 +33,8 @@ def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
|
|||
if hasattr(model.decoder, 'r'):
|
||||
model.decoder.set_r(state['r'])
|
||||
print(" > Model r: ", state['r'])
|
||||
if eval:
|
||||
model.eval()
|
||||
return model, state
|
||||
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ def plot_spectrogram(spectrogram,
|
|||
spectrogram_ = spectrogram_.astype(
|
||||
np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
|
||||
if ap is not None:
|
||||
spectrogram_ = ap._denormalize(spectrogram_) # pylint: disable=protected-access
|
||||
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access
|
||||
fig = plt.figure(figsize=fig_size)
|
||||
plt.imshow(spectrogram_, aspect="auto", origin="lower")
|
||||
plt.colorbar()
|
||||
|
|
|
@ -35,9 +35,9 @@ class AudioProcessor(object):
|
|||
trim_db=60,
|
||||
do_sound_norm=False,
|
||||
stats_path=None,
|
||||
verbose=True,
|
||||
**_):
|
||||
|
||||
print(" > Setting up Audio Processor...")
|
||||
# setup class attributed
|
||||
self.sample_rate = sample_rate
|
||||
self.resample = resample
|
||||
|
@ -73,8 +73,10 @@ class AudioProcessor(object):
|
|||
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
||||
assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
|
||||
members = vars(self)
|
||||
for key, value in members.items():
|
||||
print(" | > {}:{}".format(key, value))
|
||||
if verbose:
|
||||
print(" > Setting up Audio Processor...")
|
||||
for key, value in members.items():
|
||||
print(" | > {}:{}".format(key, value))
|
||||
# create spectrogram utils
|
||||
self.mel_basis = self._build_mel_basis()
|
||||
self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
|
||||
|
@ -107,7 +109,7 @@ class AudioProcessor(object):
|
|||
return hop_length, win_length
|
||||
|
||||
### normalization ###
|
||||
def _normalize(self, S):
|
||||
def normalize(self, S):
|
||||
"""Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]"""
|
||||
#pylint: disable=no-else-return
|
||||
S = S.copy()
|
||||
|
@ -136,7 +138,7 @@ class AudioProcessor(object):
|
|||
else:
|
||||
return S
|
||||
|
||||
def _denormalize(self, S):
|
||||
def denormalize(self, S):
|
||||
"""denormalize values"""
|
||||
#pylint: disable=no-else-return
|
||||
S_denorm = S.copy()
|
||||
|
@ -221,7 +223,7 @@ class AudioProcessor(object):
|
|||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amp_to_db(np.abs(D))
|
||||
return self._normalize(S)
|
||||
return self.normalize(S)
|
||||
|
||||
def melspectrogram(self, y):
|
||||
if self.preemphasis != 0:
|
||||
|
@ -229,11 +231,11 @@ class AudioProcessor(object):
|
|||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
|
||||
return self._normalize(S)
|
||||
return self.normalize(S)
|
||||
|
||||
def inv_spectrogram(self, spectrogram):
|
||||
"""Converts spectrogram to waveform using librosa"""
|
||||
S = self._denormalize(spectrogram)
|
||||
S = self.denormalize(spectrogram)
|
||||
S = self._db_to_amp(S)
|
||||
# Reconstruct phase
|
||||
if self.preemphasis != 0:
|
||||
|
@ -242,7 +244,7 @@ class AudioProcessor(object):
|
|||
|
||||
def inv_melspectrogram(self, mel_spectrogram):
|
||||
'''Converts melspectrogram to waveform using librosa'''
|
||||
D = self._denormalize(mel_spectrogram)
|
||||
D = self.denormalize(mel_spectrogram)
|
||||
S = self._db_to_amp(D)
|
||||
S = self._mel_to_linear(S) # Convert back to linear
|
||||
if self.preemphasis != 0:
|
||||
|
@ -250,11 +252,11 @@ class AudioProcessor(object):
|
|||
return self._griffin_lim(S**self.power)
|
||||
|
||||
def out_linear_to_mel(self, linear_spec):
|
||||
S = self._denormalize(linear_spec)
|
||||
S = self.denormalize(linear_spec)
|
||||
S = self._db_to_amp(S)
|
||||
S = self._linear_to_mel(np.abs(S))
|
||||
S = self._amp_to_db(S)
|
||||
mel = self._normalize(S)
|
||||
mel = self.normalize(S)
|
||||
return mel
|
||||
|
||||
### STFT and ISTFT ###
|
||||
|
|
|
@ -3,6 +3,8 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -67,6 +69,22 @@ def count_parameters(model):
|
|||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def get_user_data_dir(appname):
|
||||
if sys.platform == "win32":
|
||||
import winreg # pylint: disable=import-outside-toplevel
|
||||
key = winreg.OpenKey(
|
||||
winreg.HKEY_CURRENT_USER,
|
||||
r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders"
|
||||
)
|
||||
dir_, _ = winreg.QueryValueEx(key, "Local AppData")
|
||||
ans = Path(dir_).resolve(strict=False)
|
||||
elif sys.platform == 'darwin':
|
||||
ans = Path('~/Library/Application Support/').expanduser()
|
||||
else:
|
||||
ans = Path.home().joinpath('.local/share')
|
||||
return ans.joinpath(appname)
|
||||
|
||||
|
||||
def set_init_dict(model_dict, checkpoint_state, c):
|
||||
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
|
||||
for k, v in checkpoint_state.items():
|
||||
|
@ -97,6 +115,7 @@ def set_init_dict(model_dict, checkpoint_state, c):
|
|||
len(model_dict)))
|
||||
return model_dict
|
||||
|
||||
|
||||
class KeepAverage():
|
||||
def __init__(self):
|
||||
self.avg_values = {}
|
||||
|
|
|
@ -20,6 +20,16 @@ class AttrDict(dict):
|
|||
self.__dict__ = self
|
||||
|
||||
|
||||
def read_json_with_comments(json_path):
|
||||
# fallback to json
|
||||
with open(json_path, "r") as f:
|
||||
input_str = f.read()
|
||||
# handle comments
|
||||
input_str = re.sub(r'\\\n', '', input_str)
|
||||
input_str = re.sub(r'//.*\n', '\n', input_str)
|
||||
data = json.loads(input_str)
|
||||
return data
|
||||
|
||||
def load_config(config_path: str) -> AttrDict:
|
||||
"""Load config files and discard comments
|
||||
|
||||
|
@ -33,14 +43,7 @@ def load_config(config_path: str) -> AttrDict:
|
|||
with open(config_path, "r") as f:
|
||||
data = yaml.safe_load(f)
|
||||
else:
|
||||
# fallback to json
|
||||
with open(config_path, "r") as f:
|
||||
input_str = f.read()
|
||||
# handle comments
|
||||
input_str = re.sub(r'\\\n', '', input_str)
|
||||
input_str = re.sub(r'//.*\n', '\n', input_str)
|
||||
data = json.loads(input_str)
|
||||
|
||||
data = read_json_with_comments(config_path)
|
||||
config.update(data)
|
||||
return config
|
||||
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
import json
|
||||
import gdown
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.generic_utils import get_user_data_dir
|
||||
|
||||
class ModelManager(object):
|
||||
"""Manage TTS models defined in .models.json.
|
||||
It provides an interface to list and download
|
||||
models defines in '.model.json'
|
||||
|
||||
Models are downloaded under '.TTS' folder in the user's
|
||||
home path.
|
||||
|
||||
Args:
|
||||
models_file (str): path to .model.json
|
||||
"""
|
||||
def __init__(self, models_file):
|
||||
super().__init__()
|
||||
self.output_prefix = get_user_data_dir('tts')
|
||||
self.url_prefix = "https://drive.google.com/uc?id="
|
||||
self.models_dict = None
|
||||
self.read_models_file(models_file)
|
||||
|
||||
def read_models_file(self, file_path):
|
||||
"""Read .models.json as a dict
|
||||
|
||||
Args:
|
||||
file_path (str): path to .models.json.
|
||||
"""
|
||||
with open(file_path) as json_file:
|
||||
self.models_dict = json.load(json_file)
|
||||
|
||||
def list_langs(self):
|
||||
print(" Name format: type/language")
|
||||
for model_type in self.models_dict:
|
||||
for lang in self.models_dict[model_type]:
|
||||
print(f" >: {model_type}/{lang} ")
|
||||
|
||||
def list_datasets(self):
|
||||
print(" Name format: type/language/dataset")
|
||||
for model_type in self.models_dict:
|
||||
for lang in self.models_dict[model_type]:
|
||||
for dataset in self.models_dict[model_type][lang]:
|
||||
print(f" >: {model_type}/{lang}/{dataset}")
|
||||
|
||||
def list_models(self):
|
||||
print(" Name format: type/language/dataset/model")
|
||||
for model_type in self.models_dict:
|
||||
for lang in self.models_dict[model_type]:
|
||||
for dataset in self.models_dict[model_type][lang]:
|
||||
for model in self.models_dict[model_type][lang][dataset]:
|
||||
print(f" >: {model_type}/{lang}/{dataset}/{model} ")
|
||||
|
||||
def download_model(self, model_name):
|
||||
"""Download model files given the full model name.
|
||||
Model name is in the format
|
||||
'type/language/dataset/model'
|
||||
e.g. 'tts_model/en/ljspeech/tacotron'
|
||||
|
||||
Args:
|
||||
model_name (str): model name as explained above.
|
||||
|
||||
TODO: support multi-speaker models
|
||||
"""
|
||||
# fetch model info from the dict
|
||||
model_type, lang, dataset, model = model_name.split("/")
|
||||
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
|
||||
model_item = self.models_dict[model_type][lang][dataset][model]
|
||||
# set the model specific output path
|
||||
output_path = os.path.join(self.output_prefix, model_full_name)
|
||||
output_model_path = os.path.join(output_path, "model_file.pth.tar")
|
||||
output_config_path = os.path.join(output_path, "config.json")
|
||||
if os.path.exists(output_path):
|
||||
print(f" > {model_name} is already downloaded.")
|
||||
else:
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
print(f" > Downloading model to {output_path}")
|
||||
output_stats_path = None
|
||||
# download files to the output path
|
||||
self._download_file(model_item['model_file'], output_model_path)
|
||||
self._download_file(model_item['config_file'], output_config_path)
|
||||
if model_item['stats_file'] is not None and len(model_item['stats_file']) > 1:
|
||||
output_stats_path = os.path.join(output_path, 'scale_stats.npy')
|
||||
self._download_file(model_item['stats_file'], output_stats_path)
|
||||
# set scale stats path in config.json
|
||||
config_path = output_config_path
|
||||
config = load_config(config_path)
|
||||
config["audio"]['stats_path'] = output_stats_path
|
||||
with open(config_path, "w") as jf:
|
||||
json.dump(config, jf)
|
||||
return output_model_path, output_config_path
|
||||
|
||||
def _download_file(self, idx, output):
|
||||
gdown.download(f"{self.url_prefix}{idx}", output=output)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import pysbd
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.tts.utils.generic_utils import setup_model
|
||||
from TTS.tts.utils.speakers import load_speaker_mapping
|
||||
from TTS.vocoder.utils.generic_utils import setup_generator, interpolate_vocoder_input
|
||||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.tts.utils.synthesis import *
|
||||
|
||||
from TTS.tts.utils.text import make_symbols, phonemes, symbols
|
||||
|
||||
|
||||
class Synthesizer(object):
|
||||
def __init__(self, tts_checkpoint, tts_config, vocoder_checkpoint=None, vocoder_config=None, use_cuda=False):
|
||||
"""Encapsulation of tts and vocoder models for inference.
|
||||
|
||||
TODO: handle multi-speaker and GST inference.
|
||||
|
||||
Args:
|
||||
tts_checkpoint (str): path to the tts model file.
|
||||
tts_config (str): path to the tts config file.
|
||||
vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None.
|
||||
vocoder_config (str, optional): path to the vocoder config file. Defaults to None.
|
||||
use_cuda (bool, optional): enable/disable cuda. Defaults to False.
|
||||
"""
|
||||
self.tts_checkpoint = tts_checkpoint
|
||||
self.tts_config = tts_config
|
||||
self.vocoder_checkpoint = vocoder_checkpoint
|
||||
self.vocoder_config = vocoder_config
|
||||
self.use_cuda = use_cuda
|
||||
self.wavernn = None
|
||||
self.vocoder_model = None
|
||||
self.num_speakers = 0
|
||||
self.tts_speakers = None
|
||||
self.speaker_embedding_dim = None
|
||||
self.seg = self.get_segmenter("en")
|
||||
self.use_cuda = use_cuda
|
||||
if self.use_cuda:
|
||||
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
|
||||
self.load_tts(tts_checkpoint, tts_config,
|
||||
use_cuda)
|
||||
if vocoder_checkpoint:
|
||||
self.load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
|
||||
|
||||
@staticmethod
|
||||
def get_segmenter(lang):
|
||||
return pysbd.Segmenter(language=lang, clean=True)
|
||||
|
||||
def load_speakers(self):
|
||||
# load speakers
|
||||
if self.model_config.use_speaker_embedding is not None:
|
||||
self.tts_speakers = load_speaker_mapping(self.tts_config.tts_speakers_json)
|
||||
self.num_speakers = len(self.tts_speakers)
|
||||
else:
|
||||
self.num_speakers = 0
|
||||
# set external speaker embedding
|
||||
if self.tts_config.use_external_speaker_embedding_file:
|
||||
speaker_embedding = self.tts_speakers[list(self.tts_speakers.keys())[0]]['embedding']
|
||||
self.speaker_embedding_dim = len(speaker_embedding)
|
||||
|
||||
def init_speaker(self, speaker_idx):
|
||||
# load speakers
|
||||
speaker_embedding = None
|
||||
if hasattr(self, 'tts_speakers') and speaker_idx is not None:
|
||||
assert speaker_idx < len(self.tts_speakers), f" [!] speaker_idx is out of the range. {speaker_idx} vs {len(self.tts_speakers)}"
|
||||
if self.tts_config.use_external_speaker_embedding_file:
|
||||
speaker_embedding = self.tts_speakers[speaker_idx]['embedding']
|
||||
return speaker_embedding
|
||||
|
||||
def load_tts(self, tts_checkpoint, tts_config, use_cuda):
|
||||
# pylint: disable=global-statement
|
||||
global symbols, phonemes
|
||||
|
||||
self.tts_config = load_config(tts_config)
|
||||
self.use_phonemes = self.tts_config.use_phonemes
|
||||
self.ap = AudioProcessor(**self.tts_config.audio)
|
||||
|
||||
if 'characters' in self.tts_config.keys():
|
||||
symbols, phonemes = make_symbols(**self.tts_config.characters)
|
||||
|
||||
if self.use_phonemes:
|
||||
self.input_size = len(phonemes)
|
||||
else:
|
||||
self.input_size = len(symbols)
|
||||
|
||||
self.tts_model = setup_model(self.input_size, num_speakers=self.num_speakers, c=self.tts_config)
|
||||
self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True)
|
||||
if use_cuda:
|
||||
self.tts_model.cuda()
|
||||
|
||||
def load_vocoder(self, model_file, model_config, use_cuda):
|
||||
self.vocoder_config = load_config(model_config)
|
||||
self.vocoder_ap = AudioProcessor(**self.vocoder_config['audio'])
|
||||
self.vocoder_model = setup_generator(self.vocoder_config)
|
||||
self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True)
|
||||
if use_cuda:
|
||||
self.vocoder_model.cuda()
|
||||
|
||||
def save_wav(self, wav, path):
|
||||
wav = np.array(wav)
|
||||
self.ap.save_wav(wav, path)
|
||||
|
||||
def split_into_sentences(self, text):
|
||||
return self.seg.segment(text)
|
||||
|
||||
def tts(self, text, speaker_idx=None):
|
||||
start_time = time.time()
|
||||
wavs = []
|
||||
sens = self.split_into_sentences(text)
|
||||
print(" > Text splitted to sentences.")
|
||||
print(sens)
|
||||
|
||||
speaker_embedding = self.init_speaker(speaker_idx)
|
||||
use_gl = self.vocoder_model is None
|
||||
|
||||
for sen in sens:
|
||||
# synthesize voice
|
||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(
|
||||
self.tts_model,
|
||||
sen,
|
||||
self.tts_config,
|
||||
self.use_cuda,
|
||||
self.ap,
|
||||
speaker_idx,
|
||||
None,
|
||||
False,
|
||||
self.tts_config.enable_eos_bos_chars,
|
||||
use_gl,
|
||||
speaker_embedding=speaker_embedding)
|
||||
if not use_gl:
|
||||
# denormalize tts output based on tts audio config
|
||||
mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T
|
||||
device_type = "cuda" if self.use_cuda else "cpu"
|
||||
# renormalize spectrogram based on vocoder config
|
||||
vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
|
||||
# compute scale factor for possible sample rate mismatch
|
||||
scale_factor = [1, self.vocoder_config['audio']['sample_rate'] / self.ap.sample_rate]
|
||||
if scale_factor[1] != 1:
|
||||
print(" > interpolating tts model output.")
|
||||
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
|
||||
else:
|
||||
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable
|
||||
# run vocoder model
|
||||
# [1, T, C]
|
||||
waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
|
||||
if self.use_cuda and not use_gl:
|
||||
waveform = waveform.cpu()
|
||||
if not use_gl:
|
||||
waveform = waveform.numpy()
|
||||
waveform = waveform.squeeze()
|
||||
|
||||
# trim silence
|
||||
waveform = trim_silence(waveform, self.ap)
|
||||
|
||||
wavs += list(waveform)
|
||||
wavs += [0] * 10000
|
||||
|
||||
# compute stats
|
||||
process_time = time.time() - start_time
|
||||
audio_time = len(wavs) / self.tts_config.audio['sample_rate']
|
||||
print(f" > Processing time: {process_time}")
|
||||
print(f" > Real-time factor: {process_time / audio_time}")
|
||||
return wavs
|
|
@ -4,13 +4,15 @@ from torch import nn
|
|||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class TorchSTFT():
|
||||
class TorchSTFT(nn.Module):
|
||||
def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
|
||||
""" Torch based STFT operation """
|
||||
super(TorchSTFT, self).__init__()
|
||||
self.n_fft = n_fft
|
||||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
self.window = getattr(torch, window)(win_length)
|
||||
self.window = nn.Parameter(getattr(torch, window)(win_length),
|
||||
requires_grad=False)
|
||||
|
||||
def __call__(self, x):
|
||||
# B x D x T x 2
|
||||
|
@ -22,7 +24,8 @@ class TorchSTFT():
|
|||
center=True,
|
||||
pad_mode="reflect", # compatible with audio.py
|
||||
normalized=False,
|
||||
onesided=True)
|
||||
onesided=True,
|
||||
return_complex=False)
|
||||
M = o[:, :, :, 0]
|
||||
P = o[:, :, :, 1]
|
||||
return torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8))
|
||||
|
|
|
@ -95,3 +95,11 @@ class MelganGenerator(nn.Module):
|
|||
nn.utils.remove_weight_norm(layer)
|
||||
except ValueError:
|
||||
layer.remove_weight_norm()
|
||||
|
||||
def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
self.load_state_dict(state['model'])
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
||||
self.remove_weight_norm()
|
||||
|
|
|
@ -39,6 +39,7 @@ class ParallelWaveganGenerator(torch.nn.Module):
|
|||
self.upsample_factors = upsample_factors
|
||||
self.upsample_scale = np.prod(upsample_factors)
|
||||
self.inference_padding = inference_padding
|
||||
self.use_weight_norm = use_weight_norm
|
||||
|
||||
# check the number of layers and stacks
|
||||
assert num_res_blocks % stacks == 0
|
||||
|
@ -156,3 +157,12 @@ class ParallelWaveganGenerator(torch.nn.Module):
|
|||
def receptive_field_size(self):
|
||||
return self._get_receptive_field_size(self.layers, self.stacks,
|
||||
self.kernel_size)
|
||||
|
||||
def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
self.load_state_dict(state['model'])
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
||||
if self.use_weight_norm:
|
||||
self.remove_weight_norm()
|
||||
|
|
|
@ -175,3 +175,22 @@ class Wavegrad(nn.Module):
|
|||
self.x_conv = weight_norm(self.x_conv)
|
||||
self.out_conv = weight_norm(self.out_conv)
|
||||
self.y_conv = weight_norm(self.y_conv)
|
||||
|
||||
|
||||
def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
self.load_state_dict(state['model'])
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
||||
if self.use_weight_norm:
|
||||
self.remove_weight_norm()
|
||||
betas = np.linspace(config['test_noise_schedule']['min_val'],
|
||||
config['test_noise_schedule']['max_val'],
|
||||
config['test_noise_schedule']['num_steps'])
|
||||
self.compute_noise_level(betas)
|
||||
else:
|
||||
betas = np.linspace(config['train_noise_schedule']['min_val'],
|
||||
config['train_noise_schedule']['max_val'],
|
||||
config['train_noise_schedule']['num_steps'])
|
||||
self.compute_noise_level(betas)
|
||||
|
|
|
@ -499,3 +499,10 @@ class WaveRNN(nn.Module):
|
|||
unfolded[start:end] += y[i]
|
||||
|
||||
return unfolded
|
||||
|
||||
def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
self.load_state_dict(state['model'])
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import re
|
||||
import torch
|
||||
import importlib
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
|
@ -6,6 +7,29 @@ from matplotlib import pyplot as plt
|
|||
from TTS.tts.utils.visual import plot_spectrogram
|
||||
|
||||
|
||||
def interpolate_vocoder_input(scale_factor, spec):
|
||||
"""Interpolate spectrogram by the scale factor.
|
||||
It is mainly used to match the sampling rates of
|
||||
the tts and vocoder models.
|
||||
|
||||
Args:
|
||||
scale_factor (float): scale factor to interpolate the spectrogram
|
||||
spec (np.array): spectrogram to be interpolated
|
||||
|
||||
Returns:
|
||||
torch.tensor: interpolated spectrogram.
|
||||
"""
|
||||
print(" > before interpolation :", spec.shape)
|
||||
spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0) # pylint: disable=not-callable
|
||||
spec = torch.nn.functional.interpolate(spec,
|
||||
scale_factor=scale_factor,
|
||||
recompute_scale_factor=True,
|
||||
mode='bilinear',
|
||||
align_corners=False).squeeze(0)
|
||||
print(" > after interpolation :", spec.shape)
|
||||
return spec
|
||||
|
||||
|
||||
def plot_results(y_hat, y, ap, global_step, name_prefix):
|
||||
""" Plot vocoder model results """
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import pickle as pickle_tts
|
|||
from TTS.utils.io import RenamingUnpickler
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, use_cuda=False):
|
||||
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):
|
||||
try:
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
except ModuleNotFoundError:
|
||||
|
@ -15,6 +15,8 @@ def load_checkpoint(model, checkpoint_path, use_cuda=False):
|
|||
model.load_state_dict(state['model'])
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
if eval:
|
||||
model.eval()
|
||||
return model, state
|
||||
|
||||
|
||||
|
|
|
@ -112,7 +112,7 @@
|
|||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
||||
" # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n",
|
||||
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" waveform = waveform.flatten()\n",
|
||||
|
|
|
@ -112,7 +112,7 @@
|
|||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
||||
" # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n",
|
||||
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" waveform = waveform.flatten()\n",
|
||||
|
|
|
@ -230,8 +230,8 @@
|
|||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-18-91e8914b5c6a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwav\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Max:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Min:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mean:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mplot_spectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_normalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36m_normalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mnormalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mRuntimeError\u001b[0m: [!] Mean-Var stats does not match the given feature dimensions."
|
||||
]
|
||||
}
|
||||
|
@ -314,7 +314,7 @@
|
|||
" exec(set_val_cmd)\n",
|
||||
" wav = AP.load_wav(file)\n",
|
||||
" spec = AP.spectrogram(wav)\n",
|
||||
" spec_norm = AP._denormalize(spec.T)\n",
|
||||
" spec_norm = AP.denormalize(spec.T)\n",
|
||||
" plt.subplot(len(values), 2, 2*idx + 1)\n",
|
||||
" plt.imshow(spec_norm.T, aspect=\"auto\", origin=\"lower\")\n",
|
||||
" # plt.colorbar()\n",
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
[build-system]
|
||||
requires = ["setuptools", "wheel", "Cython", "numpy>=1.16.0"]
|
110
setup.py
110
setup.py
|
@ -5,22 +5,16 @@ import os
|
|||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import numpy
|
||||
|
||||
from setuptools import setup, find_packages, Extension
|
||||
import setuptools.command.develop
|
||||
import setuptools.command.build_py
|
||||
import setuptools.command.develop
|
||||
|
||||
# handle import if cython is not already installed.
|
||||
try:
|
||||
from Cython.Build import cythonize
|
||||
except ImportError:
|
||||
# create closure for deferred import
|
||||
def cythonize(*args, **kwargs): #pylint: disable=redefined-outer-name
|
||||
from Cython.Build import cythonize #pylint: disable=redefined-outer-name, import-outside-toplevel
|
||||
return cythonize(*args, **kwargs)
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
from distutils.extension import Extension
|
||||
from Cython.Build import cythonize
|
||||
|
||||
# parameters for wheeling server.
|
||||
parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
|
||||
parser.add_argument('--checkpoint',
|
||||
type=str,
|
||||
|
@ -33,38 +27,25 @@ args, unknown_args = parser.parse_known_args()
|
|||
# Remove our arguments from argv so that setuptools doesn't see them
|
||||
sys.argv = [sys.argv[0]] + unknown_args
|
||||
|
||||
version = '0.0.8'
|
||||
|
||||
# Adapted from https://github.com/pytorch/pytorch
|
||||
version = '0.0.9'
|
||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||
if os.getenv('TTS_PYTORCH_BUILD_VERSION'):
|
||||
version = os.getenv('TTS_PYTORCH_BUILD_VERSION')
|
||||
else:
|
||||
try:
|
||||
sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
|
||||
cwd=cwd).decode('ascii').strip()
|
||||
version += '+' + sha[:7]
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
except IOError: # FileNotFoundError for python 3
|
||||
pass
|
||||
|
||||
|
||||
# Handle Cython code
|
||||
def find_pyx(path='.'):
|
||||
pyx_files = []
|
||||
for root, _, filenames in os.walk(path):
|
||||
for fname in filenames:
|
||||
if fname.endswith('.pyx'):
|
||||
pyx_files.append(os.path.join(root, fname))
|
||||
return pyx_files
|
||||
# def find_pyx(path='.'):
|
||||
# pyx_files = []
|
||||
# for root, _, filenames in os.walk(path):
|
||||
# for fname in filenames:
|
||||
# if fname.endswith('.pyx'):
|
||||
# pyx_files.append(os.path.join(root, fname))
|
||||
# return pyx_files
|
||||
|
||||
|
||||
def find_cython_extensions(path="."):
|
||||
exts = cythonize(find_pyx(path), language_level=3)
|
||||
for ext in exts:
|
||||
ext.include_dirs = [numpy.get_include()]
|
||||
return exts
|
||||
# def find_cython_extensions(path="."):
|
||||
# exts = cythonize(find_pyx(path), language_level=3)
|
||||
# for ext in exts:
|
||||
# ext.include_dirs = [numpy.get_include()]
|
||||
|
||||
# return exts
|
||||
|
||||
|
||||
class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors
|
||||
|
@ -105,12 +86,12 @@ def pip_install(package_name):
|
|||
subprocess.call([sys.executable, '-m', 'pip', 'install', package_name])
|
||||
|
||||
|
||||
reqs_from_file = open('requirements.txt').readlines()
|
||||
reqs_without_tf = [r for r in reqs_from_file if not r.startswith('tensorflow')]
|
||||
tf_req = [r for r in reqs_from_file if r.startswith('tensorflow')]
|
||||
|
||||
requirements = {'install_requires': reqs_without_tf, 'pip_install': tf_req}
|
||||
requirements = open(os.path.join(cwd, 'requirements.txt'), 'r').readlines()
|
||||
with open('README.md', "r", encoding="utf-8") as readme_file:
|
||||
README = readme_file.read()
|
||||
|
||||
exts = [Extension(name='TTS.tts.layers.glow_tts.monotonic_align.core',
|
||||
sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"])]
|
||||
setup(
|
||||
name='TTS',
|
||||
version=version,
|
||||
|
@ -118,9 +99,15 @@ setup(
|
|||
author='Eren Gölge',
|
||||
author_email='egolge@mozilla.com',
|
||||
description='Text to Speech with Deep Learning',
|
||||
long_description=README,
|
||||
long_description_content_type="text/markdown",
|
||||
license='MPL-2.0',
|
||||
entry_points={'console_scripts': ['tts-server = TTS.server.server:main']},
|
||||
ext_modules=find_cython_extensions(),
|
||||
# cython
|
||||
include_dirs=numpy.get_include(),
|
||||
ext_modules=cythonize(exts, language_level=3),
|
||||
# ext_modules=find_cython_extensions(),
|
||||
# package
|
||||
include_package_data=True,
|
||||
packages=find_packages(include=['TTS*']),
|
||||
project_urls={
|
||||
'Documentation': 'https://github.com/mozilla/TTS/wiki',
|
||||
|
@ -131,9 +118,16 @@ setup(
|
|||
cmdclass={
|
||||
'build_py': build_py,
|
||||
'develop': develop,
|
||||
# 'build_ext': build_ext
|
||||
},
|
||||
install_requires=requirements,
|
||||
python_requires='>=3.6.0, <3.9',
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'tts=TTS.bin.synthesize:main',
|
||||
'tts-server = TTS.server.server:main'
|
||||
]
|
||||
},
|
||||
install_requires=requirements['install_requires'],
|
||||
python_requires='>=3.6.0',
|
||||
classifiers=[
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
|
@ -141,14 +135,16 @@ setup(
|
|||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
'Development Status :: 3 - Alpha',
|
||||
"Intended Audience :: Science/Research :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: Developers",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
|
||||
"Topic :: Software Development :: Libraries :: Python Modules :: Speech :: Sound/Audio :: Multimedia :: Artificial Intelligence",
|
||||
])
|
||||
|
||||
# for some reason having tensorflow in 'install_requires'
|
||||
# breaks some of the dependencies.
|
||||
if 'bdist_wheel' not in unknown_args:
|
||||
for module in requirements['pip_install']:
|
||||
pip_install(module)
|
||||
"Topic :: Software Development",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
||||
"Topic :: Multimedia :: Sound/Audio",
|
||||
"Topic :: Multimedia",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence"
|
||||
],
|
||||
zip_safe=False
|
||||
)
|
||||
|
|
|
@ -67,21 +67,21 @@ class TestAudio(unittest.TestCase):
|
|||
self.ap.symmetric_norm = False
|
||||
self.ap.clip_norm = False
|
||||
self.ap.max_norm = 4.0
|
||||
x_norm = self.ap._normalize(x)
|
||||
x_norm = self.ap.normalize(x)
|
||||
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
|
||||
assert (x_old - x).sum() == 0
|
||||
# check value range
|
||||
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
|
||||
assert x_norm.min() >= 0 - 1, x_norm.min()
|
||||
# check denorm.
|
||||
x_ = self.ap._denormalize(x_norm)
|
||||
x_ = self.ap.denormalize(x_norm)
|
||||
assert (x - x_).sum() < 1e-3, (x - x_).mean()
|
||||
|
||||
self.ap.signal_norm = True
|
||||
self.ap.symmetric_norm = False
|
||||
self.ap.clip_norm = True
|
||||
self.ap.max_norm = 4.0
|
||||
x_norm = self.ap._normalize(x)
|
||||
x_norm = self.ap.normalize(x)
|
||||
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
|
||||
|
||||
|
||||
|
@ -90,14 +90,14 @@ class TestAudio(unittest.TestCase):
|
|||
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
|
||||
assert x_norm.min() >= 0, x_norm.min()
|
||||
# check denorm.
|
||||
x_ = self.ap._denormalize(x_norm)
|
||||
x_ = self.ap.denormalize(x_norm)
|
||||
assert (x - x_).sum() < 1e-3, (x - x_).mean()
|
||||
|
||||
self.ap.signal_norm = True
|
||||
self.ap.symmetric_norm = True
|
||||
self.ap.clip_norm = False
|
||||
self.ap.max_norm = 4.0
|
||||
x_norm = self.ap._normalize(x)
|
||||
x_norm = self.ap.normalize(x)
|
||||
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
|
||||
|
||||
|
||||
|
@ -107,14 +107,14 @@ class TestAudio(unittest.TestCase):
|
|||
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() #pylint: disable=invalid-unary-operand-type
|
||||
assert x_norm.min() <= 0, x_norm.min()
|
||||
# check denorm.
|
||||
x_ = self.ap._denormalize(x_norm)
|
||||
x_ = self.ap.denormalize(x_norm)
|
||||
assert (x - x_).sum() < 1e-3, (x - x_).mean()
|
||||
|
||||
self.ap.signal_norm = True
|
||||
self.ap.symmetric_norm = True
|
||||
self.ap.clip_norm = True
|
||||
self.ap.max_norm = 4.0
|
||||
x_norm = self.ap._normalize(x)
|
||||
x_norm = self.ap.normalize(x)
|
||||
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
|
||||
|
||||
|
||||
|
@ -124,26 +124,26 @@ class TestAudio(unittest.TestCase):
|
|||
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
|
||||
assert x_norm.min() <= 0, x_norm.min()
|
||||
# check denorm.
|
||||
x_ = self.ap._denormalize(x_norm)
|
||||
x_ = self.ap.denormalize(x_norm)
|
||||
assert (x - x_).sum() < 1e-3, (x - x_).mean()
|
||||
|
||||
self.ap.signal_norm = True
|
||||
self.ap.symmetric_norm = False
|
||||
self.ap.max_norm = 1.0
|
||||
x_norm = self.ap._normalize(x)
|
||||
x_norm = self.ap.normalize(x)
|
||||
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
|
||||
|
||||
|
||||
assert (x_old - x).sum() == 0
|
||||
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
|
||||
assert x_norm.min() >= 0, x_norm.min()
|
||||
x_ = self.ap._denormalize(x_norm)
|
||||
x_ = self.ap.denormalize(x_norm)
|
||||
assert (x - x_).sum() < 1e-3
|
||||
|
||||
self.ap.signal_norm = True
|
||||
self.ap.symmetric_norm = True
|
||||
self.ap.max_norm = 1.0
|
||||
x_norm = self.ap._normalize(x)
|
||||
x_norm = self.ap.normalize(x)
|
||||
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
|
||||
|
||||
|
||||
|
@ -151,7 +151,7 @@ class TestAudio(unittest.TestCase):
|
|||
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
|
||||
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
|
||||
assert x_norm.min() < 0, x_norm.min()
|
||||
x_ = self.ap._denormalize(x_norm)
|
||||
x_ = self.ap.denormalize(x_norm)
|
||||
assert (x - x_).sum() < 1e-3
|
||||
|
||||
def test_scaler(self):
|
||||
|
@ -172,5 +172,5 @@ class TestAudio(unittest.TestCase):
|
|||
wav = self.ap.load_wav(WAV_FILE)
|
||||
mel_reference = self.ap.melspectrogram(wav)
|
||||
mel_norm = ap.melspectrogram(wav)
|
||||
mel_denorm = ap._denormalize(mel_norm)
|
||||
mel_denorm = ap.denormalize(mel_norm)
|
||||
assert abs(mel_reference - mel_denorm).max() < 1e-4
|
||||
|
|
|
@ -2,7 +2,7 @@ import os
|
|||
import unittest
|
||||
|
||||
from tests import get_tests_input_path, get_tests_output_path
|
||||
from TTS.server.synthesizer import Synthesizer
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
from TTS.tts.utils.generic_utils import setup_model
|
||||
from TTS.tts.utils.io import save_checkpoint
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
|
@ -29,7 +29,7 @@ class DemoServerTest(unittest.TestCase):
|
|||
tts_root_path = get_tests_output_path()
|
||||
config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint'])
|
||||
config['tts_config'] = os.path.join(tts_root_path, config['tts_config'])
|
||||
synthesizer = Synthesizer(config)
|
||||
synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None)
|
||||
synthesizer.tts("Better this test works!!")
|
||||
|
||||
def test_split_into_sentences(self):
|
||||
|
|
Загрузка…
Ссылка в новой задаче