Merge branch 'dev'

2021-01-26 17:39:22 +01:00 · 2021-01-26 17:39:22 +01:00 · df5899daf4
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -26,25 +26,21 @@ jobs:
      - run: |
          sudo apt update
          sudo apt install espeak git
-        # so we can take advantage of pyproject.toml build-dependency support
-      - run: python3 -m pip install --upgrade pip
-      - run: python3 -m pip install numpy Cython
+      - run: sudo pip install --upgrade pip
+      - run: sudo pip install -e .
      - run: |
-          python3 setup.py egg_info
-          python3 -m pip install -e .
-      - run: |
-          python3 -m pip install --quiet --upgrade cardboardlint pylint
+          sudo pip install --quiet --upgrade cardboardlint pylint
          cardboardlinter --refspec ${CIRCLE_BRANCH} -n auto
      - run: nosetests tests --nocapture
      - run: |
-          ./tests/test_server_package.sh
-          ./tests/test_glow-tts_train.sh
-          ./tests/test_server_package.sh
-          ./tests/test_tacotron_train.sh
-          ./tests/test_vocoder_gan_train.sh
-          ./tests/test_vocoder_wavegrad_train.sh
-          ./tests/test_vocoder_wavernn_train.sh
-          ./tests/test_speedy_speech_train.sh
+          sudo ./tests/test_server_package.sh
+          sudo ./tests/test_glow-tts_train.sh
+          sudo ./tests/test_server_package.sh
+          sudo ./tests/test_tacotron_train.sh
+          sudo ./tests/test_vocoder_gan_train.sh
+          sudo ./tests/test_vocoder_wavegrad_train.sh
+          sudo ./tests/test_vocoder_wavernn_train.sh
+          sudo ./tests/test_speedy_speech_train.sh

  test-3.7:
    <<: *test-template
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -6,9 +6,11 @@ labels: ''
 assignees: ''

 ---
-<b>Questions</b> will not be answered here!! 
+<b>Questions</b> will not be answered here!!

-Please consider posting on [TTS Discourse](https://discourse.mozilla.org/c/tts) page if your issue is not directly related to TTS development (Bugs, code updates etc.). 
+Help is much more valuable if it's shared publicly, so that more people can benefit from it.
+
+Please consider posting on [TTS Discourse](https://discourse.mozilla.org/c/tts) page or matrix [chat room](https://matrix.to/#/!KTePhNahjgiVumkqca:matrix.org?via=matrix.org) if your issue is not directly related to TTS development (Bugs, code updates etc.).

 You can also check https://github.com/mozilla/TTS/wiki/FAQ for common questions and answers.

--- a/20
+++ b/20
@ -1,20 +0,0 @@
-FROM pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime
-
-WORKDIR /srv/app
-
-RUN apt-get update && \
-	apt-get install -y libsndfile1 espeak && \
-	apt-get clean && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# Copy Source later to enable dependency caching
-COPY requirements.txt /srv/app/
-RUN pip install -r requirements.txt
-
-COPY . /srv/app
-
-# http://bugs.python.org/issue19846
-# > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK.
-ENV LANG C.UTF-8
-
-CMD python3.6 server/server.py -c server/conf.json
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,11 @@
+include README.md
+include LICENSE.txt
+include requirements.txt
+recursive-include TTS *.json
+recursive-include TTS *.html
+recursive-include TTS *.png
+recursive-include TTS *.md
+recursive-include TTS *.py
+recursive-include TTS *.pyx
+recursive-include images *.png
+
--- a/README.md
+++ b/README.md
@ -36,9 +36,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 ## 🔗 Links and Resources
 | Type                            | Links                               |
 | ------------------------------- | --------------------------------------- |
+| 💾 **Installation** | [TTS/README.md](https://github.com/mozilla/TTS/tree/dev#install-tts)|
 | 👩🏾‍🏫 **Tutorials and Examples**  | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials) |
-| 🤖 **Released Models**         | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/Released-Models)|
+| 🚀 **Released Models**         | [TTS/Wiki](https://github.com/mozilla/TTS/wiki/Released-Models)|
 | 💻 **Docker Image**            | [Repository by @synesthesiam](https://github.com/synesthesiam/docker-mozillatts)|
+| 🖥️ **Demo Server**             | [TTS/server](https://github.com/mozilla/TTS/tree/master/TTS/server)|
+| 🤖 **Running TTS on Terminal** | [TTS/README.md](https://github.com/mozilla/TTS#example-synthesizing-speech-on-terminal-using-the-released-models)|

 ## 🥇 TTS Performance
 <p align="center"><img src="https://discourse-prod-uploads-81679984178418.s3.dualstack.us-west-2.amazonaws.com/optimized/3X/6/4/6428f980e9ec751c248e591460895f7881aec0c6_2_1035x591.png" width="800" /></p>
@ -90,9 +93,20 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).

 ## Install TTS
-TTS supports **python >= 3.6**.
+TTS supports **python >= 3.6, <3.9**.

-```python setup.py install``` or ```python setup.py develop``` to keep your installation in your working directory.
+If you are only interested in [synthesizing speech](https://github.com/mozilla/TTS/tree/dev#example-synthesizing-speech-on-terminal-using-the-released-models) with the released TTS models, installing from PyPI is the easiest option.
+
+```
+pip install TTS
+```
+
+If you plan to code or train models, clone TTS and install it locally.
+
+```
+git clone https://github.com/mozilla/TTS
+pip install -e .
+```

 ## Directory Structure
 ```
@ -138,6 +152,24 @@ Some of the public datasets that we successfully applied TTS:
 - [LibriTTS](https://openslr.org/60/)
 - [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01

+## Example: Synthesizing Speech on Terminal Using the Released Models.
+
+After the installation, TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under the TTS project.
+
+Listing released TTS models.
+```tts --list_models```
+
+Run a tts and a vocoder model from the released model list. (Simply copy and paste the full model names from the list as arguments for the command below.)
+```tts --text "Text for TTS" --model_name "<type>/<language>/<dataset>/<model_name>" --vocoder_name "<type>/<language>/<dataset>/<model_name>" --output_path```
+
+Run your own TTS model (Using Griffin-Lim Vocoder)
+```tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav```
+
+Run your own TTS and Vocoder models
+```tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json```
+
+**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder.
+
 ## Example: Training and Fine-tuning LJ-Speech Dataset
 Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below.

--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -0,0 +1,77 @@
+{
+    "tts_models":{
+        "en":{
+            "ljspeech":{
+                "glow-tts":{
+                    "description": "",
+                    "model_file": "1NFsfhH8W8AgcfJ-BsL8CYAwQfZ5k4T-n",
+                    "config_file": "1IAROF3yy9qTK43vG_-R67y3Py9yYbD6t",
+                    "stats_file": null,
+                    "commit": ""
+                },
+                "tacotron2-DCA": {
+                    "description": "",
+                    "model_file": "1CFoPDQBnhfBFu2Gc0TBSJn8o-TuNKQn7",
+                    "config_file": "1lWSscNfKet1zZSJCNirOn7v9bigUZ8C1",
+                    "stats_file": "1qevpGRVHPmzfiRBNuugLMX62x1k7B5vK",
+                    "commit": ""
+                },
+                "speedy-speech-wn":{
+                    "description": "Speedy Speech model with wavenet decoder.",
+                    "model_file": "1VXAwiq6N-Viq3rsSXlf43bdoi0jSvMAJ",
+                    "config_file": "1KvZilhsNP3EumVggDcD46yd834eO5hR3",
+                    "stats_file": "1Ju7apZ5JlgsVECcETL-GEx3DRoNzWfkR",
+                    "commit": "77b6145"
+                }
+            }
+        },
+        "es":{
+            "mai":{
+                "tacotron2-DDC":{
+                    "model_file": "1jZ4HvYcAXI5ZClke2iGA7qFQQJBXIovw",
+                    "config_file": "1s7g4n-B73ChCB48AQ88_DV_8oyLth8r0",
+                    "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv",
+                    "commit": ""
+                }
+            }
+        },
+        "fr":{
+            "mai":{
+                "tacotron2-DDC":{
+                    "model_file": "1qyxrrCyoXUvBG2lqVd0KqAlHj-2nZCgS",
+                    "config_file": "1yECKeP2LI7tNv4E8yVNx1yLmCfTCpkqG",
+                    "stats_file": "13st0CZ743v6Br5R5Qw_lH1OPQOr3M-Jv",
+                    "commit": ""
+                }
+            }
+        }
+    },
+    "vocoder_models":{
+        "universal":{
+            "libri-tts":{
+                "wavegrad":{
+                    "model_file": "1r2g90JaZsfCj9dJkI9ioIU6JCFMPRqi6",
+                    "config_file": "1POrrLf5YEpZyjvWyMccj1nGCVc94mR6s",
+                    "stats_file": "1Vwbv4t-N1i3jXqI0bgKAhShAEO097sK0",
+                    "commit": "ea976b0"
+                },
+                "fullband-melgan":{
+                    "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K",
+                    "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu",
+                    "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU",
+                    "commit": "4132240"
+                }
+            }
+        },
+        "en": {
+            "ljspeech":{
+                "mulitband-melgan":{
+                    "model_file": "1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K",
+                    "config_file": "1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu",
+                    "stats_file": "11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU",
+                    "commit": "ea976b0"
+                }
+            }
+        }
+    }
+}
--- a/TTS/bin/init.py
+++ b/TTS/bin/init.py
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -2,114 +2,138 @@
 # -*- coding: utf-8 -*-

 import argparse
-import json
-# pylint: disable=redefined-outer-name, unused-argument
 import os
+import sys
 import string
-import time
+from argparse import RawTextHelpFormatter
+# pylint: disable=redefined-outer-name, unused-argument
+from pathlib import Path

-import torch
-import numpy as np
-
-from TTS.tts.utils.generic_utils import setup_model, is_tacotron
-from TTS.tts.utils.synthesis import synthesis
-from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.io import load_config
-from TTS.vocoder.utils.generic_utils import setup_generator
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer


-def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):
-    t_1 = time.time()
-    waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
-
-    # grab spectrogram (thx to the nice guys at mozilla discourse for codesnipplet)
-    if args.save_spectogram:
-        spec_file_name = args.text.replace(" ", "_")[0:10]
-        spec_file_name = spec_file_name.translate(
-            str.maketrans('', '', string.punctuation.replace('_', ''))) + '.npy'
-        spec_file_name = os.path.join(args.out_path, spec_file_name)
-        spectrogram = torch.FloatTensor(mel_postnet_spec.T)
-        spectrogram = spectrogram.unsqueeze(0)
-        np.save(spec_file_name, spectrogram)
-        print(" > Saving raw spectogram to " + spec_file_name)
-
-    if CONFIG.model == "Tacotron" and not use_gl:
-        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
-    if not use_gl:
-        # Use if not computed noise schedule with tune_wavegrad
-        beta = np.linspace(1e-6, 0.01, 50)
-        vocoder_model.compute_noise_level(beta)
-
-        # Use alternative when using output npy file from tune_wavegrad
-        # beta = np.load("output-tune-wavegrad.npy", allow_pickle=True).item()
-        # vocoder_model.compute_noise_level(beta['beta'])
-
-        device_type = "cuda" if use_cuda else "cpu"
-        waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).to(device_type).unsqueeze(0))
-    if use_cuda and not use_gl:
-        waveform = waveform.cpu()
-    if not use_gl:
-        waveform = waveform.numpy()
-    waveform = waveform.squeeze()
-    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
-    tps = (time.time() - t_1) / len(waveform)
-    print(" > Run-time: {}".format(time.time() - t_1))
-    print(" > Real-time factor: {}".format(rtf))
-    print(" > Time per step: {}".format(tps))
-    return waveform
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    if v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    raise argparse.ArgumentTypeError('Boolean value expected.')


-if __name__ == "__main__":
+def main():
+    # pylint: disable=bad-continuation
+    parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n'''
+
+    '''You can either use your trained model or choose a model from the provided list.\n'''\
+
+    '''
+    Example runs:
+
+    # list provided models
+    ./TTS/bin/synthesize.py --list_models
+
+    # run a model from the list
+    ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
+
+    # run your own TTS model (Using Griffin-Lim Vocoder)
+    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+
+    # run your own TTS and Vocoder models
+    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+
+    ''',
+        formatter_class=RawTextHelpFormatter)

-    parser = argparse.ArgumentParser()
-    parser.add_argument('text', type=str, help='Text to generate speech.')
-    parser.add_argument('config_path',
-                        type=str,
-                        help='Path to model config file.')
    parser.add_argument(
-        'model_path',
+        '--list_models',
+        type=str2bool,
+        nargs='?',
+        const=True,
+        default=False,
+        help='list available pre-trained tts and vocoder models.'
+        )
+    parser.add_argument(
+        '--text',
        type=str,
+        default=None,
+        help='Text to generate speech.'
+        )
+
+    # Args for running pre-trained TTS models.
+    parser.add_argument(
+        '--model_name',
+        type=str,
+        default=None,
+        help=
+        'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>'
+    )
+    parser.add_argument(
+        '--vocoder_name',
+        type=str,
+        default=None,
+        help=
+        'Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>'
+    )
+
+    # Args for running custom models
+    parser.add_argument(
+        '--config_path',
+        default=None,
+        type=str,
+        help='Path to model config file.'
+        )
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        default=None,
        help='Path to model file.',
    )
    parser.add_argument(
-        'out_path',
+        '--out_path',
        type=str,
-        help='Path to save final wav file. Wav file will be names as the text given.',
+        default=Path(__file__).resolve().parent,
+        help='Path to save final wav file. Wav file will be named as the given text.',
    )
-    parser.add_argument('--use_cuda',
-                        type=bool,
-                        help='Run model on CUDA.',
-                        default=False)
+    parser.add_argument(
+        '--use_cuda',
+        type=bool,
+        help='Run model on CUDA.',
+        default=False
+        )
    parser.add_argument(
        '--vocoder_path',
        type=str,
        help=
        'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
-        default="",
+        default=None,
    )
-    parser.add_argument('--vocoder_config_path',
-                        type=str,
-                        help='Path to vocoder model config file.',
-                        default="")
    parser.add_argument(
-        '--batched_vocoder',
-        type=bool,
-        help="If True, vocoder model uses faster batch processing.",
-        default=True)
-    parser.add_argument('--speakers_json',
-                        type=str,
-                        help="JSON file for multi-speaker model.",
-                        default="")
-    parser.add_argument(
-        '--speaker_fileid',
+        '--vocoder_config_path',
        type=str,
-        help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
+        help='Path to vocoder model config file.',
+        default=None)
+
+    # args for multi-speaker synthesis
+    parser.add_argument(
+        '--speakers_json',
+        type=str,
+        help="JSON file for multi-speaker model.",
+        default=None)
+    parser.add_argument(
+        '--speaker_idx',
+        type=str,
+        help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
        default=None)
    parser.add_argument(
        '--gst_style',
        help="Wav path file for GST stylereference.",
        default=None)
+
+    # aux args
    parser.add_argument(
        '--save_spectogram',
        type=bool,
@ -118,88 +142,77 @@ if __name__ == "__main__":

    args = parser.parse_args()

-    # load the config
-    C = load_config(args.config_path)
-    C.forward_attn_mask = True
+    # load model manager
+    path = Path(__file__).parent / "../.models.json"
+    manager = ModelManager(path)

-    # load the audio processor
-    ap = AudioProcessor(**C.audio)
+    model_path = None
+    config_path = None
+    vocoder_path = None
+    vocoder_config_path = None

-    # if the vocabulary was passed, replace the default
-    if 'characters' in C.keys():
-        symbols, phonemes = make_symbols(**C.characters)
+    # CASE1: list pre-trained TTS models
+    if args.list_models:
+        manager.list_models()
+        sys.exit()

-    speaker_embedding = None
-    speaker_embedding_dim = None
-    num_speakers = 0
+    # CASE2: load pre-trained models
+    if args.model_name is not None:
+        model_path, config_path = manager.download_model(args.model_name)

-    # load speakers
-    if args.speakers_json != '':
-        speaker_mapping = json.load(open(args.speakers_json, 'r'))
-        num_speakers = len(speaker_mapping)
-        if C.use_external_speaker_embedding_file:
-            if args.speaker_fileid is not None:
-                speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding']
-            else: # if speaker_fileid is not specificated use the first sample in speakers.json
-                speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
-            speaker_embedding_dim = len(speaker_embedding)
+    if args.vocoder_name is not None:
+        vocoder_path, vocoder_config_path = manager.download_model(args.vocoder_name)

-    # load the model
-    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
-    model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
-    cp = torch.load(args.model_path, map_location=torch.device('cpu'))
-    model.load_state_dict(cp['model'])
-    model.eval()
-    if args.use_cuda:
-        model.cuda()
-    if is_tacotron(C):
-        model.decoder.set_r(cp['r'])
+    # CASE3: load custome models
+    if args.model_path is not None:
+        model_path = args.model_path
+        config_path = args.config_path

-    # load vocoder model
-    if args.vocoder_path != "":
-        VC = load_config(args.vocoder_config_path)
-        vocoder_model = setup_generator(VC)
-        vocoder_model.load_state_dict(torch.load(args.vocoder_path, map_location="cpu")["model"])
-        vocoder_model.remove_weight_norm()
-        if args.use_cuda:
-            vocoder_model.cuda()
-        vocoder_model.eval()
-    else:
-        vocoder_model = None
-        VC = None
+    if args.vocoder_path is not None:
+        vocoder_path = args.vocoder_path
+        vocoder_config_path = args.vocoder_config_path

-    # synthesize voice
-    use_griffin_lim = args.vocoder_path == ""
+    # RUN THE SYNTHESIS
+    # load models
+    synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda)
+
+    use_griffin_lim = vocoder_path is None
    print(" > Text: {}".format(args.text))

-    if not C.use_external_speaker_embedding_file:
-        if args.speaker_fileid.isdigit():
-            args.speaker_fileid = int(args.speaker_fileid)
-        else:
-            args.speaker_fileid = None
-    else:
-        args.speaker_fileid = None
+    # # handle multi-speaker setting
+    # if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None:
+    #     if args.speaker_idx.isdigit():
+    #         args.speaker_idx = int(args.speaker_idx)
+    #     else:
+    #         args.speaker_idx = None
+    # else:
+    #     args.speaker_idx = None

-    if args.gst_style is None:
-        if is_tacotron(C):
-            gst_style = C.gst['gst_style_input']
-        else:
-            gst_style = None
-    else:
-        # check if gst_style string is a dict, if is dict convert  else use string
-        try:
-            gst_style = json.loads(args.gst_style)
-            if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']:
-                raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), C.gst['gst_style_tokens']))
-        except ValueError:
-            gst_style = args.gst_style
+    # if args.gst_style is None:
+    #     if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None:
+    #         gst_style = model_config.gst['gst_style_input']
+    #     else:
+    #         gst_style = None
+    # else:
+    #     # check if gst_style string is a dict, if is dict convert  else use string
+    #     try:
+    #         gst_style = json.loads(args.gst_style)
+    #         if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']:
+    #             raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens']))
+    #     except ValueError:
+    #         gst_style = args.gst_style

-    wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style)
+    # kick it
+    wav = synthesizer.tts(args.text)

    # save the results
-    file_name = args.text.replace(" ", "_")[0:10]
+    file_name = args.text.replace(" ", "_")[0:20]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = os.path.join(args.out_path, file_name)
    print(" > Saving output to {}".format(out_path))
-    ap.save_wav(wav, out_path)
+    synthesizer.save_wav(wav, out_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/TTS/bin/train_tacotron.py
+++ b/TTS/bin/train_tacotron.py
@ -549,7 +549,7 @@ def main(args):  # pylint: disable=redefined-outer-name
                scaler.load_state_dict(checkpoint["scaler"])
            if c.reinit_layers:
                raise RuntimeError
-        except KeyError:
+        except (KeyError, RuntimeError):
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
--- a/TTS/server/README.md
+++ b/TTS/server/README.md
@ -9,6 +9,22 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple
 ##### Using server.py
 If you have the environment set already for TTS, then you can directly call ```server.py```.

+**Note:** After installing TTS as a package you can use ```tts-server``` to call the commands below.
+
+Examples runs:
+
+List officially released models.
+```python TTS/server/server.py  --list_models ```
+
+Run the server with the official models.
+```python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan```
+
+Run the server with the official models on a GPU.
+```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/mulitband-melgan --use_cuda True```
+
+Run the server with a custom models.
+```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
+
 ##### Using .whl
 1. apt-get install -y espeak libsndfile1 python3-venv
 2. python3 -m venv /tmp/venv
@ -21,6 +37,8 @@ You can now open http://localhost:5002 in a browser

 #### Running with nginx/uwsgi:

+**Note:** This method uses an old TTS model, so quality might be low.
+
 1. apt-get install -y uwsgi uwsgi-plugin-python3 nginx espeak libsndfile1 python3-venv
 2. python3 -m venv /tmp/venv
 3. source /tmp/venv/bin/activate
--- a/TTS/server/conf.json
+++ b/TTS/server/conf.json
@ -5,10 +5,6 @@
    "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
    "vocoder_config":null,
    "vocoder_file": null,
-    "wavernn_lib_path": null,   // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
-    "wavernn_path":null,  // wavernn model root path
-    "wavernn_file":null, // wavernn checkpoint file name
-    "wavernn_config": null, // wavernn config file
    "is_wavernn_batched":true,
    "port": 5002,
    "use_cuda": true,
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -1,9 +1,14 @@
 #!flask/bin/python
 import argparse
 import os
+import sys
+import io
+from pathlib import Path

-from flask import Flask, request, render_template, send_file
-from TTS.server.synthesizer import Synthesizer
+from flask import Flask, render_template, request, send_file
+from TTS.utils.synthesizer import Synthesizer
+from TTS.utils.manage import ModelManager
+from TTS.utils.io import load_config


 def create_argparser():
@ -11,21 +16,20 @@ def create_argparser():
        return x.lower() in ['true', '1', 'yes']

    parser = argparse.ArgumentParser()
-    parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file')
-    parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
+    parser.add_argument('--list_models', type=convert_boolean, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.')
+    parser.add_argument('--model_name', type=str, help='name of one of the released tts models.')
+    parser.add_argument('--vocoder_name', type=str, help='name of one of the released vocoder models.')
+    parser.add_argument('--tts_checkpoint', type=str, help='path to custom tts checkpoint file')
+    parser.add_argument('--tts_config', type=str, help='path to custom tts config.json file')
    parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
-    parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
-    parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
-    parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
-    parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
-    parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
-    parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
+    parser.add_argument('--vocoder_config', type=str, default=None, help='path to vocoder config file.')
+    parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to vocoder checkpoint file.')
    parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
    parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
    parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
+    parser.add_argument('--show_details', type=convert_boolean, default=False, help='Generate model detail page.')
    return parser

-
 synthesizer = None

 embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
@ -45,6 +49,20 @@ wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')

 args = create_argparser().parse_args()

+path = Path(__file__).parent / "../.models.json"
+manager = ModelManager(path)
+
+if args.list_models:
+    manager.list_models()
+    sys.exit()
+
+# set models by the released models
+if args.model_name is not None:
+    tts_checkpoint_file, tts_config_file = manager.download_model(args.model_name)
+
+if args.vocoder_name is not None:
+    vocoder_checkpoint_file, vocoder_config_file = manager.download_model(args.vocoder_name)
+
 # If these were not specified in the CLI args, use default values with embedded model files
 if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
    args.tts_checkpoint = tts_checkpoint_file
@ -56,26 +74,38 @@ if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
 if not args.vocoder_config and os.path.isfile(vocoder_config_file):
    args.vocoder_config = vocoder_config_file

-if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
-    args.wavernn_checkpoint = wavernn_checkpoint_file
-if not args.wavernn_config and os.path.isfile(wavernn_config_file):
-    args.wavernn_config = wavernn_config_file
-
-synthesizer = Synthesizer(args)
+synthesizer = Synthesizer(args.tts_checkpoint, args.tts_config, args.vocoder_checkpoint, args.vocoder_config, args.use_cuda)

 app = Flask(__name__)

+
@app.route('/')
 def index():
-    return render_template('index.html')
+    return render_template('index.html', show_details=args.show_details)

+@app.route('/details')
+def details():
+    model_config = load_config(args.tts_config)
+    if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
+        vocoder_config = load_config(args.vocoder_config)
+    else:
+        vocoder_config = None
+
+    return render_template('details.html',
+                           show_details=args.show_details
+                           , model_config=model_config
+                           , vocoder_config=vocoder_config
+                           , args=args.__dict__
+                          )

@app.route('/api/tts', methods=['GET'])
 def tts():
    text = request.args.get('text')
    print(" > Model input: {}".format(text))
-    data = synthesizer.tts(text)
-    return send_file(data, mimetype='audio/wav')
+    wavs = synthesizer.tts(text)
+    out = io.BytesIO()
+    synthesizer.save_wav(wavs, out)
+    return send_file(out, mimetype='audio/wav')


 def main():
--- a/TTS/server/static/TTS_circle.png
+++ b/TTS/server/static/TTS_circle.png
--- a/TTS/server/synthesizer.py
+++ b/TTS/server/synthesizer.py
@ -1,193 +0,0 @@
-import io
-import sys
-import time
-
-import numpy as np
-import torch
-import pysbd
-
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.io import load_config
-from TTS.tts.utils.generic_utils import setup_model
-from TTS.tts.utils.speakers import load_speaker_mapping
-from TTS.vocoder.utils.generic_utils import setup_generator
-# pylint: disable=unused-wildcard-import
-# pylint: disable=wildcard-import
-from TTS.tts.utils.synthesis import *
-
-from TTS.tts.utils.text import make_symbols, phonemes, symbols
-
-
-class Synthesizer(object):
-    def __init__(self, config):
-        self.wavernn = None
-        self.vocoder_model = None
-        self.config = config
-        print(config)
-        self.seg = self.get_segmenter("en")
-        self.use_cuda = self.config.use_cuda
-        if self.use_cuda:
-            assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
-        self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
-                      self.config.use_cuda)
-        if self.config.vocoder_checkpoint:
-            self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
-        if self.config.wavernn_lib_path:
-            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
-                              self.config.wavernn_config, self.config.use_cuda)
-
-    @staticmethod
-    def get_segmenter(lang):
-        return pysbd.Segmenter(language=lang, clean=True)
-
-    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
-        # pylint: disable=global-statement
-        global symbols, phonemes
-
-        print(" > Loading TTS model ...")
-        print(" | > model config: ", tts_config)
-        print(" | > checkpoint file: ", tts_checkpoint)
-
-        self.tts_config = load_config(tts_config)
-        self.use_phonemes = self.tts_config.use_phonemes
-        self.ap = AudioProcessor(**self.tts_config.audio)
-
-        if 'characters' in self.tts_config.keys():
-            symbols, phonemes = make_symbols(**self.tts_config.characters)
-
-        if self.use_phonemes:
-            self.input_size = len(phonemes)
-        else:
-            self.input_size = len(symbols)
-        # TODO: fix this for multi-speaker model - load speakers
-        if self.config.tts_speakers is not None:
-            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
-            num_speakers = len(self.tts_speakers)
-        else:
-            num_speakers = 0
-        self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
-        # load model state
-        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
-        # load the model
-        self.tts_model.load_state_dict(cp['model'])
-        if use_cuda:
-            self.tts_model.cuda()
-        self.tts_model.eval()
-        self.tts_model.decoder.max_decoder_steps = 3000
-        if 'r' in cp:
-            self.tts_model.decoder.set_r(cp['r'])
-            print(f" > model reduction factor: {cp['r']}")
-
-    def load_vocoder(self, model_file, model_config, use_cuda):
-        self.vocoder_config = load_config(model_config)
-        self.vocoder_model = setup_generator(self.vocoder_config)
-        self.vocoder_model.load_state_dict(torch.load(model_file, map_location="cpu")["model"])
-        self.vocoder_model.remove_weight_norm()
-        self.vocoder_model.inference_padding = 0
-        self.vocoder_config = load_config(model_config)
-
-        if use_cuda:
-            self.vocoder_model.cuda()
-        self.vocoder_model.eval()
-
-    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
-        # TODO: set a function in wavernn code base for model setup and call it here.
-        sys.path.append(lib_path) # set this if WaveRNN is not installed globally
-        #pylint: disable=import-outside-toplevel
-        from WaveRNN.models.wavernn import Model
-        print(" > Loading WaveRNN model ...")
-        print(" | > model config: ", model_config)
-        print(" | > model file: ", model_file)
-        self.wavernn_config = load_config(model_config)
-        # This is the default architecture we use for our models.
-        # You might need to update it
-        self.wavernn = Model(
-            rnn_dims=512,
-            fc_dims=512,
-            mode=self.wavernn_config.mode,
-            mulaw=self.wavernn_config.mulaw,
-            pad=self.wavernn_config.pad,
-            use_aux_net=self.wavernn_config.use_aux_net,
-            use_upsample_net=self.wavernn_config.use_upsample_net,
-            upsample_factors=self.wavernn_config.upsample_factors,
-            feat_dims=80,
-            compute_dims=128,
-            res_out_dims=128,
-            res_blocks=10,
-            hop_length=self.ap.hop_length,
-            sample_rate=self.ap.sample_rate,
-        ).cuda()
-
-        check = torch.load(model_file, map_location="cpu")
-        self.wavernn.load_state_dict(check['model'])
-        if use_cuda:
-            self.wavernn.cuda()
-        self.wavernn.eval()
-
-    def save_wav(self, wav, path):
-        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
-        wav = np.array(wav)
-        self.ap.save_wav(wav, path)
-
-    def split_into_sentences(self, text):
-        return self.seg.segment(text)
-
-    def tts(self, text, speaker_id=None):
-        start_time = time.time()
-        wavs = []
-        sens = self.split_into_sentences(text)
-        print(sens)
-        speaker_id = id_to_torch(speaker_id)
-        if speaker_id is not None and self.use_cuda:
-            speaker_id = speaker_id.cuda()
-
-        for sen in sens:
-            # preprocess the given text
-            inputs = text_to_seqvec(sen, self.tts_config)
-            inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
-            inputs = inputs.unsqueeze(0)
-            # synthesize voice
-            _, postnet_output, _, _ = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None)
-            if self.vocoder_model:
-                # use native vocoder model
-                vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
-                wav = self.vocoder_model.inference(vocoder_input)
-                if self.use_cuda:
-                    wav = wav.cpu().numpy()
-                else:
-                    wav = wav.numpy()
-                wav = wav.flatten()
-            elif self.wavernn:
-                # use 3rd paty wavernn
-                vocoder_input = None
-                if self.tts_config.model == "Tacotron":
-                    vocoder_input = torch.FloatTensor(self.ap.out_linear_to_mel(linear_spec=postnet_output.T).T).T.unsqueeze(0)
-                else:
-                    vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
-                if self.use_cuda:
-                    vocoder_input.cuda()
-                wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
-            else:
-                # use GL
-                if self.use_cuda:
-                    postnet_output = postnet_output[0].cpu()
-                else:
-                    postnet_output = postnet_output[0]
-                postnet_output = postnet_output.numpy()
-                wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
-
-            # trim silence
-            wav = trim_silence(wav, self.ap)
-
-            wavs += list(wav)
-            wavs += [0] * 10000
-
-        out = io.BytesIO()
-        self.save_wav(wavs, out)
-
-        # compute stats
-        process_time = time.time() - start_time
-        audio_time = len(wavs) / self.tts_config.audio['sample_rate']
-        print(f" > Processing time: {process_time}")
-        print(f" > Real-time factor: {process_time / audio_time}")
-        return out
--- a/TTS/server/templates/details.html
+++ b/TTS/server/templates/details.html
@ -0,0 +1,131 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+  <meta name="description" content="">
+  <meta name="author" content="">
+
+  <title>TTS engine</title>
+
+  <!-- Bootstrap core CSS -->
+  <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
+    integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
+    rel="stylesheet">
+
+  <!-- Custom styles for this template -->
+  <style>
+    body {
+      padding-top: 54px;
+    }
+
+    @media (min-width: 992px) {
+      body {
+        padding-top: 56px;
+      }
+    }
+  </style>
+</head>
+
+<body>
+  <a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
+      src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
+
+  {% if show_details == true %}
+
+  <div class="container">
+    <b>Model details</b>
+  </div>
+
+  <div class="container">
+    <details>
+      <summary>CLI arguments:</summary>
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> CLI key </td>
+          <td> Value </td>
+        </tr>
+
+        {% for key, value in args.items() %}
+
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+
+        {% endfor %}
+      </table>
+    </details>
+  </div></br>
+
+  <div class="container">
+
+    {% if model_config != None %}
+
+    <details>
+      <summary>Model config:</summary>
+
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> Key </td>
+          <td> Value </td>
+        </tr>
+
+
+        {% for key, value in model_config.items() %}
+
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+
+        {% endfor %}
+
+      </table>
+    </details>
+
+    {% endif %}
+
+  </div></br>
+
+
+
+  <div class="container">
+    {% if vocoder_config != None %}
+    <details>
+      <summary>Vocoder model config:</summary>
+
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> Key </td>
+          <td> Value </td>
+        </tr>
+
+
+        {% for key, value in vocoder_config.items() %}
+
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+
+        {% endfor %}
+
+
+      </table>
+    </details>
+    {% endif %}
+  </div></br>
+
+  {% else %}
+  <div class="container">
+    <b>Please start server with --show_details=true to see details.</b>
+  </div>
+
+  {% endif %}
+
+</body>
+
+</html>
--- a/TTS/server/templates/index.html
+++ b/TTS/server/templates/index.html
@ -56,11 +56,15 @@
    <div class="container">
      <div class="row">
        <div class="col-lg-12 text-center">
-          <img class="mt-5" src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" alt=></img>
+            <img class="mt-5" src="{{url_for('static', filename='TTS_circle.png')}}" align="middle" />
+
          <ul class="list-unstyled">
          </ul>
          <input id="text" placeholder="Type here..." size=45 type="text" name="text">
          <button id="speak-button" name="speak">Speak</button><br/><br/>
+          {%if show_details%}
+            <button id="details-button" onclick="location.href = 'details'" name="model-details">Model Details</button><br/><br/>
+          {%endif%}
          <audio id="audio" controls autoplay hidden></audio>
          <p id="message"></p>
        </div>
--- a/TTS/tts/layers/glow_tts/glow.py
+++ b/TTS/tts/layers/glow_tts/glow.py
@ -128,8 +128,9 @@ class InvConvNear(nn.Module):
        return z, logdet

    def store_inverse(self):
-        self.weight_inv = torch.inverse(
+        weight_inv = torch.inverse(
            self.weight.float()).to(dtype=self.weight.dtype)
+        self.weight_inv = nn.Parameter(weight_inv, requires_grad=False)


 class CouplingBlock(nn.Module):
--- a/TTS/tts/layers/glow_tts/monotonic_align/init.py
+++ b/TTS/tts/layers/glow_tts/monotonic_align/init.py
@ -2,7 +2,13 @@ import numpy as np
 import torch
 from torch.nn import functional as F
 from TTS.tts.utils.generic_utils import sequence_mask
-from TTS.tts.layers.glow_tts.monotonic_align.core import maximum_path_c
+
+try:
+    # TODO: fix pypi cython installation problem.
+    from TTS.tts.layers.glow_tts.monotonic_align.core import maximum_path_c
+    CYTHON = True
+except ModuleNotFoundError:
+    CYTHON = False


 def convert_pad_shape(pad_shape):
@ -32,6 +38,12 @@ def generate_path(duration, mask):


 def maximum_path(value, mask):
+    if CYTHON:
+        return maximum_path_cython(value, mask)
+    return maximum_path_numpy(value, mask)
+
+
+def maximum_path_cython(value, mask):
    """ Cython optimised version.
    value: [b, t_x, t_y]
    mask: [b, t_x, t_y]
@ -47,3 +59,45 @@ def maximum_path(value, mask):
    t_y_max = mask.sum(2)[:, 0].astype(np.int32)
    maximum_path_c(path, value, t_x_max, t_y_max)
    return torch.from_numpy(path).to(device=device, dtype=dtype)
+
+
+def maximum_path_numpy(value, mask, max_neg_val=None):
+    """
+    Monotonic alignment search algorithm
+    Numpy-friendly version. It's about 4 times faster than torch version.
+    value: [b, t_x, t_y]
+    mask: [b, t_x, t_y]
+    """
+    if max_neg_val is None:
+        max_neg_val = -np.inf  # Patch for Sphinx complaint
+    value = value * mask
+
+    device = value.device
+    dtype = value.dtype
+    value = value.cpu().detach().numpy()
+    mask = mask.cpu().detach().numpy().astype(np.bool)
+
+    b, t_x, t_y = value.shape
+    direction = np.zeros(value.shape, dtype=np.int64)
+    v = np.zeros((b, t_x), dtype=np.float32)
+    x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
+    for j in range(t_y):
+        v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
+        v1 = v
+        max_mask = v1 >= v0
+        v_max = np.where(max_mask, v1, v0)
+        direction[:, :, j] = max_mask
+
+        index_mask = x_range <= j
+        v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
+    direction = np.where(mask, direction, 1)
+
+    path = np.zeros(value.shape, dtype=np.float32)
+    index = mask[:, :, 0].sum(1).astype(np.int64) - 1
+    index_range = np.arange(b)
+    for j in reversed(range(t_y)):
+        path[index_range, index, j] = 1
+        index = index + direction[index_range, index, j] - 1
+    path = path * mask.astype(np.float32)
+    path = torch.from_numpy(path).to(device=device, dtype=dtype)
+    return path
--- a/TTS/tts/layers/glow_tts/monotonic_align/setup.py
+++ b/TTS/tts/layers/glow_tts/monotonic_align/setup.py
@ -1,7 +1,7 @@
-from distutils.core import setup
-from Cython.Build import cythonize
-import numpy
+# from distutils.core import setup
+# from Cython.Build import cythonize
+# import numpy

-setup(name='monotonic_align',
-      ext_modules=cythonize("core.pyx"),
-      include_dirs=[numpy.get_include()])
+# setup(name='monotonic_align',
+#       ext_modules=cythonize("core.pyx"),
+#       include_dirs=[numpy.get_include()])
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@ -223,3 +223,11 @@ class GlowTts(nn.Module):

    def store_inverse(self):
        self.decoder.store_inverse()
+
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
+        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        self.load_state_dict(state['model'])
+        if eval:
+            self.eval()
+            self.store_inverse()
+            assert not self.training
--- a/TTS/tts/models/speedy_speech.py
+++ b/TTS/tts/models/speedy_speech.py
@ -188,5 +188,12 @@ class SpeedySpeech(nn.Module):
        o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
        o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
        y_lengths = o_dr.sum(1)
-        o_de, attn= self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
+        o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
        return o_de, attn
+
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
+        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        self.load_state_dict(state['model'])
+        if eval:
+            self.eval()
+            assert not self.training
--- a/TTS/tts/models/tacotron_abstract.py
+++ b/TTS/tts/models/tacotron_abstract.py
@ -121,6 +121,14 @@ class TacotronAbstract(ABC, nn.Module):
    def inference(self):
        pass

+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
+        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        self.load_state_dict(state['model'])
+        self.decoder.set_r(state['r'])
+        if eval:
+            self.eval()
+            assert not self.training
+
    #############################
    # COMMON COMPUTE FUNCTIONS
    #############################
--- a/TTS/tts/utils/io.py
+++ b/TTS/tts/utils/io.py
@ -7,7 +7,7 @@ from TTS.utils.io import RenamingUnpickler



-def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
+def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False):
    """Load ```TTS.tts.models``` checkpoints.

    Args:
@ -33,6 +33,8 @@ def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
    if hasattr(model.decoder, 'r'):
        model.decoder.set_r(state['r'])
        print(" > Model r: ", state['r'])
+    if eval:
+        model.eval()
    return model, state


--- a/TTS/tts/utils/visual.py
+++ b/TTS/tts/utils/visual.py
@ -50,7 +50,7 @@ def plot_spectrogram(spectrogram,
    spectrogram_ = spectrogram_.astype(
        np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
    if ap is not None:
-        spectrogram_ = ap._denormalize(spectrogram_)  # pylint: disable=protected-access
+        spectrogram_ = ap.denormalize(spectrogram_)  # pylint: disable=protected-access
    fig = plt.figure(figsize=fig_size)
    plt.imshow(spectrogram_, aspect="auto", origin="lower")
    plt.colorbar()
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -35,9 +35,9 @@ class AudioProcessor(object):
                 trim_db=60,
                 do_sound_norm=False,
                 stats_path=None,
+                 verbose=True,
                 **_):

-        print(" > Setting up Audio Processor...")
        # setup class attributed
        self.sample_rate = sample_rate
        self.resample = resample
@ -73,8 +73,10 @@ class AudioProcessor(object):
        assert min_level_db != 0.0, " [!] min_level_db is 0"
        assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
        members = vars(self)
-        for key, value in members.items():
-            print(" | > {}:{}".format(key, value))
+        if verbose:
+            print(" > Setting up Audio Processor...")
+            for key, value in members.items():
+                print(" | > {}:{}".format(key, value))
        # create spectrogram utils
        self.mel_basis = self._build_mel_basis()
        self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
@ -107,7 +109,7 @@ class AudioProcessor(object):
        return hop_length, win_length

    ### normalization ###
-    def _normalize(self, S):
+    def normalize(self, S):
        """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]"""
        #pylint: disable=no-else-return
        S = S.copy()
@ -136,7 +138,7 @@ class AudioProcessor(object):
        else:
            return S

-    def _denormalize(self, S):
+    def denormalize(self, S):
        """denormalize values"""
        #pylint: disable=no-else-return
        S_denorm = S.copy()
@ -221,7 +223,7 @@ class AudioProcessor(object):
        else:
            D = self._stft(y)
        S = self._amp_to_db(np.abs(D))
-        return self._normalize(S)
+        return self.normalize(S)

    def melspectrogram(self, y):
        if self.preemphasis != 0:
@ -229,11 +231,11 @@ class AudioProcessor(object):
        else:
            D = self._stft(y)
        S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
-        return self._normalize(S)
+        return self.normalize(S)

    def inv_spectrogram(self, spectrogram):
        """Converts spectrogram to waveform using librosa"""
-        S = self._denormalize(spectrogram)
+        S = self.denormalize(spectrogram)
        S = self._db_to_amp(S)
        # Reconstruct phase
        if self.preemphasis != 0:
@ -242,7 +244,7 @@ class AudioProcessor(object):

    def inv_melspectrogram(self, mel_spectrogram):
        '''Converts melspectrogram to waveform using librosa'''
-        D = self._denormalize(mel_spectrogram)
+        D = self.denormalize(mel_spectrogram)
        S = self._db_to_amp(D)
        S = self._mel_to_linear(S)  # Convert back to linear
        if self.preemphasis != 0:
@ -250,11 +252,11 @@ class AudioProcessor(object):
        return self._griffin_lim(S**self.power)

    def out_linear_to_mel(self, linear_spec):
-        S = self._denormalize(linear_spec)
+        S = self.denormalize(linear_spec)
        S = self._db_to_amp(S)
        S = self._linear_to_mel(np.abs(S))
        S = self._amp_to_db(S)
-        mel = self._normalize(S)
+        mel = self.normalize(S)
        return mel

    ### STFT and ISTFT ###
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@ -3,6 +3,8 @@ import glob
 import os
 import shutil
 import subprocess
+import sys
+from pathlib import Path

 import torch

@ -67,6 +69,22 @@ def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


+def get_user_data_dir(appname):
+    if sys.platform == "win32":
+        import winreg  # pylint: disable=import-outside-toplevel
+        key = winreg.OpenKey(
+            winreg.HKEY_CURRENT_USER,
+            r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders"
+        )
+        dir_, _ = winreg.QueryValueEx(key, "Local AppData")
+        ans = Path(dir_).resolve(strict=False)
+    elif sys.platform == 'darwin':
+        ans = Path('~/Library/Application Support/').expanduser()
+    else:
+        ans = Path.home().joinpath('.local/share')
+    return ans.joinpath(appname)
+
+
 def set_init_dict(model_dict, checkpoint_state, c):
    # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
    for k, v in checkpoint_state.items():
@ -97,6 +115,7 @@ def set_init_dict(model_dict, checkpoint_state, c):
                                                     len(model_dict)))
    return model_dict

+
 class KeepAverage():
    def __init__(self):
        self.avg_values = {}
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -20,6 +20,16 @@ class AttrDict(dict):
        self.__dict__ = self


+def read_json_with_comments(json_path):
+    # fallback to json
+    with open(json_path, "r") as f:
+        input_str = f.read()
+    # handle comments
+    input_str = re.sub(r'\\\n', '', input_str)
+    input_str = re.sub(r'//.*\n', '\n', input_str)
+    data = json.loads(input_str)
+    return data
+
 def load_config(config_path: str) -> AttrDict:
    """Load config files and discard comments

@ -33,14 +43,7 @@ def load_config(config_path: str) -> AttrDict:
        with open(config_path, "r") as f:
            data = yaml.safe_load(f)
    else:
-        # fallback to json
-        with open(config_path, "r") as f:
-            input_str = f.read()
-        # handle comments
-        input_str = re.sub(r'\\\n', '', input_str)
-        input_str = re.sub(r'//.*\n', '\n', input_str)
-        data = json.loads(input_str)
-
+        data = read_json_with_comments(config_path)
    config.update(data)
    return config

--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -0,0 +1,103 @@
+import json
+import gdown
+from pathlib import Path
+import os
+
+from TTS.utils.io import load_config
+from TTS.utils.generic_utils import get_user_data_dir
+
+class ModelManager(object):
+    """Manage TTS models defined in .models.json.
+    It provides an interface to list and download
+    models defines in '.model.json'
+
+    Models are downloaded under '.TTS' folder in the user's
+    home path.
+
+    Args:
+        models_file (str): path to .model.json
+    """
+    def __init__(self, models_file):
+        super().__init__()
+        self.output_prefix = get_user_data_dir('tts')
+        self.url_prefix = "https://drive.google.com/uc?id="
+        self.models_dict = None
+        self.read_models_file(models_file)
+
+    def read_models_file(self, file_path):
+        """Read .models.json as a dict
+
+        Args:
+            file_path (str): path to .models.json.
+        """
+        with open(file_path) as json_file:
+            self.models_dict = json.load(json_file)
+
+    def list_langs(self):
+        print(" Name format: type/language")
+        for model_type in self.models_dict:
+            for lang in self.models_dict[model_type]:
+                print(f" >: {model_type}/{lang} ")
+
+    def list_datasets(self):
+        print(" Name format: type/language/dataset")
+        for model_type in self.models_dict:
+            for lang in self.models_dict[model_type]:
+                for dataset in self.models_dict[model_type][lang]:
+                    print(f" >: {model_type}/{lang}/{dataset}")
+
+    def list_models(self):
+        print(" Name format: type/language/dataset/model")
+        for model_type in self.models_dict:
+            for lang in self.models_dict[model_type]:
+                for dataset in self.models_dict[model_type][lang]:
+                    for model in self.models_dict[model_type][lang][dataset]:
+                        print(f" >: {model_type}/{lang}/{dataset}/{model} ")
+
+    def download_model(self, model_name):
+        """Download model files given the full model name.
+        Model name is in the format
+            'type/language/dataset/model'
+            e.g. 'tts_model/en/ljspeech/tacotron'
+
+        Args:
+            model_name (str): model name as explained above.
+
+        TODO: support multi-speaker models
+        """
+        # fetch model info from the dict
+        model_type, lang, dataset, model = model_name.split("/")
+        model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
+        model_item = self.models_dict[model_type][lang][dataset][model]
+        # set the model specific output path
+        output_path = os.path.join(self.output_prefix, model_full_name)
+        output_model_path = os.path.join(output_path, "model_file.pth.tar")
+        output_config_path = os.path.join(output_path, "config.json")
+        if os.path.exists(output_path):
+            print(f" > {model_name} is already downloaded.")
+        else:
+            os.makedirs(output_path, exist_ok=True)
+            print(f" > Downloading model to {output_path}")
+            output_stats_path = None
+            # download files to the output path
+            self._download_file(model_item['model_file'], output_model_path)
+            self._download_file(model_item['config_file'], output_config_path)
+            if model_item['stats_file'] is not None and len(model_item['stats_file']) > 1:
+                output_stats_path = os.path.join(output_path, 'scale_stats.npy')
+                self._download_file(model_item['stats_file'], output_stats_path)
+                # set scale stats path in config.json
+                config_path = output_config_path
+                config = load_config(config_path)
+                config["audio"]['stats_path'] = output_stats_path
+                with open(config_path, "w") as jf:
+                    json.dump(config, jf)
+        return output_model_path, output_config_path
+
+    def _download_file(self, idx, output):
+        gdown.download(f"{self.url_prefix}{idx}", output=output)
+
+
+
+
+
+
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -0,0 +1,169 @@
+import time
+
+import numpy as np
+import torch
+import pysbd
+
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.io import load_config
+from TTS.tts.utils.generic_utils import setup_model
+from TTS.tts.utils.speakers import load_speaker_mapping
+from TTS.vocoder.utils.generic_utils import setup_generator, interpolate_vocoder_input
+# pylint: disable=unused-wildcard-import
+# pylint: disable=wildcard-import
+from TTS.tts.utils.synthesis import *
+
+from TTS.tts.utils.text import make_symbols, phonemes, symbols
+
+
+class Synthesizer(object):
+    def __init__(self, tts_checkpoint, tts_config, vocoder_checkpoint=None, vocoder_config=None, use_cuda=False):
+        """Encapsulation of tts and vocoder models for inference.
+
+        TODO: handle multi-speaker and GST inference.
+
+        Args:
+            tts_checkpoint (str): path to the tts model file.
+            tts_config (str): path to the tts config file.
+            vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None.
+            vocoder_config (str, optional): path to the vocoder config file. Defaults to None.
+            use_cuda (bool, optional): enable/disable cuda. Defaults to False.
+        """
+        self.tts_checkpoint = tts_checkpoint
+        self.tts_config = tts_config
+        self.vocoder_checkpoint = vocoder_checkpoint
+        self.vocoder_config = vocoder_config
+        self.use_cuda = use_cuda
+        self.wavernn = None
+        self.vocoder_model = None
+        self.num_speakers = 0
+        self.tts_speakers = None
+        self.speaker_embedding_dim = None
+        self.seg = self.get_segmenter("en")
+        self.use_cuda = use_cuda
+        if self.use_cuda:
+            assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
+        self.load_tts(tts_checkpoint, tts_config,
+                      use_cuda)
+        if vocoder_checkpoint:
+            self.load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
+
+    @staticmethod
+    def get_segmenter(lang):
+        return pysbd.Segmenter(language=lang, clean=True)
+
+    def load_speakers(self):
+        # load speakers
+        if self.model_config.use_speaker_embedding is not None:
+            self.tts_speakers = load_speaker_mapping(self.tts_config.tts_speakers_json)
+            self.num_speakers = len(self.tts_speakers)
+        else:
+            self.num_speakers = 0
+        # set external speaker embedding
+        if self.tts_config.use_external_speaker_embedding_file:
+            speaker_embedding = self.tts_speakers[list(self.tts_speakers.keys())[0]]['embedding']
+            self.speaker_embedding_dim = len(speaker_embedding)
+
+    def init_speaker(self, speaker_idx):
+        # load speakers
+        speaker_embedding = None
+        if hasattr(self, 'tts_speakers') and speaker_idx is not None:
+            assert speaker_idx < len(self.tts_speakers), f" [!] speaker_idx is out of the range. {speaker_idx} vs {len(self.tts_speakers)}"
+            if self.tts_config.use_external_speaker_embedding_file:
+                speaker_embedding = self.tts_speakers[speaker_idx]['embedding']
+        return speaker_embedding
+
+    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
+        # pylint: disable=global-statement
+        global symbols, phonemes
+
+        self.tts_config = load_config(tts_config)
+        self.use_phonemes = self.tts_config.use_phonemes
+        self.ap = AudioProcessor(**self.tts_config.audio)
+
+        if 'characters' in self.tts_config.keys():
+            symbols, phonemes = make_symbols(**self.tts_config.characters)
+
+        if self.use_phonemes:
+            self.input_size = len(phonemes)
+        else:
+            self.input_size = len(symbols)
+
+        self.tts_model = setup_model(self.input_size, num_speakers=self.num_speakers, c=self.tts_config)
+        self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True)
+        if use_cuda:
+            self.tts_model.cuda()
+
+    def load_vocoder(self, model_file, model_config, use_cuda):
+        self.vocoder_config = load_config(model_config)
+        self.vocoder_ap = AudioProcessor(**self.vocoder_config['audio'])
+        self.vocoder_model = setup_generator(self.vocoder_config)
+        self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True)
+        if use_cuda:
+            self.vocoder_model.cuda()
+
+    def save_wav(self, wav, path):
+        wav = np.array(wav)
+        self.ap.save_wav(wav, path)
+
+    def split_into_sentences(self, text):
+        return self.seg.segment(text)
+
+    def tts(self, text, speaker_idx=None):
+        start_time = time.time()
+        wavs = []
+        sens = self.split_into_sentences(text)
+        print(" > Text splitted to sentences.")
+        print(sens)
+
+        speaker_embedding = self.init_speaker(speaker_idx)
+        use_gl = self.vocoder_model is None
+
+        for sen in sens:
+            # synthesize voice
+            waveform, _, _, mel_postnet_spec, _, _ = synthesis(
+                self.tts_model,
+                sen,
+                self.tts_config,
+                self.use_cuda,
+                self.ap,
+                speaker_idx,
+                None,
+                False,
+                self.tts_config.enable_eos_bos_chars,
+                use_gl,
+                speaker_embedding=speaker_embedding)
+            if not use_gl:
+                # denormalize tts output based on tts audio config
+                mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T
+                device_type = "cuda" if self.use_cuda else "cpu"
+                # renormalize spectrogram based on vocoder config
+                vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
+                # compute scale factor for possible sample rate mismatch
+                scale_factor = [1, self.vocoder_config['audio']['sample_rate'] / self.ap.sample_rate]
+                if scale_factor[1] != 1:
+                    print(" > interpolating tts model output.")
+                    vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
+                else:
+                    vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
+                # run vocoder model
+                # [1, T, C]
+                waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
+            if self.use_cuda and not use_gl:
+                waveform = waveform.cpu()
+            if not use_gl:
+                waveform = waveform.numpy()
+            waveform = waveform.squeeze()
+
+            # trim silence
+            waveform = trim_silence(waveform, self.ap)
+
+            wavs += list(waveform)
+            wavs += [0] * 10000
+
+        # compute stats
+        process_time = time.time() - start_time
+        audio_time = len(wavs) / self.tts_config.audio['sample_rate']
+        print(f" > Processing time: {process_time}")
+        print(f" > Real-time factor: {process_time / audio_time}")
+        return wavs
--- a/TTS/vocoder/layers/losses.py
+++ b/TTS/vocoder/layers/losses.py
@ -4,13 +4,15 @@ from torch import nn
 from torch.nn import functional as F


-class TorchSTFT():
+class TorchSTFT(nn.Module):
    def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
        """ Torch based STFT operation """
+        super(TorchSTFT, self).__init__()
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.win_length = win_length
-        self.window = getattr(torch, window)(win_length)
+        self.window = nn.Parameter(getattr(torch, window)(win_length),
+                                   requires_grad=False)

    def __call__(self, x):
        # B x D x T x 2
@ -22,7 +24,8 @@ class TorchSTFT():
                       center=True,
                       pad_mode="reflect",  # compatible with audio.py
                       normalized=False,
-                       onesided=True)
+                       onesided=True,
+                       return_complex=False)
        M = o[:, :, :, 0]
        P = o[:, :, :, 1]
        return torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8))
--- a/TTS/vocoder/models/melgan_generator.py
+++ b/TTS/vocoder/models/melgan_generator.py
@ -95,3 +95,11 @@ class MelganGenerator(nn.Module):
                    nn.utils.remove_weight_norm(layer)
                except ValueError:
                    layer.remove_weight_norm()
+
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
+        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        self.load_state_dict(state['model'])
+        if eval:
+            self.eval()
+            assert not self.training
+            self.remove_weight_norm()
--- a/TTS/vocoder/models/parallel_wavegan_generator.py
+++ b/TTS/vocoder/models/parallel_wavegan_generator.py
@ -39,6 +39,7 @@ class ParallelWaveganGenerator(torch.nn.Module):
        self.upsample_factors = upsample_factors
        self.upsample_scale = np.prod(upsample_factors)
        self.inference_padding = inference_padding
+        self.use_weight_norm = use_weight_norm

        # check the number of layers and stacks
        assert num_res_blocks % stacks == 0
@ -156,3 +157,12 @@ class ParallelWaveganGenerator(torch.nn.Module):
    def receptive_field_size(self):
        return self._get_receptive_field_size(self.layers, self.stacks,
                                              self.kernel_size)
+
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
+        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        self.load_state_dict(state['model'])
+        if eval:
+            self.eval()
+            assert not self.training
+            if self.use_weight_norm:
+                self.remove_weight_norm()
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@ -175,3 +175,22 @@ class Wavegrad(nn.Module):
        self.x_conv = weight_norm(self.x_conv)
        self.out_conv = weight_norm(self.out_conv)
        self.y_conv = weight_norm(self.y_conv)
+
+
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
+        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        self.load_state_dict(state['model'])
+        if eval:
+            self.eval()
+            assert not self.training
+            if self.use_weight_norm:
+                self.remove_weight_norm()
+            betas = np.linspace(config['test_noise_schedule']['min_val'],
+                                config['test_noise_schedule']['max_val'],
+                                config['test_noise_schedule']['num_steps'])
+            self.compute_noise_level(betas)
+        else:
+            betas = np.linspace(config['train_noise_schedule']['min_val'],
+                                config['train_noise_schedule']['max_val'],
+                                config['train_noise_schedule']['num_steps'])
+            self.compute_noise_level(betas)
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@ -499,3 +499,10 @@ class WaveRNN(nn.Module):
            unfolded[start:end] += y[i]

        return unfolded
+
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
+        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        self.load_state_dict(state['model'])
+        if eval:
+            self.eval()
+            assert not self.training
--- a/TTS/vocoder/utils/generic_utils.py
+++ b/TTS/vocoder/utils/generic_utils.py
@ -1,4 +1,5 @@
 import re
+import torch
 import importlib
 import numpy as np
 from matplotlib import pyplot as plt
@ -6,6 +7,29 @@ from matplotlib import pyplot as plt
 from TTS.tts.utils.visual import plot_spectrogram


+def interpolate_vocoder_input(scale_factor, spec):
+    """Interpolate spectrogram by the scale factor.
+    It is mainly used to match the sampling rates of
+    the tts and vocoder models.
+
+    Args:
+        scale_factor (float): scale factor to interpolate the spectrogram
+        spec (np.array): spectrogram to be interpolated
+
+    Returns:
+        torch.tensor: interpolated spectrogram.
+    """
+    print(" > before interpolation :", spec.shape)
+    spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0)  # pylint: disable=not-callable
+    spec = torch.nn.functional.interpolate(spec,
+                                           scale_factor=scale_factor,
+                                           recompute_scale_factor=True,
+                                           mode='bilinear',
+                                           align_corners=False).squeeze(0)
+    print(" > after interpolation :", spec.shape)
+    return spec
+
+
 def plot_results(y_hat, y, ap, global_step, name_prefix):
    """ Plot vocoder model results """

--- a/TTS/vocoder/utils/io.py
+++ b/TTS/vocoder/utils/io.py
@ -6,7 +6,7 @@ import pickle as pickle_tts
 from TTS.utils.io import RenamingUnpickler


-def load_checkpoint(model, checkpoint_path, use_cuda=False):
+def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):
    try:
        state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    except ModuleNotFoundError:
@ -15,6 +15,8 @@ def load_checkpoint(model, checkpoint_path, use_cuda=False):
    model.load_state_dict(state['model'])
    if use_cuda:
        model.cuda()
+    if eval:
+        model.eval()
    return model, state


--- a/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb
+++ b/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb
@ -112,7 +112,7 @@
        "    t_1 = time.time()\n",
        "    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
        "                                                                             truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
-        "    # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n",
+        "    # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
        "    if not use_gl:\n",
        "        waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
        "        waveform = waveform.flatten()\n",
--- a/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb
+++ b/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb
@ -112,7 +112,7 @@
        "    t_1 = time.time()\n",
        "    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
        "                                                                             truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
-        "    # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n",
+        "    # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
        "    if not use_gl:\n",
        "        waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
        "        waveform = waveform.flatten()\n",
--- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb
+++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb
@ -230,8 +230,8 @@
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-18-91e8914b5c6a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwav\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Max:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Min:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mean:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mplot_spectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m    218\u001b[0m             \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    219\u001b[0m         \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_normalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    222\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36m_normalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m    117\u001b[0m                     \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    118\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    120\u001b[0m             \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    121\u001b[0m             \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m  \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m    218\u001b[0m             \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    219\u001b[0m         \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    222\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mnormalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m    117\u001b[0m                     \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    118\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    120\u001b[0m             \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    121\u001b[0m             \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m  \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mRuntimeError\u001b[0m:  [!] Mean-Var stats does not match the given feature dimensions."
     ]
    }
@ -314,7 +314,7 @@
    "        exec(set_val_cmd)\n",
    "        wav = AP.load_wav(file)\n",
    "        spec = AP.spectrogram(wav)\n",
-    "        spec_norm = AP._denormalize(spec.T)\n",
+    "        spec_norm = AP.denormalize(spec.T)\n",
    "        plt.subplot(len(values), 2, 2*idx + 1)\n",
    "        plt.imshow(spec_norm.T, aspect=\"auto\", origin=\"lower\")\n",
    "        #         plt.colorbar()\n",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,2 @@
+[build-system]
+requires = ["setuptools", "wheel", "Cython", "numpy>=1.16.0"]
--- a/setup.py
+++ b/setup.py
@ -5,22 +5,16 @@ import os
 import shutil
 import subprocess
 import sys
+
 import numpy
-
-from setuptools import setup, find_packages, Extension
-import setuptools.command.develop
 import setuptools.command.build_py
+import setuptools.command.develop

-# handle import if cython is not already installed.
-try:
-    from Cython.Build import cythonize
-except ImportError:
-    # create closure for deferred import
-    def cythonize(*args, **kwargs):  #pylint: disable=redefined-outer-name
-        from Cython.Build import cythonize  #pylint: disable=redefined-outer-name, import-outside-toplevel
-        return cythonize(*args, **kwargs)
-
+from setuptools import find_packages, setup
+from distutils.extension import Extension
+from Cython.Build import cythonize

+# parameters for wheeling server.
 parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
 parser.add_argument('--checkpoint',
                    type=str,
@ -33,38 +27,25 @@ args, unknown_args = parser.parse_known_args()
 # Remove our arguments from argv so that setuptools doesn't see them
 sys.argv = [sys.argv[0]] + unknown_args

-version = '0.0.8'
-
-# Adapted from https://github.com/pytorch/pytorch
+version = '0.0.9'
 cwd = os.path.dirname(os.path.abspath(__file__))
-if os.getenv('TTS_PYTORCH_BUILD_VERSION'):
-    version = os.getenv('TTS_PYTORCH_BUILD_VERSION')
-else:
-    try:
-        sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
-                                      cwd=cwd).decode('ascii').strip()
-        version += '+' + sha[:7]
-    except subprocess.CalledProcessError:
-        pass
-    except IOError:  # FileNotFoundError for python 3
-        pass
-

 # Handle Cython code
-def find_pyx(path='.'):
-    pyx_files = []
-    for root, _, filenames in os.walk(path):
-        for fname in filenames:
-            if fname.endswith('.pyx'):
-                pyx_files.append(os.path.join(root, fname))
-    return pyx_files
+# def find_pyx(path='.'):
+#     pyx_files = []
+#     for root, _, filenames in os.walk(path):
+#         for fname in filenames:
+#             if fname.endswith('.pyx'):
+#                 pyx_files.append(os.path.join(root, fname))
+#     return pyx_files


-def find_cython_extensions(path="."):
-    exts = cythonize(find_pyx(path), language_level=3)
-    for ext in exts:
-        ext.include_dirs = [numpy.get_include()]
-    return exts
+# def find_cython_extensions(path="."):
+#     exts = cythonize(find_pyx(path), language_level=3)
+#     for ext in exts:
+#         ext.include_dirs = [numpy.get_include()]
+
+#     return exts


 class build_py(setuptools.command.build_py.build_py):  # pylint: disable=too-many-ancestors
@ -105,12 +86,12 @@ def pip_install(package_name):
    subprocess.call([sys.executable, '-m', 'pip', 'install', package_name])


-reqs_from_file = open('requirements.txt').readlines()
-reqs_without_tf = [r for r in reqs_from_file if not r.startswith('tensorflow')]
-tf_req = [r for r in reqs_from_file if r.startswith('tensorflow')]
-
-requirements = {'install_requires': reqs_without_tf, 'pip_install': tf_req}
+requirements = open(os.path.join(cwd, 'requirements.txt'), 'r').readlines()
+with open('README.md', "r", encoding="utf-8") as readme_file:
+    README = readme_file.read()

+exts = [Extension(name='TTS.tts.layers.glow_tts.monotonic_align.core',
+                  sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"])]
 setup(
    name='TTS',
    version=version,
@ -118,9 +99,15 @@ setup(
    author='Eren Gölge',
    author_email='egolge@mozilla.com',
    description='Text to Speech with Deep Learning',
+    long_description=README,
+    long_description_content_type="text/markdown",
    license='MPL-2.0',
-    entry_points={'console_scripts': ['tts-server = TTS.server.server:main']},
-    ext_modules=find_cython_extensions(),
+    # cython
+    include_dirs=numpy.get_include(),
+    ext_modules=cythonize(exts, language_level=3),
+    # ext_modules=find_cython_extensions(),
+    # package
+    include_package_data=True,
    packages=find_packages(include=['TTS*']),
    project_urls={
        'Documentation': 'https://github.com/mozilla/TTS/wiki',
@ -131,9 +118,16 @@ setup(
    cmdclass={
        'build_py': build_py,
        'develop': develop,
+        # 'build_ext': build_ext
+    },
+    install_requires=requirements,
+    python_requires='>=3.6.0, <3.9',
+    entry_points={
+        'console_scripts': [
+            'tts=TTS.bin.synthesize:main',
+            'tts-server = TTS.server.server:main'
+        ]
    },
-    install_requires=requirements['install_requires'],
-    python_requires='>=3.6.0',
    classifiers=[
        "Programming Language :: Python",
        "Programming Language :: Python :: 3",
@ -141,14 +135,16 @@ setup(
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        'Development Status :: 3 - Alpha',
-        "Intended Audience :: Science/Research :: Developers",
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: Developers",
        "Operating System :: POSIX :: Linux",
        'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
-        "Topic :: Software Development :: Libraries :: Python Modules :: Speech :: Sound/Audio :: Multimedia :: Artificial Intelligence",
-    ])
-
-# for some reason having tensorflow in 'install_requires'
-# breaks some of the dependencies.
-if 'bdist_wheel' not in unknown_args:
-    for module in requirements['pip_install']:
-        pip_install(module)
+        "Topic :: Software Development",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+        "Topic :: Multimedia :: Sound/Audio :: Speech",
+        "Topic :: Multimedia :: Sound/Audio",
+        "Topic :: Multimedia",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence"
+    ],
+    zip_safe=False
+)
--- a/tests/test_audio.py
+++ b/tests/test_audio.py
@ -67,21 +67,21 @@ class TestAudio(unittest.TestCase):
        self.ap.symmetric_norm = False
        self.ap.clip_norm = False
        self.ap.max_norm = 4.0
-        x_norm = self.ap._normalize(x)
+        x_norm = self.ap.normalize(x)
        print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}")
        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
        assert x_norm.min() >= 0 - 1, x_norm.min()
        # check denorm.
-        x_ = self.ap._denormalize(x_norm)
+        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = False
        self.ap.clip_norm = True
        self.ap.max_norm = 4.0
-        x_norm = self.ap._normalize(x)
+        x_norm = self.ap.normalize(x)
        print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}")


@ -90,14 +90,14 @@ class TestAudio(unittest.TestCase):
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= 0, x_norm.min()
        # check denorm.
-        x_ = self.ap._denormalize(x_norm)
+        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.clip_norm = False
        self.ap.max_norm = 4.0
-        x_norm = self.ap._normalize(x)
+        x_norm = self.ap.normalize(x)
        print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}")


@ -107,14 +107,14 @@ class TestAudio(unittest.TestCase):
        assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min()  #pylint: disable=invalid-unary-operand-type
        assert x_norm.min() <= 0, x_norm.min()
        # check denorm.
-        x_ = self.ap._denormalize(x_norm)
+        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.clip_norm = True
        self.ap.max_norm = 4.0
-        x_norm = self.ap._normalize(x)
+        x_norm = self.ap.normalize(x)
        print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}")


@ -124,26 +124,26 @@ class TestAudio(unittest.TestCase):
        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  #pylint: disable=invalid-unary-operand-type
        assert x_norm.min() <= 0, x_norm.min()
        # check denorm.
-        x_ = self.ap._denormalize(x_norm)
+        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = False
        self.ap.max_norm = 1.0
-        x_norm = self.ap._normalize(x)
+        x_norm = self.ap.normalize(x)
        print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}")


        assert (x_old - x).sum() == 0
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= 0, x_norm.min()
-        x_ = self.ap._denormalize(x_norm)
+        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.max_norm = 1.0
-        x_norm = self.ap._normalize(x)
+        x_norm = self.ap.normalize(x)
        print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}")


@ -151,7 +151,7 @@ class TestAudio(unittest.TestCase):
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  #pylint: disable=invalid-unary-operand-type
        assert x_norm.min() < 0, x_norm.min()
-        x_ = self.ap._denormalize(x_norm)
+        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3

    def test_scaler(self):
@ -172,5 +172,5 @@ class TestAudio(unittest.TestCase):
        wav = self.ap.load_wav(WAV_FILE)
        mel_reference = self.ap.melspectrogram(wav)
        mel_norm = ap.melspectrogram(wav)
-        mel_denorm = ap._denormalize(mel_norm)
+        mel_denorm = ap.denormalize(mel_norm)
        assert abs(mel_reference - mel_denorm).max() < 1e-4
--- a/tests/test_demo_server.py
+++ b/tests/test_demo_server.py
@ -2,7 +2,7 @@ import os
 import unittest

 from tests import get_tests_input_path, get_tests_output_path
-from TTS.server.synthesizer import Synthesizer
+from TTS.utils.synthesizer import Synthesizer
 from TTS.tts.utils.generic_utils import setup_model
 from TTS.tts.utils.io import save_checkpoint
 from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
@ -29,7 +29,7 @@ class DemoServerTest(unittest.TestCase):
        tts_root_path = get_tests_output_path()
        config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint'])
        config['tts_config'] = os.path.join(tts_root_path, config['tts_config'])
-        synthesizer = Synthesizer(config)
+        synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None)
        synthesizer.tts("Better this test works!!")

    def test_split_into_sentences(self):