TTS/synthesis.py

#-*- coding: utf-8 -*-

from network import *
from data import inv_spectrogram, find_endpoint, save_wav, spectrogram
import numpy as np
import argparse
import os
import sys
import io
from text import text_to_sequence

use_cuda = torch.cuda.is_available()


def main(args):

    # Make model
    if use_cuda:
        model = nn.DataParallel(Tacotron().cuda())

    # Load checkpoint
    try:
        checkpoint = torch.load(os.path.join(
            hp.checkpoint_path, 'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        print("\n--------model restored at step %d--------\n" %
              args.restore_step)

    except:
        raise FileNotFoundError("\n------------Model not exists------------\n")

    # Evaluation
    model = model.eval()

    # Make result folder if not exists
    if not os.path.exists(hp.output_path):
        os.mkdir(hp.output_path)

    # Sentences for generation
    sentences = [
        "And it is worth mention in passing that, as an example of fine typography,",
        # From July 8, 2017 New York Times:
        'Scientists at the CERN laboratory say they have discovered a new particle.',
        'There’s a way to measure the acute emotional intelligence that has never gone out of style.',
        'President Trump met with other leaders at the Group of 20 conference.',
        'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
        # From Google's Tacotron example page:
        'Generative adversarial network or variational auto-encoder.',
        'The buses aren\'t the problem, they actually provide a solution.',
        'Does the quick brown fox jump over the lazy dog?',
        'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
    ]

    # Synthesis and save to wav files
    for i, text in enumerate(sentences):
        wav = generate(model, text)
        path = os.path.join(hp.output_path, 'result_%d_%d.wav' %
                            (args.restore_step, i + 1))
        with open(path, 'wb') as f:
            f.write(wav)

        f.close()
        print("save wav file at step %d ..." % (i + 1))


def generate(model, text):

    # Text to index sequence
    cleaner_names = [x.strip() for x in hp.cleaners.split(',')]
    seq = np.expand_dims(np.asarray(text_to_sequence(
        text, cleaner_names), dtype=np.int32), axis=0)

    # Provide [GO] Frame
    mel_input = np.zeros([seq.shape[0], hp.num_mels, 1], dtype=np.float32)

    # Variables
    characters = Variable(torch.from_numpy(seq).type(
        torch.cuda.LongTensor), volatile=True).cuda()
    mel_input = Variable(torch.from_numpy(mel_input).type(
        torch.cuda.FloatTensor), volatile=True).cuda()

    # Spectrogram to wav
    _, linear_output = model.forward(characters, mel_input)
    wav = inv_spectrogram(linear_output[0].data.cpu().numpy())
    wav = wav[:find_endpoint(wav)]
    out = io.BytesIO()
    save_wav(wav, out)

    return out.getvalue()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--restore_step', type=int,
                        help='Global step to restore checkpoint', default=0)
    parser.add_argument('--batch_size', type=int, help='Batch size', default=1)
    args = parser.parse_args()
    main(args)