config, benchmark notebook, synthesis fixed

2019-06-26 13:31:16 +02:00 · 2019-06-26 13:31:16 +02:00 · 05ff8801d1
--- a/config.json
+++ b/config.json
@ -75,6 +75,7 @@
    "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
-    "text_cleaner": "phoneme_cleaners"
+    "text_cleaner": "phoneme_cleaners",
+    "num_speakers": 10 // should just be bigger than the actual number of speakers
 }

--- a/config_tacotron.json
+++ b/config_tacotron.json
@ -76,6 +76,7 @@
        "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
        "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
        "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
-        "text_cleaner": "phoneme_cleaners"
+        "text_cleaner": "phoneme_cleaners",
+        "num_speakers": 10 // should just be bigger than the actual number of speakers
    }
    
--- a/config_tacotron2.json
+++ b/config_tacotron2.json
@ -78,6 +78,7 @@
        "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
        "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
        "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
-        "text_cleaner": "phoneme_cleaners"
+        "text_cleaner": "phoneme_cleaners",
+        "num_speakers": 10 // should just be bigger than the actual number of speakers
    }
    
--- a/config_tacotron_de.json
+++ b/config_tacotron_de.json
@ -76,6 +76,7 @@
                "phoneme_cache_path": "phoneme_cache",  // phoneme computation is slow, therefore, it caches results in the given folder.
                "use_phonemes": false,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
                "phoneme_language": "de",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
-                "text_cleaner": "phoneme_cleaners"
+                "text_cleaner": "phoneme_cleaners",
+                "num_speakers": 10 // should just be bigger than the actual number of speakers
            }
            
--- a/config_tacotron_gst.json
+++ b/config_tacotron_gst.json
@ -76,6 +76,7 @@
        "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
        "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
        "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
-        "text_cleaner": "phoneme_cleaners"
+        "text_cleaner": "phoneme_cleaners",
+        "num_speakers": 10 // should just be bigger than the actual number of speakers
    }
    
--- a/notebooks/Benchmark.ipynb
+++ b/notebooks/Benchmark.ipynb
--- a/utils/synthesis.py
+++ b/utils/synthesis.py
@ -35,17 +35,17 @@ def compute_style_mel(style_wav, ap, use_cuda):
        return style_mel


-def run_model(model, inputs, CONFIG, truncated, style_mel=None):
+def run_model(model, inputs, speaker_id, CONFIG, truncated, style_mel=None):
    if CONFIG.model == "TacotronGST" and style_mel is not None:
        decoder_output, postnet_output, alignments, stop_tokens = model.inference(
-            inputs, style_mel)
+            inputs, style_mel, speaker_id)
    else:
        if truncated:
            decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
-                inputs)
+                inputs, speaker_id)
        else:
            decoder_output, postnet_output, alignments, stop_tokens = model.inference(
-                inputs)
+                inputs, speaker_id)
    return decoder_output, postnet_output, alignments, stop_tokens


@ -100,12 +100,13 @@ def synthesis(model,
        style_mel = compute_style_mel(style_wav, ap, use_cuda)
    # preprocess the given text
    inputs = text_to_seqvec(text, CONFIG, use_cuda)
-    speaker_id = speaker_id_var = torch.from_numpy(speaker_id).unsqueeze(0)
+    speaker_id = np.asarray(speaker_id)
+    speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
    if use_cuda:
        speaker_id.cuda()
    # synthesize voice
    decoder_output, postnet_output, alignments, stop_tokens = run_model(
-        model, inputs, CONFIG, truncated, style_mel)
+        model, inputs, speaker_id, CONFIG, truncated, style_mel)
    # convert outputs to numpy
    postnet_output, decoder_output, alignment = parse_outputs(
        postnet_output, decoder_output, alignments)