Merge branch 'dev' of github.com:mozilla/TTS into stylemel_in_testing

2019-07-25 13:23:41 +02:00 · 2019-07-25 13:23:41 +02:00 · 11f9edd849
--- a/config.json
+++ b/config.json
@ -40,12 +40,12 @@
    "windowing": false,            // Enables attention windowing. Used only in eval mode.
    "memory_size": 5,              // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
    "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
-    "prenet_type": "original",     // ONLY TACOTRON2 - "original" or "bn".
-    "prenet_dropout": true,        // ONLY TACOTRON2 - enable/disable dropout at prenet. 
-    "use_forward_attn": true,      // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
-    "forward_attn_mask": false, 
-    "transition_agent": false,     // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
-    "location_attn": false,        // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "prenet_type": "original",     // "original" or "bn".
+    "prenet_dropout": true,        // enable/disable dropout at prenet.
+    "use_forward_attn": true,      // enable/disable forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": false,        // enable_disable location sensitive attention.
    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
    "stopnet": true,               // Train stopnet predicting the end of synthesis. 
--- a/config_libritts.json
+++ b/config_libritts.json
@ -39,13 +39,13 @@
    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
    "memory_size": 5,              // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
    "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
-    "prenet_type": "original",     // ONLY TACOTRON2 - "original" or "bn".
-    "prenet_dropout": true,        // ONLY TACOTRON2 - enable/disable dropout at prenet. 
+    "prenet_type": "original",     // "original" or "bn".
+    "prenet_dropout": true,        // enable/disable dropout at prenet.
    "windowing": false,            // Enables attention windowing. Used only in eval mode.
-    "use_forward_attn": false,      // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
+    "use_forward_attn": false,      // enable/disable forward attention. In general, it aligns faster.
    "forward_attn_mask": false, 
-    "transition_agent": false,     // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
-    "location_attn": true,        // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,        // enable_disable location sensitive attention.
    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
    "stopnet": true,               // Train stopnet predicting the end of synthesis. 
--- a/config_tacotron.json
+++ b/config_tacotron.json
@ -42,10 +42,10 @@
        "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
        "prenet_type": "original",     // "original" or "bn".
        "prenet_dropout": true,        // enable/disable dropout at prenet. 
-        "use_forward_attn": true,      // if it uses forward attention. In general, it aligns faster.
+        "use_forward_attn": true,      // enable/disable forward attention. In general, it aligns faster.
        "forward_attn_mask": false,    // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well.
        "transition_agent": true,     // enable/disable transition agent of forward attention.
-        "location_attn": false,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+        "location_attn": false,        // enable_disable location sensitive attention.
        "loss_masking": true,         // enable / disable loss masking against the sequence padding.
        "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
        "stopnet": true,               // Train stopnet predicting the end of synthesis. 
--- a/config_tacotron2.json
+++ b/config_tacotron2.json
@ -39,12 +39,12 @@
        "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
        "memory_size": 5,              // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
        "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
-        "prenet_type": "original",     // ONLY TACOTRON2 - "original" or "bn".
-        "prenet_dropout": true,        // ONLY TACOTRON2 - enable/disable dropout at prenet. 
-        "use_forward_attn": true,      // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
+        "prenet_type": "original",     // "original" or "bn".
+        "prenet_dropout": true,        // enable/disable dropout at prenet.
+        "use_forward_attn": true,      // enable/disable forward attention. In general, it aligns faster.
        "forward_attn_mask": false,    // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well.
-        "transition_agent": false,     // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
-        "location_attn": false,        // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+        "transition_agent": false,     // enable/disable transition agent of forward attention.
+        "location_attn": false,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
        "loss_masking": true,          // enable / disable loss masking against the sequence padding.
        "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
        "stopnet": true,               // Train stopnet predicting the end of synthesis. 
--- a/config_tacotron_de.json
+++ b/config_tacotron_de.json
@ -40,12 +40,12 @@
                "windowing": false,            // Enables attention windowing. Used only in eval mode.
                "memory_size": 5,              // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
                "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
-                "prenet_type": "original",     // ONLY TACOTRON2 - "original" or "bn".
-                "prenet_dropout": true,        // ONLY TACOTRON2 - enable/disable dropout at prenet. 
-                "use_forward_attn": false,      // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
-                "transition_agent": false,     // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
-                "forward_attn_mask": false,      
-                "location_attn": true,        // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+                "prenet_type": "original",     // "original" or "bn".
+                "prenet_dropout": true,        // enable/disable dropout at prenet.
+                "use_forward_attn": false,      // enable/disable forward attention. In general, it aligns faster.
+                "transition_agent": false,     // enable/disable transition agent of forward attention.
+                "forward_attn_mask": false,    // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well.
+                "location_attn": true,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
                "loss_masking": true,         // enable / disable loss masking against the sequence padding.
                "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
                "stopnet": true,               // Train stopnet predicting the end of synthesis. 
--- a/config_tacotron_gst.json
+++ b/config_tacotron_gst.json
@ -42,8 +42,8 @@
        "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
        "prenet_type": "original",           // "original" or "bn".
        "prenet_dropout": true,        // enable/disable dropout at prenet. 
-        "use_forward_attn": true,      // if it uses forward attention. In general, it aligns faster.
-        "forward_attn_mask": false,    // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well.
+        "use_forward_attn": true,      // enable/disable forward attention. In general, it aligns faster.
+        "forward_attn_mask": false,    // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well.
        "transition_agent": false,     // enable/disable transition agent of forward attention.
        "location_attn": false,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
        "loss_masking": true,         // enable / disable loss masking against the sequence padding.
--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@ -75,21 +75,19 @@ def mailabs(root_path, meta_files=None):
    speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
    if meta_files is None:
        csv_files = glob(root_path+"/**/metadata.csv", recursive=True)
-        folders = [os.path.dirname(f) for f in csv_files]
    else:
        csv_files = meta_files
-        folders = [f.strip().split("by_book")[1][1:] for f in csv_files]
    # meta_files = [f.strip() for f in meta_files.split(",")]
    items = []
-    for idx, csv_file in enumerate(csv_files):
+    for csv_file in csv_files:
+        txt_file = os.path.join(root_path, csv_file)
+        folder = os.path.dirname(txt_file)
        # determine speaker based on folder structure...
-        speaker_name_match = speaker_regex.search(csv_file)
+        speaker_name_match = speaker_regex.search(txt_file)
        if speaker_name_match is None:
            continue
        speaker_name = speaker_name_match.group("speaker_name")
        print(" | > {}".format(csv_file))
-        folder = folders[idx]
-        txt_file = os.path.join(root_path, csv_file)
        with open(txt_file, 'r') as ttf:
            for line in ttf:
                cols = line.split('|')