turkish cleaner and data preprocessor

2020-03-17 14:47:59 +01:00 · 2020-03-17 14:47:59 +01:00 · fa795347a9
--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@ -187,3 +187,21 @@ def libri_tts(root_path, meta_files=None):
    for item in items:
        assert os.path.exists(item[1]), f" [!] wav file is not exist - {item[1]}"
    return items
+
+
+def custom_turkish(root_path, meta_file):
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "turkish-female"
+    skipped_files = []
+    with open(txt_file, 'r', encoding='utf-8') as ttf:
+        for line in ttf:
+            cols = line.split('|')
+            wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
+            if not os.path.exists(wav_file):
+                skipped_files.append(wav_file)
+                continue
+            text = cols[1].strip()
+            items.append([text, wav_file, speaker_name])
+    print(f" [!] {len(skipped_files)} files skipped. They are not exist...")
+    return items
--- a/utils/text/cleaners.py
+++ b/utils/text/cleaners.py
@ -91,6 +91,14 @@ def transliteration_cleaners(text):
    return text


+# TODO: elaborate it
+def basic_turkish_cleaners(text):
+    '''Pipeline for Turkish text'''
+    text = text.replace("I", "ı")
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+
+
 def english_cleaners(text):
    '''Pipeline for English text, including number and abbreviation expansion.'''
    text = convert_to_ascii(text)