turkish cleaner and data preprocessor

This commit is contained in:
erogol 2020-03-17 14:47:59 +01:00
Родитель fd4e6d0245
Коммит fa795347a9
2 изменённых файлов: 26 добавлений и 0 удалений

Просмотреть файл

@ -187,3 +187,21 @@ def libri_tts(root_path, meta_files=None):
for item in items:
assert os.path.exists(item[1]), f" [!] wav file is not exist - {item[1]}"
return items
def custom_turkish(root_path, meta_file):
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "turkish-female"
skipped_files = []
with open(txt_file, 'r', encoding='utf-8') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
if not os.path.exists(wav_file):
skipped_files.append(wav_file)
continue
text = cols[1].strip()
items.append([text, wav_file, speaker_name])
print(f" [!] {len(skipped_files)} files skipped. They are not exist...")
return items

Просмотреть файл

@ -91,6 +91,14 @@ def transliteration_cleaners(text):
return text
# TODO: elaborate it
def basic_turkish_cleaners(text):
'''Pipeline for Turkish text'''
text = text.replace("I", "ı")
text = lowercase(text)
text = collapse_whitespace(text)
def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)