2017-04-01 20:35:46 +03:00
|
|
|
#!/usr/bin/env python
|
2017-04-01 19:57:58 +03:00
|
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
|
2017-04-01 20:35:46 +03:00
|
|
|
# Make sure we can import stuff from util/
|
|
|
|
# This script needs to be run from the root of the DeepSpeech repository
|
|
|
|
import sys
|
|
|
|
import os
|
2017-04-09 22:49:51 +03:00
|
|
|
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
2017-04-01 20:35:46 +03:00
|
|
|
|
2017-03-21 20:34:34 +03:00
|
|
|
import codecs
|
2016-10-13 22:15:39 +03:00
|
|
|
import fnmatch
|
2017-04-01 19:57:58 +03:00
|
|
|
import pandas
|
|
|
|
import progressbar
|
2016-10-13 22:15:39 +03:00
|
|
|
import subprocess
|
|
|
|
import tarfile
|
2016-11-08 22:07:46 +03:00
|
|
|
import unicodedata
|
2017-04-01 19:57:58 +03:00
|
|
|
|
2016-10-13 22:15:39 +03:00
|
|
|
from sox import Transformer
|
2018-10-02 11:24:26 +03:00
|
|
|
from util.downloader import maybe_download
|
2016-10-13 22:15:39 +03:00
|
|
|
from tensorflow.python.platform import gfile
|
2017-04-01 19:57:58 +03:00
|
|
|
|
2019-06-23 21:47:12 +03:00
|
|
|
SAMPLE_RATE = 16000
|
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
def _download_and_preprocess_data(data_dir):
|
2016-10-13 22:15:39 +03:00
|
|
|
# Conditionally download data to data_dir
|
2017-04-01 19:57:58 +03:00
|
|
|
print("Downloading Librivox data set (55GB) into {} if not already present...".format(data_dir))
|
2017-03-21 20:37:50 +03:00
|
|
|
with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
|
2017-03-21 20:34:34 +03:00
|
|
|
TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
|
|
|
|
TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
|
|
|
|
TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz"
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2017-03-21 20:34:34 +03:00
|
|
|
DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
|
|
|
|
DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz"
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2017-03-21 20:34:34 +03:00
|
|
|
TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"
|
|
|
|
TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2017-04-01 20:07:47 +03:00
|
|
|
def filename_of(x): return os.path.split(x)[1]
|
2018-10-02 11:24:26 +03:00
|
|
|
train_clean_100 = maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(0)
|
2018-10-02 11:24:26 +03:00
|
|
|
train_clean_360 = maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(1)
|
2018-10-02 11:24:26 +03:00
|
|
|
train_other_500 = maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(2)
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2018-10-02 11:24:26 +03:00
|
|
|
dev_clean = maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(3)
|
2018-10-02 11:24:26 +03:00
|
|
|
dev_other = maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(4)
|
|
|
|
|
2018-10-02 11:24:26 +03:00
|
|
|
test_clean = maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(5)
|
2018-10-02 11:24:26 +03:00
|
|
|
test_other = maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(6)
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2016-10-13 22:15:39 +03:00
|
|
|
# Conditionally extract LibriSpeech data
|
|
|
|
# We extract each archive into data_dir, but test for existence in
|
|
|
|
# data_dir/LibriSpeech because the archives share that root.
|
2017-03-21 20:34:34 +03:00
|
|
|
print("Extracting librivox data if not already extracted...")
|
2017-03-21 20:37:50 +03:00
|
|
|
with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
|
2017-03-21 20:34:34 +03:00
|
|
|
LIBRIVOX_DIR = "LibriSpeech"
|
|
|
|
work_dir = os.path.join(data_dir, LIBRIVOX_DIR)
|
|
|
|
|
|
|
|
_maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100)
|
|
|
|
bar.update(0)
|
|
|
|
_maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360)
|
|
|
|
bar.update(1)
|
|
|
|
_maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500)
|
|
|
|
bar.update(2)
|
|
|
|
|
|
|
|
_maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean)
|
|
|
|
bar.update(3)
|
|
|
|
_maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other)
|
|
|
|
bar.update(4)
|
|
|
|
|
|
|
|
_maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean)
|
|
|
|
bar.update(5)
|
|
|
|
_maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other)
|
|
|
|
bar.update(6)
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
# Convert FLAC data to wav, from:
|
2016-10-13 22:15:39 +03:00
|
|
|
# data_dir/LibriSpeech/split/1/2/1-2-3.flac
|
|
|
|
# to:
|
|
|
|
# data_dir/LibriSpeech/split-wav/1-2-3.wav
|
2017-04-01 19:57:58 +03:00
|
|
|
#
|
|
|
|
# And split LibriSpeech transcriptions, from:
|
2016-10-13 22:15:39 +03:00
|
|
|
# data_dir/LibriSpeech/split/1/2/1-2.trans.txt
|
|
|
|
# to:
|
|
|
|
# data_dir/LibriSpeech/split-wav/1-2-0.txt
|
|
|
|
# data_dir/LibriSpeech/split-wav/1-2-1.txt
|
|
|
|
# data_dir/LibriSpeech/split-wav/1-2-2.txt
|
|
|
|
# ...
|
2017-04-01 19:57:58 +03:00
|
|
|
print("Converting FLAC to WAV and splitting transcriptions...")
|
2017-03-21 20:37:50 +03:00
|
|
|
with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
|
2017-04-01 19:57:58 +03:00
|
|
|
train_100 = _convert_audio_and_split_sentences(work_dir, "train-clean-100", "train-clean-100-wav")
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(0)
|
2017-04-01 19:57:58 +03:00
|
|
|
train_360 = _convert_audio_and_split_sentences(work_dir, "train-clean-360", "train-clean-360-wav")
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(1)
|
2017-04-01 19:57:58 +03:00
|
|
|
train_500 = _convert_audio_and_split_sentences(work_dir, "train-other-500", "train-other-500-wav")
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(2)
|
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
dev_clean = _convert_audio_and_split_sentences(work_dir, "dev-clean", "dev-clean-wav")
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(3)
|
2017-04-01 19:57:58 +03:00
|
|
|
dev_other = _convert_audio_and_split_sentences(work_dir, "dev-other", "dev-other-wav")
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(4)
|
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
test_clean = _convert_audio_and_split_sentences(work_dir, "test-clean", "test-clean-wav")
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(5)
|
2017-04-01 19:57:58 +03:00
|
|
|
test_other = _convert_audio_and_split_sentences(work_dir, "test-other", "test-other-wav")
|
2017-03-21 20:34:34 +03:00
|
|
|
bar.update(6)
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
# Write sets to disk as CSV files
|
|
|
|
train_100.to_csv(os.path.join(data_dir, "librivox-train-clean-100.csv"), index=False)
|
|
|
|
train_360.to_csv(os.path.join(data_dir, "librivox-train-clean-360.csv"), index=False)
|
|
|
|
train_500.to_csv(os.path.join(data_dir, "librivox-train-other-500.csv"), index=False)
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"), index=False)
|
|
|
|
dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"), index=False)
|
2016-10-13 22:15:39 +03:00
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"), index=False)
|
|
|
|
test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"), index=False)
|
2017-03-21 20:34:34 +03:00
|
|
|
|
2016-10-13 22:15:39 +03:00
|
|
|
def _maybe_extract(data_dir, extracted_data, archive):
|
|
|
|
# If data_dir/extracted_data does not exist, extract archive in data_dir
|
|
|
|
if not gfile.Exists(os.path.join(data_dir, extracted_data)):
|
|
|
|
tar = tarfile.open(archive)
|
|
|
|
tar.extractall(data_dir)
|
|
|
|
tar.close()
|
2017-03-21 20:34:34 +03:00
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir):
|
2016-10-13 22:15:39 +03:00
|
|
|
source_dir = os.path.join(extracted_dir, data_set)
|
|
|
|
target_dir = os.path.join(extracted_dir, dest_dir)
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
if not os.path.exists(target_dir):
|
|
|
|
os.makedirs(target_dir)
|
|
|
|
|
2016-10-13 22:15:39 +03:00
|
|
|
# Loop over transcription files and split each one
|
|
|
|
#
|
|
|
|
# The format for each file 1-2.trans.txt is:
|
|
|
|
# 1-2-0 transcription of 1-2-0.flac
|
|
|
|
# 1-2-1 transcription of 1-2-1.flac
|
|
|
|
# ...
|
|
|
|
#
|
|
|
|
# Each file is then split into several files:
|
|
|
|
# 1-2-0.txt (contains transcription of 1-2-0.flac)
|
|
|
|
# 1-2-1.txt (contains transcription of 1-2-1.flac)
|
|
|
|
# ...
|
2017-04-01 19:57:58 +03:00
|
|
|
#
|
|
|
|
# We also convert the corresponding FLACs to WAV in the same pass
|
|
|
|
files = []
|
2016-10-13 22:15:39 +03:00
|
|
|
for root, dirnames, filenames in os.walk(source_dir):
|
|
|
|
for filename in fnmatch.filter(filenames, '*.trans.txt'):
|
|
|
|
trans_filename = os.path.join(root, filename)
|
2017-04-01 19:57:58 +03:00
|
|
|
with codecs.open(trans_filename, "r", "utf-8") as fin:
|
2016-10-13 22:15:39 +03:00
|
|
|
for line in fin:
|
2017-04-01 19:57:58 +03:00
|
|
|
# Parse each segment line
|
2016-10-13 22:15:39 +03:00
|
|
|
first_space = line.find(" ")
|
2017-04-01 19:57:58 +03:00
|
|
|
seqid, transcript = line[:first_space], line[first_space+1:]
|
|
|
|
|
2017-04-09 22:27:40 +03:00
|
|
|
# We need to do the encode-decode dance here because encode
|
|
|
|
# returns a bytes() object on Python 3, and text_to_char_array
|
|
|
|
# expects a string.
|
2017-04-01 19:57:58 +03:00
|
|
|
transcript = unicodedata.normalize("NFKD", transcript) \
|
|
|
|
.encode("ascii", "ignore") \
|
|
|
|
.decode("ascii", "ignore")
|
2016-10-13 22:15:39 +03:00
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
transcript = transcript.lower().strip()
|
2016-11-24 20:07:50 +03:00
|
|
|
|
2017-04-01 19:57:58 +03:00
|
|
|
# Convert corresponding FLAC to a WAV
|
|
|
|
flac_file = os.path.join(root, seqid + ".flac")
|
|
|
|
wav_file = os.path.join(target_dir, seqid + ".wav")
|
|
|
|
if not os.path.exists(wav_file):
|
2019-06-23 21:47:12 +03:00
|
|
|
tfm = Transformer()
|
|
|
|
tfm.set_output_format(rate=SAMPLE_RATE)
|
|
|
|
tfm.build(flac_file, wav_file)
|
2017-04-01 19:57:58 +03:00
|
|
|
wav_filesize = os.path.getsize(wav_file)
|
|
|
|
|
2017-04-26 05:58:13 +03:00
|
|
|
files.append((os.path.abspath(wav_file), wav_filesize, transcript))
|
2017-04-01 19:57:58 +03:00
|
|
|
|
|
|
|
return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
|
|
|
|
|
2017-04-01 20:35:46 +03:00
|
|
|
if __name__ == "__main__":
|
|
|
|
_download_and_preprocess_data(sys.argv[1])
|