Exporter to DeepSpeech CSV files

This commit is contained in:
Tilman Kamp 2019-09-16 18:12:54 +02:00
Родитель a08b1e85d9
Коммит cb695226fc
6 изменённых файлов: 385 добавлений и 1 удалений

3
.gitignore поставляемый
Просмотреть файл

@ -1,6 +1,7 @@
models
data
dependencies
data/test*
data/export
.idea

362
align/export.py Normal file
Просмотреть файл

@ -0,0 +1,362 @@
import os
import sys
import csv
import math
import json
import random
import logging
import argparse
import statistics
import os.path as path
from tqdm import tqdm
from pydub import AudioSegment
from collections import Counter
from multiprocessing import Pool
unknown = '<unknown>'
def fail(message, code=1):
logging.fatal(message)
exit(code)
def engroup(lst, get_key):
groups = {}
for obj in lst:
key = get_key(obj)
if key in groups:
groups[key].append(obj)
else:
groups[key] = [obj]
return groups
def get_set_sizes(population_size):
margin_of_error = 0.01
fraction_picking = 0.50
z_score = 2.58 # Corresponds to confidence level 99%
numerator = (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / (
margin_of_error ** 2
)
sample_size = 0
for train_size in range(population_size, 0, -1):
denominator = 1 + (z_score ** 2 * fraction_picking * (1 - fraction_picking)) / (
margin_of_error ** 2 * train_size
)
sample_size = int(numerator / denominator)
if 2 * sample_size + train_size <= population_size:
break
return population_size - 2 * sample_size, sample_size
def load_segment(audio_path):
return audio_path, AudioSegment.from_file(audio_path)
def load_segment_dry(audio_path):
if path.isfile(audio_path):
logging.debug('Would load file "{}"'.format(audio_path))
else:
fail('File not found: "{}"'.format(audio_path))
return audio_path, AudioSegment.empty()
def main(args):
parser = argparse.ArgumentParser(description='Export aligned speech samples.')
parser.add_argument('--audio', type=str,
help='Take audio file as input (requires "--aligned <file>")')
parser.add_argument('--aligned', type=str,
help='Take alignment file ("<...>.aligned") as input (requires "--audio <file>")')
parser.add_argument('--catalog', type=str,
help='Take alignment and audio file references of provided catalog ("<...>.catalog") as input')
parser.add_argument('--ignore-missing', action="store_true",
help='Ignores catalog entries with missing files')
parser.add_argument('--target-dir', type=str, required=True,
help='Existing target directory for storing generated sets (files and directories)')
parser.add_argument('--filter', type=str,
help='Python expression that computes a boolean value from sample data fields. '
'If the result is True, the sample will be dropped.')
parser.add_argument('--criteria', type=str, default='100',
help='Python expression that computes a number as quality indicator from sample data fields.')
parser.add_argument('--partition', type=str, action='append',
help='Expression of the form "<number>:<partition>" where all samples with a quality indicator '
'(--criteria) above or equal the given number and below the next bigger one are assigned '
'to the specified partition. Samples below the lowest partition criteria are assigned to '
'partition "other".')
parser.add_argument('--split', action="store_true",
help='Split each partition except "other" into train/dev/test sub-sets.')
parser.add_argument('--split-field', type=str,
help='Sample meta field that should be used for splitting (e.g. "speaker")')
parser.add_argument('--split-seed', type=int,
help='Random seed for set splitting')
parser.add_argument('--debias', type=str, action='append',
help='Sample meta field to group samples for debiasing (e.g. "speaker"). '
'Group sizes will be capped according to --debias-sigma-factor')
parser.add_argument('--debias-sigma-factor', type=float, default=3.0,
help='Standard deviation (sigma) factor after which the sample number of a group gets capped')
parser.add_argument('--loglevel', type=int, default=20,
help='Log level (between 0 and 50) - default: 20')
parser.add_argument('--no-progress', action="store_true",
help='Prevents showing progress bars')
parser.add_argument('--format', type=str, default='csv',
help='Sample list format - one of (json|csv)')
parser.add_argument('--rate', type=int,
help='Export wav-files with this sample rate')
parser.add_argument('--channels', type=int,
help='Export wav-files with this number of channels')
parser.add_argument('--force', action="store_true",
help='Overwrite existing files')
parser.add_argument('--dry-run', action="store_true",
help='Simulates export without writing or creating any file or directory')
parser.add_argument('--dry-run-fast', action="store_true",
help='Simulates export without writing or creating any file or directory. '
'In contrast to --dry-run this faster simulation will not load samples.')
parser.add_argument('--workers', type=int, default=None,
help='Number of workers for loading and re-sampling audio files. Default: Number of CPUs')
parser.add_argument('--pretty', action="store_true",
help='Writes indented JSON output')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.loglevel if args.loglevel else 20)
def progress(iter, desc='Processing', total=None):
desc = desc.rjust(24)
return iter if args.no_progress else tqdm(iter, desc=desc, total=total, ncols=120)
logging.debug("Start")
pairs = []
def check_path(target_path, fs_type='file'):
if not (path.isfile(target_path) if fs_type == 'file' else path.isdir(target_path)):
logging.fatal('{} not existing: "{}"'.format(fs_type[0].upper() + fs_type[1:], target_path))
exit(1)
return path.abspath(target_path)
def make_absolute(base_path, spec_path):
if not path.isabs(spec_path):
spec_path = path.join(base_path, spec_path)
spec_path = path.abspath(spec_path)
return spec_path if path.isfile(spec_path) else None
target_dir = check_path(args.target_dir, fs_type='directory')
if args.audio:
if args.aligned:
pairs.append((check_path(args.audio), check_path(args.aligned)))
else:
fail('If you specify "--audio", you also have to specify "--aligned"')
elif args.aligned:
fail('If you specify "--aligned", you also have to specify "--audio"')
elif args.catalog:
catalog = check_path(args.catalog)
catalog_dir = path.dirname(catalog)
with open(catalog, 'r') as catalog_file:
catalog_entries = json.load(catalog_file)
for entry in progress(catalog_entries, desc='Reading catalog'):
audio = make_absolute(catalog_dir, entry['audio'])
aligned = make_absolute(catalog_dir, entry['aligned'])
if audio is None or aligned is None:
if args.ignore_missing:
continue
if audio is None:
fail('Problem loading catalog "{}": Missing referenced audio file "{}"'
.format(args.catalog, entry['audio']))
if aligned is None:
fail('Problem loading catalog "{}": Missing referenced alignment file "{}"'
.format(args.catalog, entry['aligned']))
pairs.append((audio, aligned))
else:
fail('You have to either specify "--audio" and "--aligned" or "--catalog"')
dry_run = args.dry_run or args.dry_run_fast
load_samples = not args.dry_run_fast
partition_specs = []
if args.partition is not None:
for partition_expr in args.partition:
parts = partition_expr.split(':')
if len(parts) != 2:
fail('Wrong partition specification: "{}"'.format(partition_expr))
partition_specs.append((float(parts[0]), str(parts[1])))
partition_specs.sort(key=lambda p: p[0], reverse=True)
fragments = []
for audio_path, aligned_path in progress(pairs, desc='Loading alignments'):
with open(aligned_path, 'r') as aligned_file:
aligned_fragments = json.load(aligned_file)
for fragment in aligned_fragments:
fragment['audio_path'] = audio_path
fragments.append(fragment)
if args.filter is not None:
kept_fragments = []
for fragment in progress(fragments, desc='Filtering'):
if not eval(args.filter, {'math': math}, fragment):
kept_fragments.append(fragment)
if len(kept_fragments) < len(fragments):
logging.info('Filtered out {} samples'.format(len(fragments) - len(kept_fragments)))
fragments = kept_fragments
if len(fragments) == 0:
fail('Filter left no samples samples')
for fragment in progress(fragments, desc='Computing qualities'):
fragment['quality'] = eval(args.criteria, {'math': math}, fragment)
def get_meta(fragment, meta_field):
if 'meta' in fragment:
meta = fragment['meta']
if meta_field in meta:
for value in meta[meta_field]:
return value
return unknown
if args.debias is not None:
for debias in args.debias:
grouped = engroup(fragments, lambda f: get_meta(f, debias))
if unknown in grouped:
fragments = grouped[unknown]
del grouped[unknown]
else:
fragments = []
counts = list(map(lambda f: len(f), grouped.values()))
mean = statistics.mean(counts)
sigma = statistics.pstdev(counts, mu=mean)
cap = int(mean + args.debias_sigma_factor * sigma)
counter = Counter()
for group, values in progress(grouped.items(), desc='Debiasing "{}"'.format(debias)):
if len(values) > cap:
values.sort(key=lambda g: g['quality'])
counter[group] += len(values) - cap
values = values[-cap:]
fragments.extend(values)
logging.info('Dropped for debiasing "{}":'.format(debias))
for group, count in counter.most_common():
logging.info(' - "{}": {}'.format(group, count))
def get_partition(f):
quality = f['quality']
for minimum, partition_name in partition_specs:
if quality >= minimum:
return partition_name
return 'other'
lists = {}
def ensure_list(name):
lists[name] = []
if not args.force:
for p in [name, name + '.' + args.format]:
if path.exists(path.join(target_dir, p)):
fail('"{}" already existing - use --force to ignore'.format(p))
if args.split_seed is not None:
random.seed(args.split_seed)
partitions = engroup(fragments, get_partition)
for partition, partition_fragments in partitions.items():
logging.info('Partition "{}":'.format(partition))
if not args.split or partition == 'other':
ensure_list(partition)
for fragment in partition_fragments:
fragment['list-name'] = partition
logging.info(' - samples: {}'.format(len(partition_fragments)))
else:
train_size, sample_size = get_set_sizes(len(partition_fragments))
if args.split_field:
portions = engroup(partition_fragments, lambda f: get_meta(f, args.split_field)).values()
portions.sort(key=lambda p: len(p))
train_set, dev_set, test_set = [], [], []
for offset, sample_set in [(0, dev_set), (1, test_set)]:
for portion in portions[offset::2]:
if len(sample_set) < sample_size:
sample_set.extend(portion)
else:
train_set.extend(portion)
else:
random.shuffle(partition_fragments)
test_set = partition_fragments[:sample_size]
partition_fragments = partition_fragments[sample_size:]
dev_set = partition_fragments[:sample_size]
train_set = partition_fragments[sample_size:]
for set_name, set_fragments in [('train', train_set), ('dev', dev_set), ('test', test_set)]:
list_name = partition + '-' + set_name
ensure_list(list_name)
for fragment in set_fragments:
fragment['list-name'] = list_name
logging.info(' - sub-set "{}" - samples: {}'.format(set_name, len(set_fragments)))
for list_name in lists.keys():
dir_name = path.join(target_dir, list_name)
if not path.isdir(dir_name):
if dry_run:
logging.debug('Would create directory "{}"'.format(dir_name))
else:
os.mkdir(dir_name)
def list_fragments():
audio_files = engroup(fragments, lambda f: f['audio_path'])
pool = Pool(args.workers)
ls = load_segment if load_samples else load_segment_dry
for audio_path, audio in pool.imap_unordered(ls, audio_files.keys()):
file_fragments = audio_files[audio_path]
if args.channels is not None:
audio = audio.set_channels(args.channels)
if args.rate is not None:
audio = audio.set_frame_rate(args.rate)
file_fragments.sort(key=lambda f: f['start'])
for fragment in file_fragments:
if load_samples:
yield audio[fragment['start']:fragment['end']], fragment
else:
yield audio, fragment
for audio_segment, fragment in progress(list_fragments(), desc='Exporting samples', total=len(fragments)):
list_name = fragment['list-name']
group_list = lists[list_name]
sample_name = 'sample-{:010d}.wav'.format(len(group_list))
rel_path = path.join(list_name, sample_name)
abs_path = path.join(target_dir, rel_path)
if dry_run:
logging.debug('Would write file "{}"'.format(abs_path))
else:
with open(abs_path, "wb") as wav_file:
audio_segment.export(wav_file, format="wav")
file_size = wav_file.tell()
group_list.append((rel_path, file_size, fragment))
for list_name, group_list in progress(lists.items(), desc='Writing lists'):
if args.format == 'json':
json_path = path.join(target_dir, list_name + '.json')
if dry_run:
logging.debug('Would write file "{}"'.format(json_path))
else:
entries = []
for rel_path, file_size, fragment in group_list:
entry = {
'audio': rel_path,
'size': file_size,
'transcript': fragment['aligned'],
'duration': fragment['end'] - fragment['start']
}
if 'aligned-raw' in fragment:
entry['transcript-raw'] = fragment['aligned-raw']
entries.append(entry)
with open(json_path, 'w') as json_file:
json.dump(entries, json_file)
else:
csv_path = path.join(target_dir, list_name + '.csv')
if dry_run:
logging.debug('Would write file "{}"'.format(csv_path))
else:
with open(csv_path, 'w') as csv_file:
writer = csv.writer(csv_file)
for rel_path, file_size, fragment in group_list:
writer.writerow([rel_path, file_size, fragment['aligned']])
if __name__ == '__main__':
main(sys.argv[1:])
os.system('stty sane')

4
bin/export.sh Executable file
Просмотреть файл

@ -0,0 +1,4 @@
#!/usr/bin/env bash
approot="$(dirname "$(dirname "$(readlink -fm "$0")")")"
source "$approot/venv/bin/activate"
python "$approot/align/export.py" "$@"

4
bin/statistics.sh Executable file
Просмотреть файл

@ -0,0 +1,4 @@
#!/usr/bin/env bash
approot="$(dirname "$(dirname "$(readlink -fm "$0")")")"
source "$approot/venv/bin/activate"
python "$approot/align/statistics.py" "$@"

12
data/all.catalog Normal file
Просмотреть файл

@ -0,0 +1,12 @@
[
{
"audio": "test1/joined.mp3",
"text": "test1/transcript.txt",
"aligned": "test1/aligned.json"
},
{
"audio": "test2/joined.mp3",
"text": "test2/transcript.script",
"aligned": "test2/aligned.json"
}
]

Просмотреть файл

@ -3,3 +3,4 @@ deepspeech
webrtcvad
tqdm
textdistance
pydub