338 строки
11 KiB
Python
338 строки
11 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT license.
|
|
|
|
import numpy as np
|
|
import os
|
|
import re
|
|
import bisect
|
|
import glob
|
|
from scipy.io import wavfile
|
|
import sys
|
|
import multiprocessing
|
|
import itertools
|
|
import math, random
|
|
import functools
|
|
|
|
from tensorpack.dataflow import DataFlow, PrefetchDataZMQ, \
|
|
PrefetchData, MapDataComponent, MapData, BatchData
|
|
from tensorpack.dataflow.serialize import LMDBSerializer
|
|
dump_dataflow_to_lmdb = LMDBSerializer.save
|
|
|
|
def _load_list_in_file(fn):
|
|
with open(fn, 'rt') as fin:
|
|
lines = fin.read()
|
|
return sorted(lines.strip().split())
|
|
|
|
def _is_fn_in_ll_fn(fn, ll_fn):
|
|
for l_fn in ll_fn:
|
|
idx = bisect.bisect_left(l_fn, fn)
|
|
if l_fn[idx] == fn:
|
|
return True
|
|
return False
|
|
|
|
def _full_fn_to_record_fn(full_fn):
|
|
"""
|
|
|
|
Returns:
|
|
str : tree/c24d96eb_nohash_0.wav
|
|
"""
|
|
return os.path.join(os.path.basename(os.path.dirname(full_fn)), os.path.basename(full_fn))
|
|
|
|
|
|
ALL_WORDS = sorted(['bed', 'cat', 'down', 'five', 'forward', 'go', 'house', 'left', 'marvin', 'no',
|
|
'on', 'seven', 'six', 'tree', 'up', 'visual', 'yes', 'backward', 'bird', 'dog', 'eight', 'follow',
|
|
'four', 'happy', 'learn', 'nine', 'off', 'one', 'right', 'sheila', 'stop', 'three', 'two', 'wow', 'zero'])
|
|
|
|
|
|
DEFAULT_TRAIN_WORDS = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]
|
|
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
|
|
SILENCE_LABEL = '_silence_'
|
|
SILENCE_INDEX = 0
|
|
UNKNOWN_WORD_LABEL = '_unknown_'
|
|
UNKNOWN_WORD_INDEX = 1
|
|
BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
|
|
RANDOM_SEED = 59185
|
|
SAMPLE_RATE = 16000
|
|
DESIRED_SAMPLES = 16000
|
|
|
|
|
|
class RawSpeechCommandsData(object):
|
|
|
|
def __init__(self, data_dir, split, train_words=None):
|
|
"""
|
|
"""
|
|
test_list_fn = os.path.join(data_dir, 'testing_list.txt')
|
|
val_list_fn = os.path.join(data_dir, 'validation_list.txt')
|
|
test_list = _load_list_in_file(test_list_fn)
|
|
val_list = _load_list_in_file(val_list_fn)
|
|
|
|
if train_words is None:
|
|
train_words = DEFAULT_TRAIN_WORDS
|
|
|
|
l_background_noises = glob.glob(os.path.join(data_dir, BACKGROUND_NOISE_DIR_NAME, '*.wav'))
|
|
l_background_noises = list(map(_full_fn_to_record_fn, l_background_noises))
|
|
|
|
if split in ['validation', 'val']:
|
|
self.split = 'val'
|
|
l_fn = val_list
|
|
elif split in ['testing', 'test']:
|
|
self.split = 'test'
|
|
l_fn = test_list
|
|
elif split in ['train', 'training']:
|
|
self.split = 'train'
|
|
train_list = []
|
|
for word in ALL_WORDS:
|
|
train_list.extend(map(_full_fn_to_record_fn, glob.glob(os.path.join(data_dir, word, '*.wav'))))
|
|
l_fn = train_list
|
|
else:
|
|
raise ValueError("Unknown split name {}".format(split))
|
|
|
|
self.l_fn = l_fn
|
|
self.l_background_noises = l_background_noises
|
|
self._size = len(self.l_fn)
|
|
self.data_dir = data_dir
|
|
self.data = None
|
|
|
|
def size(self):
|
|
return self._size
|
|
|
|
def load(self):
|
|
if self.data is not None:
|
|
return
|
|
self.data = []
|
|
self.labels = []
|
|
for fn in self.l_fn:
|
|
fps, x = wavfile.read(os.path.join(self.data_dir, fn))
|
|
if fps != SAMPLE_RATE:
|
|
print("MEOW!!!!! different frame rate")
|
|
label = os.path.basename(os.path.dirname(fn))
|
|
self.data.append(x)
|
|
self.labels.append(label)
|
|
|
|
if self.split == 'train':
|
|
self.noises = []
|
|
for fn in self.l_background_noises:
|
|
fps, x = wavfile.read(os.path.join(self.data_dir, fn))
|
|
if fps != SAMPLE_RATE:
|
|
print("MEOW!!!!! different frame rate")
|
|
self.noises.append(x)
|
|
|
|
def save_to_pickles(self, dest_dir, n_batch=1):
|
|
self.load()
|
|
if self.split == 'train':
|
|
np.savez(os.path.join(dest_dir, BACKGROUND_NOISE_DIR_NAME), noises=self.noises)
|
|
|
|
N = len(self.data)
|
|
start = 0
|
|
step = N // n_batch
|
|
end = step
|
|
for si in range(n_batch):
|
|
if si + 1 == n_batch:
|
|
end = N
|
|
postfix = ""
|
|
if n_batch > 1:
|
|
postfix = "_batch_{}".format(si + 1)
|
|
batch_name = self.split + '_data' + postfix
|
|
np.savez(os.path.join(dest_dir, batch_name), \
|
|
data=self.data[start:end], labels=self.labels[start:end])
|
|
start = end
|
|
end += step
|
|
|
|
|
|
def convert_to_npz():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--data_dir', type=str, default=None)
|
|
parser.add_argument('--dest_dir', type=str, default=None)
|
|
args = parser.parse_args()
|
|
|
|
for split in ['train', 'val', 'test']:
|
|
data = RawSpeechCommandsData(split=split, data_dir=args.data_dir)
|
|
if split == 'train':
|
|
n_batch = 4
|
|
else:
|
|
n_batch = 1
|
|
data.save_to_pickles(dest_dir=args.dest_dir, n_batch=n_batch)
|
|
|
|
|
|
class SpeechCommandsDataFlow(DataFlow):
|
|
|
|
def __init__(self, data_dir, split, shuffle=True, train_words=None,
|
|
silence_percentage=10., unknown_percentage=10.0):
|
|
self.shuffle = shuffle
|
|
train_words = DEFAULT_TRAIN_WORDS if train_words is None else train_words
|
|
self.train_words = train_words
|
|
self.split = split
|
|
|
|
words_to_label = dict()
|
|
words_to_label[SILENCE_LABEL] = SILENCE_INDEX
|
|
words_to_label[UNKNOWN_WORD_LABEL] = UNKNOWN_WORD_INDEX
|
|
offset = 2
|
|
for wi, word in enumerate(train_words):
|
|
words_to_label[word] = wi + offset
|
|
for word in ALL_WORDS:
|
|
if word in train_words:
|
|
continue
|
|
words_to_label[word] = UNKNOWN_WORD_INDEX
|
|
|
|
l_npz = glob.glob(os.path.join(data_dir, '{}_data*.npz'.format(split)))
|
|
l_data = [ np.load(npz, encoding='bytes') for npz in l_npz]
|
|
l_xs = [ list(data['data']) for data in l_data ]
|
|
l_ys = [ list(map(lambda ystr : words_to_label[str(ystr)], data['labels'])) \
|
|
for data in l_data]
|
|
|
|
# filter the xs and ys into train_words and backgrounds.
|
|
self.xs = []
|
|
self.ys = []
|
|
unknown_xs = []
|
|
for xs, ys in zip (l_xs, l_ys):
|
|
is_background = list(map(lambda y : y == UNKNOWN_WORD_INDEX or y == SILENCE_INDEX, ys))
|
|
self.xs.extend(itertools.compress(xs, map(lambda t : not t, is_background)))
|
|
self.ys.extend(itertools.compress(ys, map(lambda t : not t, is_background)))
|
|
unknown_xs.extend(itertools.compress(xs, is_background))
|
|
|
|
# compute unknown and silence sizes
|
|
set_size = len(self.xs)
|
|
unknown_size = int(math.ceil(set_size * unknown_percentage / 100.0))
|
|
silence_size = int(math.ceil(set_size * silence_percentage / 100.0))
|
|
|
|
# select backgrounds based on percentage
|
|
random.seed(RANDOM_SEED)
|
|
random.shuffle(unknown_xs)
|
|
self.xs.extend(unknown_xs[:unknown_size])
|
|
self.ys.extend([UNKNOWN_WORD_INDEX for _ in range(unknown_size)])
|
|
|
|
# add silece based on percentage
|
|
self.xs.extend([np.zeros([SAMPLE_RATE], dtype=np.int16) for _ in range(silence_size)])
|
|
self.ys.extend([SILENCE_INDEX for _ in range(silence_size) ])
|
|
|
|
self._size = len(self.xs)
|
|
self.dps = [self.xs, self.ys]
|
|
|
|
if split == 'train':
|
|
self.noises = list(np.load(os.path.join(data_dir, BACKGROUND_NOISE_DIR_NAME + '.npz'),
|
|
encoding='bytes')['noises'])
|
|
|
|
|
|
def size(self):
|
|
return self._size
|
|
|
|
|
|
def get_data(self):
|
|
indices = list(range(self._size))
|
|
if self.shuffle:
|
|
np.random.shuffle(indices)
|
|
|
|
for idx in indices:
|
|
yield [ dp[idx] for dp in self.dps ]
|
|
|
|
|
|
def _to_float(x):
|
|
assert x.dtype == np.int16
|
|
return x.astype(np.float32) / np.iinfo(np.int16).max
|
|
|
|
|
|
def _pad_or_clip_to_desired_sample(x):
|
|
len_x = len(x)
|
|
if len_x == DESIRED_SAMPLES:
|
|
return x
|
|
extra = len_x - DESIRED_SAMPLES
|
|
if extra > 0:
|
|
start = extra // 2
|
|
end = - extra + start
|
|
return x[start:end]
|
|
pad = -extra
|
|
pleft = pad // 2
|
|
return np.pad(x, [pleft, pad - pleft], mode='constant')
|
|
|
|
|
|
def _time_shift(x):
|
|
assert x.dtype == np.float32
|
|
time_shift_ms = 100. # in ms
|
|
time_shift = int(time_shift_ms / len(x) * 1000)
|
|
if time_shift > 0:
|
|
time_shift_amount = np.random.randint(-time_shift, time_shift)
|
|
else:
|
|
time_shift_amount = 0
|
|
if time_shift_amount > 0:
|
|
x = np.pad(x[:-time_shift_amount], [time_shift_amount, 0], mode='constant')
|
|
elif time_shift_amount < 0:
|
|
x = np.pad(x[-time_shift_amount:], [0, -time_shift_amount], mode='constant')
|
|
return x
|
|
|
|
|
|
def _add_noise(d, noises):
|
|
"""
|
|
x (1-d array) : float array
|
|
y (int) : label index
|
|
"""
|
|
x, y = d
|
|
assert x.dtype == np.float32
|
|
desired_samples = DESIRED_SAMPLES
|
|
background_frequency = 0.8
|
|
background_volume_range = 0.1
|
|
|
|
# random sample noise
|
|
if y == SILENCE_INDEX:
|
|
background_index = np.random.randint(len(noises))
|
|
background_samples = noises[background_index]
|
|
if len(background_samples) <= desired_samples:
|
|
raise ValueError(
|
|
'Background sample is too short! Need more than %d'
|
|
' samples but only %d were found' %
|
|
(desired_samples, len(background_samples)))
|
|
background_offset = np.random.randint(
|
|
0, len(background_samples) - desired_samples)
|
|
background_clipped = background_samples[background_offset:(
|
|
background_offset + desired_samples)]
|
|
background_reshaped = background_clipped.reshape([desired_samples])
|
|
if y == SILENCE_INDEX:
|
|
background_volume = np.random.uniform(0, 1)
|
|
elif np.random.uniform(0, 1) < background_frequency:
|
|
background_volume = np.random.uniform(0, background_volume_range)
|
|
else:
|
|
background_volume = 0
|
|
else:
|
|
background_reshaped = np.zeros([desired_samples])
|
|
background_volume = 0
|
|
|
|
# Merge noise with data
|
|
x = np.clip(x + background_volume * background_reshaped, -1.0, 1.0)
|
|
return [x, y]
|
|
|
|
|
|
def get_augmented_speech_commands_data(subset, options,
|
|
do_multiprocess=True, shuffle=True):
|
|
isTrain = subset == 'train' and do_multiprocess
|
|
shuffle = shuffle if shuffle is not None else isTrain
|
|
|
|
ds = SpeechCommandsDataFlow(os.path.join(options.data_dir, 'speech_commands_v0.02'),
|
|
subset, shuffle, None)
|
|
if isTrain:
|
|
add_noise_func = functools.partial(_add_noise, noises=ds.noises)
|
|
ds = MapDataComponent(ds, _pad_or_clip_to_desired_sample, index=0)
|
|
ds = MapDataComponent(ds, _to_float, index=0)
|
|
if isTrain:
|
|
ds = MapDataComponent(ds, _time_shift, index=0)
|
|
ds = MapData(ds, add_noise_func)
|
|
ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain)
|
|
if do_multiprocess:
|
|
ds = PrefetchData(ds, 4, 4)
|
|
return ds
|
|
|
|
|
|
def _test_get_augmented_data():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--data_dir', type=str, default=None)
|
|
args = parser.parse_args()
|
|
subset, shuffle = 'train', True
|
|
args.batch_size = 100
|
|
args.nr_gpu = 1
|
|
ds = get_augmented_speech_commands_data(subset, args, True, shuffle)
|
|
return ds
|
|
|
|
if __name__ == '__main__':
|
|
#convert_to_npz()
|
|
ret = _test_get_augmented_data()
|