From 0ef3c0ac3f80eb4970c946a87b27b0170bbea110 Mon Sep 17 00:00:00 2001 From: Eren G Date: Fri, 13 Jul 2018 14:56:05 +0200 Subject: [PATCH] Remove preemphasis from audio processing --- datasets/LJSpeech.py | 5 +++-- train.py | 6 +++++- utils/audio.py | 12 ++++++------ utils/generic_utils.py | 31 +++++++++++++++---------------- utils/model.py | 9 --------- 5 files changed, 29 insertions(+), 34 deletions(-) delete mode 100644 utils/model.py diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index 0c90ab7..b313d8a 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -16,7 +16,7 @@ class LJSpeechDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power, - min_seq_len=0): + min_mel_freq, max_mel_freq, min_seq_len=0): with open(csv_file, "r", encoding="utf8") as f: self.frames = [line.split('|') for line in f] @@ -26,7 +26,8 @@ class LJSpeechDataset(Dataset): self.cleaners = text_cleaner self.min_seq_len = min_seq_len self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, - frame_length_ms, preemphasis, ref_level_db, num_freq, power) + frame_length_ms, preemphasis, ref_level_db, num_freq, power, + min_mel_freq, max_mel_freq) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames() diff --git a/train.py b/train.py index 113346b..ec33927 100644 --- a/train.py +++ b/train.py @@ -352,6 +352,8 @@ def main(args): c.ref_level_db, c.num_freq, c.power, + c.min_mel_freq, + c.max_mel_freq, min_seq_len=c.min_seq_len ) @@ -372,7 +374,9 @@ def main(args): c.preemphasis, c.ref_level_db, c.num_freq, - c.power + c.power, + c.min_mel_freq, + c.max_mel_freq ) val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size, diff --git a/utils/audio.py b/utils/audio.py index 6595b41..a47fb7f 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -59,11 +59,11 @@ class AudioProcessor(object): def _db_to_amp(self, x): return np.power(10.0, x * 0.05) - def apply_preemphasis(self, x): - return signal.lfilter([1, -self.preemphasis], [1], x) - - def apply_inv_preemphasis(self, x): - return signal.lfilter([1], [1, -self.preemphasis], x) + # def apply_preemphasis(self, x): + # return signal.lfilter([1, -self.preemphasis], [1], x) + # + # def apply_inv_preemphasis(self, x): + # return signal.lfilter([1], [1, -self.preemphasis], x) def spectrogram(self, y): # D = self._stft(self.apply_preemphasis(y)) @@ -105,7 +105,7 @@ class AudioProcessor(object): return y def melspectrogram(self, y): - D = self._stft(self.apply_preemphasis(y)) + D = self._stft(y) S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db return self._normalize(S) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index c1ef418..9490581 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -40,7 +40,7 @@ def create_experiment_folder(root_path, model_name, debug): date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I:%M%p") if debug: commit_hash = 'debug' - else: + else: commit_hash = get_commit_hash() output_folder = os.path.join(root_path, date_str + '-' + model_name + '-' + commit_hash) os.makedirs(output_folder, exist_ok=True) @@ -135,21 +135,6 @@ def lr_decay(init_lr, global_step, warmup_steps): return lr -def create_attn_mask(N, T, g=0.05): - r'''creating attn mask for guided attention - TODO: vectorize''' - M = np.zeros([N, T]) - for t in range(T): - for n in range(N): - val = 20 * np.exp(-pow((n/N)-(t/T), 2.0)/g) - M[n, t] = val - e_x = np.exp(M - np.max(M)) - M = e_x / e_x.sum(axis=0) # only difference - M = torch.FloatTensor(M).t().cuda() - M = torch.stack([M]*32) - return M - - def mk_decay(init_mk, max_epoch, n_epoch): return init_mk * ((max_epoch - n_epoch) / max_epoch) @@ -159,6 +144,20 @@ def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) +# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 +def sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.data.max() + batch_size = sequence_length.size(0) + seq_range = torch.arange(0, max_len).long() + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + if sequence_length.is_cuda: + seq_range_expand = seq_range_expand.cuda() + seq_length_expand = (sequence_length.unsqueeze(1) + .expand_as(seq_range_expand)) + return seq_range_expand < seq_length_expand + + class Progbar(object): """Displays a progress bar. Args: diff --git a/utils/model.py b/utils/model.py deleted file mode 100644 index d75d61a..0000000 --- a/utils/model.py +++ /dev/null @@ -1,9 +0,0 @@ - -def get_param_size(model): - params = 0 - for p in model.parameters(): - tmp = 1 - for x in p.size(): - tmp *= x - params += tmp - return params