rename audio._normalize to audio.normalize

This commit is contained in:
Eren Gölge 2021-01-22 02:33:19 +01:00
Родитель c990b3a59c
Коммит ca8ad9c21e
6 изменённых файлов: 33 добавлений и 33 удалений

Просмотреть файл

@ -109,7 +109,7 @@ class AudioProcessor(object):
return hop_length, win_length
### normalization ###
def _normalize(self, S):
def normalize(self, S):
"""Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]"""
#pylint: disable=no-else-return
S = S.copy()
@ -138,7 +138,7 @@ class AudioProcessor(object):
else:
return S
def _denormalize(self, S):
def denormalize(self, S):
"""denormalize values"""
#pylint: disable=no-else-return
S_denorm = S.copy()
@ -223,7 +223,7 @@ class AudioProcessor(object):
else:
D = self._stft(y)
S = self._amp_to_db(np.abs(D))
return self._normalize(S)
return self.normalize(S)
def melspectrogram(self, y):
if self.preemphasis != 0:
@ -231,11 +231,11 @@ class AudioProcessor(object):
else:
D = self._stft(y)
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
return self._normalize(S)
return self.normalize(S)
def inv_spectrogram(self, spectrogram):
"""Converts spectrogram to waveform using librosa"""
S = self._denormalize(spectrogram)
S = self.denormalize(spectrogram)
S = self._db_to_amp(S)
# Reconstruct phase
if self.preemphasis != 0:
@ -244,7 +244,7 @@ class AudioProcessor(object):
def inv_melspectrogram(self, mel_spectrogram):
'''Converts melspectrogram to waveform using librosa'''
D = self._denormalize(mel_spectrogram)
D = self.denormalize(mel_spectrogram)
S = self._db_to_amp(D)
S = self._mel_to_linear(S) # Convert back to linear
if self.preemphasis != 0:
@ -252,11 +252,11 @@ class AudioProcessor(object):
return self._griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec):
S = self._denormalize(linear_spec)
S = self.denormalize(linear_spec)
S = self._db_to_amp(S)
S = self._linear_to_mel(np.abs(S))
S = self._amp_to_db(S)
mel = self._normalize(S)
mel = self.normalize(S)
return mel
### STFT and ISTFT ###

Просмотреть файл

@ -112,7 +112,7 @@
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
" # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n",
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" waveform = waveform.flatten()\n",

Просмотреть файл

@ -112,7 +112,7 @@
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
" # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n",
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" waveform = waveform.flatten()\n",

Просмотреть файл

@ -230,8 +230,8 @@
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-18-91e8914b5c6a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwav\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Max:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Min:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mean:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mplot_spectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_normalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36m_normalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mnormalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: [!] Mean-Var stats does not match the given feature dimensions."
]
}
@ -314,7 +314,7 @@
" exec(set_val_cmd)\n",
" wav = AP.load_wav(file)\n",
" spec = AP.spectrogram(wav)\n",
" spec_norm = AP._denormalize(spec.T)\n",
" spec_norm = AP.denormalize(spec.T)\n",
" plt.subplot(len(values), 2, 2*idx + 1)\n",
" plt.imshow(spec_norm.T, aspect=\"auto\", origin=\"lower\")\n",
" # plt.colorbar()\n",

Просмотреть файл

@ -106,10 +106,10 @@ def pip_install(package_name):
reqs_from_file = open('requirements.txt').readlines()
reqs_without_tf = [r for r in reqs_from_file if not r.startswith('tensorflow')]
tf_req = [r for r in reqs_from_file if r.startswith('tensorflow')]
# reqs_without_tf = [r for r in reqs_from_file if not r.startswith('tensorflow')]
# tf_req = [r for r in reqs_from_file if r.startswith('tensorflow')]
requirements = {'install_requires': reqs_without_tf, 'pip_install': tf_req}
# requirements = {'install_requires': reqs_without_tf, 'pip_install': tf_req}
setup(
name='TTS',
@ -132,7 +132,7 @@ setup(
'build_py': build_py,
'develop': develop,
},
install_requires=requirements['install_requires'],
install_requires=reqs_from_file,
python_requires='>=3.6.0',
classifiers=[
"Programming Language :: Python",
@ -149,6 +149,6 @@ setup(
# for some reason having tensorflow in 'install_requires'
# breaks some of the dependencies.
if 'bdist_wheel' not in unknown_args:
for module in requirements['pip_install']:
pip_install(module)
# if 'bdist_wheel' not in unknown_args:
# for module in requirements['pip_install']:
# pip_install(module)

Просмотреть файл

@ -67,21 +67,21 @@ class TestAudio(unittest.TestCase):
self.ap.symmetric_norm = False
self.ap.clip_norm = False
self.ap.max_norm = 4.0
x_norm = self.ap._normalize(x)
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
assert x_norm.min() >= 0 - 1, x_norm.min()
# check denorm.
x_ = self.ap._denormalize(x_norm)
x_ = self.ap.denormalize(x_norm)
assert (x - x_).sum() < 1e-3, (x - x_).mean()
self.ap.signal_norm = True
self.ap.symmetric_norm = False
self.ap.clip_norm = True
self.ap.max_norm = 4.0
x_norm = self.ap._normalize(x)
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
@ -90,14 +90,14 @@ class TestAudio(unittest.TestCase):
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= 0, x_norm.min()
# check denorm.
x_ = self.ap._denormalize(x_norm)
x_ = self.ap.denormalize(x_norm)
assert (x - x_).sum() < 1e-3, (x - x_).mean()
self.ap.signal_norm = True
self.ap.symmetric_norm = True
self.ap.clip_norm = False
self.ap.max_norm = 4.0
x_norm = self.ap._normalize(x)
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
@ -107,14 +107,14 @@ class TestAudio(unittest.TestCase):
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() <= 0, x_norm.min()
# check denorm.
x_ = self.ap._denormalize(x_norm)
x_ = self.ap.denormalize(x_norm)
assert (x - x_).sum() < 1e-3, (x - x_).mean()
self.ap.signal_norm = True
self.ap.symmetric_norm = True
self.ap.clip_norm = True
self.ap.max_norm = 4.0
x_norm = self.ap._normalize(x)
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
@ -124,26 +124,26 @@ class TestAudio(unittest.TestCase):
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() <= 0, x_norm.min()
# check denorm.
x_ = self.ap._denormalize(x_norm)
x_ = self.ap.denormalize(x_norm)
assert (x - x_).sum() < 1e-3, (x - x_).mean()
self.ap.signal_norm = True
self.ap.symmetric_norm = False
self.ap.max_norm = 1.0
x_norm = self.ap._normalize(x)
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
assert (x_old - x).sum() == 0
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= 0, x_norm.min()
x_ = self.ap._denormalize(x_norm)
x_ = self.ap.denormalize(x_norm)
assert (x - x_).sum() < 1e-3
self.ap.signal_norm = True
self.ap.symmetric_norm = True
self.ap.max_norm = 1.0
x_norm = self.ap._normalize(x)
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
@ -151,7 +151,7 @@ class TestAudio(unittest.TestCase):
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() < 0, x_norm.min()
x_ = self.ap._denormalize(x_norm)
x_ = self.ap.denormalize(x_norm)
assert (x - x_).sum() < 1e-3
def test_scaler(self):
@ -172,5 +172,5 @@ class TestAudio(unittest.TestCase):
wav = self.ap.load_wav(WAV_FILE)
mel_reference = self.ap.melspectrogram(wav)
mel_norm = ap.melspectrogram(wav)
mel_denorm = ap._denormalize(mel_norm)
mel_denorm = ap.denormalize(mel_norm)
assert abs(mel_reference - mel_denorm).max() < 1e-4