This commit is contained in:
chandan 2020-03-14 23:41:32 +00:00
Родитель d1f3ef3066 5fd2a7f421
Коммит 714091548d
9 изменённых файлов: 89 добавлений и 43 удалений

Просмотреть файл

@ -1,6 +1,6 @@
# Noise Suppression Net (NSNet) baseline inference script # Noise Suppression Net (NSNet) baseline inference script
* As a baseline for Interspeech 2020 Deep Noise Suppression challenge, we will use the recently developed SE method based on Recurrent Neural Network (RNN). For ease of reference, we will call this method as Noise Suppression Net (NSNet). * As a baseline for Interspeech 2020 Deep Noise Suppression challenge, we will use the recently developed SE method based on Recurrent Neural Network (RNN). For ease of reference, we will call this method as Noise Suppression Net (NSNet). The details about this method can be found in the [published paper](https://arxiv.org/pdf/2001.10601.pdf)
* This method uses log power spectra as input to predict the enhancement gain per frame using a learning machine based on Gated Recurrent Units (GRU) and fully connected layers. Please refer to the paper for more details of the method. * This method uses log power spectra as input to predict the enhancement gain per frame using a learning machine based on Gated Recurrent Units (GRU) and fully connected layers. Please refer to the paper for more details of the method.
* NSNet is computationally efficient. It only takes 0.16ms to enhance a 20ms frame on an Intel quad core i5 machine using the ONNX run time v1.1 . * NSNet is computationally efficient. It only takes 0.16ms to enhance a 20ms frame on an Intel quad core i5 machine using the ONNX run time v1.1 .
@ -22,3 +22,14 @@ From the NSNet-baseline directory, run nsnet_eval_local.py with the following re
- --modelpath "Specify the path to the onnx model provided" - --modelpath "Specify the path to the onnx model provided"
Use default values for the rest. Run to enhance the clips. Use default values for the rest. Run to enhance the clips.
## Citation:
The baseline NSNet noise suppression:<br />
@misc{xia2020weighted,<br />
title={Weighted Speech Distortion Losses for Neural-network-based Real-time Speech Enhancement},<br />
author={Yangyang Xia and Sebastian Braun and Chandan K. A. Reddy and Harishchandra Dubey and Ross Cutler and Ivan Tashev},<br />
year={2020},<br />
eprint={2001.10601},<br />
archivePrefix={arXiv},<br />
primaryClass={eess.AS}<br />
}

Двоичный файл не отображается.

Двоичные данные
NSNet-baseline/__pycache__/onnx.cpython-36.pyc

Двоичный файл не отображается.

Просмотреть файл

@ -62,10 +62,10 @@ def _main():
logging.debug("NSNet local workers start with %d input files", len(input_filelist)) logging.debug("NSNet local workers start with %d input files", len(input_filelist))
with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor: # with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
executor.map(worker, input_filelist, chunksize=args.chunksize) # executor.map(worker, input_filelist, chunksize=args.chunksize)
# for fname in input_filelist: for fname in input_filelist:
# worker(fname) worker(fname)
logging.info("NSNet local workers complete") logging.info("NSNet local workers complete")

Просмотреть файл

@ -100,7 +100,7 @@ class NSNetInference:
mask = model_outputs[0].squeeze() mask = model_outputs[0].squeeze()
x_enh = audiolib.istft( x_enh = audiolib.istft(
(xmag * mask) * xphs, sample_rate, self.wind, self.dft_size, zphase=False) (xmag * mask) * xphs, sample_rate, self.wind, self.dft_size, zphase=False)
sout[frame_sampleindex:frame_sampleindex + hsize] = x_old + x_enh[0:hsize] sout[frame_sampleindex:frame_sampleindex + hsize] = x_old + x_enh[0:hsize]
x_old = x_enh[hsize:fsize] x_old = x_enh[hsize:fsize]

Просмотреть файл

@ -21,15 +21,28 @@ This repository contains the datasets and scripts required for the DNS challenge
* Run python **noisyspeech_synthesizer_singleprocess.py** to synthesize the data. * Run python **noisyspeech_synthesizer_singleprocess.py** to synthesize the data.
## Citation: ## Citation:
@misc{ch2020interspeech, For the datasets and the DNS challenge:<br />
title={The INTERSPEECH 2020 Deep Noise Suppression Challenge: Datasets, Subjective Speech Quality and Testing Framework},
@misc{ch2020interspeech,<br />
title={The INTERSPEECH 2020 Deep Noise Suppression Challenge: Datasets, Subjective Speech Quality and Testing Framework},<br />
author={Chandan K. A. Reddy and Ebrahim Beyrami and Harishchandra Dubey and Vishak Gopal and Roger Cheng and Ross Cutler and Sergiy Matusevych and Robert Aichner and Ashkan Aazami and Sebastian Braun and Puneet Rana and Sriram Srinivasan and Johannes Gehrke}, author={Chandan K. A. Reddy and Ebrahim Beyrami and Harishchandra Dubey and Vishak Gopal and Roger Cheng and Ross Cutler and Sergiy Matusevych and Robert Aichner and Ashkan Aazami and Sebastian Braun and Puneet Rana and Sriram Srinivasan and Johannes Gehrke},
year={2020}, year={2020},<br />
eprint={2001.08662}, eprint={2001.08662},<br />
archivePrefix={arXiv}, archivePrefix={arXiv},<br />
primaryClass={cs.SD} primaryClass={cs.SD}<br />
} }
The baseline NSNet noise suppression:<br />
@misc{xia2020weighted,<br />
title={Weighted Speech Distortion Losses for Neural-network-based Real-time Speech Enhancement},<br />
author={Yangyang Xia and Sebastian Braun and Chandan K. A. Reddy and Harishchandra Dubey and Ross Cutler and Ivan Tashev},<br />
year={2020},<br />
eprint={2001.10601},<br />
archivePrefix={arXiv},<br />
primaryClass={eess.AS}<br />
}
# Contributing # Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a This project welcomes contributions and suggestions. Most contributions require you to agree to a

Просмотреть файл

@ -143,36 +143,48 @@ def main_gen(params, filenum):
'''Calls gen_audio() to generate the audio signals, verifies that they meet '''Calls gen_audio() to generate the audio signals, verifies that they meet
the requirements, and writes the files to storage''' the requirements, and writes the files to storage'''
# generate clean speech print("Generating file #" + str(filenum))
clean, clean_source_files, clean_clipped_files, clean_low_activity_files = \
gen_audio(True, params, filenum)
# generate noise
noise, noise_source_files, noise_clipped_files, noise_low_activity_files = \
gen_audio(False, params, filenum, len(clean))
# mix clean speech and noise clean_clipped_files = []
# if specified, use specified SNR value clean_low_activity_files = []
if not params['randomize_snr']: noise_clipped_files = []
snr = params['snr'] noise_low_activity_files = []
# use a randomly sampled SNR value between the specified bounds
else: while True:
snr = np.random.randint(params['snr_lower'], params['snr_upper']) # generate clean speech
clean, clean_source_files, clean_cf, clean_laf = \
clean_snr, noise_snr, noisy_snr, target_level = snr_mixer(params=params, gen_audio(True, params, filenum)
clean=clean, # generate noise
noise=noise, noise, noise_source_files, noise_cf, noise_laf = \
snr=snr) gen_audio(False, params, filenum, len(clean))
# Uncomment the below lines if you need segmental SNR and comment the above lines using snr_mixer
#clean_snr, noise_snr, noisy_snr, target_level = segmental_snr_mixer(params=params, clean_clipped_files += clean_cf
# clean=clean, clean_low_activity_files += clean_laf
# noise=noise, noise_clipped_files += noise_cf
# snr=snr) noise_low_activity_files += noise_laf
# unexpected clipping
if is_clipped(clean_snr) or is_clipped(noise_snr) or is_clipped(noisy_snr): # mix clean speech and noise
print("Warning: File #" + str(filenum) + " has unexpected clipping, " + \ # if specified, use specified SNR value
"returning without writing audio to disk") if not params['randomize_snr']:
return [], clean_clipped_files, clean_low_activity_files, \ snr = params['snr']
[], noise_clipped_files, noise_low_activity_files # use a randomly sampled SNR value between the specified bounds
else:
snr = np.random.randint(params['snr_lower'], params['snr_upper'])
clean_snr, noise_snr, noisy_snr, target_level = snr_mixer(params=params,
clean=clean,
noise=noise,
snr=snr)
# Uncomment the below lines if you need segmental SNR and comment the above lines using snr_mixer
#clean_snr, noise_snr, noisy_snr, target_level = segmental_snr_mixer(params=params,
# clean=clean,
# noise=noise,
# snr=snr)
# unexpected clipping
if is_clipped(clean_snr) or is_clipped(noise_snr) or is_clipped(noisy_snr):
continue
else:
break
# write resultant audio streams to files # write resultant audio streams to files
hyphen = '-' hyphen = '-'
@ -252,7 +264,9 @@ def main_body():
params['total_hours'] = float(cfg['total_hours']) params['total_hours'] = float(cfg['total_hours'])
if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None': if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None':
params['num_files'] = int(cfg['fileindex_end'])-int(cfg['fileindex_start']) params['fileindex_start'] = int(cfg['fileindex_start'])
params['fileindex_end'] = int(cfg['fileindex_end'])
params['num_files'] = int(params['fileindex_end'])-int(params['fileindex_start'])
else: else:
params['num_files'] = int((params['total_hours']*60*60)/params['audio_length']) params['num_files'] = int((params['total_hours']*60*60)/params['audio_length'])
@ -262,8 +276,11 @@ def main_body():
params['noise_activity_threshold'] = float(cfg['noise_activity_threshold']) params['noise_activity_threshold'] = float(cfg['noise_activity_threshold'])
params['snr_lower'] = int(cfg['snr_lower']) params['snr_lower'] = int(cfg['snr_lower'])
params['snr_upper'] = int(cfg['snr_upper']) params['snr_upper'] = int(cfg['snr_upper'])
<<<<<<< HEAD
# params['fileindex_start'] = int(cfg['fileindex_start']) # params['fileindex_start'] = int(cfg['fileindex_start'])
# params['fileindex_end'] = int(cfg['fileindex_end']) # params['fileindex_end'] = int(cfg['fileindex_end'])
=======
>>>>>>> f9fdbd480c5c3f8cc5fbaad00028ffecc3e5ea61
params['randomize_snr'] = utils.str2bool(cfg['randomize_snr']) params['randomize_snr'] = utils.str2bool(cfg['randomize_snr'])
params['target_level_lower'] = int(cfg['target_level_lower']) params['target_level_lower'] = int(cfg['target_level_lower'])
params['target_level_upper'] = int(cfg['target_level_upper']) params['target_level_upper'] = int(cfg['target_level_upper'])

Просмотреть файл

@ -23,6 +23,7 @@ MAXTRIES = 50
MAXFILELEN = 100 MAXFILELEN = 100
np.random.seed(2) np.random.seed(2)
random.seed(3)
def build_audio(is_clean, params, index, audio_samples_length=-1): def build_audio(is_clean, params, index, audio_samples_length=-1):
'''Construct an audio signal from source files''' '''Construct an audio signal from source files'''

Просмотреть файл

@ -43,7 +43,11 @@ def test_zeros_beg_end(audio, num_zeros=16000, low_energy_thresh=LOW_ENERGY_THRE
end_segment_energy = 20*np.log10(audio[-num_zeros:]**2).mean()**0.5 end_segment_energy = 20*np.log10(audio[-num_zeros:]**2).mean()**0.5
return beg_segment_energy < low_energy_thresh or end_segment_energy < low_energy_thresh return beg_segment_energy < low_energy_thresh or end_segment_energy < low_energy_thresh
def adsp_filtering_test(adsp, without_adsp):
diff = adsp - without_adsp
if any(val >0.0001 for val in diff):
if __name__=='__main__': if __name__=='__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--cfg', default='noisyspeech_synthesizer.cfg') parser.add_argument('--cfg', default='noisyspeech_synthesizer.cfg')