diff --git a/NSNet-baseline/Readme.md b/NSNet-baseline/Readme.md index 42f656ae392..02198bd18ff 100644 --- a/NSNet-baseline/Readme.md +++ b/NSNet-baseline/Readme.md @@ -1,6 +1,6 @@ # Noise Suppression Net (NSNet) baseline inference script -* As a baseline for Interspeech 2020 Deep Noise Suppression challenge, we will use the recently developed SE method based on Recurrent Neural Network (RNN). For ease of reference, we will call this method as Noise Suppression Net (NSNet). +* As a baseline for Interspeech 2020 Deep Noise Suppression challenge, we will use the recently developed SE method based on Recurrent Neural Network (RNN). For ease of reference, we will call this method as Noise Suppression Net (NSNet). The details about this method can be found in the [published paper](https://arxiv.org/pdf/2001.10601.pdf) * This method uses log power spectra as input to predict the enhancement gain per frame using a learning machine based on Gated Recurrent Units (GRU) and fully connected layers. Please refer to the paper for more details of the method. * NSNet is computationally efficient. It only takes 0.16ms to enhance a 20ms frame on an Intel quad core i5 machine using the ONNX run time v1.1 . @@ -22,3 +22,14 @@ From the NSNet-baseline directory, run nsnet_eval_local.py with the following re - --modelpath "Specify the path to the onnx model provided" Use default values for the rest. Run to enhance the clips. + +## Citation: +The baseline NSNet noise suppression:
+@misc{xia2020weighted,
+ title={Weighted Speech Distortion Losses for Neural-network-based Real-time Speech Enhancement},
+ author={Yangyang Xia and Sebastian Braun and Chandan K. A. Reddy and Harishchandra Dubey and Ross Cutler and Ivan Tashev},
+ year={2020},
+ eprint={2001.10601},
+ archivePrefix={arXiv},
+ primaryClass={eess.AS}
+} diff --git a/NSNet-baseline/__pycache__/audiolib.cpython-36.pyc b/NSNet-baseline/__pycache__/audiolib.cpython-36.pyc index 2cc2d0b79ba..582fee2ffd5 100644 Binary files a/NSNet-baseline/__pycache__/audiolib.cpython-36.pyc and b/NSNet-baseline/__pycache__/audiolib.cpython-36.pyc differ diff --git a/NSNet-baseline/__pycache__/onnx.cpython-36.pyc b/NSNet-baseline/__pycache__/onnx.cpython-36.pyc index 4dc6798bd4f..32e3bece2fc 100644 Binary files a/NSNet-baseline/__pycache__/onnx.cpython-36.pyc and b/NSNet-baseline/__pycache__/onnx.cpython-36.pyc differ diff --git a/NSNet-baseline/nsnet_eval_local.py b/NSNet-baseline/nsnet_eval_local.py index 70d499250eb..73ab2aa1b43 100644 --- a/NSNet-baseline/nsnet_eval_local.py +++ b/NSNet-baseline/nsnet_eval_local.py @@ -62,10 +62,10 @@ def _main(): logging.debug("NSNet local workers start with %d input files", len(input_filelist)) - with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor: - executor.map(worker, input_filelist, chunksize=args.chunksize) -# for fname in input_filelist: -# worker(fname) +# with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor: +# executor.map(worker, input_filelist, chunksize=args.chunksize) + for fname in input_filelist: + worker(fname) logging.info("NSNet local workers complete") diff --git a/NSNet-baseline/onnx.py b/NSNet-baseline/onnx.py index f3cf05748d8..0e3bce1f56e 100644 --- a/NSNet-baseline/onnx.py +++ b/NSNet-baseline/onnx.py @@ -100,7 +100,7 @@ class NSNetInference: mask = model_outputs[0].squeeze() x_enh = audiolib.istft( (xmag * mask) * xphs, sample_rate, self.wind, self.dft_size, zphase=False) - + sout[frame_sampleindex:frame_sampleindex + hsize] = x_old + x_enh[0:hsize] x_old = x_enh[hsize:fsize] diff --git a/README.md b/README.md index 76b1ee61486..1d21ef8a06b 100644 --- a/README.md +++ b/README.md @@ -21,15 +21,28 @@ This repository contains the datasets and scripts required for the DNS challenge * Run python **noisyspeech_synthesizer_singleprocess.py** to synthesize the data. ## Citation: -@misc{ch2020interspeech, - title={The INTERSPEECH 2020 Deep Noise Suppression Challenge: Datasets, Subjective Speech Quality and Testing Framework}, +For the datasets and the DNS challenge:
+ +@misc{ch2020interspeech,
+ title={The INTERSPEECH 2020 Deep Noise Suppression Challenge: Datasets, Subjective Speech Quality and Testing Framework},
author={Chandan K. A. Reddy and Ebrahim Beyrami and Harishchandra Dubey and Vishak Gopal and Roger Cheng and Ross Cutler and Sergiy Matusevych and Robert Aichner and Ashkan Aazami and Sebastian Braun and Puneet Rana and Sriram Srinivasan and Johannes Gehrke}, - year={2020}, - eprint={2001.08662}, - archivePrefix={arXiv}, - primaryClass={cs.SD} + year={2020},
+ eprint={2001.08662},
+ archivePrefix={arXiv},
+ primaryClass={cs.SD}
} +The baseline NSNet noise suppression:
+@misc{xia2020weighted,
+ title={Weighted Speech Distortion Losses for Neural-network-based Real-time Speech Enhancement},
+ author={Yangyang Xia and Sebastian Braun and Chandan K. A. Reddy and Harishchandra Dubey and Ross Cutler and Ivan Tashev},
+ year={2020},
+ eprint={2001.10601},
+ archivePrefix={arXiv},
+ primaryClass={eess.AS}
+} + + # Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a diff --git a/noisyspeech_synthesizer_multiprocessing.py b/noisyspeech_synthesizer_multiprocessing.py index dba7836b7da..a219e7613c3 100644 --- a/noisyspeech_synthesizer_multiprocessing.py +++ b/noisyspeech_synthesizer_multiprocessing.py @@ -143,36 +143,48 @@ def main_gen(params, filenum): '''Calls gen_audio() to generate the audio signals, verifies that they meet the requirements, and writes the files to storage''' - # generate clean speech - clean, clean_source_files, clean_clipped_files, clean_low_activity_files = \ - gen_audio(True, params, filenum) - # generate noise - noise, noise_source_files, noise_clipped_files, noise_low_activity_files = \ - gen_audio(False, params, filenum, len(clean)) + print("Generating file #" + str(filenum)) - # mix clean speech and noise - # if specified, use specified SNR value - if not params['randomize_snr']: - snr = params['snr'] - # use a randomly sampled SNR value between the specified bounds - else: - snr = np.random.randint(params['snr_lower'], params['snr_upper']) - - clean_snr, noise_snr, noisy_snr, target_level = snr_mixer(params=params, - clean=clean, - noise=noise, - snr=snr) - # Uncomment the below lines if you need segmental SNR and comment the above lines using snr_mixer - #clean_snr, noise_snr, noisy_snr, target_level = segmental_snr_mixer(params=params, - # clean=clean, - # noise=noise, - # snr=snr) - # unexpected clipping - if is_clipped(clean_snr) or is_clipped(noise_snr) or is_clipped(noisy_snr): - print("Warning: File #" + str(filenum) + " has unexpected clipping, " + \ - "returning without writing audio to disk") - return [], clean_clipped_files, clean_low_activity_files, \ - [], noise_clipped_files, noise_low_activity_files + clean_clipped_files = [] + clean_low_activity_files = [] + noise_clipped_files = [] + noise_low_activity_files = [] + + while True: + # generate clean speech + clean, clean_source_files, clean_cf, clean_laf = \ + gen_audio(True, params, filenum) + # generate noise + noise, noise_source_files, noise_cf, noise_laf = \ + gen_audio(False, params, filenum, len(clean)) + + clean_clipped_files += clean_cf + clean_low_activity_files += clean_laf + noise_clipped_files += noise_cf + noise_low_activity_files += noise_laf + + # mix clean speech and noise + # if specified, use specified SNR value + if not params['randomize_snr']: + snr = params['snr'] + # use a randomly sampled SNR value between the specified bounds + else: + snr = np.random.randint(params['snr_lower'], params['snr_upper']) + + clean_snr, noise_snr, noisy_snr, target_level = snr_mixer(params=params, + clean=clean, + noise=noise, + snr=snr) + # Uncomment the below lines if you need segmental SNR and comment the above lines using snr_mixer + #clean_snr, noise_snr, noisy_snr, target_level = segmental_snr_mixer(params=params, + # clean=clean, + # noise=noise, + # snr=snr) + # unexpected clipping + if is_clipped(clean_snr) or is_clipped(noise_snr) or is_clipped(noisy_snr): + continue + else: + break # write resultant audio streams to files hyphen = '-' @@ -252,7 +264,9 @@ def main_body(): params['total_hours'] = float(cfg['total_hours']) if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None': - params['num_files'] = int(cfg['fileindex_end'])-int(cfg['fileindex_start']) + params['fileindex_start'] = int(cfg['fileindex_start']) + params['fileindex_end'] = int(cfg['fileindex_end']) + params['num_files'] = int(params['fileindex_end'])-int(params['fileindex_start']) else: params['num_files'] = int((params['total_hours']*60*60)/params['audio_length']) @@ -262,8 +276,11 @@ def main_body(): params['noise_activity_threshold'] = float(cfg['noise_activity_threshold']) params['snr_lower'] = int(cfg['snr_lower']) params['snr_upper'] = int(cfg['snr_upper']) +<<<<<<< HEAD # params['fileindex_start'] = int(cfg['fileindex_start']) # params['fileindex_end'] = int(cfg['fileindex_end']) +======= +>>>>>>> f9fdbd480c5c3f8cc5fbaad00028ffecc3e5ea61 params['randomize_snr'] = utils.str2bool(cfg['randomize_snr']) params['target_level_lower'] = int(cfg['target_level_lower']) params['target_level_upper'] = int(cfg['target_level_upper']) diff --git a/noisyspeech_synthesizer_singleprocess.py b/noisyspeech_synthesizer_singleprocess.py index de17d800068..3b31055d7e8 100644 --- a/noisyspeech_synthesizer_singleprocess.py +++ b/noisyspeech_synthesizer_singleprocess.py @@ -23,6 +23,7 @@ MAXTRIES = 50 MAXFILELEN = 100 np.random.seed(2) +random.seed(3) def build_audio(is_clean, params, index, audio_samples_length=-1): '''Construct an audio signal from source files''' diff --git a/unit_tests_synthesizer.py b/unit_tests_synthesizer.py index 577bc747d6d..5b0c95817d3 100644 --- a/unit_tests_synthesizer.py +++ b/unit_tests_synthesizer.py @@ -43,7 +43,11 @@ def test_zeros_beg_end(audio, num_zeros=16000, low_energy_thresh=LOW_ENERGY_THRE end_segment_energy = 20*np.log10(audio[-num_zeros:]**2).mean()**0.5 return beg_segment_energy < low_energy_thresh or end_segment_energy < low_energy_thresh - +def adsp_filtering_test(adsp, without_adsp): + diff = adsp - without_adsp + if any(val >0.0001 for val in diff): + + if __name__=='__main__': parser = argparse.ArgumentParser() parser.add_argument('--cfg', default='noisyspeech_synthesizer.cfg')