From 95382bd2b9eba895521a38f4d140a11633c91846 Mon Sep 17 00:00:00 2001 From: Hari Dubey Date: Sat, 12 Sep 2020 03:35:59 +0000 Subject: [PATCH] updated README, config, .gitignore --- .gitignore | 1 + README.md | 15 ++++++++++++++- noisyspeech_synthesizer.cfg | 8 ++++---- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index f8bb1feefda..9075122e0e6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ training_set5/ logs/ test_set2/ training_set_sept11/ +training_set_sept12/ diff --git a/README.md b/README.md index 7d16ac70eb9..9ac66d02e7e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Deep Noise Suppression (DNS) Challenge - Interspeech 2020 -This repository contains the datasets and scripts required for the DNS challenge. For more details about the challenge, please visit https://dns-challenge.azurewebsites.net/ and refer to our [paper](https://arxiv.org/ftp/arxiv/papers/2001/2001.08662.pdf). +This repository contains the datasets and scripts required for the DNS challenge. For more details about the challenge, please visit https://dns-challenge.azurewebsites.net/. ## Repo details: * The **datasets** directory contains the clean speech and noise clips. @@ -101,11 +101,24 @@ The datasets used in this project are licensed as follows: * https://librivox.org/; License: https://librivox.org/pages/public-domain/ * PTDB-TUG: Pitch Tracking Database from Graz University of Technology https://www.spsc.tugraz.at/databases-and-tools/ptdb-tug-pitch-tracking-database-from-graz-university-of-technology.html; License: http://opendatacommons.org/licenses/odbl/1.0/ * Edinburgh 56 speaker dataset: https://datashare.is.ed.ac.uk/handle/10283/2791; License: https://datashare.is.ed.ac.uk/bitstream/handle/10283/2791/license_text?sequence=11&isAllowed=y +* VocalSet: A Singing Voice Dataset https://zenodo.org/record/1193957#.X1hkxYtlCHs; License: Creative Commons Attribution 4.0 International +* Emotion data corpus: CREMA-D (Crowd-sourced Emotional Multimodal Actors Dataset) +https://github.com/CheyneyComputerScience/CREMA-D; License: http://opendatacommons.org/licenses/dbcl/1.0/ +* The VoxCeleb2 Dataset http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html; License: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/ +The VoxCeleb dataset is available to download for commercial/research purposes under a Creative Commons Attribution 4.0 International License. The copyright remains with the original owners of the video. A complete version of the license can be found here. +* VCTK Dataset: https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html; License: This corpus is licensed under Open Data Commons Attribution License (ODC-By) v1.0. +http://opendatacommons.org/licenses/by/1.0/ + 2. Noise: * Audioset: https://research.google.com/audioset/index.html; License: https://creativecommons.org/licenses/by/4.0/ * Freesound: https://freesound.org/ Only files with CC0 licenses were selected; License: https://creativecommons.org/publicdomain/zero/1.0/ * Demand: https://zenodo.org/record/1227121#.XRKKxYhKiUk; License: https://creativecommons.org/licenses/by-sa/3.0/deed.en_CA +3. RIR datasets: OpenSLR26 and OpenSLR28: +* http://www.openslr.org/26/ +* http://www.openslr.org/28/ +* License: Apache 2.0 + ## Code license MIT License diff --git a/noisyspeech_synthesizer.cfg b/noisyspeech_synthesizer.cfg index 517a1d4b1af..1f44e84cd43 100644 --- a/noisyspeech_synthesizer.cfg +++ b/noisyspeech_synthesizer.cfg @@ -52,9 +52,9 @@ noise_dir: datasets\noise speech_dir: datasets\clean\read_speech noise_types_excluded: None -noisy_destination: datasets\training_set_sept11\noisy -clean_destination: datasets\training_set_sept11\clean -noise_destination: datasets\training_set_sept11\noise +noisy_destination: datasets\training_set_sept12\noisy +clean_destination: datasets\training_set_sept12\clean +noise_destination: datasets\training_set_sept12\noise log_dir: logs # Config: add singing voice to clean speech @@ -76,7 +76,7 @@ use_mandarin_data=1 clean_mandarin: datasets\clean\mandarin_speech # Config: add reverb to clean speech -rir_choice: 1 +rir_choice: 3 # 1 for only real rir, 2 for only synthetic rir, 3 (default) use both real and synthetic lower_t60: 0.3 # lower bound of t60 range in seconds