From 0e47d49a28ccbc22799c64212a2130bcdd7b3acf Mon Sep 17 00:00:00 2001 From: Hari Dubey Date: Tue, 13 Dec 2022 23:45:42 +0000 Subject: [PATCH] added training clean speech, noise, ir download scripts for both tracks --- download-dns-challenge-5-headset-training.sh | 133 +++++++++++++ download-dns-challenge-5-headset.sh | 100 ---------- download-dns-challenge-5-noise-ir.sh | 53 +++++ ...d-dns-challenge-5-speakerphone-training.sh | 187 ++++++++++++++++++ download-dns-challenge-5-speakerphone.sh | 134 ------------- 5 files changed, 373 insertions(+), 234 deletions(-) create mode 100644 download-dns-challenge-5-headset-training.sh delete mode 100644 download-dns-challenge-5-headset.sh create mode 100644 download-dns-challenge-5-noise-ir.sh create mode 100644 download-dns-challenge-5-speakerphone-training.sh delete mode 100644 download-dns-challenge-5-speakerphone.sh diff --git a/download-dns-challenge-5-headset-training.sh b/download-dns-challenge-5-headset-training.sh new file mode 100644 index 00000000000..4c41b9db871 --- /dev/null +++ b/download-dns-challenge-5-headset-training.sh @@ -0,0 +1,133 @@ +#!/usr/bin/bash + +# ***** 5th DNS Challenge at ICASSP 2023***** +# Track 1 Headset Clean speech: All Languages +# ------------------------------------------------------------- +# In all, you will need about 1TB to store the UNPACKED data. +# Archived, the same data takes about 550GB total. + +# Please comment out the files you don't need before launching +# the script. + +# NOTE: By default, the script *DOES NOT* DOWNLOAD ANY FILES! +# Please scroll down and edit this script to pick the +# downloading method that works best for you. + +# ------------------------------------------------------------- +# The directory structure of the unpacked data is: + +# datasets_fullband +# \-- clean_fullband 827G +# +-- emotional_speech 2.4G +# +-- french_speech 62G +# +-- german_speech 319G +# +-- italian_speech 42G +# +-- read_speech 299G +# +-- russian_speech 12G +# +-- spanish_speech 65G +# +-- vctk_wav48_silence_trimmed 27G +# \-- VocalSet_48kHz_mono 974M + +BLOB_NAMES=( + + Track1_Headset/VocalSet_48kHz_mono.tgz + Track1_Headset/emotional_speech.tgz + + Track1_Headset/french_speech.tar.gz.partaa + Track1_Headset/french_speech.tar.gz.partab + Track1_Headset/french_speech.tar.gz.partac + Track1_Headset/french_speech.tar.gz.partad + Track1_Headset/french_speech.tar.gz.partae + Track1_Headset/french_speech.tar.gz.partah + + Track1_Headset/german_speech.tgz.partaa + Track1_Headset/german_speech.tgz.partab + Track1_Headset/german_speech.tgz.partac + Track1_Headset/german_speech.tgz.partad + Track1_Headset/german_speech.tgz.partae + Track1_Headset/german_speech.tgz.partaf + Track1_Headset/german_speech.tgz.partag + Track1_Headset/german_speech.tgz.partah + Track1_Headset/german_speech.tgz.partaj + Track1_Headset/german_speech.tgz.partal + Track1_Headset/german_speech.tgz.partam + Track1_Headset/german_speech.tgz.partan + Track1_Headset/german_speech.tgz.partao + Track1_Headset/german_speech.tgz.partap + Track1_Headset/german_speech.tgz.partaq + Track1_Headset/german_speech.tgz.partar + Track1_Headset/german_speech.tgz.partas + Track1_Headset/german_speech.tgz.partat + Track1_Headset/german_speech.tgz.partau + Track1_Headset/german_speech.tgz.partav + Track1_Headset/german_speech.tgz.partaw + + Track1_Headset/italian_speech.tgz.partaa + Track1_Headset/italian_speech.tgz.partab + Track1_Headset/italian_speech.tgz.partac + Track1_Headset/italian_speech.tgz.partad + + Track1_Headset/read_speech.tgz.partaa + Track1_Headset/read_speech.tgz.partab + Track1_Headset/read_speech.tgz.partac + Track1_Headset/read_speech.tgz.partad + Track1_Headset/read_speech.tgz.partae + Track1_Headset/read_speech.tgz.partaf + Track1_Headset/read_speech.tgz.partag + Track1_Headset/read_speech.tgz.partah + Track1_Headset/read_speech.tgz.partai + Track1_Headset/read_speech.tgz.partaj + Track1_Headset/read_speech.tgz.partak + Track1_Headset/read_speech.tgz.partal + Track1_Headset/read_speech.tgz.partam + Track1_Headset/read_speech.tgz.partan + Track1_Headset/read_speech.tgz.partao + Track1_Headset/read_speech.tgz.partap + Track1_Headset/read_speech.tgz.partaq + Track1_Headset/read_speech.tgz.partar + Track1_Headset/read_speech.tgz.partas + Track1_Headset/read_speech.tgz.partat + Track1_Headset/read_speech.tgz.partau + + Track1_Headset/russian_speech.tgz + + Track1_Headset/spanish_speech.tgz.partaa + Track1_Headset/spanish_speech.tgz.partab + Track1_Headset/spanish_speech.tgz.partac + Track1_Headset/spanish_speech.tgz.partad + Track1_Headset/spanish_speech.tgz.partae + Track1_Headset/spanish_speech.tgz.partaf + Track1_Headset/spanish_speech.tgz.partag + + Track1_Headset/vctk_wav48_silence_trimmed.tgz.partaa + Track1_Headset/vctk_wav48_silence_trimmed.tgz.partab + Track1_Headset/vctk_wav48_silence_trimmed.tgz.partac +) + +############################################################### +# this data is extracted from datasets used in Track 2. + +AZURE_URL="https://dnschallengepublic.blob.core.windows.net/dns5archive/V5_training_dataset" + +OUTPUT_PATH="./datasets_fullband" + +mkdir -p $OUTPUT_PATH/{clean_fullband} + +for BLOB in ${BLOB_NAMES[@]} +do + URL="$AZURE_URL/$BLOB" + echo "Download: $BLOB" + + # DRY RUN: print HTTP response and Content-Length + # WITHOUT downloading the files + curl -s -I "$URL" | head -n 2 + + # Actually download the files: UNCOMMENT when ready to download + # curl "$URL" -o "$OUTPUT_PATH/$BLOB" + + # Same as above, but using wget + # wget "$URL" -O "$OUTPUT_PATH/$BLOB" + + # Same, + unpack files on the fly + # curl "$URL" | tar -C "$OUTPUT_PATH" -f - -x -j +done diff --git a/download-dns-challenge-5-headset.sh b/download-dns-challenge-5-headset.sh deleted file mode 100644 index 0e058b31ea8..00000000000 --- a/download-dns-challenge-5-headset.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/bash - -# ***** Datasets for ICASSP 2023 DNS Challenge 5 - Personalized DNS Track ***** - -# NOTE: Before downloading, make sure you have enough space -# on your local storage! - -# In all, you will need about 380TB to store the UNPACKED data. -# Archived, the same data takes about 200GB total. - -# Please comment out the files you don't need before launching -# the script. - -# NOTE: By default, the script *DOES NOT* DOWNLOAD ANY FILES! -# Please scroll down and edit this script to pick the -# downloading method that works best for you. - -# ------------------------------------------------------------- - -BLOB_NAMES=( - V5_training_dataset/Track1_Headset/VocalSet_48kHz_mono.tgz - - V5_training_dataset/Track1_Headset/vctk_wav48_silence_trimmed.tgz.partac - V5_training_dataset/Track1_Headset/vctk_wav48_silence_trimmed.tgz.partab - V5_training_dataset/Track1_Headset/vctk_wav48_silence_trimmed.tgz.partaa - - V5_training_dataset/Track1_Headset/russian_speech.tgz - - V5_training_dataset/Track1_Headset/read_speech.tgz.partau - V5_training_dataset/Track1_Headset/read_speech.tgz.partat - V5_training_dataset/Track1_Headset/read_speech.tgz.partas - V5_training_dataset/Track1_Headset/read_speech.tgz.partar - V5_training_dataset/Track1_Headset/read_speech.tgz.partaq - V5_training_dataset/Track1_Headset/read_speech.tgz.partap - V5_training_dataset/Track1_Headset/read_speech.tgz.partao - V5_training_dataset/Track1_Headset/read_speech.tgz.partan - V5_training_dataset/Track1_Headset/read_speech.tgz.partam - V5_training_dataset/Track1_Headset/read_speech.tgz.partal - V5_training_dataset/Track1_Headset/read_speech.tgz.partak - V5_training_dataset/Track1_Headset/read_speech.tgz.partaj - V5_training_dataset/Track1_Headset/read_speech.tgz.partai - V5_training_dataset/Track1_Headset/read_speech.tgz.partah - V5_training_dataset/Track1_Headset/read_speech.tgz.partag - V5_training_dataset/Track1_Headset/read_speech.tgz.partaf - V5_training_dataset/Track1_Headset/read_speech.tgz.partae - V5_training_dataset/Track1_Headset/read_speech.tgz.partad - V5_training_dataset/Track1_Headset/read_speech.tgz.partac - V5_training_dataset/Track1_Headset/read_speech.tgz.partab - V5_training_dataset/Track1_Headset/read_speech.tgz.partaa - - V5_training_dataset/Track1_Headset/italian_speech.tgz.partad - V5_training_dataset/Track1_Headset/italian_speech.tgz.partac - V5_training_dataset/Track1_Headset/italian_speech.tgz.partab - V5_training_dataset/Track1_Headset/italian_speech.tgz.partaa - - V5_training_dataset/Track1_Headset/french_speech.tar.gz.partah - V5_training_dataset/Track1_Headset/french_speech.tar.gz.partae - V5_training_dataset/Track1_Headset/french_speech.tar.gz.partad - V5_training_dataset/Track1_Headset/french_speech.tar.gz.partac - V5_training_dataset/Track1_Headset/french_speech.tar.gz.partab - V5_training_dataset/Track1_Headset/french_speech.tar.gz.partaa - - V5_training_dataset/Track1_Headset/emotional_speech.tgz - - - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.freesound_001.tar.bz2 - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.freesound_000.tar.bz2 - - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.audioset_006.tar.bz2 - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.audioset_005.tar.bz2 - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.audioset_004.tar.bz2 - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.audioset_003.tar.bz2 - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.audioset_002.tar.bz2 - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.audioset_001.tar.bz2 - V5_training_dataset/noise_fullband/datasets_fullband.noise_fullband.audioset_000.tar.bz2 - - V5_training_dataset/datasets_fullband.impulse_responses_000.tar.bz2 -) - -############################################################### - -AZURE_URL="https://dnschallengepublic.blob.core.windows.net/dns5archive" - -OUTPUT_PATH="." - -mkdir -p $OUTPUT_PATH/V5_training_dataset/noise_fullband -mkdir -p $OUTPUT_PATH/V5_training_dataset/Track1_Headset - -for BLOB in ${BLOB_NAMES[@]} -do - URL="$AZURE_URL/$BLOB" - echo "Download: $BLOB" - - # DRY RUN: print HTTP response and Content-Length - # WITHOUT downloading the files - curl -s -I "$URL" | head -n 2 - - # Actually download the files: UNCOMMENT when ready to download - # curl "$URL" -o "$OUTPUT_PATH/$BLOB" -done \ No newline at end of file diff --git a/download-dns-challenge-5-noise-ir.sh b/download-dns-challenge-5-noise-ir.sh new file mode 100644 index 00000000000..115adf64fb9 --- /dev/null +++ b/download-dns-challenge-5-noise-ir.sh @@ -0,0 +1,53 @@ +#!/usr/bin/bash + +# ***** 5th DNS Challenge at ICASSP 2023***** +# Noise data which is used in both tracks +# Also download the impulse response data + +# All compressed noises files are ~39 GB +# ------------------------------------------------------------- +# ------------------------------------------------------------- +# The directory structure of the unpacked data is: +# +-- noise_fullband + +BLOB_NAMES=( + noise_fullband/datasets_fullband.noise_fullband.audioset_000.tar.bz2 + noise_fullband/datasets_fullband.noise_fullband.audioset_001.tar.bz2 + noise_fullband/datasets_fullband.noise_fullband.audioset_002.tar.bz2 + noise_fullband/datasets_fullband.noise_fullband.audioset_003.tar.bz2 + noise_fullband/datasets_fullband.noise_fullband.audioset_004.tar.bz2 + noise_fullband/datasets_fullband.noise_fullband.audioset_005.tar.bz2 + noise_fullband/datasets_fullband.noise_fullband.audioset_006.tar.bz2 + + noise_fullband/datasets_fullband.noise_fullband.freesound_000.tar.bz2 + noise_fullband/datasets_fullband.noise_fullband.freesound_001.tar.bz2 + + datasets_fullband.impulse_responses_000.tar.bz2 +) + +############################################################### + +AZURE_URL="https://dnschallengepublic.blob.core.windows.net/dns5archive/V5_training_dataset" + +OUTPUT_PATH="./" + +mkdir -p $OUTPUT_PATH/{noise_fullband} + +for BLOB in ${BLOB_NAMES[@]} +do + URL="$AZURE_URL/$BLOB" + echo "Download: $BLOB" + + # DRY RUN: print HTTP response and Content-Length + # WITHOUT downloading the files + curl -s -I "$URL" | head -n 2 + + # Actually download the files: UNCOMMENT when ready to download + # curl "$URL" -o "$OUTPUT_PATH/$BLOB" + + # Same as above, but using wget + # wget "$URL" -O "$OUTPUT_PATH/$BLOB" + + # Same, + unpack files on the fly + # curl "$URL" | tar -C "$OUTPUT_PATH" -f - -x -j +done diff --git a/download-dns-challenge-5-speakerphone-training.sh b/download-dns-challenge-5-speakerphone-training.sh new file mode 100644 index 00000000000..62628f10a22 --- /dev/null +++ b/download-dns-challenge-5-speakerphone-training.sh @@ -0,0 +1,187 @@ +#!/usr/bin/bash + +# ***** 5th DNS Challenge at ICASSP 2023***** +# Track 2 Speakerphone Clean speech: All Languages +# ------------------------------------------------------------- +# In all, you will need about 1TB to store the UNPACKED data. +# Archived, the same data takes about 550GB total. + +# Please comment out the files you don't need before launching +# the script. + +# NOTE: By default, the script *DOES NOT* DOWNLOAD ANY FILES! +# Please scroll down and edit this script to pick the +# downloading method that works best for you. + +# ------------------------------------------------------------- +# The directory structure of the unpacked data is: + +# datasets_fullband +# \-- clean_fullband 827G +# +-- emotional_speech 2.4G +# +-- french_speech 62G +# +-- german_speech 319G +# +-- italian_speech 42G +# +-- read_speech 299G +# +-- russian_speech 12G +# +-- spanish_speech 65G +# +-- vctk_wav48_silence_trimmed 27G +# \-- VocalSet_48kHz_mono 974M + +BLOB_NAMES=( + + clean_fullband/datasets_fullband.clean_fullband.VocalSet_48kHz_mono_000_NA_NA.tar.bz2 + + clean_fullband/datasets_fullband.clean_fullband.emotional_speech_000_NA_NA.tar.bz2 + + clean_fullband/datasets_fullband.clean_fullband.french_speech_000_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.french_speech_001_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.french_speech_002_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.french_speech_003_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.french_speech_004_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.french_speech_005_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.french_speech_006_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.french_speech_007_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.french_speech_008_NA_NA.tar.bz2 + + clean_fullband/datasets_fullband.clean_fullband.german_speech_000_0.00_3.47.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_001_3.47_3.64.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_002_3.64_3.74.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_003_3.74_3.81.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_004_3.81_3.86.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_005_3.86_3.91.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_006_3.91_3.96.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_007_3.96_4.00.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_008_4.00_4.04.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_009_4.04_4.08.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_010_4.08_4.12.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_011_4.12_4.16.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_012_4.16_4.21.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_013_4.21_4.26.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_014_4.26_4.33.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_015_4.33_4.43.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_016_4.43_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_017_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_018_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_019_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_020_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_021_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_022_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_023_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_024_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_025_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_026_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_027_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_028_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_029_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_030_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_031_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_032_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_033_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_034_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_035_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_036_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_037_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_038_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_039_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_040_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_041_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.german_speech_042_NA_NA.tar.bz2 + + clean_fullband/datasets_fullband.clean_fullband.italian_speech_000_0.00_3.98.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.italian_speech_001_3.98_4.21.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.italian_speech_002_4.21_4.40.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.italian_speech_003_4.40_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.italian_speech_004_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.italian_speech_005_NA_NA.tar.bz2 + + clean_fullband/datasets_fullband.clean_fullband.read_speech_000_0.00_3.75.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_001_3.75_3.88.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_002_3.88_3.96.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_003_3.96_4.02.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_004_4.02_4.06.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_005_4.06_4.10.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_006_4.10_4.13.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_007_4.13_4.16.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_008_4.16_4.19.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_009_4.19_4.21.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_010_4.21_4.24.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_011_4.24_4.26.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_012_4.26_4.29.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_013_4.29_4.31.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_014_4.31_4.33.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_015_4.33_4.35.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_016_4.35_4.38.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_017_4.38_4.40.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_018_4.40_4.42.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_019_4.42_4.45.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_020_4.45_4.48.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_021_4.48_4.52.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_022_4.52_4.57.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_023_4.57_4.67.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_024_4.67_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_025_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_026_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_027_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_028_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_029_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_030_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_031_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_032_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_033_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_034_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_035_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_036_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_037_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_038_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.read_speech_039_NA_NA.tar.bz2 + + clean_fullband/datasets_fullband.clean_fullband.russian_speech_000_0.00_4.31.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.russian_speech_001_4.31_NA.tar.bz2 + + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_000_0.00_4.09.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_001_4.09_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_002_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_003_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_004_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_005_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_006_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_007_NA_NA.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.spanish_speech_008_NA_NA.tar.bz2 + + clean_fullband/datasets_fullband.clean_fullband.vctk_wav48_silence_trimmed_000.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.vctk_wav48_silence_trimmed_001.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.vctk_wav48_silence_trimmed_002.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.vctk_wav48_silence_trimmed_003.tar.bz2 + clean_fullband/datasets_fullband.clean_fullband.vctk_wav48_silence_trimmed_004.tar.bz2 + +) + +############################################################### +# this data is identical to non-personalized track 4th DNS Challenge clean speech +# recommend to re-download the data using this script + +AZURE_URL="https://dns4public.blob.core.windows.net/dns4archive/datasets_fullband" + +OUTPUT_PATH="./datasets_fullband" + +mkdir -p $OUTPUT_PATH/{clean_fullband,noise_fullband} + +for BLOB in ${BLOB_NAMES[@]} +do + URL="$AZURE_URL/$BLOB" + echo "Download: $BLOB" + + # DRY RUN: print HTTP response and Content-Length + # WITHOUT downloading the files + curl -s -I "$URL" | head -n 2 + + # Actually download the files: UNCOMMENT when ready to download + # curl "$URL" -o "$OUTPUT_PATH/$BLOB" + + # Same as above, but using wget + # wget "$URL" -O "$OUTPUT_PATH/$BLOB" + + # Same, + unpack files on the fly + # curl "$URL" | tar -C "$OUTPUT_PATH" -f - -x -j +done diff --git a/download-dns-challenge-5-speakerphone.sh b/download-dns-challenge-5-speakerphone.sh deleted file mode 100644 index a62c850b407..00000000000 --- a/download-dns-challenge-5-speakerphone.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/bash - -# ***** Datasets for ICASSP 2023 DNS Challenge 5 - Speakerphone DNS Track ***** - -# NOTE: Before downloading, make sure you have enough space -# on your local storage! - -# In all, you will need about 380TB to store the UNPACKED data. -# Archived, the same data takes about 200GB total. - -# Please comment out the files you don't need before launching -# the script. - -# NOTE: By default, the script *DOES NOT* DOWNLOAD ANY FILES! -# Please scroll down and edit this script to pick the -# downloading method that works best for you. - -# ------------------------------------------------------------- -# The directory structure of the unpacked data is: - -# . 362G -# +-- datasets_fullband 64G -# | +-- impulse_responses 5.9G -# | \-- noise_fullband 58G -# +-- pdns_training_set 294G -# | +-- enrollment_embeddings 115M -# | +-- enrollment_wav 42G -# | +-- raw/clean 252G -# | +-- english 168G -# | +-- french 2.1G -# | +-- german 53G -# | +-- italian 17G -# | +-- russian 6.8G -# | \-- spanish 5.4G -# \-- personalized_dev_testset 3.3G - -BLOB_NAMES=( - - pdns_training_set/raw/pdns_training_set.raw.clean.english_000.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_001.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_002.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_003.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_004.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_005.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_006.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_007.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_008.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_009.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_010.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_011.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_012.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_013.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_014.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_015.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_016.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_017.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_018.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_019.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.english_020.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.french_000.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_000.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_001.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_002.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_003.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_004.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_005.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_006.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_007.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.german_008.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.italian_000.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.italian_001.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.italian_002.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.russian_000.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.spanish_000.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.spanish_001.tar.bz2 - pdns_training_set/raw/pdns_training_set.raw.clean.spanish_002.tar.bz2 - - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_000.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_001.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_002.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_003.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_004.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.french_000.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.german_000.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.german_001.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.italian_000.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.russian_000.tar.bz2 - pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.spanish_000.tar.bz2 - - pdns_training_set/pdns_training_set.enrollment_embeddings_000.tar.bz2 - - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_000.tar.bz2 - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_001.tar.bz2 - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_002.tar.bz2 - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_003.tar.bz2 - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_004.tar.bz2 - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_005.tar.bz2 - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_006.tar.bz2 - - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.freesound_000.tar.bz2 - datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.freesound_001.tar.bz2 - - datasets_fullband/datasets_fullband.impulse_responses_000.tar.bz2 - - personalized_dev_testset/personalized_dev_testset.enrollment.tar.bz2 - personalized_dev_testset/personalized_dev_testset.noisy_testclips.tar.bz2 -) - -############################################################### - -AZURE_URL="https://dns4public.blob.core.windows.net/dns4archive" - -OUTPUT_PATH="." - -mkdir -p $OUTPUT_PATH/{pdns_training_set/{raw,enrollment_wav},datasets_fullband/noise_fullband} - -for BLOB in ${BLOB_NAMES[@]} -do - URL="$AZURE_URL/$BLOB" - echo "Download: $BLOB" - - # DRY RUN: print HTTP response and Content-Length - # WITHOUT downloading the files - curl -s -I "$URL" | head -n 2 - - # Actually download the files: UNCOMMENT when ready to download - # curl "$URL" -o "$OUTPUT_PATH/$BLOB" - - # Same as above, but using wget - # wget "$URL" -O "$OUTPUT_PATH/$BLOB" - - # Same, + unpack files on the fly - # curl "$URL" | tar -C "$OUTPUT_PATH" -f - -x -j -done