From 790375fa8ef64857d50074f159db2f4600b28c15 Mon Sep 17 00:00:00 2001 From: Sergiy Matusevych Date: Sun, 5 Dec 2021 11:15:29 -0800 Subject: [PATCH] first version of the pdns download script - not tested yet as the data is not there --- download-dns-challenge-4-pdns.sh | 109 +++++++++++++++++++++++++++++++ download-dns-challenge-4.sh | 2 +- 2 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 download-dns-challenge-4-pdns.sh diff --git a/download-dns-challenge-4-pdns.sh b/download-dns-challenge-4-pdns.sh new file mode 100644 index 00000000000..9ee463c480c --- /dev/null +++ b/download-dns-challenge-4-pdns.sh @@ -0,0 +1,109 @@ +#!/usr/bin/bash + +# ***** Datasets for ICASSP 2022 DNS Challenge 4 - Personalized DNS Track ***** + +# NOTE: Before downloading, make sure you have enough space +# on your local storage! + +# In all, you will need about 360TB to store the UNPACKED data. +# Archived, the same data takes about 200GB total. + +# Please comment out the files you don't need before launching +# the script. + +# NOTE: By default, the script *DOES NOT* DOWNLOAD ANY FILES! +# Please scroll down and edit this script to pick the +# downloading method that works best for you. + +# ------------------------------------------------------------- +# The directory structure of the unpacked data is: + +# . 358G +# +-- datasets_fullband 64G +# | +-- impulse_responses 5.9G +# | \-- noise_fullband 58G +# \-- pdns_training_set 294G +# +-- enrollment_embeddings 115M +# +-- enrollment_wav 42G +# +-- raw/clean 252G +# +-- english 168G +# +-- french 2.1G +# +-- german 53G +# +-- italian 17G +# +-- russian 6.8G +# \-- spanish 5.4G + +BLOB_NAMES=( + + pdns_training_set/raw/pdns_training_set.raw.clean.english_000.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.english_001.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.english_002.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.english_003.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.english_004.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.english_005.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.english_006.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.english_007.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.english_008.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.french_000.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.german_000.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.german_001.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.german_002.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.german_003.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.italian_000.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.russian_000.tar.bz2 + pdns_training_set/raw/pdns_training_set.raw.clean.spanish_000.tar.bz2 + + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_000.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_001.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_002.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_003.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_004.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.french_000.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.german_000.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.german_001.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.italian_000.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.russian_000.tar.bz2 + pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.spanish_000.tar.bz2 + + pdns_training_set/pdns_training_set.enrollment_embeddings_000.tar.bz2 + + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_000.tar.bz2 + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_001.tar.bz2 + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_002.tar.bz2 + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_003.tar.bz2 + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_004.tar.bz2 + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_005.tar.bz2 + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_006.tar.bz2 + + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.freesound_000.tar.bz2 + datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.freesound_001.tar.bz2 + + datasets_fullband/datasets_fullband.impulse_responses_000.tar.bz2 +) + +############################################################### + +AZURE_URL="https://dns4public.blob.core.windows.net/dns4archive" + +OUTPUT_PATH="." + +mkdir -p $OUTPUT_PATH/{pdns_training_set/{raw,enrollment_wav},datasets_fullband/noise_fullband} + +for BLOB in ${BLOB_NAMES[@]} +do + URL="$AZURE_URL/$BLOB" + echo "Download: $BLOB" + + # DRY RUN: print HTTP response and Content-Length + # WITHOUT downloading the files + curl -s -I "$URL" | head -n 2 + + # Actually download the files: UNCOMMENT when ready to download + # curl "$URL" -o "$OUTPUT_PATH/$BLOB" + + # Same as above, but using wget + # wget "$URL" -O "$OUTPUT_PATH/$BLOB" + + # Same, + unpack files on the fly + # curl "$URL" | tar -C "$OUTPUT_PATH" -f - -x -j +done diff --git a/download-dns-challenge-4.sh b/download-dns-challenge-4.sh index 0f1e44b2fd2..988b0ca8a7b 100644 --- a/download-dns-challenge-4.sh +++ b/download-dns-challenge-4.sh @@ -1,6 +1,6 @@ #!/usr/bin/bash -# ***** Datasets for ICASSP 2022 DNS Challenge 4 ***** +# ***** Datasets for ICASSP 2022 DNS Challenge 4 - Main (Real-Time) Track ***** # NOTE: Before downloading, make sure you have enough space # on your local storage!