first version of the pdns download script - not tested yet as the data

is not there
This commit is contained in:
Sergiy Matusevych 2021-12-05 11:15:29 -08:00
Родитель a0d3f95a28
Коммит 790375fa8e
2 изменённых файлов: 110 добавлений и 1 удалений

Просмотреть файл

@ -0,0 +1,109 @@
#!/usr/bin/bash
# ***** Datasets for ICASSP 2022 DNS Challenge 4 - Personalized DNS Track *****
# NOTE: Before downloading, make sure you have enough space
# on your local storage!
# In all, you will need about 360TB to store the UNPACKED data.
# Archived, the same data takes about 200GB total.
# Please comment out the files you don't need before launching
# the script.
# NOTE: By default, the script *DOES NOT* DOWNLOAD ANY FILES!
# Please scroll down and edit this script to pick the
# downloading method that works best for you.
# -------------------------------------------------------------
# The directory structure of the unpacked data is:
# . 358G
# +-- datasets_fullband 64G
# | +-- impulse_responses 5.9G
# | \-- noise_fullband 58G
# \-- pdns_training_set 294G
# +-- enrollment_embeddings 115M
# +-- enrollment_wav 42G
# +-- raw/clean 252G
# +-- english 168G
# +-- french 2.1G
# +-- german 53G
# +-- italian 17G
# +-- russian 6.8G
# \-- spanish 5.4G
BLOB_NAMES=(
pdns_training_set/raw/pdns_training_set.raw.clean.english_000.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.english_001.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.english_002.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.english_003.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.english_004.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.english_005.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.english_006.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.english_007.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.english_008.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.french_000.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.german_000.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.german_001.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.german_002.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.german_003.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.italian_000.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.russian_000.tar.bz2
pdns_training_set/raw/pdns_training_set.raw.clean.spanish_000.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_000.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_001.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_002.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_003.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.english_004.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.french_000.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.german_000.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.german_001.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.italian_000.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.russian_000.tar.bz2
pdns_training_set/enrollment_wav/pdns_training_set.enrollment_wav.spanish_000.tar.bz2
pdns_training_set/pdns_training_set.enrollment_embeddings_000.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_000.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_001.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_002.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_003.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_004.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_005.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.audioset_006.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.freesound_000.tar.bz2
datasets_fullband/noise_fullband/datasets_fullband.noise_fullband.freesound_001.tar.bz2
datasets_fullband/datasets_fullband.impulse_responses_000.tar.bz2
)
###############################################################
AZURE_URL="https://dns4public.blob.core.windows.net/dns4archive"
OUTPUT_PATH="."
mkdir -p $OUTPUT_PATH/{pdns_training_set/{raw,enrollment_wav},datasets_fullband/noise_fullband}
for BLOB in ${BLOB_NAMES[@]}
do
URL="$AZURE_URL/$BLOB"
echo "Download: $BLOB"
# DRY RUN: print HTTP response and Content-Length
# WITHOUT downloading the files
curl -s -I "$URL" | head -n 2
# Actually download the files: UNCOMMENT when ready to download
# curl "$URL" -o "$OUTPUT_PATH/$BLOB"
# Same as above, but using wget
# wget "$URL" -O "$OUTPUT_PATH/$BLOB"
# Same, + unpack files on the fly
# curl "$URL" | tar -C "$OUTPUT_PATH" -f - -x -j
done

Просмотреть файл

@ -1,6 +1,6 @@
#!/usr/bin/bash #!/usr/bin/bash
# ***** Datasets for ICASSP 2022 DNS Challenge 4 ***** # ***** Datasets for ICASSP 2022 DNS Challenge 4 - Main (Real-Time) Track *****
# NOTE: Before downloading, make sure you have enough space # NOTE: Before downloading, make sure you have enough space
# on your local storage! # on your local storage!