Merge pull request #109 from motus/sergiym/dns1-2/download

add download scripts for DNS1 and DNS2 datasets
This commit is contained in:
hdubey 2022-02-12 20:42:43 -08:00 коммит произвёл GitHub
Родитель f8509724d4 7d8f6e669d
Коммит 313362324b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 118 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,28 @@
#!/usr/bin/bash
# ***** Datasets for INTERSPEECH 2020 DNS Challenge 1 *****
# NOTE: This data is for the *PAST* challenge!
# Current DNS Challenge is ICASSP 2022 DNS Challenge 4, which
# has its own download script, `download-dns-challenge-4.sh`
###############################################################
AZURE_URL="https://dns3public.blob.core.windows.net/dns3archive"
mkdir -p ./datasets/
URL="$AZURE_URL/datasets-interspeech2020.tar.bz2"
echo "Download: $BLOB"
# DRY RUN: print HTTP header WITHOUT downloading the files
curl -s -I "$URL"
# Actually download the archive - UNCOMMENT it when ready to download
# curl "$URL" -o "$BLOB"
# Same as above, but using wget
# wget "$URL" -O "$BLOB"
# Same, + unpack files on the fly
# curl "$URL" | tar -f - -x -j

Просмотреть файл

@ -0,0 +1,90 @@
#!/usr/bin/bash
# ***** Datasets for ICASSP 2021 DNS Challenge 2 *****
# NOTE: This data is for the *PAST* challenge!
# Current DNS Challenge is ICASSP 2022 DNS Challenge 4, which
# has its own download script, `download-dns-challenge-4.sh`
# NOTE: Before downloading, make sure you have enough space
# on your local storage!
# In all, you will need at least 230GB to store UNPACKED data.
# Archived, the same data takes 155GB total.
# Please comment out the files you don't need before launching
# the script.
# NOTE: By default, the script *DOES NOT* DOWNLOAD ANY FILES!
# Please scroll down and edit this script to pick the
# downloading method that works best for you.
# -------------------------------------------------------------
# The directory structure of the unpacked data is:
# datasets 229G
# +-- clean 204G
# | +-- emotional_speech 403M
# | +-- french_data 21G
# | +-- german_speech 66G
# | +-- italian_speech 14G
# | +-- mandarin_speech 21G
# | +-- read_speech 61G
# | +-- russian_speech 5.1G
# | +-- singing_voice 979M
# | \-- spanish_speech 17G
# +-- dev_testset 211M
# +-- impulse_responses 4.3G
# | +-- SLR26 2.1G
# | \-- SLR28 2.3G
# \-- noise 20G
BLOB_NAMES=(
# DEMAND dataset
DEMAND.tar.bz2
# Wideband clean speech
datasets/datasets.clean.read_speech.tar.bz2
# Wideband emotional speech
datasets/datasets.clean.emotional_speech.tar.bz2
# Wideband non-English clean speech
datasets/datasets.clean.french_data.tar.bz2
datasets/datasets.clean.german_speech.tar.bz2
datasets/datasets.clean.italian_speech.tar.bz2
datasets/datasets.clean.mandarin_speech.tar.bz2
datasets/datasets.clean.russian_speech.tar.bz2
datasets/datasets.clean.singing_voice.tar.bz2
datasets/datasets.clean.spanish_speech.tar.bz2
# Wideband noise, IR, and test data
datasets/datasets.impulse_responses.tar.bz2
datasets/datasets.noise.tar.bz2
datasets/datasets.dev_testset.tar.bz2
)
###############################################################
AZURE_URL="https://dns3public.blob.core.windows.net/dns3archive"
mkdir -p ./datasets
for BLOB in ${BLOB_NAMES[@]}
do
URL="$AZURE_URL/$BLOB"
echo "Download: $BLOB"
# DRY RUN: print HTTP headers WITHOUT downloading the files
curl -s -I "$URL" | head -n 1
# Actually download the files - UNCOMMENT it when ready to download
# curl "$URL" -o "$BLOB"
# Same as above, but using wget
# wget "$URL" -O "$BLOB"
# Same, + unpack files on the fly
# curl "$URL" | tar -f - -x -j
done