зеркало из https://github.com/mozilla/kaldi.git
Test set must be handled a bit differently due to the different stm files
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/nshmyrev@3843 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
16dee468f3
Коммит
02cc48b327
|
@ -13,11 +13,20 @@ dir=data/$set
|
|||
|
||||
mkdir -p $dir
|
||||
|
||||
cat db/TEDLIUM_release1/$set/stm/*.stm | sed "s:\([A-Z]\) ':\1':g" > $dir/stm.txt
|
||||
cat db/TEDLIUM_release1/$set/stm/*.stm | grep -v ignore_time_segment_in_scoring |
|
||||
sed "s:\([A-Z]\) ':\1':g" > $dir/stm.txt
|
||||
|
||||
# Test set is a bit different
|
||||
if [ "$set" = "train" ] ; then
|
||||
cat $dir/stm.txt | sed 's:<sil>::g' | sed 's:([0-9])::g' |
|
||||
awk '{printf ("%s ", $NF); for (i=7;i<NF;i++) printf("%s ", $i); printf("\n");}' |
|
||||
tr '{}' '[]' | tr -d '()' | sort | local/join_suffix.py db/TEDLIUM_release1/TEDLIUM.150K.dic > $dir/text
|
||||
else
|
||||
cat $dir/stm.txt | sed 's:<sil>::g' | sed 's:([0-9])::g' |
|
||||
awk '{printf ("%s-%s-%s ", $1, $4, $5); for (i=7;i<=NF;i++) printf("%s ", $i); printf("\n");}' |
|
||||
tr '{}' '[]' | tr -d '()' | sort | local/join_suffix.py db/TEDLIUM_release1/TEDLIUM.150K.dic > $dir/text
|
||||
fi
|
||||
|
||||
|
||||
cat $dir/text | cut -d" " -f 1 | awk -F"-" '{print $0, $1, $2, $3}' > $dir/segments
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче