Test set must be handled a bit differently due to the different stm files

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/nshmyrev@3843 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Nickolay V. Shmyrev 2014-04-05 21:15:43 +00:00
Родитель 16dee468f3
Коммит 02cc48b327
1 изменённых файлов: 10 добавлений и 1 удалений

Просмотреть файл

@ -13,11 +13,20 @@ dir=data/$set
mkdir -p $dir
cat db/TEDLIUM_release1/$set/stm/*.stm | sed "s:\([A-Z]\) ':\1':g" > $dir/stm.txt
cat db/TEDLIUM_release1/$set/stm/*.stm | grep -v ignore_time_segment_in_scoring |
sed "s:\([A-Z]\) ':\1':g" > $dir/stm.txt
# Test set is a bit different
if [ "$set" = "train" ] ; then
cat $dir/stm.txt | sed 's:<sil>::g' | sed 's:([0-9])::g' |
awk '{printf ("%s ", $NF); for (i=7;i<NF;i++) printf("%s ", $i); printf("\n");}' |
tr '{}' '[]' | tr -d '()' | sort | local/join_suffix.py db/TEDLIUM_release1/TEDLIUM.150K.dic > $dir/text
else
cat $dir/stm.txt | sed 's:<sil>::g' | sed 's:([0-9])::g' |
awk '{printf ("%s-%s-%s ", $1, $4, $5); for (i=7;i<=NF;i++) printf("%s ", $i); printf("\n");}' |
tr '{}' '[]' | tr -d '()' | sort | local/join_suffix.py db/TEDLIUM_release1/TEDLIUM.150K.dic > $dir/text
fi
cat $dir/text | cut -d" " -f 1 | awk -F"-" '{print $0, $1, $2, $3}' > $dir/segments