Added scripts and results for wsj/s1 lattice oracle error rate.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@410 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Gilles Boulianne 2011-08-26 12:38:11 +00:00
Родитель fa573297c2
Коммит b1dded704c
3 изменённых файлов: 72 добавлений и 1 удалений

Просмотреть файл

@ -110,7 +110,13 @@ Test set: | Eval92 | Eval93 | Eval92 | Eval93 | Eval92 | Eval93 |
latrescore| 12.6 18.7 14.3 21.0 12.1 18.2
[beam=15]| 12.5 18.7 14.3 20.9 11.7 18.2
# Oracle lattice results:
On Eval92 lattices generated with a bigram LM (from exp/decode_tri2a_bg_latgen_eval92)
Lattices pruned with varying beam values:
beam=0.01 %WER 16.13 [ 910 / 5641, 213 ins, 59 del, 638 sub ]
beam=1 %WER 13.07 [ 737 / 5641, 171 ins, 46 del, 520 sub ]
beam=5 %WER 6.38 [ 360 / 5641, 86 ins, 20 del, 254 sub ]
beam=10 %WER 4.11 [ 232 / 5641, 53 ins, 9 del, 170 sub ]
# Raw results:
exp/decode_mono_tgpr_eval92/wer:%WER 31.38 [ 1770 / 5641, 108 ins, 386 del, 1276 sub ]

Просмотреть файл

@ -216,6 +216,7 @@ steps/train_tri2a.sh || exit 1;
for year in 92 93; do
scripts/decode.sh exp/decode_tri2a_bg_eval${year} exp/graph_tri2a_bg/HCLG.fst steps/decode_tri2a.sh data/eval_nov${year}.scp
scripts/decode.sh exp/decode_tri2a_bg_latgen_eval${year} exp/graph_tri2a_bg/HCLG.fst steps/decode_tri2a_latgen.sh data/eval_nov${year}.scp
scripts/latoracle.sh exp/decode_tri2a_bg_latgen_eval${year} data/eval_nov${year}.txt exp/decode_tri2a_bg_latoracle_eval${year}
scripts/latrescore.sh exp/decode_tri2a_bg_latgen_eval${year} data/G_bg.fst data/G_tg.fst data/eval_nov${year}.txt exp/decode_tri2a_bg_rescore_tg_eval${year}
scripts/latrescore.sh exp/decode_tri2a_bg_latgen_eval${year} data/G_bg.fst data/G_tg_pruned.fst data/eval_nov${year}.txt exp/decode_tri2a_bg_rescore_tg_pruned_eval${year}
scripts/latrescore.sh exp/decode_tri2a_bg_latgen_eval${year} data/G_bg.fst data/G_bg.fst data/eval_nov${year}.txt exp/decode_tri2a_bg_rescore_bg_eval${year}

64
egs/wsj/s1/scripts/latoracle.sh Executable file
Просмотреть файл

@ -0,0 +1,64 @@
#!/bin/bash
# Copyright 2011 Microsoft Corporation1 Gilles Boulianne
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
if [ $# != 3 ]; then
echo "Usage: scripts/latoracle.sh <input-decode-dir> <transcript-text-file> <output-decode-dir>"
exit 1;
fi
. path.sh || exit 1;
inputdir=$1 # e.g. /pub/tmp/kaldi2011/dpovey/decode_tri1_latgen/
transcript=$2 # e.g. data_prep/test_sep92_trans.txt
dir=$3 #eg exp/decode_tri1_latgen
mkdir -p $dir
# Create reference transcriptions and lattices
cat $transcript | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/test_trans.filt
cat $dir/test_trans.filt | \
scripts/sym2int.pl --ignore-first-field data/words.txt | \
string-to-lattice "ark:$dir/test_trans.lats" 2>$dir/string-to-lattice.log
# Symbols that don't count as errors
echo "<s>" > $dir/ignore.txt
echo "</s>" >> $dir/ignore.txt
echo "<UNK>" >> $dir/ignore.txt
# Loop over pruning beams
inv_acwt=10
acwt=`perl -e "print (1.0/$inv_acwt);"`
for beam in 0.01 1 5 10; do
echo "Pruning $inputdir"'/*.lats.gz'" with invacwt=$inv_acwt and beam=$beam"
lattice-prune --acoustic-scale=$acwt --beam=$beam \
"ark:gunzip -c $inputdir/*.lats.gz|" "ark,t:|gzip -c>$dir/lats.pruned.gz" \
2>$dir/prune.$beam.log
echo "Computing oracle error rate w/r $transcript"
lattice-oracle --word-symbol-table=data/words.txt --wildcard-symbols-list=$dir/ignore.txt \
"ark:$dir/test_trans.lats" "ark:gunzip -c $dir/lats.pruned.gz|" "ark,t:$dir/oracle${beam}.tra" \
2>$dir/oracle.$beam.log
cat $dir/oracle${beam}.tra | \
scripts/int2sym.pl --ignore-first-field data/words.txt | \
sed 's:<s>::' | sed 's:</s>::' | sed 's:<UNK>::g' | \
compute-wer --text --mode=present ark:$dir/test_trans.filt ark,p:- | tee $dir/wer_${beam}
done