diff --git a/egs/wsj/s1/RESULTS b/egs/wsj/s1/RESULTS index 23949745c..0d0d22d8f 100644 --- a/egs/wsj/s1/RESULTS +++ b/egs/wsj/s1/RESULTS @@ -110,7 +110,13 @@ Test set: | Eval92 | Eval93 | Eval92 | Eval93 | Eval92 | Eval93 | latrescore| 12.6 18.7 14.3 21.0 12.1 18.2 [beam=15]| 12.5 18.7 14.3 20.9 11.7 18.2 - +# Oracle lattice results: +On Eval92 lattices generated with a bigram LM (from exp/decode_tri2a_bg_latgen_eval92) +Lattices pruned with varying beam values: +beam=0.01 %WER 16.13 [ 910 / 5641, 213 ins, 59 del, 638 sub ] +beam=1 %WER 13.07 [ 737 / 5641, 171 ins, 46 del, 520 sub ] +beam=5 %WER 6.38 [ 360 / 5641, 86 ins, 20 del, 254 sub ] +beam=10 %WER 4.11 [ 232 / 5641, 53 ins, 9 del, 170 sub ] # Raw results: exp/decode_mono_tgpr_eval92/wer:%WER 31.38 [ 1770 / 5641, 108 ins, 386 del, 1276 sub ] diff --git a/egs/wsj/s1/run.sh b/egs/wsj/s1/run.sh index 1034b5592..2b9624aee 100644 --- a/egs/wsj/s1/run.sh +++ b/egs/wsj/s1/run.sh @@ -216,6 +216,7 @@ steps/train_tri2a.sh || exit 1; for year in 92 93; do scripts/decode.sh exp/decode_tri2a_bg_eval${year} exp/graph_tri2a_bg/HCLG.fst steps/decode_tri2a.sh data/eval_nov${year}.scp scripts/decode.sh exp/decode_tri2a_bg_latgen_eval${year} exp/graph_tri2a_bg/HCLG.fst steps/decode_tri2a_latgen.sh data/eval_nov${year}.scp + scripts/latoracle.sh exp/decode_tri2a_bg_latgen_eval${year} data/eval_nov${year}.txt exp/decode_tri2a_bg_latoracle_eval${year} scripts/latrescore.sh exp/decode_tri2a_bg_latgen_eval${year} data/G_bg.fst data/G_tg.fst data/eval_nov${year}.txt exp/decode_tri2a_bg_rescore_tg_eval${year} scripts/latrescore.sh exp/decode_tri2a_bg_latgen_eval${year} data/G_bg.fst data/G_tg_pruned.fst data/eval_nov${year}.txt exp/decode_tri2a_bg_rescore_tg_pruned_eval${year} scripts/latrescore.sh exp/decode_tri2a_bg_latgen_eval${year} data/G_bg.fst data/G_bg.fst data/eval_nov${year}.txt exp/decode_tri2a_bg_rescore_bg_eval${year} diff --git a/egs/wsj/s1/scripts/latoracle.sh b/egs/wsj/s1/scripts/latoracle.sh new file mode 100755 index 000000000..67444ea2e --- /dev/null +++ b/egs/wsj/s1/scripts/latoracle.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright 2011 Microsoft Corporation1 Gilles Boulianne + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + + +if [ $# != 3 ]; then + echo "Usage: scripts/latoracle.sh " + exit 1; +fi + +. path.sh || exit 1; + + +inputdir=$1 # e.g. /pub/tmp/kaldi2011/dpovey/decode_tri1_latgen/ +transcript=$2 # e.g. data_prep/test_sep92_trans.txt +dir=$3 #eg exp/decode_tri1_latgen + +mkdir -p $dir + +# Create reference transcriptions and lattices +cat $transcript | sed 's:::g' | sed 's:::g' > $dir/test_trans.filt +cat $dir/test_trans.filt | \ + scripts/sym2int.pl --ignore-first-field data/words.txt | \ + string-to-lattice "ark:$dir/test_trans.lats" 2>$dir/string-to-lattice.log + +# Symbols that don't count as errors +echo "" > $dir/ignore.txt +echo "" >> $dir/ignore.txt +echo "" >> $dir/ignore.txt + +# Loop over pruning beams +inv_acwt=10 +acwt=`perl -e "print (1.0/$inv_acwt);"` +for beam in 0.01 1 5 10; do + + echo "Pruning $inputdir"'/*.lats.gz'" with invacwt=$inv_acwt and beam=$beam" + lattice-prune --acoustic-scale=$acwt --beam=$beam \ + "ark:gunzip -c $inputdir/*.lats.gz|" "ark,t:|gzip -c>$dir/lats.pruned.gz" \ + 2>$dir/prune.$beam.log + + echo "Computing oracle error rate w/r $transcript" + lattice-oracle --word-symbol-table=data/words.txt --wildcard-symbols-list=$dir/ignore.txt \ + "ark:$dir/test_trans.lats" "ark:gunzip -c $dir/lats.pruned.gz|" "ark,t:$dir/oracle${beam}.tra" \ + 2>$dir/oracle.$beam.log + + cat $dir/oracle${beam}.tra | \ + scripts/int2sym.pl --ignore-first-field data/words.txt | \ + sed 's:::' | sed 's:::' | sed 's:::g' | \ + compute-wer --text --mode=present ark:$dir/test_trans.filt ark,p:- | tee $dir/wer_${beam} +done +