Updated a few scripts to use new arpa2fst options

This commit is contained in:
kkm 2016-03-31 18:20:29 -07:00
Родитель 33b6d6f4b0
Коммит 64ca6aed76
7 изменённых файлов: 28 добавлений и 109 удалений

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -15,25 +15,12 @@ arpa_lm=$1
cp -r data/lang data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - >data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -61,4 +48,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
fstisstochastic || echo LG is not stochastic
echo AMI_format_data succeeded.

Просмотреть файл

@ -18,40 +18,23 @@ fi
lm_dir=$1
tmpdir=data/local/lm_tmp
lexicon=data/local/lang_tmp/lexiconp.txt
mkdir -p $tmpdir
# This loop was taken verbatim from wsj_format_data.sh, and I'm leaving it in place in
# case we decide to add more language models at some point
for lm_suffix in tgpr; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones oov.txt oov.int; do
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones topo oov.txt oov.int; do
cp -r data/lang/$f $test
done
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - > $test/G.fst
utils/validate_lang.pl $test || exit 1;
done
echo "Succeeded in formatting data."
rm -r $tmpdir
exit 0

Просмотреть файл

@ -49,24 +49,9 @@ for lm_suffix in tgsmall tgmed; do
test=${src_dir}_test_${lm_suffix}
mkdir -p $test
cp -r ${src_dir}/* $test
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other
# similar things in a LM from Geoff. Removing all "illegal" combinations of
# <s> and </s>, which are supposed to occur only at being/end of utt. These
# can cause determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - > $test/G.fst
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
done

Просмотреть файл

@ -27,7 +27,7 @@ tmpdir=data/local/lm_tmp
lexicon=data/local/lang${lang_suffix}_tmp/lexiconp.txt
mkdir -p $tmpdir
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
@ -49,22 +49,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
cp -r data/lang${lang_suffix}/* $test || exit 1;
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - > $test/G.fst
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
done

Просмотреть файл

@ -45,17 +45,13 @@ fi
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
# not work for LMs generated from all toolkits.
gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1;
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - > data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1;
fstisstochastic data/lang${lang_suffix}_test_bd_tgpr/G.fst
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1;
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - > data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1;
fstisstochastic data/lang${lang_suffix}_test_bd_tg/G.fst
# Build ConstArpaLm for the unpruned language model.
@ -65,10 +61,8 @@ gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
--unk-symbol=$unk - data/lang${lang_suffix}_test_bd_tgconst/G.carpa || exit 1
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1;
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - > data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1;
fstisstochastic data/lang${lang_suffix}_test_bd_fg/G.fst
# Build ConstArpaLm for the unpruned language model.
@ -78,10 +72,8 @@ gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
--unk-symbol=$unk - data/lang${lang_suffix}_test_bd_fgconst/G.carpa || exit 1
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1;
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - > data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1;
fstisstochastic data/lang${lang_suffix}_test_bd_fgpr/G.fst
exit 0;

Просмотреть файл

@ -111,10 +111,8 @@ while read line; do
if (invoc[$x]) { printf("%s ", $x); } else { printf("%s ", oov); } }
printf("\n"); }' > $wdir/text
ngram-count -text $wdir/text -order $ngram_order "$srilm_options" -lm - |\
arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl |\
fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
--keep_isymbols=false --keep_osymbols=false |\
fstrmepsilon | fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - > $wdir/G.fst || exit 1;
fi
fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic."
@ -134,7 +132,7 @@ while read line; do
make-h-transducer --disambig-syms-out=$wdir/disambig_tid.int \
--transition-scale=$tscale $wdir/ilabels_${N}_${P} \
$model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst
$model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst
# Builds HCLGa.fst
fsttablecompose $wdir/Ha.fst $wdir/CLG.fst | \
@ -143,10 +141,10 @@ while read line; do
fstminimizeencoded > $wdir/HCLGa.fst
fstisstochastic $wdir/HCLGa.fst ||\
echo "$0: $uttid/HCLGa.fst is not stochastic"
add-self-loops --self-loop-scale=$loopscale --reorder=true \
$model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst
if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
fstisstochastic $wdir/HCLG.fst ||\
echo "$0: $uttid/HCLG.fst is not stochastic."

Просмотреть файл

@ -39,20 +39,9 @@ for f in phones.txt words.txt L.fst L_disambig.fst phones/; do
done
lm_base=$(basename $lm '.gz')
gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \
> $out_dir/oovs_${lm_base}.txt
# Removing all "illegal" combinations of <s> and </s>, which are supposed to
# occur only at being/end of utt. These can cause determinization failures
# of CLG [ends up being epsilon cycles].
gunzip -c $lm \
| egrep -v '<s> <s>|</s> <s>|</s> </s>' \
| arpa2fst - | fstprint \
| utils/remove_oovs.pl $out_dir/oovs_${lm_base}.txt \
| utils/eps2disambig.pl | utils/s2eps.pl \
| fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \
--keep_isymbols=false --keep_osymbols=false \
| fstrmepsilon | fstarcsort --sort_type=ilabel > $out_dir/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$out_dir/words.txt - > $out_dir/G.fst
set +e
fstisstochastic $out_dir/G.fst
set -e
@ -66,7 +55,7 @@ set -e
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $out_dir/tmpdir.g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }}
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }}
END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" > $out_dir/tmpdir.g/select_empty.fst.txt