зеркало из https://github.com/mozilla/kaldi.git
Updated a few scripts to use new arpa2fst options
This commit is contained in:
Родитель
33b6d6f4b0
Коммит
64ca6aed76
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -15,25 +15,12 @@ arpa_lm=$1
|
|||
|
||||
cp -r data/lang data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - >data/lang_test/G.fst
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -61,4 +48,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
|||
fstisstochastic || echo LG is not stochastic
|
||||
|
||||
echo AMI_format_data succeeded.
|
||||
|
||||
|
|
|
@ -18,40 +18,23 @@ fi
|
|||
|
||||
lm_dir=$1
|
||||
|
||||
tmpdir=data/local/lm_tmp
|
||||
lexicon=data/local/lang_tmp/lexiconp.txt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
# This loop was taken verbatim from wsj_format_data.sh, and I'm leaving it in place in
|
||||
# case we decide to add more language models at some point
|
||||
for lm_suffix in tgpr; do
|
||||
test=data/lang_test_${lm_suffix}
|
||||
mkdir -p $test
|
||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones oov.txt oov.int; do
|
||||
for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones topo oov.txt oov.int; do
|
||||
cp -r data/lang/$f $test
|
||||
done
|
||||
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - > $test/G.fst
|
||||
|
||||
utils/validate_lang.pl $test || exit 1;
|
||||
done
|
||||
|
||||
echo "Succeeded in formatting data."
|
||||
rm -r $tmpdir
|
||||
|
||||
exit 0
|
||||
|
|
|
@ -49,24 +49,9 @@ for lm_suffix in tgsmall tgmed; do
|
|||
test=${src_dir}_test_${lm_suffix}
|
||||
mkdir -p $test
|
||||
cp -r ${src_dir}/* $test
|
||||
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other
|
||||
# similar things in a LM from Geoff. Removing all "illegal" combinations of
|
||||
# <s> and </s>, which are supposed to occur only at being/end of utt. These
|
||||
# can cause determinization failures of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - > $test/G.fst
|
||||
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
|
||||
done
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ tmpdir=data/local/lm_tmp
|
|||
lexicon=data/local/lang${lang_suffix}_tmp/lexiconp.txt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
|
||||
mkdir -p data/$x
|
||||
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
||||
cp $srcdir/$x.txt data/$x/text || exit 1;
|
||||
|
@ -49,22 +49,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
|||
cp -r data/lang${lang_suffix}/* $test || exit 1;
|
||||
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - > $test/G.fst
|
||||
|
||||
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
|
||||
done
|
||||
|
|
|
@ -45,17 +45,13 @@ fi
|
|||
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
|
||||
# not work for LMs generated from all toolkits.
|
||||
gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1;
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang/words.txt - > data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1;
|
||||
fstisstochastic data/lang${lang_suffix}_test_bd_tgpr/G.fst
|
||||
|
||||
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1;
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang/words.txt - > data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1;
|
||||
fstisstochastic data/lang${lang_suffix}_test_bd_tg/G.fst
|
||||
|
||||
# Build ConstArpaLm for the unpruned language model.
|
||||
|
@ -65,10 +61,8 @@ gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
|
|||
--unk-symbol=$unk - data/lang${lang_suffix}_test_bd_tgconst/G.carpa || exit 1
|
||||
|
||||
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1;
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang/words.txt - > data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1;
|
||||
fstisstochastic data/lang${lang_suffix}_test_bd_fg/G.fst
|
||||
|
||||
# Build ConstArpaLm for the unpruned language model.
|
||||
|
@ -78,10 +72,8 @@ gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
|
|||
--unk-symbol=$unk - data/lang${lang_suffix}_test_bd_fgconst/G.carpa || exit 1
|
||||
|
||||
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1;
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang/words.txt - > data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1;
|
||||
fstisstochastic data/lang${lang_suffix}_test_bd_fgpr/G.fst
|
||||
|
||||
exit 0;
|
||||
|
|
|
@ -111,10 +111,8 @@ while read line; do
|
|||
if (invoc[$x]) { printf("%s ", $x); } else { printf("%s ", oov); } }
|
||||
printf("\n"); }' > $wdir/text
|
||||
ngram-count -text $wdir/text -order $ngram_order "$srilm_options" -lm - |\
|
||||
arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl |\
|
||||
fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
|
||||
--keep_isymbols=false --keep_osymbols=false |\
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang/words.txt - > $wdir/G.fst || exit 1;
|
||||
fi
|
||||
fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic."
|
||||
|
||||
|
@ -134,7 +132,7 @@ while read line; do
|
|||
|
||||
make-h-transducer --disambig-syms-out=$wdir/disambig_tid.int \
|
||||
--transition-scale=$tscale $wdir/ilabels_${N}_${P} \
|
||||
$model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst
|
||||
$model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst
|
||||
|
||||
# Builds HCLGa.fst
|
||||
fsttablecompose $wdir/Ha.fst $wdir/CLG.fst | \
|
||||
|
@ -143,10 +141,10 @@ while read line; do
|
|||
fstminimizeencoded > $wdir/HCLGa.fst
|
||||
fstisstochastic $wdir/HCLGa.fst ||\
|
||||
echo "$0: $uttid/HCLGa.fst is not stochastic"
|
||||
|
||||
|
||||
add-self-loops --self-loop-scale=$loopscale --reorder=true \
|
||||
$model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst
|
||||
|
||||
|
||||
if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
|
||||
fstisstochastic $wdir/HCLG.fst ||\
|
||||
echo "$0: $uttid/HCLG.fst is not stochastic."
|
||||
|
|
|
@ -39,20 +39,9 @@ for f in phones.txt words.txt L.fst L_disambig.fst phones/; do
|
|||
done
|
||||
|
||||
lm_base=$(basename $lm '.gz')
|
||||
gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \
|
||||
> $out_dir/oovs_${lm_base}.txt
|
||||
|
||||
# Removing all "illegal" combinations of <s> and </s>, which are supposed to
|
||||
# occur only at being/end of utt. These can cause determinization failures
|
||||
# of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $lm \
|
||||
| egrep -v '<s> <s>|</s> <s>|</s> </s>' \
|
||||
| arpa2fst - | fstprint \
|
||||
| utils/remove_oovs.pl $out_dir/oovs_${lm_base}.txt \
|
||||
| utils/eps2disambig.pl | utils/s2eps.pl \
|
||||
| fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \
|
||||
--keep_isymbols=false --keep_osymbols=false \
|
||||
| fstrmepsilon | fstarcsort --sort_type=ilabel > $out_dir/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$out_dir/words.txt - > $out_dir/G.fst
|
||||
set +e
|
||||
fstisstochastic $out_dir/G.fst
|
||||
set -e
|
||||
|
@ -66,7 +55,7 @@ set -e
|
|||
# this might cause determinization failure of CLG.
|
||||
# #0 is treated as an empty word.
|
||||
mkdir -p $out_dir/tmpdir.g
|
||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }}
|
||||
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }}
|
||||
END{print "0 0 #0 #0"; print "0";}' \
|
||||
< "$lexicon" > $out_dir/tmpdir.g/select_empty.fst.txt
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче