Update arpa2fst invocations in individual egs/*/local scripts

This commit is contained in:
kkm 2016-04-10 20:15:08 -07:00
Родитель b77e93095b
Коммит 829432d05b
39 изменённых файлов: 177 добавлений и 581 удалений

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -10,26 +10,12 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
cp -rT data/lang data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -61,4 +47,3 @@ utils/build_const_arpa_lm.sh \
data/local/lm/4gram-mincount/lm_unpruned.gz data/lang data/lang_test_fg
echo "$0 succeeded"

Просмотреть файл

@ -21,7 +21,7 @@ tmpdir=data/local/lm_tmp
lexicon=data/local/lang_tmp/lexiconp.txt
mkdir -p $tmpdir
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
@ -42,23 +42,9 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
cp -r data/lang/* $test
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
done

Просмотреть файл

@ -39,14 +39,8 @@ destdir=$3
mkdir $destdir 2>/dev/null || true
gunzip -c $lmfile | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
arpa2fst - | \
fstprint | \
utils/eps2disambig.pl | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true
exit 0

Просмотреть файл

@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
[ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
rho=$[$last_id+1]
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
# "dead state/failure state" that is not coaccessible.
cat <<EOF | fstcompile > $destdir/rho.fst
0 1 $silence_id $silence_id
@ -35,16 +35,11 @@ EOF
gunzip -c $lmfile | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
sed 's/<unk>/<oov>/g' | \
arpa2fst - | \
fstprint | \
utils/eps2disambig.pl | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
--read-symbol-table=$langdir/words.txt - | \
fstrhocompose "$rho" - $destdir/rho.fst | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true

Просмотреть файл

@ -8,7 +8,7 @@
# This script trains LMs on the WSJ LM-training data.
# It requires that you have already run wsj_extend_dict.sh,
# to get the larger-size dictionary including all of CMUdict
# plus any OOVs and possible acronyms that we could easily
# plus any OOVs and possible acronyms that we could easily
# derive pronunciations for.
# This script takes as command-line arguments the relevant data/lang
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
gzip -c > $dir/train_in.gz || exit 1;
# Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
# To save disk space, remove the un-mapped training data. We could
# easily generate it again if needed.
rm $dir/train_nounk.gz
rm $dir/train_nounk.gz
##################################################################
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
# The default LM chosen to be the last pruned 4gram-mincount
#
# Note: One can cheat and provide an external ARPA LM here!!!
# To do so, make sure that
# To do so, make sure that
# -- its vocabulary is fully covered by $lang/words.txt,
# -- it is gzipped and
# -- it is placed in the $dir directory.
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
. ./path.sh || exit 1;
gunzip -c $gzipped_ARPA_LM | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
##################################################################
# Redo the FST step after reviewing perplexities reported by the
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
##################################################################
exit 0

Просмотреть файл

@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then
fi
set -e #Exit on non-zero return code from any command
set -o pipefail #Exit if any of the commands in the pipeline will
set -o pipefail #Exit if any of the commands in the pipeline will
#return non-zero return code
lmfile=$1
@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then
exit 1;
fi
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
if ($order == 1) { @A = split;
if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then
while(<STDIN>) {
if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
else { print; } # print all lines unchanged except the one that says ngram 1=X.
if (m/^\\1-grams:$/) {
if (m/^\\1-grams:$/) {
foreach $l (@OOVS) {
@A = split(" ", $l);
@A == 2 || die "bad line in oov2prob: $_;";
@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then
else
decompress="cat $lmfile"
fi
$decompress | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
arpa2fst - | \
fstprint | \
utils/eps2disambig.pl | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true;
if $cleanup; then

Просмотреть файл

@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
[ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
rho=$[$last_id+1]
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
# "dead state/failure state" that is not coaccessible.
cat <<EOF | fstcompile > $destdir/rho.fst
0 1 $silence_id $silence_id
@ -35,16 +35,11 @@ EOF
gunzip -c $lmfile | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
sed 's/<unk>/<oov>/g' | \
arpa2fst - | \
fstprint | \
utils/eps2disambig.pl | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
--read-symbol-table=$langdir/words.txt - | \
fstrhocompose "$rho" - $destdir/rho.fst | \
fstrmepsilon > $destdir/G.fst || exit 1
fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true

Просмотреть файл

@ -8,7 +8,7 @@
# This script trains LMs on the WSJ LM-training data.
# It requires that you have already run wsj_extend_dict.sh,
# to get the larger-size dictionary including all of CMUdict
# plus any OOVs and possible acronyms that we could easily
# plus any OOVs and possible acronyms that we could easily
# derive pronunciations for.
# This script takes as command-line arguments the relevant data/lang
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
gzip -c > $dir/train_in.gz || exit 1;
# Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
# To save disk space, remove the un-mapped training data. We could
# easily generate it again if needed.
rm $dir/train_nounk.gz
rm $dir/train_nounk.gz
##################################################################
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
# The default LM chosen to be the last pruned 4gram-mincount
#
# Note: One can cheat and provide an external ARPA LM here!!!
# To do so, make sure that
# To do so, make sure that
# -- its vocabulary is fully covered by $lang/words.txt,
# -- it is gzipped and
# -- it is placed in the $dir directory.
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
. ./path.sh || exit 1;
gunzip -c $gzipped_ARPA_LM | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
##################################################################
# Redo the FST step after reviewing perplexities reported by the
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
##################################################################
exit 0

Просмотреть файл

@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then
fi
set -e #Exit on non-zero return code from any command
set -o pipefail #Exit if any of the commands in the pipeline will
set -o pipefail #Exit if any of the commands in the pipeline will
#return non-zero return code
lmfile=$1
@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then
exit 1;
fi
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
if ($order == 1) { @A = split;
if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then
while(<STDIN>) {
if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
else { print; } # print all lines unchanged except the one that says ngram 1=X.
if (m/^\\1-grams:$/) {
if (m/^\\1-grams:$/) {
foreach $l (@OOVS) {
@A = split(" ", $l);
@A == 2 || die "bad line in oov2prob: $_;";
@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then
else
decompress="cat $lmfile"
fi
$decompress | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
arpa2fst - | \
fstprint | \
utils/eps2disambig.pl | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true;
if $cleanup; then

Просмотреть файл

@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
[ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
rho=$[$last_id+1]
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
# "dead state/failure state" that is not coaccessible.
cat <<EOF | fstcompile > $destdir/rho.fst
0 1 $silence_id $silence_id
@ -35,16 +35,11 @@ EOF
gunzip -c $lmfile | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
sed 's/<unk>/<oov>/g' | \
arpa2fst - | \
fstprint | \
utils/eps2disambig.pl | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
--read-symbol-table=$langdir/words.txt - | \
fstrhocompose "$rho" - $destdir/rho.fst | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true

Просмотреть файл

@ -8,7 +8,7 @@
# This script trains LMs on the WSJ LM-training data.
# It requires that you have already run wsj_extend_dict.sh,
# to get the larger-size dictionary including all of CMUdict
# plus any OOVs and possible acronyms that we could easily
# plus any OOVs and possible acronyms that we could easily
# derive pronunciations for.
# This script takes as command-line arguments the relevant data/lang
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
gzip -c > $dir/train_in.gz || exit 1;
# Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
# To save disk space, remove the un-mapped training data. We could
# easily generate it again if needed.
rm $dir/train_nounk.gz
rm $dir/train_nounk.gz
##################################################################
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
# The default LM chosen to be the last pruned 4gram-mincount
#
# Note: One can cheat and provide an external ARPA LM here!!!
# To do so, make sure that
# To do so, make sure that
# -- its vocabulary is fully covered by $lang/words.txt,
# -- it is gzipped and
# -- it is placed in the $dir directory.
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
. ./path.sh || exit 1;
gunzip -c $gzipped_ARPA_LM | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
##################################################################
# Redo the FST step after reviewing perplexities reported by the
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
##################################################################
exit 0

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -12,25 +12,18 @@ mkdir -p data/lang_test
cp -r data/lang/* data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -59,4 +52,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
echo "$0 succeeded"

Просмотреть файл

@ -17,11 +17,9 @@
echo "Preparing train and test data"
srcdir=data/local/data
lmdir=data/local/nist_lm
tmpdir=data/local/lm_tmp
lexicon=data/local/lang_tmp/lexiconp.txt
mkdir -p $tmpdir
for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do
for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
@ -42,25 +40,10 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
cp -r data/lang/* $test
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
utils/validate_lang.pl $test || exit 1;
done
echo "Succeeded in formatting data."
rm -r $tmpdir

Просмотреть файл

@ -18,7 +18,7 @@ if [ $# -ne 1 ]; then
exit 1;
fi
# check data directories
# check data directories
chime3_data=$1
wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory
if [ ! -d $chime3_data ]; then
@ -70,7 +70,7 @@ else
| awk -v voc=$dir/vocab_5k.txt '
BEGIN{ while((getline<voc)>0) { invoc[$1]=1; }}
/^</{next}{
for (x=1;x<=NF;x++) {
for (x=1;x<=NF;x++) {
w=toupper($x);
if (invoc[w]) { printf("%s ",w); } else { printf("<UNK> "); }
}
@ -88,7 +88,7 @@ else
$chime3_data/data/transcriptions/dt05_simu.trn_all \
|gzip -c > $dir/valid.gz
fi
# train a large n-gram language model
lm_suffix=${order}gkn_5k
if [ -f $dir/lm_${lm_suffix}.arpa.gz ]; then
@ -121,22 +121,8 @@ for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
cp -r data/lang/$f $test
done
gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
@ -154,10 +140,9 @@ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
echo "Succeeded in preparing a large ${order}-gram LM"
rm -r $tmpdir

Просмотреть файл

@ -20,7 +20,7 @@ tmpdir=data/local/lm_tmp
lexicon=data/local/lang_tmp/lexiconp.txt
mkdir -p $tmpdir
for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do
for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
@ -43,29 +43,15 @@ for lm_suffix in tgpr_5k; do
cp -r data/lang/$f $test
done
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
@ -76,7 +62,7 @@ for lm_suffix in tgpr_5k; do
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
done

Просмотреть файл

@ -25,13 +25,10 @@ for lm_suffix in bg; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
cp -r data/lang/* $test
gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
@ -49,7 +46,7 @@ for lm_suffix in bg; do
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
done

Просмотреть файл

@ -25,13 +25,10 @@ for lm_suffix in bg; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
cp -r data/lang/* $test
gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
@ -49,7 +46,7 @@ for lm_suffix in bg; do
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
done

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
# Copyright 2014 Gaurav Kumar. Apache 2.0
#
@ -12,26 +12,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
mkdir -p data/lang_test
cp -r data/lang/* data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -60,4 +47,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
echo "$0 succeeded"

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
mkdir -p data/lang_test
cp -r data/lang/* data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
echo "$0 succeeded"

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
mkdir -p data/lang_test
cp -r data/lang/* data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
echo "$0 succeeded"

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
mkdir -p data/lang_test_fsh
cp -r data/lang/* data/lang_test_fsh
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test_fsh/words.txt \
--osymbols=data/lang_test_fsh/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_fsh/G.fst
fstisstochastic data/lang_test_fsh/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test_fsh/G.fst
fstisstochastic data/lang_test_fsh/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test_fsh/G.fst | \
echo "$0 succeeded"

Просмотреть файл

@ -6,9 +6,9 @@
if [ -f path.sh ]; then
. path.sh; else
echo "missing path.sh"; exit 1;
fi
fi
for dir in test train; do
for dir in test train; do
cp -pr data/local/$dir data/$dir
done
@ -21,26 +21,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
rm -r data/lang_test
cp -r data/lang data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.

Просмотреть файл

@ -6,9 +6,9 @@
if [ -f path.sh ]; then
. path.sh; else
echo "missing path.sh"; exit 1;
fi
fi
for dir in dev train; do
for dir in dev train; do
cp -pr data/local/$dir data/$dir
done
@ -22,26 +22,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
rm -r data/lang_dev
cp -r data/lang data/lang_dev
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_dev/words.txt \
--osymbols=data/lang_dev/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_dev/G.fst
fstisstochastic data/lang_dev/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_dev/G.fst
fstisstochastic data/lang_dev/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.

Просмотреть файл

@ -40,20 +40,10 @@ function format_lms () {
cp $work_dir/lang_test/$f $test
done
# kkm: I am removing fstdeterminizelog from the following pipe, no point.
gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
| find_arpa_oovs.pl $test/words.txt > $test/oovs_${lm_suffix}.txt
# Removing all "illegal" combinations of <s> and </s>, which are supposed to
# occur only at being/end of utt. These can cause determinization failures
# of CLG [ends up being epsilon cycles].
gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
| egrep -v '<s> <s>|</s> <s>|</s> </s>' \
| arpa2fst - | fstprint \
| remove_oovs.pl $test/oovs_${lm_suffix}.txt \
| eps2disambig.pl | s2eps.pl \
| fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
--keep_isymbols=false --keep_osymbols=false \
| fstrmepsilon | fstdeterminizelog > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
set +e
fstisstochastic $test/G.fst
set -e
@ -73,7 +63,7 @@ function format_lms () {
< $work_dir/local/lexicon_??.txt >tmpdir.g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r tmpdir.g
@ -99,7 +89,7 @@ echo "Preparing language models for test"
format_lms GE17k_tg $WDIR/GE;
format_lms GE17k_tg_pr $WDIR/GE; } >& $WDIR/GE/format_lms.log
# German - 60K
# German - 60K
{ format_lms GE60k_bg $WDIR/GE;
format_lms GE60k_tg $WDIR/GE;
format_lms GE60k_tg_pr $WDIR/GE; } >> $WDIR/GE/format_lms.log 2>&1
@ -115,7 +105,7 @@ echo "Preparing language models for test"
format_lms SP23k_tg_pr $WDIR/SP; } >& $WDIR/SP/format_lms.log
# Swedish - 24K
# TODO(arnab): Something going wrong with the Swedish trigram LM.
# TODO(arnab): Something going wrong with the Swedish trigram LM.
{ # format_lms SW24k_tg $WDIR/SW;
# format_lms SW24k_tg_pr $WDIR/SW;
format_lms SW24k_bg $WDIR/SW; } >& $WDIR/SW/format_lms.log

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -23,26 +23,13 @@ done
rm -r data/lang_test
cp -r data/lang data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -71,4 +58,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
echo hkust_format_data succeeded.

Просмотреть файл

@ -34,22 +34,10 @@ mfccdir=mfcc
# here.
lang=data/lang
lang_test=data/lang_test
lang_test_tmp=data/local/lang_test_tmp/
mkdir -p $lang_test_tmp
mkdir -p $lang_test
cp -r $lang/* $lang_test
gunzip -c $lm | utils/find_arpa_oovs.pl $lang_test/words.txt \
> $lang_test_tmp/oovs.txt || exit 1
gunzip -c $lm | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $lang_test_tmp/oovs.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | \
fstcompile --isymbols=$lang_test/words.txt --osymbols=$lang_test/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_test/G.fst
gunzip -c $lm | arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$lang_test/words.txt - $lang_test/G.fst
utils/validate_lang.pl --skip-determinization-check $lang_test || exit 1;
# Compiles decoding graph.

Просмотреть файл

@ -50,22 +50,8 @@ for lm_suffix in bg_5k tg_5k; do
cp -r data/lang/$f $test
done
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
@ -83,7 +69,7 @@ for lm_suffix in bg_5k tg_5k; do
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
done

Просмотреть файл

@ -52,25 +52,10 @@ mkdir -p $test
cp -r data/lang/* $test
cat $lmdir/sprak.arpa | \
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
cat $lmdir/sprak.arpa | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
utils/validate_lang.pl $test || exit 1;
exit 0;

Просмотреть файл

@ -61,8 +61,8 @@ fi
# Checks if espeak is available on the system. espeak is necessary to extend
# the setup because the original transcriptions were created with espeak and
# filtered
# the setup because the original transcriptions were created with espeak and
# filtered
if ! which espeak >&/dev/null; then
echo "espeak is not available on your system. You must install espeak before proceeding."
@ -95,7 +95,7 @@ if [ ! -f $extdict/lexicon.txt ];
# Filter transcription
# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
# initial and trailing spaces and collapse 2 or more spaces to one space
cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
@ -128,7 +128,7 @@ if [ ! -f $lmdir/extra4.ngt ];
grep -P -v '^[\s?|\.|\!]*$' $newtext | \
awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt
# Envelop LM training data in context cues
add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input
@ -151,22 +151,8 @@ mkdir -p $test
cp -r $extlang $test
cat $lmdir/extra${N}$lm_suffix | \
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
cat $lmdir/extra${N}$lm_suffix | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
utils/validate_lang.pl $test || exit 1;

Просмотреть файл

@ -66,22 +66,8 @@ mkdir -p $test
cp -r $srcdir/* $test
cat $lmdir/train${ngram}.arpa | \
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
cat $lmdir/train${ngram}.arpa | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
utils/validate_lang.pl $test || exit 1;

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
mkdir -p data/lang_test
cp -r data/lang/* data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
echo "$0 succeeded"

Просмотреть файл

@ -4,13 +4,5 @@
cd data
#convert to FST format for Kaldi
cat local/swahili.arpa | ../utils/find_arpa_oovs.pl lang/words.txt > lang/oovs.txt
cat local/swahili.arpa | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
../utils/remove_oovs.pl lang/oovs.txt | \
../utils/eps2disambig.pl | ../utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \
--osymbols=lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > lang/G.fst
arpa2fst --disambig-symbol=#0 --read-symbol-table=lang/words.txt \
local/swahili.arpa lang/G.fst

Просмотреть файл

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
@ -20,26 +20,13 @@ done
rm -r data/lang_test
cp -r data/lang data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
@ -68,4 +55,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
echo swbd_p1_format_data succeeded.

Просмотреть файл

@ -1,6 +1,6 @@
#!/bin/bash
#!/bin/bash
#
# Copyright 2014 Nickolay V. Shmyrev
# Copyright 2014 Nickolay V. Shmyrev
# Apache 2.0
@ -12,21 +12,8 @@ arpa_lm=db/cantab-TEDLIUM/cantab-TEDLIUM-pruned.lm3.gz
rm -rf data/lang_nosp_test
cp -r data/lang_nosp data/lang_nosp_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_nosp_test/words.txt \
--osymbols=data/lang_nosp_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_nosp_test/G.fst
gunzip -c "$arpa_lm" | arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_nosp_test/words.txt - data/lang_nosp_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"

Просмотреть файл

@ -16,7 +16,7 @@ tmpdir=data/local/lm_tmp
lexicon=data/local/dict/lexicon.txt
mkdir -p $tmpdir
for x in train dev test; do
for x in train dev test; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.text data/$x/text || exit 1;
@ -37,13 +37,10 @@ for lm_suffix in bg; do
test=data/lang_test_${lm_suffix}
mkdir -p $test
cp -r data/lang/* $test
gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
@ -61,7 +58,7 @@ for lm_suffix in bg; do
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
done

Просмотреть файл

@ -12,7 +12,7 @@ tmpdir=data/local/lm_tmp
lexicon=data/local/dict/lexicon.txt
mkdir -p $tmpdir
for x in train test; do
for x in train test; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/${x}_trans.txt data/$x/text || exit 1;
@ -33,22 +33,8 @@ for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones; do
cp -r data/lang/$f $test
done
cat $lmdir/lm.arpa | \
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
cat $lmdir/lm.arpa | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
@ -67,9 +53,8 @@ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
$tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -rf $tmpdir
echo "*** Succeeded in formatting data."

Просмотреть файл

@ -17,7 +17,7 @@ for lm in $LMs ; do
lmp=$lmdir/`basename $lm`
tmpdir=$tgt/tmp
mkdir -p $tgt
mkdir -p $tgt
mkdir -p $tmpdir
echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
@ -26,21 +26,9 @@ for lm in $LMs ; do
ln -s $langdir/$f $tgt/$f 2> /dev/null
done
cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
cat $lmp | \
grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
--osymbols=$tgt/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$tgt/words.txt - $tgt/G.fst
fstisstochastic $tgt/G.fst
# The output is like:
# 9.14233e-05 -0.259833
@ -48,7 +36,7 @@ for lm in $LMs ; do
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
@ -59,7 +47,7 @@ for lm in $LMs ; do
fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
$tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
# rm -rf $tmpdir # TODO debugging

Просмотреть файл

@ -17,7 +17,7 @@ for lm in $LMs ; do
lmp=$lmdir/`basename $lm`
tmpdir=$tgt/tmp
mkdir -p $tgt
mkdir -p $tgt
mkdir -p $tmpdir
echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
@ -26,21 +26,9 @@ for lm in $LMs ; do
ln -s $langdir/$f $tgt/$f 2> /dev/null
done
cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
cat $lmp | \
grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl $tmpdir/oovs.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
--osymbols=$tgt/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$tgt/words.txt - $tgt/G.fst
fstisstochastic $tgt/G.fst
# The output is like:
# 9.14233e-05 -0.259833
@ -48,7 +36,7 @@ for lm in $LMs ; do
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
@ -59,7 +47,7 @@ for lm in $LMs ; do
fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
$tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
# rm -rf $tmpdir # TODO debugging

Просмотреть файл

@ -1,7 +1,7 @@
#!/bin/bash
. path.sh
echo Preparing language models for test
for lm_suffix in tg; do
@ -10,10 +10,10 @@ for lm_suffix in tg; do
rm -rf data/lang_test_${lm_suffix}
cp -r data/lang data/lang_test_${lm_suffix}
cat input/task.arpabo | arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
#cat input/G.txt | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
arpa2fst --disambig-symbol=#0 --read-symbol-table=$test/words.txt input/task.arpabo $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
@ -30,7 +30,7 @@ for lm_suffix in tg; do
< data/local/dict/lexicon.txt >tmpdir.g/select_empty.fst.txt
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r tmpdir.g
done