зеркало из https://github.com/mozilla/kaldi.git
Update arpa2fst invocations in individual egs/*/local scripts
This commit is contained in:
Родитель
b77e93095b
Коммит
829432d05b
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -10,26 +10,12 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
|||
|
||||
cp -rT data/lang data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -61,4 +47,3 @@ utils/build_const_arpa_lm.sh \
|
|||
data/local/lm/4gram-mincount/lm_unpruned.gz data/lang data/lang_test_fg
|
||||
|
||||
echo "$0 succeeded"
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ tmpdir=data/local/lm_tmp
|
|||
lexicon=data/local/lang_tmp/lexiconp.txt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
|
||||
for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
|
||||
mkdir -p data/$x
|
||||
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
||||
cp $srcdir/$x.txt data/$x/text || exit 1;
|
||||
|
@ -42,23 +42,9 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
|||
cp -r data/lang/* $test
|
||||
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
|
||||
utils/validate_lang.pl --skip-determinization-check $test || exit 1;
|
||||
done
|
||||
|
||||
|
|
|
@ -39,14 +39,8 @@ destdir=$3
|
|||
mkdir $destdir 2>/dev/null || true
|
||||
|
||||
gunzip -c $lmfile | \
|
||||
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
|
||||
arpa2fst - | \
|
||||
fstprint | \
|
||||
utils/eps2disambig.pl | \
|
||||
utils/s2eps.pl | \
|
||||
fstcompile --isymbols=$langdir/words.txt \
|
||||
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
|
||||
fstisstochastic $destdir/G.fst || true
|
||||
|
||||
exit 0
|
||||
|
|
|
@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
|
|||
[ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
|
||||
rho=$[$last_id+1]
|
||||
|
||||
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
|
||||
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
|
||||
# "dead state/failure state" that is not coaccessible.
|
||||
cat <<EOF | fstcompile > $destdir/rho.fst
|
||||
0 1 $silence_id $silence_id
|
||||
|
@ -35,16 +35,11 @@ EOF
|
|||
|
||||
|
||||
gunzip -c $lmfile | \
|
||||
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
|
||||
sed 's/<unk>/<oov>/g' | \
|
||||
arpa2fst - | \
|
||||
fstprint | \
|
||||
utils/eps2disambig.pl | \
|
||||
utils/s2eps.pl | \
|
||||
fstcompile --isymbols=$langdir/words.txt \
|
||||
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
|
||||
--read-symbol-table=$langdir/words.txt - | \
|
||||
fstrhocompose "$rho" - $destdir/rho.fst | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
|
||||
fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
|
||||
|
||||
fstisstochastic $destdir/G.fst || true
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
# This script trains LMs on the WSJ LM-training data.
|
||||
# It requires that you have already run wsj_extend_dict.sh,
|
||||
# to get the larger-size dictionary including all of CMUdict
|
||||
# plus any OOVs and possible acronyms that we could easily
|
||||
# plus any OOVs and possible acronyms that we could easily
|
||||
# derive pronunciations for.
|
||||
|
||||
# This script takes as command-line arguments the relevant data/lang
|
||||
|
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
|
|||
gzip -c > $dir/train_in.gz || exit 1;
|
||||
|
||||
# Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
|
||||
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
|
||||
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
|
||||
gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
|
||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
|
||||
|
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
|
|||
|
||||
# To save disk space, remove the un-mapped training data. We could
|
||||
# easily generate it again if needed.
|
||||
rm $dir/train_nounk.gz
|
||||
rm $dir/train_nounk.gz
|
||||
|
||||
|
||||
##################################################################
|
||||
|
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
|
|||
# The default LM chosen to be the last pruned 4gram-mincount
|
||||
#
|
||||
# Note: One can cheat and provide an external ARPA LM here!!!
|
||||
# To do so, make sure that
|
||||
# To do so, make sure that
|
||||
# -- its vocabulary is fully covered by $lang/words.txt,
|
||||
# -- it is gzipped and
|
||||
# -- it is placed in the $dir directory.
|
||||
|
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
|
|||
|
||||
. ./path.sh || exit 1;
|
||||
gunzip -c $gzipped_ARPA_LM | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
|
||||
fstisstochastic $lang/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
|
||||
fstisstochastic $lang/G.fst
|
||||
|
||||
##################################################################
|
||||
# Redo the FST step after reviewing perplexities reported by the
|
||||
|
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
|
|||
##################################################################
|
||||
|
||||
exit 0
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then
|
|||
fi
|
||||
|
||||
set -e #Exit on non-zero return code from any command
|
||||
set -o pipefail #Exit if any of the commands in the pipeline will
|
||||
set -o pipefail #Exit if any of the commands in the pipeline will
|
||||
#return non-zero return code
|
||||
|
||||
lmfile=$1
|
||||
|
@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then
|
|||
exit 1;
|
||||
fi
|
||||
|
||||
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
|
||||
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
|
||||
while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
|
||||
if ($order == 1) { @A = split;
|
||||
if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
|
||||
|
@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then
|
|||
while(<STDIN>) {
|
||||
if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
|
||||
else { print; } # print all lines unchanged except the one that says ngram 1=X.
|
||||
if (m/^\\1-grams:$/) {
|
||||
if (m/^\\1-grams:$/) {
|
||||
foreach $l (@OOVS) {
|
||||
@A = split(" ", $l);
|
||||
@A == 2 || die "bad line in oov2prob: $_;";
|
||||
|
@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then
|
|||
else
|
||||
decompress="cat $lmfile"
|
||||
fi
|
||||
|
||||
|
||||
$decompress | \
|
||||
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
|
||||
arpa2fst - | \
|
||||
fstprint | \
|
||||
utils/eps2disambig.pl | \
|
||||
utils/s2eps.pl | \
|
||||
fstcompile --isymbols=$langdir/words.txt \
|
||||
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
|
||||
|
||||
fstisstochastic $destdir/G.fst || true;
|
||||
|
||||
if $cleanup; then
|
||||
|
|
|
@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
|
|||
[ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
|
||||
rho=$[$last_id+1]
|
||||
|
||||
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
|
||||
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
|
||||
# "dead state/failure state" that is not coaccessible.
|
||||
cat <<EOF | fstcompile > $destdir/rho.fst
|
||||
0 1 $silence_id $silence_id
|
||||
|
@ -35,16 +35,11 @@ EOF
|
|||
|
||||
|
||||
gunzip -c $lmfile | \
|
||||
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
|
||||
sed 's/<unk>/<oov>/g' | \
|
||||
arpa2fst - | \
|
||||
fstprint | \
|
||||
utils/eps2disambig.pl | \
|
||||
utils/s2eps.pl | \
|
||||
fstcompile --isymbols=$langdir/words.txt \
|
||||
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
|
||||
--read-symbol-table=$langdir/words.txt - | \
|
||||
fstrhocompose "$rho" - $destdir/rho.fst | \
|
||||
fstrmepsilon > $destdir/G.fst || exit 1
|
||||
fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
|
||||
|
||||
fstisstochastic $destdir/G.fst || true
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
# This script trains LMs on the WSJ LM-training data.
|
||||
# It requires that you have already run wsj_extend_dict.sh,
|
||||
# to get the larger-size dictionary including all of CMUdict
|
||||
# plus any OOVs and possible acronyms that we could easily
|
||||
# plus any OOVs and possible acronyms that we could easily
|
||||
# derive pronunciations for.
|
||||
|
||||
# This script takes as command-line arguments the relevant data/lang
|
||||
|
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
|
|||
gzip -c > $dir/train_in.gz || exit 1;
|
||||
|
||||
# Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
|
||||
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
|
||||
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
|
||||
gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
|
||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
|
||||
|
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
|
|||
|
||||
# To save disk space, remove the un-mapped training data. We could
|
||||
# easily generate it again if needed.
|
||||
rm $dir/train_nounk.gz
|
||||
rm $dir/train_nounk.gz
|
||||
|
||||
|
||||
##################################################################
|
||||
|
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
|
|||
# The default LM chosen to be the last pruned 4gram-mincount
|
||||
#
|
||||
# Note: One can cheat and provide an external ARPA LM here!!!
|
||||
# To do so, make sure that
|
||||
# To do so, make sure that
|
||||
# -- its vocabulary is fully covered by $lang/words.txt,
|
||||
# -- it is gzipped and
|
||||
# -- it is placed in the $dir directory.
|
||||
|
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
|
|||
|
||||
. ./path.sh || exit 1;
|
||||
gunzip -c $gzipped_ARPA_LM | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon > $lang/G.fst || exit 1;
|
||||
fstisstochastic $lang/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
|
||||
fstisstochastic $lang/G.fst
|
||||
|
||||
##################################################################
|
||||
# Redo the FST step after reviewing perplexities reported by the
|
||||
|
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
|
|||
##################################################################
|
||||
|
||||
exit 0
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then
|
|||
fi
|
||||
|
||||
set -e #Exit on non-zero return code from any command
|
||||
set -o pipefail #Exit if any of the commands in the pipeline will
|
||||
set -o pipefail #Exit if any of the commands in the pipeline will
|
||||
#return non-zero return code
|
||||
|
||||
lmfile=$1
|
||||
|
@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then
|
|||
exit 1;
|
||||
fi
|
||||
|
||||
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
|
||||
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
|
||||
while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
|
||||
if ($order == 1) { @A = split;
|
||||
if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
|
||||
|
@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then
|
|||
while(<STDIN>) {
|
||||
if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
|
||||
else { print; } # print all lines unchanged except the one that says ngram 1=X.
|
||||
if (m/^\\1-grams:$/) {
|
||||
if (m/^\\1-grams:$/) {
|
||||
foreach $l (@OOVS) {
|
||||
@A = split(" ", $l);
|
||||
@A == 2 || die "bad line in oov2prob: $_;";
|
||||
|
@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then
|
|||
else
|
||||
decompress="cat $lmfile"
|
||||
fi
|
||||
|
||||
|
||||
$decompress | \
|
||||
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
|
||||
arpa2fst - | \
|
||||
fstprint | \
|
||||
utils/eps2disambig.pl | \
|
||||
utils/s2eps.pl | \
|
||||
fstcompile --isymbols=$langdir/words.txt \
|
||||
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
|
||||
|
||||
fstisstochastic $destdir/G.fst || true;
|
||||
|
||||
if $cleanup; then
|
||||
|
|
|
@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
|
|||
[ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
|
||||
rho=$[$last_id+1]
|
||||
|
||||
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
|
||||
# state 0 is start-state. state 1 is state after we saw silence. state 2 is
|
||||
# "dead state/failure state" that is not coaccessible.
|
||||
cat <<EOF | fstcompile > $destdir/rho.fst
|
||||
0 1 $silence_id $silence_id
|
||||
|
@ -35,16 +35,11 @@ EOF
|
|||
|
||||
|
||||
gunzip -c $lmfile | \
|
||||
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
|
||||
sed 's/<unk>/<oov>/g' | \
|
||||
arpa2fst - | \
|
||||
fstprint | \
|
||||
utils/eps2disambig.pl | \
|
||||
utils/s2eps.pl | \
|
||||
fstcompile --isymbols=$langdir/words.txt \
|
||||
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
|
||||
--read-symbol-table=$langdir/words.txt - | \
|
||||
fstrhocompose "$rho" - $destdir/rho.fst | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
|
||||
fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
|
||||
|
||||
fstisstochastic $destdir/G.fst || true
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
# This script trains LMs on the WSJ LM-training data.
|
||||
# It requires that you have already run wsj_extend_dict.sh,
|
||||
# to get the larger-size dictionary including all of CMUdict
|
||||
# plus any OOVs and possible acronyms that we could easily
|
||||
# plus any OOVs and possible acronyms that we could easily
|
||||
# derive pronunciations for.
|
||||
|
||||
# This script takes as command-line arguments the relevant data/lang
|
||||
|
@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
|
|||
gzip -c > $dir/train_in.gz || exit 1;
|
||||
|
||||
# Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
|
||||
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
|
||||
echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
|
||||
gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
|
||||
'BEGIN{while((getline<w)>0) v[$1]=1;}
|
||||
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
|
||||
|
@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
|
|||
|
||||
# To save disk space, remove the un-mapped training data. We could
|
||||
# easily generate it again if needed.
|
||||
rm $dir/train_nounk.gz
|
||||
rm $dir/train_nounk.gz
|
||||
|
||||
|
||||
##################################################################
|
||||
|
@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
|
|||
# The default LM chosen to be the last pruned 4gram-mincount
|
||||
#
|
||||
# Note: One can cheat and provide an external ARPA LM here!!!
|
||||
# To do so, make sure that
|
||||
# To do so, make sure that
|
||||
# -- its vocabulary is fully covered by $lang/words.txt,
|
||||
# -- it is gzipped and
|
||||
# -- it is placed in the $dir directory.
|
||||
|
@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
|
|||
|
||||
. ./path.sh || exit 1;
|
||||
gunzip -c $gzipped_ARPA_LM | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
|
||||
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
|
||||
fstisstochastic $lang/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
|
||||
fstisstochastic $lang/G.fst
|
||||
|
||||
##################################################################
|
||||
# Redo the FST step after reviewing perplexities reported by the
|
||||
|
@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
|
|||
##################################################################
|
||||
|
||||
exit 0
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -12,25 +12,18 @@ mkdir -p data/lang_test
|
|||
cp -r data/lang/* data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -59,4 +52,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
|||
|
||||
|
||||
echo "$0 succeeded"
|
||||
|
||||
|
|
|
@ -17,11 +17,9 @@
|
|||
echo "Preparing train and test data"
|
||||
srcdir=data/local/data
|
||||
lmdir=data/local/nist_lm
|
||||
tmpdir=data/local/lm_tmp
|
||||
lexicon=data/local/lang_tmp/lexiconp.txt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do
|
||||
for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do
|
||||
mkdir -p data/$x
|
||||
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
||||
cp $srcdir/$x.txt data/$x/text || exit 1;
|
||||
|
@ -42,25 +40,10 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
|
|||
cp -r data/lang/* $test
|
||||
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
|
||||
utils/validate_lang.pl $test || exit 1;
|
||||
done
|
||||
|
||||
echo "Succeeded in formatting data."
|
||||
rm -r $tmpdir
|
||||
|
|
|
@ -18,7 +18,7 @@ if [ $# -ne 1 ]; then
|
|||
exit 1;
|
||||
fi
|
||||
|
||||
# check data directories
|
||||
# check data directories
|
||||
chime3_data=$1
|
||||
wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory
|
||||
if [ ! -d $chime3_data ]; then
|
||||
|
@ -70,7 +70,7 @@ else
|
|||
| awk -v voc=$dir/vocab_5k.txt '
|
||||
BEGIN{ while((getline<voc)>0) { invoc[$1]=1; }}
|
||||
/^</{next}{
|
||||
for (x=1;x<=NF;x++) {
|
||||
for (x=1;x<=NF;x++) {
|
||||
w=toupper($x);
|
||||
if (invoc[w]) { printf("%s ",w); } else { printf("<UNK> "); }
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ else
|
|||
$chime3_data/data/transcriptions/dt05_simu.trn_all \
|
||||
|gzip -c > $dir/valid.gz
|
||||
fi
|
||||
|
||||
|
||||
# train a large n-gram language model
|
||||
lm_suffix=${order}gkn_5k
|
||||
if [ -f $dir/lm_${lm_suffix}.arpa.gz ]; then
|
||||
|
@ -121,22 +121,8 @@ for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
|
|||
cp -r data/lang/$f $test
|
||||
done
|
||||
gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
fstisstochastic $test/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
|
@ -154,10 +140,9 @@ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "
|
|||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -r $tmpdir/g
|
||||
|
||||
echo "Succeeded in preparing a large ${order}-gram LM"
|
||||
rm -r $tmpdir
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ tmpdir=data/local/lm_tmp
|
|||
lexicon=data/local/lang_tmp/lexiconp.txt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do
|
||||
for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do
|
||||
mkdir -p data/$x
|
||||
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
||||
cp $srcdir/$x.txt data/$x/text || exit 1;
|
||||
|
@ -43,29 +43,15 @@ for lm_suffix in tgpr_5k; do
|
|||
cp -r data/lang/$f $test
|
||||
done
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
fstisstochastic $test/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
||||
# nonzero because the backoff weights make the states sum to >1).
|
||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
||||
# as close to zero as it could be.
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
||||
# nonzero because the backoff weights make the states sum to >1).
|
||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
||||
# as close to zero as it could be.
|
||||
|
||||
# Everything below is only for diagnostic.
|
||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
||||
|
@ -76,7 +62,7 @@ for lm_suffix in tgpr_5k; do
|
|||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -r $tmpdir/g
|
||||
done
|
||||
|
|
|
@ -25,13 +25,10 @@ for lm_suffix in bg; do
|
|||
test=data/lang_test_${lm_suffix}
|
||||
mkdir -p $test
|
||||
cp -r data/lang/* $test
|
||||
|
||||
|
||||
gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
|
||||
egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
fstisstochastic $test/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
|
@ -49,7 +46,7 @@ for lm_suffix in bg; do
|
|||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -r $tmpdir/g
|
||||
done
|
||||
|
|
|
@ -25,13 +25,10 @@ for lm_suffix in bg; do
|
|||
test=data/lang_test_${lm_suffix}
|
||||
mkdir -p $test
|
||||
cp -r data/lang/* $test
|
||||
|
||||
|
||||
gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
|
||||
egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
fstisstochastic $test/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
|
@ -49,7 +46,7 @@ for lm_suffix in bg; do
|
|||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -r $tmpdir/g
|
||||
done
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
# Copyright 2014 Gaurav Kumar. Apache 2.0
|
||||
#
|
||||
|
||||
|
@ -12,26 +12,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
|||
mkdir -p data/lang_test
|
||||
cp -r data/lang/* data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -60,4 +47,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
|||
|
||||
|
||||
echo "$0 succeeded"
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
|||
mkdir -p data/lang_test
|
||||
cp -r data/lang/* data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
|||
|
||||
|
||||
echo "$0 succeeded"
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
|||
mkdir -p data/lang_test
|
||||
cp -r data/lang/* data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
|||
|
||||
|
||||
echo "$0 succeeded"
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
|||
mkdir -p data/lang_test_fsh
|
||||
cp -r data/lang/* data/lang_test_fsh
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test_fsh/words.txt \
|
||||
--osymbols=data/lang_test_fsh/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_fsh/G.fst
|
||||
fstisstochastic data/lang_test_fsh/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test_fsh/G.fst
|
||||
fstisstochastic data/lang_test_fsh/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test_fsh/G.fst | \
|
|||
|
||||
|
||||
echo "$0 succeeded"
|
||||
|
||||
|
|
|
@ -6,9 +6,9 @@
|
|||
if [ -f path.sh ]; then
|
||||
. path.sh; else
|
||||
echo "missing path.sh"; exit 1;
|
||||
fi
|
||||
fi
|
||||
|
||||
for dir in test train; do
|
||||
for dir in test train; do
|
||||
cp -pr data/local/$dir data/$dir
|
||||
done
|
||||
|
||||
|
@ -21,26 +21,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
|||
rm -r data/lang_test
|
||||
cp -r data/lang data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
|
|
@ -6,9 +6,9 @@
|
|||
if [ -f path.sh ]; then
|
||||
. path.sh; else
|
||||
echo "missing path.sh"; exit 1;
|
||||
fi
|
||||
fi
|
||||
|
||||
for dir in dev train; do
|
||||
for dir in dev train; do
|
||||
cp -pr data/local/$dir data/$dir
|
||||
done
|
||||
|
||||
|
@ -22,26 +22,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
|||
rm -r data/lang_dev
|
||||
cp -r data/lang data/lang_dev
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_dev/words.txt \
|
||||
--osymbols=data/lang_dev/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_dev/G.fst
|
||||
fstisstochastic data/lang_dev/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_dev/G.fst
|
||||
fstisstochastic data/lang_dev/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
|
|
@ -40,20 +40,10 @@ function format_lms () {
|
|||
cp $work_dir/lang_test/$f $test
|
||||
done
|
||||
|
||||
# kkm: I am removing fstdeterminizelog from the following pipe, no point.
|
||||
gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
|
||||
| find_arpa_oovs.pl $test/words.txt > $test/oovs_${lm_suffix}.txt
|
||||
|
||||
# Removing all "illegal" combinations of <s> and </s>, which are supposed to
|
||||
# occur only at being/end of utt. These can cause determinization failures
|
||||
# of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
|
||||
| egrep -v '<s> <s>|</s> <s>|</s> </s>' \
|
||||
| arpa2fst - | fstprint \
|
||||
| remove_oovs.pl $test/oovs_${lm_suffix}.txt \
|
||||
| eps2disambig.pl | s2eps.pl \
|
||||
| fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
|
||||
--keep_isymbols=false --keep_osymbols=false \
|
||||
| fstrmepsilon | fstdeterminizelog > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
set +e
|
||||
fstisstochastic $test/G.fst
|
||||
set -e
|
||||
|
@ -73,7 +63,7 @@ function format_lms () {
|
|||
< $work_dir/local/lexicon_??.txt >tmpdir.g/select_empty.fst.txt
|
||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
|
||||
fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -r tmpdir.g
|
||||
|
||||
|
@ -99,7 +89,7 @@ echo "Preparing language models for test"
|
|||
format_lms GE17k_tg $WDIR/GE;
|
||||
format_lms GE17k_tg_pr $WDIR/GE; } >& $WDIR/GE/format_lms.log
|
||||
|
||||
# German - 60K
|
||||
# German - 60K
|
||||
{ format_lms GE60k_bg $WDIR/GE;
|
||||
format_lms GE60k_tg $WDIR/GE;
|
||||
format_lms GE60k_tg_pr $WDIR/GE; } >> $WDIR/GE/format_lms.log 2>&1
|
||||
|
@ -115,7 +105,7 @@ echo "Preparing language models for test"
|
|||
format_lms SP23k_tg_pr $WDIR/SP; } >& $WDIR/SP/format_lms.log
|
||||
|
||||
# Swedish - 24K
|
||||
# TODO(arnab): Something going wrong with the Swedish trigram LM.
|
||||
# TODO(arnab): Something going wrong with the Swedish trigram LM.
|
||||
{ # format_lms SW24k_tg $WDIR/SW;
|
||||
# format_lms SW24k_tg_pr $WDIR/SW;
|
||||
format_lms SW24k_bg $WDIR/SW; } >& $WDIR/SW/format_lms.log
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -23,26 +23,13 @@ done
|
|||
rm -r data/lang_test
|
||||
cp -r data/lang data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -71,4 +58,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
|||
|
||||
|
||||
echo hkust_format_data succeeded.
|
||||
|
||||
|
|
|
@ -34,22 +34,10 @@ mfccdir=mfcc
|
|||
# here.
|
||||
lang=data/lang
|
||||
lang_test=data/lang_test
|
||||
lang_test_tmp=data/local/lang_test_tmp/
|
||||
mkdir -p $lang_test_tmp
|
||||
mkdir -p $lang_test
|
||||
cp -r $lang/* $lang_test
|
||||
gunzip -c $lm | utils/find_arpa_oovs.pl $lang_test/words.txt \
|
||||
> $lang_test_tmp/oovs.txt || exit 1
|
||||
gunzip -c $lm | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $lang_test_tmp/oovs.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | \
|
||||
fstcompile --isymbols=$lang_test/words.txt --osymbols=$lang_test/words.txt \
|
||||
--keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_test/G.fst
|
||||
gunzip -c $lm | arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$lang_test/words.txt - $lang_test/G.fst
|
||||
utils/validate_lang.pl --skip-determinization-check $lang_test || exit 1;
|
||||
|
||||
# Compiles decoding graph.
|
||||
|
|
|
@ -50,22 +50,8 @@ for lm_suffix in bg_5k tg_5k; do
|
|||
cp -r data/lang/$f $test
|
||||
done
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
fstisstochastic $test/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
|
@ -83,7 +69,7 @@ for lm_suffix in bg_5k tg_5k; do
|
|||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -r $tmpdir/g
|
||||
done
|
||||
|
|
|
@ -52,25 +52,10 @@ mkdir -p $test
|
|||
cp -r data/lang/* $test
|
||||
|
||||
cat $lmdir/sprak.arpa | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
cat $lmdir/sprak.arpa | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
|
||||
|
||||
utils/validate_lang.pl $test || exit 1;
|
||||
|
||||
exit 0;
|
||||
|
||||
|
|
|
@ -61,8 +61,8 @@ fi
|
|||
|
||||
|
||||
# Checks if espeak is available on the system. espeak is necessary to extend
|
||||
# the setup because the original transcriptions were created with espeak and
|
||||
# filtered
|
||||
# the setup because the original transcriptions were created with espeak and
|
||||
# filtered
|
||||
|
||||
if ! which espeak >&/dev/null; then
|
||||
echo "espeak is not available on your system. You must install espeak before proceeding."
|
||||
|
@ -95,7 +95,7 @@ if [ ! -f $extdict/lexicon.txt ];
|
|||
|
||||
|
||||
# Filter transcription
|
||||
# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
|
||||
# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
|
||||
# initial and trailing spaces and collapse 2 or more spaces to one space
|
||||
|
||||
cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
|
||||
|
@ -128,7 +128,7 @@ if [ ! -f $lmdir/extra4.ngt ];
|
|||
|
||||
grep -P -v '^[\s?|\.|\!]*$' $newtext | \
|
||||
awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt
|
||||
|
||||
|
||||
# Envelop LM training data in context cues
|
||||
add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input
|
||||
|
||||
|
@ -151,22 +151,8 @@ mkdir -p $test
|
|||
cp -r $extlang $test
|
||||
|
||||
cat $lmdir/extra${N}$lm_suffix | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
cat $lmdir/extra${N}$lm_suffix | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
|
||||
utils/validate_lang.pl $test || exit 1;
|
||||
|
||||
|
|
|
@ -66,22 +66,8 @@ mkdir -p $test
|
|||
cp -r $srcdir/* $test
|
||||
|
||||
cat $lmdir/train${ngram}.arpa | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
cat $lmdir/train${ngram}.arpa | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
|
||||
utils/validate_lang.pl $test || exit 1;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
|
|||
mkdir -p data/lang_test
|
||||
cp -r data/lang/* data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
|||
|
||||
|
||||
echo "$0 succeeded"
|
||||
|
||||
|
|
|
@ -4,13 +4,5 @@
|
|||
|
||||
cd data
|
||||
#convert to FST format for Kaldi
|
||||
cat local/swahili.arpa | ../utils/find_arpa_oovs.pl lang/words.txt > lang/oovs.txt
|
||||
cat local/swahili.arpa | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
../utils/remove_oovs.pl lang/oovs.txt | \
|
||||
../utils/eps2disambig.pl | ../utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \
|
||||
--osymbols=lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon > lang/G.fst
|
||||
arpa2fst --disambig-symbol=#0 --read-symbol-table=lang/words.txt \
|
||||
local/swahili.arpa lang/G.fst
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
@ -20,26 +20,13 @@ done
|
|||
rm -r data/lang_test
|
||||
cp -r data/lang data/lang_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
|
||||
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
|
||||
## Check lexicon.
|
||||
## just have a look and make sure it seems sane.
|
||||
|
@ -68,4 +55,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
|
|||
|
||||
|
||||
echo swbd_p1_format_data succeeded.
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright 2014 Nickolay V. Shmyrev
|
||||
# Copyright 2014 Nickolay V. Shmyrev
|
||||
# Apache 2.0
|
||||
|
||||
|
||||
|
@ -12,21 +12,8 @@ arpa_lm=db/cantab-TEDLIUM/cantab-TEDLIUM-pruned.lm3.gz
|
|||
rm -rf data/lang_nosp_test
|
||||
cp -r data/lang_nosp data/lang_nosp_test
|
||||
|
||||
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
|
||||
# LM doesn't have these "invalid combinations". These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
|
||||
# our word list. Since our LM doesn't have any, we just give it
|
||||
# /dev/null [we leave it in the script to show how you'd do it].
|
||||
gunzip -c "$arpa_lm" | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl /dev/null | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_nosp_test/words.txt \
|
||||
--osymbols=data/lang_nosp_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_nosp_test/G.fst
|
||||
gunzip -c "$arpa_lm" | arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=data/lang_nosp_test/words.txt - data/lang_nosp_test/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
|
|
|
@ -16,7 +16,7 @@ tmpdir=data/local/lm_tmp
|
|||
lexicon=data/local/dict/lexicon.txt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
for x in train dev test; do
|
||||
for x in train dev test; do
|
||||
mkdir -p data/$x
|
||||
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
||||
cp $srcdir/$x.text data/$x/text || exit 1;
|
||||
|
@ -37,13 +37,10 @@ for lm_suffix in bg; do
|
|||
test=data/lang_test_${lm_suffix}
|
||||
mkdir -p $test
|
||||
cp -r data/lang/* $test
|
||||
|
||||
|
||||
gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
|
||||
egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
fstisstochastic $test/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
|
@ -61,7 +58,7 @@ for lm_suffix in bg; do
|
|||
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
|
||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -r $tmpdir/g
|
||||
done
|
||||
|
|
|
@ -12,7 +12,7 @@ tmpdir=data/local/lm_tmp
|
|||
lexicon=data/local/dict/lexicon.txt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
for x in train test; do
|
||||
for x in train test; do
|
||||
mkdir -p data/$x
|
||||
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
|
||||
cp $srcdir/${x}_trans.txt data/$x/text || exit 1;
|
||||
|
@ -33,22 +33,8 @@ for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones; do
|
|||
cp -r data/lang/$f $test
|
||||
done
|
||||
cat $lmdir/lm.arpa | \
|
||||
utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
cat $lmdir/lm.arpa | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
|
||||
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$test/words.txt - $test/G.fst
|
||||
fstisstochastic $test/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
|
@ -67,9 +53,8 @@ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "
|
|||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
|
||||
$tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -rf $tmpdir
|
||||
|
||||
echo "*** Succeeded in formatting data."
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ for lm in $LMs ; do
|
|||
lmp=$lmdir/`basename $lm`
|
||||
|
||||
tmpdir=$tgt/tmp
|
||||
mkdir -p $tgt
|
||||
mkdir -p $tgt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
|
||||
|
@ -26,21 +26,9 @@ for lm in $LMs ; do
|
|||
ln -s $langdir/$f $tgt/$f 2> /dev/null
|
||||
done
|
||||
|
||||
cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
|
||||
cat $lmp | \
|
||||
grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
|
||||
--osymbols=$tgt/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$tgt/words.txt - $tgt/G.fst
|
||||
fstisstochastic $tgt/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
|
@ -48,7 +36,7 @@ for lm in $LMs ; do
|
|||
# nonzero because the backoff weights make the states sum to >1).
|
||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
||||
# as close to zero as it could be.
|
||||
|
||||
|
||||
# Everything below is only for diagnostic.
|
||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
||||
# this might cause determinization failure of CLG.
|
||||
|
@ -59,7 +47,7 @@ for lm in $LMs ; do
|
|||
fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
|
||||
$tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
|
||||
# rm -rf $tmpdir # TODO debugging
|
||||
|
|
|
@ -17,7 +17,7 @@ for lm in $LMs ; do
|
|||
lmp=$lmdir/`basename $lm`
|
||||
|
||||
tmpdir=$tgt/tmp
|
||||
mkdir -p $tgt
|
||||
mkdir -p $tgt
|
||||
mkdir -p $tmpdir
|
||||
|
||||
echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
|
||||
|
@ -26,21 +26,9 @@ for lm in $LMs ; do
|
|||
ln -s $langdir/$f $tgt/$f 2> /dev/null
|
||||
done
|
||||
|
||||
cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
|
||||
|
||||
# grep -v '<s> <s>' because the LM seems to have some strange and useless
|
||||
# stuff in it with multiple <s>'s in the history. Encountered some other similar
|
||||
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
|
||||
# which are supposed to occur only at being/end of utt. These can cause
|
||||
# determinization failures of CLG [ends up being epsilon cycles].
|
||||
|
||||
cat $lmp | \
|
||||
grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
|
||||
arpa2fst - | fstprint | \
|
||||
utils/remove_oovs.pl $tmpdir/oovs.txt | \
|
||||
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
|
||||
--osymbols=$tgt/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst
|
||||
arpa2fst --disambig-symbol=#0 \
|
||||
--read-symbol-table=$tgt/words.txt - $tgt/G.fst
|
||||
fstisstochastic $tgt/G.fst
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
|
@ -48,7 +36,7 @@ for lm in $LMs ; do
|
|||
# nonzero because the backoff weights make the states sum to >1).
|
||||
# Because of the <s> fiasco for these particular LMs, the first number is not
|
||||
# as close to zero as it could be.
|
||||
|
||||
|
||||
# Everything below is only for diagnostic.
|
||||
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
|
||||
# this might cause determinization failure of CLG.
|
||||
|
@ -59,7 +47,7 @@ for lm in $LMs ; do
|
|||
fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
|
||||
$tmpdir/g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
|
||||
# rm -rf $tmpdir # TODO debugging
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
. path.sh
|
||||
|
||||
|
||||
echo Preparing language models for test
|
||||
|
||||
for lm_suffix in tg; do
|
||||
|
@ -10,10 +10,10 @@ for lm_suffix in tg; do
|
|||
rm -rf data/lang_test_${lm_suffix}
|
||||
cp -r data/lang data/lang_test_${lm_suffix}
|
||||
|
||||
cat input/task.arpabo | arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
#cat input/G.txt | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
|
||||
arpa2fst --disambig-symbol=#0 --read-symbol-table=$test/words.txt input/task.arpabo $test/G.fst
|
||||
|
||||
fstisstochastic $test/G.fst
|
||||
|
||||
|
||||
# The output is like:
|
||||
# 9.14233e-05 -0.259833
|
||||
# we do expect the first of these 2 numbers to be close to zero (the second is
|
||||
|
@ -30,7 +30,7 @@ for lm_suffix in tg; do
|
|||
< data/local/dict/lexicon.txt >tmpdir.g/select_empty.fst.txt
|
||||
fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
|
||||
fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
|
||||
fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
|
||||
echo "Language model has cycles with empty words" && exit 1
|
||||
rm -r tmpdir.g
|
||||
done
|
||||
|
|
Загрузка…
Ссылка в новой задаче