Basic work towards the KW search -- create the keywords fsts and prepare the index using lattices and the KWfst

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1940 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Jan Trmal 2013-01-24 23:39:35 +00:00
Родитель f88a45bc6b
Коммит cc620ded07
11 изменённых файлов: 593 добавлений и 8 удалений

Просмотреть файл

@ -17,3 +17,6 @@ numGaussSGMM=50000
# Change it to the LM with the best perplexity after run.sh finishes
lmForDecoding=$SysDir/sriLM/32hLM.gz
glmFile=`readlink -f ./conf/glm`

Просмотреть файл

@ -17,8 +17,16 @@ dev_data_dir=/export/babel/oguz/10Hsubsets/106B-delivery-v0.2f_10hSubset/convers
lexicon_file=/export/a09/jtrmal/babel/egs/Tagalog-10hSystem2a/Lexicon/lexicon.txt
# Scoring protocols (dummy GLM file to appease the scoring script)
glmFile=`readlink -f ./conf/glm`
glmFile=./conf/glm
train_nj=12
decode_nj=9
#keyword search settings
ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
kwlist_file=/export/babel/data/scoring/IndusDB.20121102/babel106b-v0.2g_conv-dev.kwlist.xml
rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.rttm
#ln -sf `readlink -f babel/subsets/ecf.xml` $kwsdatadir/ecf.xml
. /export/babel/data/software/env.sh

Просмотреть файл

@ -27,6 +27,14 @@ lexiconFlags="-oov <unk>"
# Scoring protocols (dummy GLM file to appease the scoring script)
glmFile=./conf/glm
train_nj=32
train_nj=16
decode_nj=18
#keyword search settings
ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
kws_list= /export/babel/data/scoring/IndusDB.20121102/babel106b-v0.2g_conv-dev.kwlist.xml
rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.rttm
#ln -sf `readlink -f babel/subsets/ecf.xml` $kwsdatadir/ecf.xml
. /export/babel/data/software/env.sh

Просмотреть файл

@ -101,6 +101,33 @@ function CHECKPOINT {
eval export $COUNTER_NAME=$COUNTER
}
function KILLBG_JOBS {
jobs \
| perl -ne 'print "$1\n" if m/^\[(\d+)\][+-]? +Running/;' \
| while read -r ; do kill %"$REPLY" ; done
}
function ONEXIT_HANDLER {
COLOR_GREEN='\e[00;32m'
COLOR_RED='\e[00;31m'
COLOR_BLUE='\e[00;34m'
COLOR_DEFAULT='\e[00m'
counters=`set | egrep "^CHECKPOINT_[_A-Z]+_COUNTER=" | sed 's/^CHECKPOINT\(_[_A-Z][_A-Z]*\)_COUNTER=/LAST_GOOD\1=/g' | sed "s/^LAST_GOOD_DEFAULT=/LAST_GOOD=/g"`
if [[ ! -z "$counters" ]]; then
echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The last command returned non-zero status"${COLOR_DEFAULT} >&2
echo -e ${COLOR_RED}"look at the counters and try to rerun this script (after figuring the issue)"${COLOR_DEFAULT} >&2
echo -e ${COLOR_RED}"using the -c COUNTER_NAME=COUNTER_VALUE parameters;"${COLOR_DEFAULT} >&2
echo -e ${COLOR_RED}"You can use -c \"COUNTER_NAME1=COUNTER_VALUE1;COUNTER_NAME2=COUNTER_VALUE2\" as well"${COLOR_DEFAULT} >&2
echo -e ${COLOR_RED}"The counters: \n $counters"${COLOR_DEFAULT} >&2
else
echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The last command returned non-zero status"${COLOR_DEFAULT} >&2
echo -e ${COLOR_RED}"No checkpoint was found. Try to figure out the problem and "${COLOR_DEFAULT} >&2
echo -e ${COLOR_RED}"run the script again"${COLOR_DEFAULT} >&2
fi
}
trap "ONEXIT_HANDLER; exit; " SIGINT SIGKILL SIGTERM ERR
while getopts ":c:i" opt; do
case $opt in
c)

Просмотреть файл

@ -3,6 +3,13 @@
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen)
# Apache 2.0.
# Begin configuration section.
case_insensitive=true
# End configuration section.
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/kws_data_prep.sh <lang-dir> <data-dir> <kws-data-dir>"
@ -13,9 +20,8 @@ fi
langdir=$1;
datadir=$2;
kwsdatadir=$3;
keywords=$kwsdatadir/kws.xml
keywords=$kwsdatadir/kwlist.xml
case_insensitive=true
mkdir -p $kwsdatadir;
@ -39,9 +45,10 @@ cat $keywords | perl -e '
# are not in our $langdir/words.txt, as we won't find them anyway...
#cat $kwsdatadir/keywords.txt | babel/filter_keywords.pl $langdir/words.txt - - | \
# sym2int.pl --map-oov 0 -f 2- $langdir/words.txt | \
if $case_insensitive ; then
if $case_insensitive ; then
echo "Running case insensitive processing"
cat $langdir/words.txt | tr '[:lower:]' '[:upper:]' > $kwsdatadir/words.txt
[ `cut -f 1 -d ' ' $kwsdatadir/words.txt | wc -l` -ne `cat $kwsdatadir/words.txt | wc -l` ] && echo "Warning, multiple words in dictionary differ only in case..."
[ `cut -f 1 -d ' ' $kwsdatadir/words.txt | sort -u | wc -l` -ne `cat $kwsdatadir/words.txt | wc -l` ] && echo "Warning, multiple words in dictionary differ only in case..."
cat $kwsdatadir/keywords.txt | tr '[:lower:]' '[:upper:]' | \
sym2int.pl --map-oov 0 -f 2- $kwsdatadir/words.txt > $kwsdatadir/keywords_all.int
@ -55,11 +62,11 @@ cat $kwsdatadir/keywords_all.int | \
grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int
cut -f 1 -d ' ' $kwsdatadir/keywords.int | \
babel/subset_kwslist.pl $keywords > $kwsdatadir/keyword_invocab.xml
local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_invocab.xml
cat $kwsdatadir/keywords_all.int | \
egrep " 0 | 0$" | cut -f 1 -d ' ' | \
babel/subset_kwslist.pl $keywords > $kwsdatadir/keyword_outvocab.xml
local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_outvocab.xml

Просмотреть файл

@ -0,0 +1,47 @@
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen)
# Apache 2.0.
. ./path.sh
. ./cmd.sh
# Begin configuration section.
acwt=0.0909091
duptime=0.6
cmd=run.pl
# End configuration section.
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
datadir=$2
langdir=$1
decodedir=$3
kwsdatadir=$datadir/kws
kwsoutdir=$decodedir/kws
mkdir -p $kwsdatadir
mkdir -p $kwsoutdir
local/make_index.sh --cmd "$cmd" --acwt $acwt \
$kwsdatadir $langdir $decodedir $kwsoutdir || exit 1
local/search_index.sh $kwsdatadir $kwsoutdir || exit 1
duration=`head -1 $kwsdatadir/ecf.xml |\
grep -o -E "duration=\"[0-9]*[ \.]*[0-9]*\"" |\
grep -o -E "[0-9]*[\.]*[0-9]*" |\
perl -e 'while(<>) {print $_/2;}'`
cat $kwsoutdir/result.* | \
utils/write_kwslist.pl --flen=0.01 --duration=$duration \
--segments=$datadir/segments --normalize=true \
--map-utter=$kwsdatadir/utter_map \
- - | \
utils/filter_kwslist.pl $duptime > $kwsoutdir/kwslist.xml

32
egs/babel/s5/local/kws_setup.sh Executable file
Просмотреть файл

@ -0,0 +1,32 @@
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen, Yenda Trmal)
# Apache 2.0.
. ./path.sh
. ./cmd.sh
# Begin configuration section.
cmd=run.pl
# End configuration section.
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
ecf_file=$1
kwlist_file=$2
rttm_file=$3
langdir=$4
datadir=$5
kwsdatadir=$datadir/kws
mkdir -p $kwsdatadir
cp `readlink -f $ecf_file` $kwsdatadir/ecf.xml || exit 1
cp `readlink -f $kwlist_file` $kwsdatadir/kwlist.xml || exit 1
cp `readlink -f $rttm_file` $kwsdatadir/rttm || exit 1
local/kws_data_prep.sh --case-insensitive true $langdir $datadir $kwsdatadir || exit 1

Просмотреть файл

@ -0,0 +1,35 @@
#!/bin/bash
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
input_data_dir=$1
input_filelist=$2
output_data_dir=$3
mkdir -p $output_data_dir/transcription
mkdir -p $output_data_dir/audio
abs_src_dir=`readlink -f $input_data_dir`
abs_tgt_dir=`readlink -f $output_data_dir`
for file_basename in `cat $input_data_list`; do
if [[ -e $abs_src_dir/audio/$file_basename.sph ]] ; then
ln -sf $abs_src_dir/audio/$file_basename.sph $abs_tgt_dir/audio || exit 1
else
echo "File $abs_src_dir/audio/$file_basename.sph does not exist!"
exit 1
fi
if [[ -e $abs_src_dir/transcription/$file_basename.txt ]] ; then
ln -sf $abs_src_dir/transcription/$file_basename.txt $abs_tgt_dir/transcription || exit 1
else
echo "File $abs_src_dir/audio/$file_basename.sph does not exist!"
exit 1
fi
done

Просмотреть файл

@ -0,0 +1,30 @@
#!/bin/bash
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
$transcriptions=$1
$input_lexicon_file=$2
$output_lexicon_file=$3
(
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
) | sort -u | awk '
BEGIN {
while(( getline line< ARGV[2] ) > 0 ) {
split(line, e, "\t")
LEXICON[ e[1] ]=line
}
FILENAME="-"
i=0
while(( getline word< ARGV[1] ) > 0 ) {
if (word in LEXICON)
print LEXICON[word]
}
}
' - $input_lexicon_file | sort -u > $output_lexicon_file

384
egs/babel/s5/run-limited-chk.sh Executable file
Просмотреть файл

@ -0,0 +1,384 @@
#!/bin/bash
# System and data directories
#SCRIPT=$(readlink $0)
#SysDir=`dirname $SCRIPT`
SysDir=`pwd`
echo $SysDir
# Lexicon and Language Model parameters
oovSymbol="<unk>"
lexiconFlags="-oov <unk>"
# Scoring protocols (dummy GLM file to appease the scoring script)
glmFile=`readlink -f ./conf/glm`
# Include the checkpointing facility
. ./local/CHECKPOINT.sh
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
[ -f ./cmd.sh ] && . ./cmd.sh; # source train and decode cmds.
. parse_options.sh || exit 1;
configfile=$1
[ -f $configfile ] && . $configfile
[ -f ./local.conf ] && . ./local.conf
#Preparing dev and train directories
if test -f $train_data_list ; then
echo ---------------------------------------------------------------------
echo "Subsetting the TRAIN set"
echo ---------------------------------------------------------------------
CHK local/make_corpus_subset.sh $train_data_dir $train_data_list ./data/raw_train_data || exit 1
train_data_dir=`readlink -f ./data/raw_train_data`
nj_max=`cat $train_data_list | wc -l`
if [[ "$nj_max" -lt "$train_nj" ]] ; then
echo "The maximum reasonable number of jobs is $nj_max (you have $train_nj)! (The training and decoding process has file-granularity)"
train_nj=$nj_max
fi
fi
if test -f $dev_data_list ; then
echo ---------------------------------------------------------------------
echo "Subsetting the DEV set"
echo ---------------------------------------------------------------------
CHK local/make_corpus_subset.sh $dev_data_dir $dev_data_list ./data/raw_dev_data || exit 1
dev_data_dir=`readlink -f ./data/raw_dev_data`
nj_max=`cat $dev_data_list | wc -l`
if [[ "$nj_max" -lt "$decode_nj" ]] ; then
echo "The maximum reasonable number of jobs is $nj_max "
echo "you have $decode_nj! (The training and decoding process has file-granularity)"
decode_nj=$nj_max
fi
fi
if [[ $filter_lexicon ]]; then
echo ---------------------------------------------------------------------
echo "Subsetting the LEXICON"
echo ---------------------------------------------------------------------
lexicon_dir=./data/raw_lex_data
mkdir -p $lexicon_dir
CHK local/make_corpus_subset.sh $train_data_dir/transcriptions \
$lexicon_file $lexicon_dir/lexicon.txt || exit 1
lexicon_file=$lexicon_dir/lexicon.txt
fi
echo ---------------------------------------------------------------------
echo "Preparing lexicon in data/local on" `date`
echo ---------------------------------------------------------------------
mkdir -p data/local
CHK local/prepare_lexicon.pl \
$lexiconFlags $lexicon_file data/local || exit 1
echo ---------------------------------------------------------------------
echo "Creating L.fst etc in data/lang on" `date`
echo ---------------------------------------------------------------------
mkdir -p data/lang
CHK utils/prepare_lang.sh \
--share-silence-phones true \
data/local $oovSymbol data/local/tmp.lang data/lang || exit 1
echo ---------------------------------------------------------------------
echo "Preparing acoustic training lists in data/train on" `date`
echo ---------------------------------------------------------------------
mkdir -p data/train
CHK local/prepare_acoustic_training_data.pl \
--vocab data/local/lexicon.txt --fragmentMarkers \-\*\~ \
$train_data_dir data/train > data/train/skipped_utts.log || exit 1
echo ---------------------------------------------------------------------
echo "Preparing dev data lists in data/dev on" `date`
echo ---------------------------------------------------------------------
mkdir -p data/dev
CHK local/prepare_acoustic_training_data.pl \
--fragmentMarkers \-\*\~ \
$dev_data_dir data/dev > data/dev/skipped_utts.log || exit 1
echo ---------------------------------------------------------------------
echo "Preparing dev stm files in data/dev on" `date`
echo ---------------------------------------------------------------------
CHK local/prepare_stm.pl --fragmentMarkers \-\*\~ data/dev || exit 1
test -f "$glmFile" || exit 1
cp $glmFile data/dev/glm || exit 1
echo ---------------------------------------------------------------------
echo "Creating a basic G.fst in data/lang on" `date`
echo ---------------------------------------------------------------------
# We will simply override the default G.fst by the G.fst generated using SRILM
CHK local/train_lms_srilm.sh data data/srilm
CHK local/arpa2G.sh data/srilm/lm.gz data/lang data/lang
#CHK local/kws_setup.sh $ecf_file $kwlist_file $rttm_file data/lang data/dev || exit 1
#CHK local/kws_search.sh data/lang data/dev exp/sgmm5/decode_fmllr
cd $SysDir
echo ---------------------------------------------------------------------
echo "Starting plp feature extraction in plp on" `date`
echo ---------------------------------------------------------------------
CHK steps/make_plp.sh \
--cmd "$train_cmd" --nj $train_nj \
data/train exp/make_plp/train plp || exit 1
CHK steps/compute_cmvn_stats.sh \
data/train exp/make_plp/train plp || exit 1
# In case plp extraction failed on some utterance, delist them
CHK utils/fix_data_dir.sh data/train
CHK steps/make_plp.sh \
--cmd "$train_cmd" --nj $decode_nj \
data/dev exp/make_plp/dev plp || exit 1
CHK steps/compute_cmvn_stats.sh \
data/dev exp/make_plp/dev plp || exit 1
# In case plp extraction failed on some utterance, delist them
CHK utils/fix_data_dir.sh data/dev
mkdir -p exp
echo ---------------------------------------------------------------------
echo "Subsetting monophone training data in data/train_sub1 on" `date`
echo ---------------------------------------------------------------------
CHK utils/subset_data_dir.sh data/train 5000 data/train_sub1 || exit 1
echo ---------------------------------------------------------------------
echo "Starting (small) monophone training in exp/mono on" `date`
echo ---------------------------------------------------------------------
CHK steps/train_mono.sh \
--boost-silence 1.5 --nj 8 --cmd "$train_cmd" \
data/train_sub1 data/lang exp/mono || exit 1
echo ---------------------------------------------------------------------
echo "Starting (first) triphone training in exp/tri1 on" `date`
echo ---------------------------------------------------------------------
CHK steps/align_si.sh \
--boost-silence 1.5 --nj $(( $train_nj/2 )) --cmd "$train_cmd" \
data/train data/lang exp/mono exp/mono_ali || exit 1
CHK steps/train_deltas.sh \
--boost-silence 1.5 --cmd "$train_cmd" \
$numLeavesTri1 $numGaussTri1 data/train data/lang exp/mono_ali exp/tri1 || exit 1
echo ---------------------------------------------------------------------
echo "Spawning decoding with first triphone models in exp/tri1 on" `date`
echo ---------------------------------------------------------------------
#(
mkdir -p exp/tri1/graph
CHK utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph &> exp/tri1/mkgraph.log
mkdir -p exp/tri1/decode
CHK steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
exp/tri1/graph data/dev exp/tri1/decode &> exp/tri1/decode.log
#) &
#tri1decode=$!; # Grab the PID of the subshell
#sleep 5; # Let any "start-up error" messages from the subshell get logged
echo "See exp/tri1/mkgraph.log and exp/tri1/decode.log for decoding outcomes"
echo -----------------------------------------------------------------------------
echo "Starting second triphone training in exp/tri2 on" `date`
echo -----------------------------------------------------------------------------
CHK steps/align_si.sh \
--boost-silence 1.5 --nj $(( $train_nj/2 )) --cmd "$train_cmd" \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1
CHK steps/train_deltas.sh \
--boost-silence 1.5 --cmd "$train_cmd" \
$numLeavesTri2 $numGaussTri2 data/train data/lang exp/tri1_ali exp/tri2 || exit 1
echo -----------------------------------------------------------------------------
echo "Spawning decoding with triphone models in exp/tri2 on" `date`
echo -----------------------------------------------------------------------------
#(
mkdir -p exp/tri2/graph
CHK utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph &> exp/tri2/mkgraph.log
mkdir -p exp/tri2/decode
CHK steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
exp/tri2/graph data/dev exp/tri2/decode &> exp/tri2/decode.log
#) &
#tri2decode=$!; # Grab the PID of the subshell
#sleep 5; # Let any "start-up error" messages from the subshell get logged
echo "See exp/tri2/mkgraph.log and exp/tri2/decode.log for decoding outcomes"
echo ---------------------------------------------------------------------------------
echo "Starting (lda_mllt) triphone training in exp/tri4 on" `date`
echo ---------------------------------------------------------------------------------
CHK steps/align_si.sh \
--boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
data/train data/lang exp/tri2 exp/tri2_ali || exit 1
CHK steps/train_lda_mllt.sh \
--boost-silence 1.5 --cmd "$train_cmd" \
$numLeavesMLLT $numGaussMLLT data/train data/lang exp/tri2_ali exp/tri3 || exit 1
echo ----------------------------------------------------------------------------------
echo "Spawning decoding with lda_mllt models in exp/tri3 on" `date`
echo ----------------------------------------------------------------------------------
#(
mkdir -p exp/tri3/graph
utils/mkgraph.sh \
data/lang exp/tri3 exp/tri3/graph &> exp/tri3/mkgraph.log
mkdir -p exp/tri3/decode
steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
exp/tri3/graph data/dev exp/tri3/decode &> exp/tri3/decode.log
#) &
#tri3decode=$!; # Grab the PID of the subshell
#sleep 5; # Let any "start-up error" messages from the subshell get logged
echo "See exp/tri3/mkgraph.log and exp/tri3/decode.log for decoding outcomes"
echo ----------------------------------------------------------------------------
echo "Starting (SAT) triphone training in exp/tri4 on" `date`
echo ----------------------------------------------------------------------------
CHK steps/align_si.sh \
--boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
data/train data/lang exp/tri3 exp/tri3_ali || exit 1
CHK steps/train_sat.sh \
--boost-silence 1.5 --cmd "$train_cmd" \
$numLeavesSAT $numGaussSAT data/train data/lang exp/tri3_ali exp/tri4 || exit 1
echo ------------------------------------------------------------------
echo "Spawning decoding with SAT models on" `date`
echo ------------------------------------------------------------------
#(
mkdir -p exp/tri4/graph
CHK utils/mkgraph.sh \
data/lang exp/tri4 exp/tri4/graph &> exp/tri4/mkgraph.log
mkdir -p exp/tri4/decode
CHK touch exp/tri4/decode.started # A signal to the SGMM2 decoding step
CHK steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" \
exp/tri4/graph data/dev exp/tri4/decode &> exp/tri4/decode.log
CHK touch exp/tri4/decode.finished # so SGMM2 decoding may proceed
#) &
#tri4decode=$!; # Grab the PID of the subshell; needed for SGMM2 decoding
#sleep 5; # Let any "start-up error" messages from the subshell get logged
echo "See exp/tri4/mkgraph.log and exp/tri4/decode.log for decoding outcomes"
################################################################################
# Ready to start SGMM training
################################################################################
echo -------------------------------------------------
echo "Starting exp/ubm5 on" `date`
echo -------------------------------------------------
CHK steps/align_fmllr.sh --boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
data/train data/lang exp/tri4 exp/tri4_ali || exit 1
CHK steps/train_ubm.sh --cmd "$train_cmd" \
$numGaussUBM data/train data/lang exp/tri4_ali exp/ubm5 || exit 1
echo --------------------------------------------------
echo "Starting exp/sgmm5 on" `date`
echo --------------------------------------------------
CHK steps/train_sgmm2.sh --cmd "$train_cmd" \
$numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri4_ali exp/ubm5/final.ubm exp/sgmm5 || exit 1
################################################################################
# Ready to decode with SGMM2 models
################################################################################
echo -----------------------------------------------------------------
echo "Spawning exp/sgmm5/decode[_fmllr] on" `date`
echo -----------------------------------------------------------------
echo "exp/sgmm5/decode will wait on PID $tri4decode if necessary"
#wait $tri4decode; # Need lattices from the corresponding SGMM decoding passes
#(
# sleep 5; # Let the status message after the subshell get logged
## The next (now commented) block should ensure we starting decoding of sgmm5 only after
## the tri4 decoding finishes. The same can be achieved by "wait"int for tri4decode pid
# while [ ! -f exp/tri4/decode.started -o ! -f exp/tri4/decode.finished ]; do
# echo "exp/sgmm5/decode is waiting on SAT decoding ..." `date`
# sleep 5
# done
# while [ exp/tri4/decode.finished -ot exp/tri4/decode.started ]; do
# echo "exp/tri4/decode.finished is older than exp/tri4/decode.started"; \
# ls -lt exp/tri4/decode.finished exp/tri4/decode.started; \
# echo "Perhaps SAT decoding was restarted and is still running?"; \
# echo "exp/sgmm5/decode is still waiting on SAT decoding ..." `date`
# sleep 5
# done
# rm exp/tri4/decode.started exp/tri4/decode.finished
mkdir -p exp/sgmm5/graph
CHK utils/mkgraph.sh \
data/lang exp/sgmm5 exp/sgmm5/graph &> exp/sgmm5/mkgraph.log
mkdir -p exp/sgmm5/decode
CHK steps/decode_sgmm2.sh \
--nj $decode_nj --cmd "$decode_cmd" --transform-dir exp/tri4/decode \
exp/sgmm5/graph data/dev/ exp/sgmm5/decode &> exp/sgmm5/decode.log
CHK steps/decode_sgmm2.sh --use-fmllr true --nj $decode_nj --cmd "$decode_cmd" \
--transform-dir exp/tri4/decode \
exp/sgmm5/graph data/dev/ exp/sgmm5/decode_fmllr &> exp/sgmm5/decode_fmllr.log
#) &
#sgmm5decode=$!; # Grab the PID of the subshell; needed for MMI rescoring
#sleep 5; # Let any "start-up error" messages from the subshell get logged
echo "See exp/sgmm5/mkgraph.log, exp/sgmm5/decode.log and exp/sgmm5/decode_fmllr.log for decoding outcomes"
################################################################################
# Ready to start discriminative SGMM training
################################################################################
echo ------------------------------------------------------
echo "Starting exp/sgmm5_ali on" `date`
echo ------------------------------------------------------
CHK steps/align_sgmm2.sh \
--nj $train_nj --cmd "$train_cmd" --transform-dir exp/tri4_ali --use-graphs true --use-gselect true \
data/train data/lang exp/sgmm5 exp/sgmm5_ali || exit 1
echo ----------------------------------------------------------
echo "Starting exp/sgmm5_denlats on" `date`
echo ----------------------------------------------------------
CHK steps/make_denlats_sgmm2.sh \
--nj $train_nj --sub-split $train_nj \
--beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri4_ali \
data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats || exit 1
echo -----------------------------------------------------------
echo "Starting exp/sgmm5_mmi_b0.1 on" `date`
echo -----------------------------------------------------------
CHK steps/train_mmi_sgmm2.sh \
--cmd "$decode_cmd" --transform-dir exp/tri4_ali --boost 0.1 \
data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
exp/sgmm5_mmi_b0.1 || exit 1
CHK steps/train_mmi_sgmm2.sh \
--cmd "$decode_cmd" --transform-dir exp/tri4_ali --boost 0.2 \
data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
exp/sgmm5_mmi_b0.2 || exit 1
################################################################################
# Ready to decode with discriminative SGMM2 models
################################################################################
echo "exp/sgmm5_mmi_b0.1/decode will wait on PID $sgmm5decode if necessary"
wait $sgmm5decode; # Need lattices from the corresponding SGMM decoding passes
echo --------------------------------------------------------------------------
echo "Starting exp/sgmm5_mmi_b0.1/decode[_fmllr] on" `date`
echo --------------------------------------------------------------------------
for iter in 1 2 3 4; do
CHK steps/decode_sgmm2_rescore.sh \
--cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
data/lang data/dev exp/sgmm5/decode exp/sgmm5_mmi_b0.1/decode_it$iter
CHK steps/decode_sgmm2_rescore.sh \
--cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
data/lang data/dev exp/sgmm5/decode_fmllr exp/sgmm5_mmi_b0.1/decode_fmllr_it$iter
CHK steps/decode_sgmm2_rescore.sh \
--cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
data/lang data/dev exp/sgmm5/decode exp/sgmm5_mmi_b0.2/decode_it$iter
CHK steps/decode_sgmm2_rescore.sh \
--cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
data/lang data/dev exp/sgmm5/decode_fmllr exp/sgmm5_mmi_b0.2/decode_fmllr_it$iter
done
wait
# No need to wait on $tri4decode ---> $sgmm5decode ---> sgmm5_mmi_b0.1decode
echo -----------------------------------------------------
echo "Finished successfully on" `date`
echo -----------------------------------------------------
exit 0

Просмотреть файл

@ -99,6 +99,7 @@ if [[ $filter_lexicon ]]; then
lexicon_file=$lexicon_dir/lexicon.txt
fi
echo ---------------------------------------------------------------------
echo "Preparing lexicon in data/local on" `date`
echo ---------------------------------------------------------------------
@ -144,6 +145,9 @@ echo ---------------------------------------------------------------------
local/train_lms_srilm.sh data data/srilm
local/arpa2G.sh data/srilm/lm.gz data/lang data/lang
#local/kws_setup.sh $ecf_file $kwlist_file $rttm_file data/lang data/dev || exit 1
#local/kws_search.sh data/lang data/dev exp/sgmm5/decode_fmllr
cd $SysDir
echo ---------------------------------------------------------------------
echo "Starting plp feature extraction in plp on" `date`