зеркало из https://github.com/mozilla/kaldi.git
Basic work towards the KW search -- create the keywords fsts and prepare the index using lattices and the KWfst
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1940 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
f88a45bc6b
Коммит
cc620ded07
|
@ -17,3 +17,6 @@ numGaussSGMM=50000
|
|||
# Change it to the LM with the best perplexity after run.sh finishes
|
||||
lmForDecoding=$SysDir/sriLM/32hLM.gz
|
||||
|
||||
glmFile=`readlink -f ./conf/glm`
|
||||
|
||||
|
||||
|
|
|
@ -17,8 +17,16 @@ dev_data_dir=/export/babel/oguz/10Hsubsets/106B-delivery-v0.2f_10hSubset/convers
|
|||
lexicon_file=/export/a09/jtrmal/babel/egs/Tagalog-10hSystem2a/Lexicon/lexicon.txt
|
||||
|
||||
# Scoring protocols (dummy GLM file to appease the scoring script)
|
||||
glmFile=`readlink -f ./conf/glm`
|
||||
glmFile=./conf/glm
|
||||
|
||||
train_nj=12
|
||||
decode_nj=9
|
||||
|
||||
#keyword search settings
|
||||
ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
|
||||
kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
|
||||
kwlist_file=/export/babel/data/scoring/IndusDB.20121102/babel106b-v0.2g_conv-dev.kwlist.xml
|
||||
rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.rttm
|
||||
#ln -sf `readlink -f babel/subsets/ecf.xml` $kwsdatadir/ecf.xml
|
||||
|
||||
. /export/babel/data/software/env.sh
|
||||
|
|
|
@ -27,6 +27,14 @@ lexiconFlags="-oov <unk>"
|
|||
# Scoring protocols (dummy GLM file to appease the scoring script)
|
||||
glmFile=./conf/glm
|
||||
|
||||
train_nj=32
|
||||
train_nj=16
|
||||
decode_nj=18
|
||||
|
||||
#keyword search settings
|
||||
ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
|
||||
kws_list= /export/babel/data/scoring/IndusDB.20121102/babel106b-v0.2g_conv-dev.kwlist.xml
|
||||
rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.rttm
|
||||
#ln -sf `readlink -f babel/subsets/ecf.xml` $kwsdatadir/ecf.xml
|
||||
|
||||
. /export/babel/data/software/env.sh
|
||||
|
||||
|
|
|
@ -101,6 +101,33 @@ function CHECKPOINT {
|
|||
eval export $COUNTER_NAME=$COUNTER
|
||||
}
|
||||
|
||||
function KILLBG_JOBS {
|
||||
jobs \
|
||||
| perl -ne 'print "$1\n" if m/^\[(\d+)\][+-]? +Running/;' \
|
||||
| while read -r ; do kill %"$REPLY" ; done
|
||||
}
|
||||
|
||||
function ONEXIT_HANDLER {
|
||||
COLOR_GREEN='\e[00;32m'
|
||||
COLOR_RED='\e[00;31m'
|
||||
COLOR_BLUE='\e[00;34m'
|
||||
COLOR_DEFAULT='\e[00m'
|
||||
counters=`set | egrep "^CHECKPOINT_[_A-Z]+_COUNTER=" | sed 's/^CHECKPOINT\(_[_A-Z][_A-Z]*\)_COUNTER=/LAST_GOOD\1=/g' | sed "s/^LAST_GOOD_DEFAULT=/LAST_GOOD=/g"`
|
||||
if [[ ! -z "$counters" ]]; then
|
||||
echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The last command returned non-zero status"${COLOR_DEFAULT} >&2
|
||||
echo -e ${COLOR_RED}"look at the counters and try to rerun this script (after figuring the issue)"${COLOR_DEFAULT} >&2
|
||||
echo -e ${COLOR_RED}"using the -c COUNTER_NAME=COUNTER_VALUE parameters;"${COLOR_DEFAULT} >&2
|
||||
echo -e ${COLOR_RED}"You can use -c \"COUNTER_NAME1=COUNTER_VALUE1;COUNTER_NAME2=COUNTER_VALUE2\" as well"${COLOR_DEFAULT} >&2
|
||||
echo -e ${COLOR_RED}"The counters: \n $counters"${COLOR_DEFAULT} >&2
|
||||
else
|
||||
echo -e ${COLOR_RED}"CHECKPOINT FAILURE: The last command returned non-zero status"${COLOR_DEFAULT} >&2
|
||||
echo -e ${COLOR_RED}"No checkpoint was found. Try to figure out the problem and "${COLOR_DEFAULT} >&2
|
||||
echo -e ${COLOR_RED}"run the script again"${COLOR_DEFAULT} >&2
|
||||
fi
|
||||
}
|
||||
|
||||
trap "ONEXIT_HANDLER; exit; " SIGINT SIGKILL SIGTERM ERR
|
||||
|
||||
while getopts ":c:i" opt; do
|
||||
case $opt in
|
||||
c)
|
||||
|
|
|
@ -3,6 +3,13 @@
|
|||
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen)
|
||||
# Apache 2.0.
|
||||
|
||||
# Begin configuration section.
|
||||
case_insensitive=true
|
||||
# End configuration section.
|
||||
|
||||
[ -f ./path.sh ] && . ./path.sh; # source the path.
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
|
||||
if [ $# -ne 3 ]; then
|
||||
echo "Usage: local/kws_data_prep.sh <lang-dir> <data-dir> <kws-data-dir>"
|
||||
|
@ -13,9 +20,8 @@ fi
|
|||
langdir=$1;
|
||||
datadir=$2;
|
||||
kwsdatadir=$3;
|
||||
keywords=$kwsdatadir/kws.xml
|
||||
keywords=$kwsdatadir/kwlist.xml
|
||||
|
||||
case_insensitive=true
|
||||
|
||||
mkdir -p $kwsdatadir;
|
||||
|
||||
|
@ -39,9 +45,10 @@ cat $keywords | perl -e '
|
|||
# are not in our $langdir/words.txt, as we won't find them anyway...
|
||||
#cat $kwsdatadir/keywords.txt | babel/filter_keywords.pl $langdir/words.txt - - | \
|
||||
# sym2int.pl --map-oov 0 -f 2- $langdir/words.txt | \
|
||||
if $case_insensitive ; then
|
||||
if $case_insensitive ; then
|
||||
echo "Running case insensitive processing"
|
||||
cat $langdir/words.txt | tr '[:lower:]' '[:upper:]' > $kwsdatadir/words.txt
|
||||
[ `cut -f 1 -d ' ' $kwsdatadir/words.txt | wc -l` -ne `cat $kwsdatadir/words.txt | wc -l` ] && echo "Warning, multiple words in dictionary differ only in case..."
|
||||
[ `cut -f 1 -d ' ' $kwsdatadir/words.txt | sort -u | wc -l` -ne `cat $kwsdatadir/words.txt | wc -l` ] && echo "Warning, multiple words in dictionary differ only in case..."
|
||||
|
||||
cat $kwsdatadir/keywords.txt | tr '[:lower:]' '[:upper:]' | \
|
||||
sym2int.pl --map-oov 0 -f 2- $kwsdatadir/words.txt > $kwsdatadir/keywords_all.int
|
||||
|
@ -55,11 +62,11 @@ cat $kwsdatadir/keywords_all.int | \
|
|||
grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int
|
||||
|
||||
cut -f 1 -d ' ' $kwsdatadir/keywords.int | \
|
||||
babel/subset_kwslist.pl $keywords > $kwsdatadir/keyword_invocab.xml
|
||||
local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_invocab.xml
|
||||
|
||||
cat $kwsdatadir/keywords_all.int | \
|
||||
egrep " 0 | 0$" | cut -f 1 -d ' ' | \
|
||||
babel/subset_kwslist.pl $keywords > $kwsdatadir/keyword_outvocab.xml
|
||||
local/subset_kwslist.pl $keywords > $kwsdatadir/kwlist_outvocab.xml
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen)
|
||||
# Apache 2.0.
|
||||
|
||||
. ./path.sh
|
||||
. ./cmd.sh
|
||||
|
||||
# Begin configuration section.
|
||||
acwt=0.0909091
|
||||
duptime=0.6
|
||||
cmd=run.pl
|
||||
# End configuration section.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
[ -f ./path.sh ] && . ./path.sh; # source the path.
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
datadir=$2
|
||||
langdir=$1
|
||||
decodedir=$3
|
||||
|
||||
kwsdatadir=$datadir/kws
|
||||
kwsoutdir=$decodedir/kws
|
||||
|
||||
mkdir -p $kwsdatadir
|
||||
mkdir -p $kwsoutdir
|
||||
|
||||
|
||||
local/make_index.sh --cmd "$cmd" --acwt $acwt \
|
||||
$kwsdatadir $langdir $decodedir $kwsoutdir || exit 1
|
||||
|
||||
local/search_index.sh $kwsdatadir $kwsoutdir || exit 1
|
||||
|
||||
duration=`head -1 $kwsdatadir/ecf.xml |\
|
||||
grep -o -E "duration=\"[0-9]*[ \.]*[0-9]*\"" |\
|
||||
grep -o -E "[0-9]*[\.]*[0-9]*" |\
|
||||
perl -e 'while(<>) {print $_/2;}'`
|
||||
|
||||
cat $kwsoutdir/result.* | \
|
||||
utils/write_kwslist.pl --flen=0.01 --duration=$duration \
|
||||
--segments=$datadir/segments --normalize=true \
|
||||
--map-utter=$kwsdatadir/utter_map \
|
||||
- - | \
|
||||
utils/filter_kwslist.pl $duptime > $kwsoutdir/kwslist.xml
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen, Yenda Trmal)
|
||||
# Apache 2.0.
|
||||
|
||||
. ./path.sh
|
||||
. ./cmd.sh
|
||||
|
||||
# Begin configuration section.
|
||||
cmd=run.pl
|
||||
# End configuration section.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
[ -f ./path.sh ] && . ./path.sh; # source the path.
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
ecf_file=$1
|
||||
kwlist_file=$2
|
||||
rttm_file=$3
|
||||
langdir=$4
|
||||
datadir=$5
|
||||
kwsdatadir=$datadir/kws
|
||||
|
||||
mkdir -p $kwsdatadir
|
||||
|
||||
cp `readlink -f $ecf_file` $kwsdatadir/ecf.xml || exit 1
|
||||
cp `readlink -f $kwlist_file` $kwsdatadir/kwlist.xml || exit 1
|
||||
cp `readlink -f $rttm_file` $kwsdatadir/rttm || exit 1
|
||||
|
||||
local/kws_data_prep.sh --case-insensitive true $langdir $datadir $kwsdatadir || exit 1
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
#!/bin/bash
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
[ -f ./path.sh ] && . ./path.sh; # source the path.
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
input_data_dir=$1
|
||||
input_filelist=$2
|
||||
output_data_dir=$3
|
||||
|
||||
mkdir -p $output_data_dir/transcription
|
||||
mkdir -p $output_data_dir/audio
|
||||
|
||||
abs_src_dir=`readlink -f $input_data_dir`
|
||||
abs_tgt_dir=`readlink -f $output_data_dir`
|
||||
|
||||
for file_basename in `cat $input_data_list`; do
|
||||
if [[ -e $abs_src_dir/audio/$file_basename.sph ]] ; then
|
||||
ln -sf $abs_src_dir/audio/$file_basename.sph $abs_tgt_dir/audio || exit 1
|
||||
else
|
||||
echo "File $abs_src_dir/audio/$file_basename.sph does not exist!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -e $abs_src_dir/transcription/$file_basename.txt ]] ; then
|
||||
ln -sf $abs_src_dir/transcription/$file_basename.txt $abs_tgt_dir/transcription || exit 1
|
||||
else
|
||||
echo "File $abs_src_dir/audio/$file_basename.sph does not exist!"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
[ -f ./path.sh ] && . ./path.sh; # source the path.
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
$transcriptions=$1
|
||||
$input_lexicon_file=$2
|
||||
$output_lexicon_file=$3
|
||||
|
||||
(
|
||||
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
|
||||
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
|
||||
) | sort -u | awk '
|
||||
BEGIN {
|
||||
while(( getline line< ARGV[2] ) > 0 ) {
|
||||
split(line, e, "\t")
|
||||
LEXICON[ e[1] ]=line
|
||||
}
|
||||
FILENAME="-"
|
||||
i=0
|
||||
|
||||
while(( getline word< ARGV[1] ) > 0 ) {
|
||||
if (word in LEXICON)
|
||||
print LEXICON[word]
|
||||
}
|
||||
}
|
||||
' - $input_lexicon_file | sort -u > $output_lexicon_file
|
||||
|
|
@ -0,0 +1,384 @@
|
|||
#!/bin/bash
|
||||
|
||||
# System and data directories
|
||||
#SCRIPT=$(readlink $0)
|
||||
#SysDir=`dirname $SCRIPT`
|
||||
SysDir=`pwd`
|
||||
echo $SysDir
|
||||
|
||||
# Lexicon and Language Model parameters
|
||||
oovSymbol="<unk>"
|
||||
lexiconFlags="-oov <unk>"
|
||||
|
||||
# Scoring protocols (dummy GLM file to appease the scoring script)
|
||||
glmFile=`readlink -f ./conf/glm`
|
||||
|
||||
# Include the checkpointing facility
|
||||
. ./local/CHECKPOINT.sh
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
||||
[ -f ./path.sh ] && . ./path.sh; # source the path.
|
||||
[ -f ./cmd.sh ] && . ./cmd.sh; # source train and decode cmds.
|
||||
. parse_options.sh || exit 1;
|
||||
|
||||
configfile=$1
|
||||
[ -f $configfile ] && . $configfile
|
||||
[ -f ./local.conf ] && . ./local.conf
|
||||
|
||||
#Preparing dev and train directories
|
||||
if test -f $train_data_list ; then
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Subsetting the TRAIN set"
|
||||
echo ---------------------------------------------------------------------
|
||||
|
||||
CHK local/make_corpus_subset.sh $train_data_dir $train_data_list ./data/raw_train_data || exit 1
|
||||
train_data_dir=`readlink -f ./data/raw_train_data`
|
||||
|
||||
nj_max=`cat $train_data_list | wc -l`
|
||||
if [[ "$nj_max" -lt "$train_nj" ]] ; then
|
||||
echo "The maximum reasonable number of jobs is $nj_max (you have $train_nj)! (The training and decoding process has file-granularity)"
|
||||
train_nj=$nj_max
|
||||
fi
|
||||
fi
|
||||
|
||||
if test -f $dev_data_list ; then
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Subsetting the DEV set"
|
||||
echo ---------------------------------------------------------------------
|
||||
|
||||
CHK local/make_corpus_subset.sh $dev_data_dir $dev_data_list ./data/raw_dev_data || exit 1
|
||||
dev_data_dir=`readlink -f ./data/raw_dev_data`
|
||||
|
||||
nj_max=`cat $dev_data_list | wc -l`
|
||||
if [[ "$nj_max" -lt "$decode_nj" ]] ; then
|
||||
echo "The maximum reasonable number of jobs is $nj_max "
|
||||
echo "you have $decode_nj! (The training and decoding process has file-granularity)"
|
||||
decode_nj=$nj_max
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ $filter_lexicon ]]; then
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Subsetting the LEXICON"
|
||||
echo ---------------------------------------------------------------------
|
||||
|
||||
lexicon_dir=./data/raw_lex_data
|
||||
mkdir -p $lexicon_dir
|
||||
CHK local/make_corpus_subset.sh $train_data_dir/transcriptions \
|
||||
$lexicon_file $lexicon_dir/lexicon.txt || exit 1
|
||||
lexicon_file=$lexicon_dir/lexicon.txt
|
||||
fi
|
||||
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Preparing lexicon in data/local on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
mkdir -p data/local
|
||||
CHK local/prepare_lexicon.pl \
|
||||
$lexiconFlags $lexicon_file data/local || exit 1
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Creating L.fst etc in data/lang on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
mkdir -p data/lang
|
||||
CHK utils/prepare_lang.sh \
|
||||
--share-silence-phones true \
|
||||
data/local $oovSymbol data/local/tmp.lang data/lang || exit 1
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Preparing acoustic training lists in data/train on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
mkdir -p data/train
|
||||
CHK local/prepare_acoustic_training_data.pl \
|
||||
--vocab data/local/lexicon.txt --fragmentMarkers \-\*\~ \
|
||||
$train_data_dir data/train > data/train/skipped_utts.log || exit 1
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Preparing dev data lists in data/dev on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
mkdir -p data/dev
|
||||
CHK local/prepare_acoustic_training_data.pl \
|
||||
--fragmentMarkers \-\*\~ \
|
||||
$dev_data_dir data/dev > data/dev/skipped_utts.log || exit 1
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Preparing dev stm files in data/dev on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
CHK local/prepare_stm.pl --fragmentMarkers \-\*\~ data/dev || exit 1
|
||||
|
||||
|
||||
test -f "$glmFile" || exit 1
|
||||
cp $glmFile data/dev/glm || exit 1
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Creating a basic G.fst in data/lang on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
|
||||
# We will simply override the default G.fst by the G.fst generated using SRILM
|
||||
CHK local/train_lms_srilm.sh data data/srilm
|
||||
CHK local/arpa2G.sh data/srilm/lm.gz data/lang data/lang
|
||||
|
||||
#CHK local/kws_setup.sh $ecf_file $kwlist_file $rttm_file data/lang data/dev || exit 1
|
||||
#CHK local/kws_search.sh data/lang data/dev exp/sgmm5/decode_fmllr
|
||||
|
||||
|
||||
cd $SysDir
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Starting plp feature extraction in plp on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
CHK steps/make_plp.sh \
|
||||
--cmd "$train_cmd" --nj $train_nj \
|
||||
data/train exp/make_plp/train plp || exit 1
|
||||
CHK steps/compute_cmvn_stats.sh \
|
||||
data/train exp/make_plp/train plp || exit 1
|
||||
# In case plp extraction failed on some utterance, delist them
|
||||
CHK utils/fix_data_dir.sh data/train
|
||||
|
||||
CHK steps/make_plp.sh \
|
||||
--cmd "$train_cmd" --nj $decode_nj \
|
||||
data/dev exp/make_plp/dev plp || exit 1
|
||||
CHK steps/compute_cmvn_stats.sh \
|
||||
data/dev exp/make_plp/dev plp || exit 1
|
||||
# In case plp extraction failed on some utterance, delist them
|
||||
CHK utils/fix_data_dir.sh data/dev
|
||||
mkdir -p exp
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Subsetting monophone training data in data/train_sub1 on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
CHK utils/subset_data_dir.sh data/train 5000 data/train_sub1 || exit 1
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Starting (small) monophone training in exp/mono on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
CHK steps/train_mono.sh \
|
||||
--boost-silence 1.5 --nj 8 --cmd "$train_cmd" \
|
||||
data/train_sub1 data/lang exp/mono || exit 1
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Starting (first) triphone training in exp/tri1 on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
CHK steps/align_si.sh \
|
||||
--boost-silence 1.5 --nj $(( $train_nj/2 )) --cmd "$train_cmd" \
|
||||
data/train data/lang exp/mono exp/mono_ali || exit 1
|
||||
CHK steps/train_deltas.sh \
|
||||
--boost-silence 1.5 --cmd "$train_cmd" \
|
||||
$numLeavesTri1 $numGaussTri1 data/train data/lang exp/mono_ali exp/tri1 || exit 1
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Spawning decoding with first triphone models in exp/tri1 on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
#(
|
||||
mkdir -p exp/tri1/graph
|
||||
CHK utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph &> exp/tri1/mkgraph.log
|
||||
mkdir -p exp/tri1/decode
|
||||
CHK steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
|
||||
exp/tri1/graph data/dev exp/tri1/decode &> exp/tri1/decode.log
|
||||
#) &
|
||||
#tri1decode=$!; # Grab the PID of the subshell
|
||||
#sleep 5; # Let any "start-up error" messages from the subshell get logged
|
||||
echo "See exp/tri1/mkgraph.log and exp/tri1/decode.log for decoding outcomes"
|
||||
|
||||
echo -----------------------------------------------------------------------------
|
||||
echo "Starting second triphone training in exp/tri2 on" `date`
|
||||
echo -----------------------------------------------------------------------------
|
||||
CHK steps/align_si.sh \
|
||||
--boost-silence 1.5 --nj $(( $train_nj/2 )) --cmd "$train_cmd" \
|
||||
data/train data/lang exp/tri1 exp/tri1_ali || exit 1
|
||||
CHK steps/train_deltas.sh \
|
||||
--boost-silence 1.5 --cmd "$train_cmd" \
|
||||
$numLeavesTri2 $numGaussTri2 data/train data/lang exp/tri1_ali exp/tri2 || exit 1
|
||||
|
||||
echo -----------------------------------------------------------------------------
|
||||
echo "Spawning decoding with triphone models in exp/tri2 on" `date`
|
||||
echo -----------------------------------------------------------------------------
|
||||
#(
|
||||
mkdir -p exp/tri2/graph
|
||||
CHK utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph &> exp/tri2/mkgraph.log
|
||||
mkdir -p exp/tri2/decode
|
||||
CHK steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
|
||||
exp/tri2/graph data/dev exp/tri2/decode &> exp/tri2/decode.log
|
||||
#) &
|
||||
#tri2decode=$!; # Grab the PID of the subshell
|
||||
#sleep 5; # Let any "start-up error" messages from the subshell get logged
|
||||
echo "See exp/tri2/mkgraph.log and exp/tri2/decode.log for decoding outcomes"
|
||||
|
||||
|
||||
echo ---------------------------------------------------------------------------------
|
||||
echo "Starting (lda_mllt) triphone training in exp/tri4 on" `date`
|
||||
echo ---------------------------------------------------------------------------------
|
||||
CHK steps/align_si.sh \
|
||||
--boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
|
||||
data/train data/lang exp/tri2 exp/tri2_ali || exit 1
|
||||
CHK steps/train_lda_mllt.sh \
|
||||
--boost-silence 1.5 --cmd "$train_cmd" \
|
||||
$numLeavesMLLT $numGaussMLLT data/train data/lang exp/tri2_ali exp/tri3 || exit 1
|
||||
|
||||
echo ----------------------------------------------------------------------------------
|
||||
echo "Spawning decoding with lda_mllt models in exp/tri3 on" `date`
|
||||
echo ----------------------------------------------------------------------------------
|
||||
#(
|
||||
mkdir -p exp/tri3/graph
|
||||
utils/mkgraph.sh \
|
||||
data/lang exp/tri3 exp/tri3/graph &> exp/tri3/mkgraph.log
|
||||
mkdir -p exp/tri3/decode
|
||||
steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
|
||||
exp/tri3/graph data/dev exp/tri3/decode &> exp/tri3/decode.log
|
||||
#) &
|
||||
#tri3decode=$!; # Grab the PID of the subshell
|
||||
#sleep 5; # Let any "start-up error" messages from the subshell get logged
|
||||
echo "See exp/tri3/mkgraph.log and exp/tri3/decode.log for decoding outcomes"
|
||||
|
||||
echo ----------------------------------------------------------------------------
|
||||
echo "Starting (SAT) triphone training in exp/tri4 on" `date`
|
||||
echo ----------------------------------------------------------------------------
|
||||
|
||||
CHK steps/align_si.sh \
|
||||
--boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
|
||||
data/train data/lang exp/tri3 exp/tri3_ali || exit 1
|
||||
CHK steps/train_sat.sh \
|
||||
--boost-silence 1.5 --cmd "$train_cmd" \
|
||||
$numLeavesSAT $numGaussSAT data/train data/lang exp/tri3_ali exp/tri4 || exit 1
|
||||
|
||||
echo ------------------------------------------------------------------
|
||||
echo "Spawning decoding with SAT models on" `date`
|
||||
echo ------------------------------------------------------------------
|
||||
#(
|
||||
mkdir -p exp/tri4/graph
|
||||
CHK utils/mkgraph.sh \
|
||||
data/lang exp/tri4 exp/tri4/graph &> exp/tri4/mkgraph.log
|
||||
mkdir -p exp/tri4/decode
|
||||
CHK touch exp/tri4/decode.started # A signal to the SGMM2 decoding step
|
||||
CHK steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" \
|
||||
exp/tri4/graph data/dev exp/tri4/decode &> exp/tri4/decode.log
|
||||
|
||||
CHK touch exp/tri4/decode.finished # so SGMM2 decoding may proceed
|
||||
#) &
|
||||
#tri4decode=$!; # Grab the PID of the subshell; needed for SGMM2 decoding
|
||||
#sleep 5; # Let any "start-up error" messages from the subshell get logged
|
||||
echo "See exp/tri4/mkgraph.log and exp/tri4/decode.log for decoding outcomes"
|
||||
|
||||
################################################################################
|
||||
# Ready to start SGMM training
|
||||
################################################################################
|
||||
|
||||
echo -------------------------------------------------
|
||||
echo "Starting exp/ubm5 on" `date`
|
||||
echo -------------------------------------------------
|
||||
CHK steps/align_fmllr.sh --boost-silence 1.5 --nj $train_nj --cmd "$train_cmd" \
|
||||
data/train data/lang exp/tri4 exp/tri4_ali || exit 1
|
||||
CHK steps/train_ubm.sh --cmd "$train_cmd" \
|
||||
$numGaussUBM data/train data/lang exp/tri4_ali exp/ubm5 || exit 1
|
||||
|
||||
echo --------------------------------------------------
|
||||
echo "Starting exp/sgmm5 on" `date`
|
||||
echo --------------------------------------------------
|
||||
CHK steps/train_sgmm2.sh --cmd "$train_cmd" \
|
||||
$numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri4_ali exp/ubm5/final.ubm exp/sgmm5 || exit 1
|
||||
|
||||
################################################################################
|
||||
# Ready to decode with SGMM2 models
|
||||
################################################################################
|
||||
|
||||
echo -----------------------------------------------------------------
|
||||
echo "Spawning exp/sgmm5/decode[_fmllr] on" `date`
|
||||
echo -----------------------------------------------------------------
|
||||
echo "exp/sgmm5/decode will wait on PID $tri4decode if necessary"
|
||||
#wait $tri4decode; # Need lattices from the corresponding SGMM decoding passes
|
||||
#(
|
||||
# sleep 5; # Let the status message after the subshell get logged
|
||||
## The next (now commented) block should ensure we starting decoding of sgmm5 only after
|
||||
## the tri4 decoding finishes. The same can be achieved by "wait"int for tri4decode pid
|
||||
# while [ ! -f exp/tri4/decode.started -o ! -f exp/tri4/decode.finished ]; do
|
||||
# echo "exp/sgmm5/decode is waiting on SAT decoding ..." `date`
|
||||
# sleep 5
|
||||
# done
|
||||
# while [ exp/tri4/decode.finished -ot exp/tri4/decode.started ]; do
|
||||
# echo "exp/tri4/decode.finished is older than exp/tri4/decode.started"; \
|
||||
# ls -lt exp/tri4/decode.finished exp/tri4/decode.started; \
|
||||
# echo "Perhaps SAT decoding was restarted and is still running?"; \
|
||||
# echo "exp/sgmm5/decode is still waiting on SAT decoding ..." `date`
|
||||
# sleep 5
|
||||
# done
|
||||
# rm exp/tri4/decode.started exp/tri4/decode.finished
|
||||
mkdir -p exp/sgmm5/graph
|
||||
CHK utils/mkgraph.sh \
|
||||
data/lang exp/sgmm5 exp/sgmm5/graph &> exp/sgmm5/mkgraph.log
|
||||
mkdir -p exp/sgmm5/decode
|
||||
CHK steps/decode_sgmm2.sh \
|
||||
--nj $decode_nj --cmd "$decode_cmd" --transform-dir exp/tri4/decode \
|
||||
exp/sgmm5/graph data/dev/ exp/sgmm5/decode &> exp/sgmm5/decode.log
|
||||
CHK steps/decode_sgmm2.sh --use-fmllr true --nj $decode_nj --cmd "$decode_cmd" \
|
||||
--transform-dir exp/tri4/decode \
|
||||
exp/sgmm5/graph data/dev/ exp/sgmm5/decode_fmllr &> exp/sgmm5/decode_fmllr.log
|
||||
#) &
|
||||
#sgmm5decode=$!; # Grab the PID of the subshell; needed for MMI rescoring
|
||||
#sleep 5; # Let any "start-up error" messages from the subshell get logged
|
||||
echo "See exp/sgmm5/mkgraph.log, exp/sgmm5/decode.log and exp/sgmm5/decode_fmllr.log for decoding outcomes"
|
||||
|
||||
################################################################################
|
||||
# Ready to start discriminative SGMM training
|
||||
################################################################################
|
||||
|
||||
echo ------------------------------------------------------
|
||||
echo "Starting exp/sgmm5_ali on" `date`
|
||||
echo ------------------------------------------------------
|
||||
CHK steps/align_sgmm2.sh \
|
||||
--nj $train_nj --cmd "$train_cmd" --transform-dir exp/tri4_ali --use-graphs true --use-gselect true \
|
||||
data/train data/lang exp/sgmm5 exp/sgmm5_ali || exit 1
|
||||
|
||||
echo ----------------------------------------------------------
|
||||
echo "Starting exp/sgmm5_denlats on" `date`
|
||||
echo ----------------------------------------------------------
|
||||
CHK steps/make_denlats_sgmm2.sh \
|
||||
--nj $train_nj --sub-split $train_nj \
|
||||
--beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri4_ali \
|
||||
data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats || exit 1
|
||||
|
||||
echo -----------------------------------------------------------
|
||||
echo "Starting exp/sgmm5_mmi_b0.1 on" `date`
|
||||
echo -----------------------------------------------------------
|
||||
CHK steps/train_mmi_sgmm2.sh \
|
||||
--cmd "$decode_cmd" --transform-dir exp/tri4_ali --boost 0.1 \
|
||||
data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
|
||||
exp/sgmm5_mmi_b0.1 || exit 1
|
||||
|
||||
CHK steps/train_mmi_sgmm2.sh \
|
||||
--cmd "$decode_cmd" --transform-dir exp/tri4_ali --boost 0.2 \
|
||||
data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
|
||||
exp/sgmm5_mmi_b0.2 || exit 1
|
||||
################################################################################
|
||||
# Ready to decode with discriminative SGMM2 models
|
||||
################################################################################
|
||||
|
||||
echo "exp/sgmm5_mmi_b0.1/decode will wait on PID $sgmm5decode if necessary"
|
||||
wait $sgmm5decode; # Need lattices from the corresponding SGMM decoding passes
|
||||
echo --------------------------------------------------------------------------
|
||||
echo "Starting exp/sgmm5_mmi_b0.1/decode[_fmllr] on" `date`
|
||||
echo --------------------------------------------------------------------------
|
||||
for iter in 1 2 3 4; do
|
||||
CHK steps/decode_sgmm2_rescore.sh \
|
||||
--cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
|
||||
data/lang data/dev exp/sgmm5/decode exp/sgmm5_mmi_b0.1/decode_it$iter
|
||||
CHK steps/decode_sgmm2_rescore.sh \
|
||||
--cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
|
||||
data/lang data/dev exp/sgmm5/decode_fmllr exp/sgmm5_mmi_b0.1/decode_fmllr_it$iter
|
||||
CHK steps/decode_sgmm2_rescore.sh \
|
||||
--cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
|
||||
data/lang data/dev exp/sgmm5/decode exp/sgmm5_mmi_b0.2/decode_it$iter
|
||||
CHK steps/decode_sgmm2_rescore.sh \
|
||||
--cmd "$decode_cmd" --iter $iter --transform-dir exp/tri4/decode \
|
||||
data/lang data/dev exp/sgmm5/decode_fmllr exp/sgmm5_mmi_b0.2/decode_fmllr_it$iter
|
||||
done
|
||||
|
||||
|
||||
wait
|
||||
|
||||
# No need to wait on $tri4decode ---> $sgmm5decode ---> sgmm5_mmi_b0.1decode
|
||||
|
||||
echo -----------------------------------------------------
|
||||
echo "Finished successfully on" `date`
|
||||
echo -----------------------------------------------------
|
||||
|
||||
exit 0
|
|
@ -99,6 +99,7 @@ if [[ $filter_lexicon ]]; then
|
|||
lexicon_file=$lexicon_dir/lexicon.txt
|
||||
fi
|
||||
|
||||
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Preparing lexicon in data/local on" `date`
|
||||
echo ---------------------------------------------------------------------
|
||||
|
@ -144,6 +145,9 @@ echo ---------------------------------------------------------------------
|
|||
local/train_lms_srilm.sh data data/srilm
|
||||
local/arpa2G.sh data/srilm/lm.gz data/lang data/lang
|
||||
|
||||
#local/kws_setup.sh $ecf_file $kwlist_file $rttm_file data/lang data/dev || exit 1
|
||||
#local/kws_search.sh data/lang data/dev exp/sgmm5/decode_fmllr
|
||||
|
||||
cd $SysDir
|
||||
echo ---------------------------------------------------------------------
|
||||
echo "Starting plp feature extraction in plp on" `date`
|
||||
|
|
Загрузка…
Ссылка в новой задаче