sandbox/dan2: merging changes from trunk; some further small code-level optimizations to determinization code (which I just realized were done in sandbox/dan2; I'll now merge those back to trunk.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/dan2@3087 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2013-10-14 23:33:20 +00:00
Родитель 4da87782bc
Коммит 0bc71728a2
92 изменённых файлов: 2361 добавлений и 2030 удалений

Просмотреть файл

@ -0,0 +1,77 @@
# include common settings for limitedLP systems.
. conf/common.limitedLP || exit 1;
#speech corpora files location
train_data_dir=/export/babel/data/106-tagalog/release-current/conversational/training/
train_data_list=/export/babel/data/splits/Tagalog_Babel106/train.LimitedLP.official.list
train_nj=16
#RADICAL DEV data files
dev2h_data_dir=/export/babel/data/106-tagalog/release-current/conversational/dev
dev2h_data_list=/export/babel/data/splits/Tagalog_Babel106/dev2hr.list
dev2h_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
dev2h_stm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.stm
dev2h_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
dev2h_rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.mitllfa3.rttm
dev2h_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
dev2h_subset_ecf=true
dev2h_nj=23
#Official DEV data files
dev10h_data_dir=/export/babel/data/106-tagalog/release-current/conversational/dev
dev10h_data_list=/export/babel/data/splits/Tagalog_Babel106/dev.list
dev10h_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
dev10h_stm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.stm
dev10h_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.ecf.xml
dev10h_rttm_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev/babel106b-v0.2g_conv-dev.mitllfa3.rttm
dev10h_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-dev.kwlist.xml
dev10h_nj=32
#Official EVAL period evaluation data files
eval_data_dir=/export/babel/data/106-tagalog/release-current/conversational/eval
eval_data_list=/export/babel/data/splits/Tagalog_Babel106/eval.babel106b-v0.2g.list
eval_ecf_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-eval.ecf.xml
eval_kwlist_file=/export/babel/data/scoring/IndusDB/babel106b-v0.2g_conv-eval.kwlist2.xml
eval_data_cmudb=/export/babel/data/splits/Tagalog_Babel106/uem/v18/db-tag-utt.dat
eval_nj=64
#Official (POST-)EVAL evaluation data portion
evalpart1_data_dir=
evalpart1_data_list=
evalpart1_data_cmudb=
evalpart1_stm_file=
evalpart1_ecf_file=
evalpart1_rttm_file=
evalpart1_kwlist_file=
evalpart1_nj=21
# Acoustic model parameters
numLeavesTri1=1000
numGaussTri1=10000
numLeavesTri2=2500
numGaussTri2=36000
numLeavesTri3=2500
numGaussTri3=36000
numLeavesMLLT=2500
numGaussMLLT=36000
numLeavesSAT=2500
numGaussSAT=36000
numGaussUBM=750
numLeavesSGMM=5000
numGaussSGMM=18000
# Lexicon and Language Model parameters
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
use_ffv=true
use_pitch=true
# Scoring protocols (dummy GLM file to appease the scoring script)
#glmFile=./conf/glm
lexicon_file=/export/babel/data/106-tagalog/release-current/conversational/reference_materials/lexicon.txt
#keyword search settings
duptime=0.5
case_insensitive=true

Просмотреть файл

@ -0,0 +1,97 @@
# include common settings for fullLP systems.
. conf/common.fullLP || exit 1;
#speech corpora files location
train_data_dir=/export/babel/data/201-haitian/release-current/conversational/training/
train_data_list=/export/babel/data/splits/Haitian_Babel201/train.FullLP.list
train_nj=32
#RADICAL DEV data files
dev2h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
dev2h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.2hr.list
dev2h_data_cmudb=
dev2h_stm_file=
dev2h_ecf_file=
dev2h_rttm_file=
dev2h_kwlist_file=
dev2h_subset_ecf=true
dev2h_nj=20
#Official DEV data files
dev10h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev
dev10h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.list
dev10h_data_cmudb=
dev10h_stm_file=
dev10h_ecf_file=
dev10h_rttm_file=
dev10h_kwlist_file=
dev10h_nj=32
#RADICAL DEV data files
dev10h_sph_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
dev10h_sph_data_list=/export/babel/data/splits/Haitian_Babel201/dev.sph.list
dev10h_sph_data_cmudb=
dev10h_sph_stm_file=
dev10h_sph_ecf_file=
dev10h_sph_rttm_file=
dev10h_sph_kwlist_file=
dev10h_sph_subset_ecf=true
dev10h_sph_nj=32
#RADICAL DEV data files
dev10h_wav_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
dev10h_wav_data_list=/export/babel/data/splits/Haitian_Babel201/dev.wav.list
dev10h_wav_data_cmudb=
dev10h_wav_stm_file=
dev10h_wav_ecf_file=
dev10h_wav_rttm_file=
dev10h_wav_kwlist_file=
dev10h_wav_subset_ecf=true
dev10h_wav_nj=13
#Official EVAL period evaluation data files
eval_data_dir=/export/babel/data/201-haitian/release-current/conversational/eval
eval_data_list=
eval_ecf_file=
eval_kwlist_file=
eval_data_cmudb=
eval_nj=64
#Official (POST-)EVAL evaluation data portion
evalpart1_data_dir=
evalpart1_data_list=
evalpart1_data_cmudb=
evalpart1_stm_file=
evalpart1_ecf_file=
evalpart1_rttm_file=
evalpart1_kwlist_file=
evalpart1_nj=21
# Acoustic model parameters
numLeavesTri1=1000
numGaussTri1=10000
numLeavesTri2=1000
numGaussTri2=20000
numLeavesTri3=6000
numGaussTri3=75000
numLeavesMLLT=6000
numGaussMLLT=75000
numLeavesSAT=6000
numGaussSAT=75000
numGaussUBM=800
numLeavesSGMM=10000
numGaussSGMM=80000
# Lexicon and Language Model parameters
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
use_pitch=false
use_ffv=false
# Scoring protocols (dummy GLM file to appease the scoring script)
#glmFile=./conf/glm
lexicon_file=/export/babel/data/201-haitian/release-current/conversational/reference_materials/lexicon.txt
#keyword search settings
duptime=0.5
case_insensitive=true

Просмотреть файл

@ -0,0 +1,97 @@
# include common settings for fullLP systems.
. conf/common.fullLP || exit 1;
#speech corpora files location
train_data_dir=/export/babel/data/201-haitian/release-current/conversational/training/
train_data_list=/export/babel/data/splits/Haitian_Babel201/train.FullLP.list
train_nj=32
#RADICAL DEV data files
dev2h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
dev2h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.2hr.list
dev2h_data_cmudb=
dev2h_stm_file=
dev2h_ecf_file=
dev2h_rttm_file=
dev2h_kwlist_file=
dev2h_subset_ecf=true
dev2h_nj=20
#Official DEV data files
dev10h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev
dev10h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.list
dev10h_data_cmudb=
dev10h_stm_file=
dev10h_ecf_file=
dev10h_rttm_file=
dev10h_kwlist_file=
dev10h_nj=32
#RADICAL DEV data files
dev10h_sph_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
dev10h_sph_data_list=/export/babel/data/splits/Haitian_Babel201/dev.sph.list
dev10h_sph_data_cmudb=
dev10h_sph_stm_file=
dev10h_sph_ecf_file=
dev10h_sph_rttm_file=
dev10h_sph_kwlist_file=
dev10h_sph_subset_ecf=true
dev10h_sph_nj=32
#RADICAL DEV data files
dev10h_wav_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
dev10h_wav_data_list=/export/babel/data/splits/Haitian_Babel201/dev.wav.list
dev10h_wav_data_cmudb=
dev10h_wav_stm_file=
dev10h_wav_ecf_file=
dev10h_wav_rttm_file=
dev10h_wav_kwlist_file=
dev10h_wav_subset_ecf=true
dev10h_wav_nj=13
#Official EVAL period evaluation data files
eval_data_dir=/export/babel/data/201-haitian/release-current/conversational/eval
eval_data_list=
eval_ecf_file=
eval_kwlist_file=
eval_data_cmudb=
eval_nj=64
#Official (POST-)EVAL evaluation data portion
evalpart1_data_dir=
evalpart1_data_list=
evalpart1_data_cmudb=
evalpart1_stm_file=
evalpart1_ecf_file=
evalpart1_rttm_file=
evalpart1_kwlist_file=
evalpart1_nj=21
# Acoustic model parameters
numLeavesTri1=1000
numGaussTri1=10000
numLeavesTri2=1000
numGaussTri2=20000
numLeavesTri3=6000
numGaussTri3=75000
numLeavesMLLT=6000
numGaussMLLT=75000
numLeavesSAT=6000
numGaussSAT=75000
numGaussUBM=800
numLeavesSGMM=10000
numGaussSGMM=80000
# Lexicon and Language Model parameters
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
use_pitch=true
use_ffv=true
# Scoring protocols (dummy GLM file to appease the scoring script)
#glmFile=./conf/glm
lexicon_file=/export/babel/data/201-haitian/release-current/conversational/reference_materials/lexicon.txt
#keyword search settings
duptime=0.5
case_insensitive=true

Просмотреть файл

@ -0,0 +1,76 @@
# include common settings for limitedLP systems.
. conf/common.limitedLP || exit 1;
#speech corpora files location
train_data_dir=/export/babel/data/201-haitian/release-current/conversational/training/
train_data_list=/export/babel/data/splits/Haitian_Babel201/train.LimitedLP.list
train_nj=16
#RADICAL DEV data files
dev2h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev/
dev2h_data_list=/export/babel/data/splits/Haitian_Babel201/dev.2hr.list
dev2h_data_cmudb=
dev2h_stm_file=
dev2h_ecf_file=
dev2h_rttm_file=
dev2h_kwlist_file=
dev2h_subset_ecf=true
dev2h_nj=20
#Official DEV data files
dev10h_data_dir=/export/babel/data/201-haitian/release-current/conversational/dev
dev10h_data_list=/export/babel/data/splits/Haitian_Babel201//dev.list
dev10h_data_cmudb=
dev10h_stm_file=
dev10h_ecf_file=
dev10h_rttm_file=
dev10h_kwlist_file=
dev10h_nj=32
#Official EVAL period evaluation data files
eval_data_dir=/export/babel/data/201-haitian//release-current/conversational/eval
eval_data_list=
eval_ecf_file=
eval_kwlist_file=
eval_data_cmudb=
eval_nj=64
#Official (POST-)EVAL evaluation data portion
evalpart1_data_dir=
evalpart1_data_list=
evalpart1_data_cmudb=
evalpart1_stm_file=
evalpart1_ecf_file=
evalpart1_rttm_file=
evalpart1_kwlist_file=
evalpart1_nj=21
# Acoustic model parameters
numLeavesTri1=1000
numGaussTri1=10000
numLeavesTri2=2500
numGaussTri2=36000
numLeavesTri3=2500
numGaussTri3=36000
numLeavesMLLT=2500
numGaussMLLT=36000
numLeavesSAT=2500
numGaussSAT=36000
numGaussUBM=750
numLeavesSGMM=5000
numGaussSGMM=18000
# Lexicon and Language Model parameters
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
use_pitch=true
use_ffv=true
# Scoring protocols (dummy GLM file to appease the scoring script)
#glmFile=./conf/glm
lexicon_file=/export/babel/data/201-haitian/release-current/conversational/reference_materials/lexicon.sub-train.txt
#keyword search settings
duptime=0.5
case_insensitive=true

Просмотреть файл

@ -85,8 +85,8 @@ numGaussSGMM=80000
# Lexicon and Language Model parameters
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
use_pitch=true
use_ffv=true
use_pitch=false
use_ffv=false
# Scoring protocols (dummy GLM file to appease the scoring script)
#glmFile=./conf/glm
lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.txt

Просмотреть файл

@ -0,0 +1,97 @@
# include common settings for fullLP systems.
. conf/common.fullLP || exit 1;
#speech corpora files location
train_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
train_data_list=/export/babel/data/splits/Zulu_Babel206/train.FullLP.list
train_nj=32
#RADICAL DEV data files
dev2h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev2h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.2hr.list
dev2h_data_cmudb=
dev2h_stm_file=
dev2h_ecf_file=
dev2h_rttm_file=
dev2h_kwlist_file=
dev2h_subset_ecf=true
dev2h_nj=20
#Official DEV data files
dev10h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev
dev10h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.list
dev10h_data_cmudb=
dev10h_stm_file=
dev10h_ecf_file=
dev10h_rttm_file=
dev10h_kwlist_file=
dev10h_nj=32
#RADICAL DEV data files
dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list
dev10h_sph_data_cmudb=
dev10h_sph_stm_file=
dev10h_sph_ecf_file=
dev10h_sph_rttm_file=
dev10h_sph_kwlist_file=
dev10h_sph_subset_ecf=true
dev10h_sph_nj=32
#RADICAL DEV data files
dev10h_wav_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev10h_wav_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
dev10h_wav_data_cmudb=
dev10h_wav_stm_file=
dev10h_wav_ecf_file=
dev10h_wav_rttm_file=
dev10h_wav_kwlist_file=
dev10h_wav_subset_ecf=true
dev10h_wav_nj=13
#Official EVAL period evaluation data files
eval_data_dir=/export/babel/data/206-zulu/release-current/conversational/eval
eval_data_list=
eval_ecf_file=
eval_kwlist_file=
eval_data_cmudb=
eval_nj=64
#Official (POST-)EVAL evaluation data portion
evalpart1_data_dir=
evalpart1_data_list=
evalpart1_data_cmudb=
evalpart1_stm_file=
evalpart1_ecf_file=
evalpart1_rttm_file=
evalpart1_kwlist_file=
evalpart1_nj=21
# Acoustic model parameters
numLeavesTri1=1000
numGaussTri1=10000
numLeavesTri2=1000
numGaussTri2=20000
numLeavesTri3=6000
numGaussTri3=75000
numLeavesMLLT=6000
numGaussMLLT=75000
numLeavesSAT=6000
numGaussSAT=75000
numGaussUBM=800
numLeavesSGMM=10000
numGaussSGMM=80000
# Lexicon and Language Model parameters
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
use_pitch=true
use_ffv=true
# Scoring protocols (dummy GLM file to appease the scoring script)
#glmFile=./conf/glm
lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.txt
#keyword search settings
duptime=0.5
case_insensitive=true

Просмотреть файл

@ -37,6 +37,28 @@ devtrain_rttm_file=
devtrain_kwlist_file=
devtrain_nj=64
#RADICAL DEV data files
dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list
dev10h_sph_data_cmudb=
dev10h_sph_stm_file=
dev10h_sph_ecf_file=
dev10h_sph_rttm_file=
dev10h_sph_kwlist_file=
dev10h_sph_subset_ecf=true
dev10h_sph_nj=32
#RADICAL DEV data files
dev10h_wav_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev10h_wav_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
dev10h_wav_data_cmudb=
dev10h_wav_stm_file=
dev10h_wav_ecf_file=
dev10h_wav_rttm_file=
dev10h_wav_kwlist_file=
dev10h_wav_subset_ecf=true
dev10h_wav_nj=13
#Official EVAL period evaluation data files
eval_data_dir=/export/babel/data/206-zulu/release-current/conversational/eval
eval_data_list=
@ -74,8 +96,8 @@ numGaussSGMM=18000
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
use_pitch=true
use_ffv=true
use_pitch=false
use_ffv=false
# Scoring protocols (dummy GLM file to appease the scoring script)
#glmFile=./conf/glm
lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.sub-train.txt

Просмотреть файл

@ -0,0 +1,108 @@
# include common settings for limitedLP systems.
. conf/common.limitedLP || exit 1;
#speech corpora files location
train_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
train_data_list=/export/babel/data/splits/Zulu_Babel206/train.LimitedLP.list
train_nj=16
#RADICAL DEV data files
dev2h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev2h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.2hr.list
dev2h_data_cmudb=
dev2h_stm_file=
dev2h_ecf_file=
dev2h_rttm_file=
dev2h_kwlist_file=
dev2h_subset_ecf=true
dev2h_nj=20
#Official DEV data files
dev10h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev
dev10h_data_list=/export/babel/data/splits/Zulu_Babel206/dev.list
dev10h_data_cmudb=
dev10h_stm_file=
dev10h_ecf_file=
dev10h_rttm_file=
dev10h_kwlist_file=
dev10h_nj=32
#RADICAL EVAL data files (difference between TRAIN-FULL TRAIN-LIMITED)
devtrain_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
devtrain_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
devtrain_data_cmudb=
devtrain_stm_file=
devtrain_ecf_file=
devtrain_rttm_file=
devtrain_kwlist_file=
devtrain_nj=64
#RADICAL DEV data files
dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list
dev10h_sph_data_cmudb=
dev10h_sph_stm_file=
dev10h_sph_ecf_file=
dev10h_sph_rttm_file=
dev10h_sph_kwlist_file=
dev10h_sph_subset_ecf=true
dev10h_sph_nj=32
#RADICAL DEV data files
dev10h_wav_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev10h_wav_data_list=/export/babel/data/splits/Zulu_Babel206/dev.wav.list
dev10h_wav_data_cmudb=
dev10h_wav_stm_file=
dev10h_wav_ecf_file=
dev10h_wav_rttm_file=
dev10h_wav_kwlist_file=
dev10h_wav_subset_ecf=true
dev10h_wav_nj=13
#Official EVAL period evaluation data files
eval_data_dir=/export/babel/data/206-zulu/release-current/conversational/eval
eval_data_list=
eval_ecf_file=
eval_kwlist_file=
eval_data_cmudb=
eval_nj=64
#Official (POST-)EVAL evaluation data portion
evalpart1_data_dir=
evalpart1_data_list=
evalpart1_data_cmudb=
evalpart1_stm_file=
evalpart1_ecf_file=
evalpart1_rttm_file=
evalpart1_kwlist_file=
evalpart1_nj=21
# Acoustic model parameters
numLeavesTri1=1000
numGaussTri1=10000
numLeavesTri2=2500
numGaussTri2=36000
numLeavesTri3=2500
numGaussTri3=36000
numLeavesMLLT=2500
numGaussMLLT=36000
numLeavesSAT=2500
numGaussSAT=36000
numGaussUBM=750
numLeavesSGMM=5000
numGaussSGMM=18000
# Lexicon and Language Model parameters
oovSymbol="<unk>"
lexiconFlags="--oov <unk>"
use_pitch=true
use_ffv=true
# Scoring protocols (dummy GLM file to appease the scoring script)
#glmFile=./conf/glm
lexicon_file=/export/babel/data/206-zulu/release-current/conversational/reference_materials/lexicon.sub-train.txt
#keyword search settings
duptime=0.5
case_insensitive=true

Просмотреть файл

@ -15,7 +15,7 @@ dev2h_ecf_file=
dev2h_rttm_file=
dev2h_kwlist_file=
dev2h_subset_ecf=true
dev2h_nj=20
dev2h_nj=18
#Official DEV data files
dev10h_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev
@ -27,6 +27,16 @@ dev10h_rttm_file=
dev10h_kwlist_file=
dev10h_nj=32
#RADICAL EVAL data files (difference between TRAIN-FULL TRAIN-LIMITED)
devtrain_data_dir=/export/babel/data/206-zulu/release-current/conversational/training/
devtrain_data_list=/export/babel/data/splits/Zulu_Babel206/dev.train.list
devtrain_data_cmudb=
devtrain_stm_file=
devtrain_ecf_file=
devtrain_rttm_file=
devtrain_kwlist_file=
devtrain_nj=64
#RADICAL DEV data files
dev10h_sph_data_dir=/export/babel/data/206-zulu/release-current/conversational/dev/
dev10h_sph_data_list=/export/babel/data/splits/Zulu_Babel206/dev.sph.list

Просмотреть файл

@ -17,9 +17,10 @@ sub KeywordSort {
}
my $Usage = <<EOU;
This script reads a alignment.csv file and computes the oracle ATWV based on the
oracle threshold. The duration of the search collection is supposed to be provided.
In the Babel case, the duration should be half of the total audio duration.
This script reads a alignment.csv file and computes the ATWV, OTWV, MTWV by
sweeping the threshold. The duration of the search collection is supposed to be
provided. In the Babel case, the duration should be half of the total audio
duration.
The alignment.csv file is supposed to have the following fields for each line:
language,file,channel,termid,term,ref_bt,ref_et,sys_bt,sys_et,sys_score,
@ -42,7 +43,7 @@ GetOptions(
@ARGV == 1 || die $Usage;
# Workout the input/output source.
# Works out the input/output source.
my $alignment_in = shift @ARGV;
# Hash alignment file. For each instance we store a 3-dimension vector:
@ -98,40 +99,66 @@ while (<A>) {
}
close(A);
# Work out the oracle ATWV by sweeping the threshold.
# Works out the oracle ATWV by sweeping the threshold.
my $atwv = 0.0;
my $oracle_atwv = 0.0;
my $otwv = 0.0;
my %mtwv_sweep;
foreach my $kwid (keys %keywords) {
# Sort the instances by confidence score.
my @instances = sort KeywordSort @{$alignment{$kwid}};
my $local_oracle_atwv = 0.0;
my $max_local_oracle_atwv = 0.0;
my $local_otwv = 0.0;
my $max_local_otwv = 0.0;
my $local_atwv = 0.0;
foreach my $instance (@instances) {
my @ins = @{$instance};
# Oracle ATWV.
my $gain = 1.0 / $Ntrue{$kwid};
my $cost = $beta / ($duration - $Ntrue{$kwid});
# ATWV.
if ($ins[1] == 1) {
$local_oracle_atwv += 1.0 / $Ntrue{$kwid};
$local_otwv += $gain;
} else {
$local_oracle_atwv -= $beta / ($duration - $Ntrue{$kwid});
$local_otwv -= $cost;
}
if ($local_oracle_atwv > $max_local_oracle_atwv) {
$max_local_oracle_atwv = $local_oracle_atwv;
if ($local_otwv > $max_local_otwv) {
$max_local_otwv = $local_otwv;
}
# Original ATWV.
# OTWV.
if ($ins[2] == 1) {
$local_atwv -= $beta / ($duration - $Ntrue{$kwid});
$local_atwv -= $cost;
} elsif ($ins[2] == 2) {
$local_atwv += 1.0 / $Ntrue{$kwid};
$local_atwv += $gain;
}
# MTWV.
for (my $threshold = 0.000; $threshold <= $ins[0]; $threshold += 0.001) {
if ($ins[1] == 1) {
$mtwv_sweep{$threshold} += $gain;
} else {
$mtwv_sweep{$threshold} -= $cost;
}
}
}
$atwv += $local_atwv;
$oracle_atwv += $max_local_oracle_atwv;
$otwv += $max_local_otwv;
}
# Works out the MTWV.
my $mtwv = 0.0;
my $mtwv_threshold = 0.0;
for my $threshold (keys %mtwv_sweep) {
if ($mtwv_sweep{$threshold} > $mtwv) {
$mtwv = $mtwv_sweep{$threshold};
$mtwv_threshold = $threshold;
}
}
$atwv /= scalar(keys %keywords);
$atwv = sprintf("%.4f", $atwv);
$oracle_atwv /= scalar(keys %keywords);
$oracle_atwv = sprintf("%.4f", $oracle_atwv);
print "Original ATWV = $atwv\n";
print "Oracle ATWV = $oracle_atwv\n";
$otwv /= scalar(keys %keywords);
$otwv = sprintf("%.4f", $otwv);
$mtwv /= scalar(keys %keywords);
$mtwv = sprintf("%.4f", $mtwv);
print "ATWV = $atwv\n";
print "OTWV = $otwv\n";
print "MTWV = $mtwv, THRESHOLD = $mtwv_threshold\n";

Просмотреть файл

@ -41,9 +41,10 @@ nnet_8m_6l/decode_eval_iter270/cer_10:%CER 25.72 [ 1945 / 7562, 405 ins, 533 del
nnet_8m_6l/decode_eval_iter280/cer_10:%CER 27.43 [ 2074 / 7562, 424 ins, 605 del, 1045 sub ]
nnet_8m_6l/decode_eval_iter290/cer_10:%CER 26.37 [ 1994 / 7562, 410 ins, 572 del, 1012 sub ]
nnet_8m_6l/decode_eval/cer_10:%CER 25.55 [ 1932 / 7562, 405 ins, 549 del, 978 sub ] # 6 layers neural network
nnet_tanh_6l/decode_eval/cer_10:%CER 21.34 [ 1614 / 7562, 369 ins, 487 del, 758 sub ] # 6 layers neural network (nnet2 script, 1024 neurons)
nnet_4m_3l/decode_eval/cer_10:%CER 22.38 [ 1692 / 7562, 372 ins, 510 del, 810 sub ] # 4 layers neural network
nnet_8m_6l/decode_eval/cer_10:%CER 25.55 [ 1932 / 7562, 405 ins, 549 del, 978 sub ] # 6 hidden layers neural network
nnet_tanh_6l/decode_eval/cer_10:%CER 21.34 [ 1614 / 7562, 369 ins, 487 del, 758 sub ] # 6 hidden layers neural network (nnet2 script, 1024 neurons)
nnet_4m_3l/decode_eval/cer_10:%CER 22.38 [ 1692 / 7562, 372 ins, 510 del, 810 sub ] # 3 hidden layers neural network
nnet_tanh_3l/decode_eval/cer_10:%CER 22.11 [ 1672 / 7562, 391 ins, 489 del, 792 sub ] # 3 hidden layers neural network (nnet2 script, 1024 neurons)
tri5a_pretrain-dbn_dnn/decode/cer_10:%CER 20.48 [ 1549 / 7562, 383 ins, 468 del, 698 sub ] # 6 layers DNN - pretrained RBM, cross entropy trained DNN
tri5a_pretrain-dbn_dnn_smbr/decode_it1/cer_10:%CER 18.73 [ 1416 / 7562, 306 ins, 453 del, 657 sub ] # sMBR trained DNN
@ -97,6 +98,7 @@ nnet_8m_6l/decode_eval_closelm_iter290/cer_10:%CER 20.40 [ 1543 / 7562, 323 ins,
nnet_8m_6l/decode_eval_closelm/cer_10:%CER 20.68 [ 1564 / 7562, 351 ins, 483 del, 730 sub ]
nnet_tanh_6l/decode_eval_closelm/cer_10:%CER 17.10 [ 1293 / 7562, 337 ins, 448 del, 508 sub ]
nnet_4m_3l/decode_eval_closelm/cer_10:%CER 17.15 [ 1297 / 7562, 335 ins, 439 del, 523 sub ]
nnet_tanh_3l/decode_eval_closelm/cer_10:%CER 17.22 [ 1302 / 7562, 349 ins, 434 del, 519 sub ]
tri5a_pretrain-dbn_dnn/decode_closelm/cer_10:%CER 16.54 [ 1251 / 7562, 346 ins, 413 del, 492 sub ]
tri5a_pretrain-dbn_dnn_smbr/decode_closelm_it1/cer_10:%CER 15.31 [ 1158 / 7562, 280 ins, 410 del, 468 sub ]
@ -130,6 +132,7 @@ exp/sgmm_5a_mmi_b0.1/decode_wide_eval_4/cer_10:%CER 23.17 [ 1752 / 7562, 373 ins
exp/nnet_8m_6l/decode_wide_eval/cer_10:%CER 24.13 [ 1825 / 7562, 384 ins, 535 del, 906 sub ]
exp/nnet_tanh_6l/decode_wide_eval/cer_10:%CER 21.22 [ 1605 / 7562, 365 ins, 485 del, 755 sub ]
exp/nnet_4m_3l/decode_wide_eval/cer_10:%CER 22.16 [ 1676 / 7562, 365 ins, 505 del, 806 sub ]
exp/nnet_tanh_3l/decode_wide_eval/cer_10:%CER 21.95 [ 1660 / 7562, 382 ins, 488 del, 790 sub ]
exp/tri5a_pretrain-dbn_dnn/decode_dnnwide/cer_10:%CER 20.47 [ 1548 / 7562, 383 ins, 467 del, 698 sub ]
exp/tri5a_pretrain-dbn_dnn_smbr/decode_it1_dnnwide/cer_10:%CER 18.73 [ 1416 / 7562, 306 ins, 453 del, 657 sub ]
exp/tri5a_pretrain-dbn_dnn_smbr/decode_it2_dnnwide/cer_10:%CER 18.73 [ 1416 / 7562, 310 ins, 446 del, 660 sub ]
@ -157,6 +160,7 @@ exp/sgmm_5a_mmi_b0.1/decode_wide_eval_closelm_4/cer_10:%CER 19.27 [ 1457 / 7562,
exp/nnet_8m_6l/decode_wide_eval_closelm/cer_10:%CER 17.87 [ 1351 / 7562, 343 ins, 453 del, 555 sub ]
exp/nnet_tanh_6l/decode_wide_eval_closelm/cer_10:%CER 17.15 [ 1297 / 7562, 336 ins, 452 del, 509 sub ]
exp/nnet_4m_3l/decode_wide_eval_closelm/cer_10:%CER 17.02 [ 1287 / 7562, 330 ins, 436 del, 521 sub ]
exp/nnet_tanh_3l/decode_wide_eval_closelm/cer_10:%CER 17.31 [ 1309 / 7562, 348 ins, 441 del, 520 sub ]
exp/tri5a_pretrain-dbn_dnn/decode_closelm_dnnwide/cer_10:%CER 16.42 [ 1242 / 7562, 337 ins, 414 del, 491 sub ]
exp/tri5a_pretrain-dbn_dnn_smbr/decode_closelm_it1_dnnwide/cer_10:%CER 15.26 [ 1154 / 7562, 279 ins, 409 del, 466 sub ]
exp/tri5a_pretrain-dbn_dnn_smbr/decode_closelm_it2_dnnwide/cer_10:%CER 15.31 [ 1158 / 7562, 279 ins, 408 del, 471 sub ]

Просмотреть файл

@ -12,6 +12,7 @@
ulimit -u 10000
# 6 hidden layers DNN
(
steps/nnet2/train_tanh.sh \
--mix-up 8000 \
@ -36,3 +37,28 @@ local/ext/score.sh data/eval exp/tri5a/graph_closelm exp/nnet_tanh_6l/decode_wid
)
# 3 hidden layers DNN
(
steps/nnet2/train_tanh.sh \
--mix-up 8000 \
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
--num-hidden-layers 3 --hidden-layer-dim 1024 \
--cmd "$decode_cmd" \
data/train data/lang exp/tri5a_ali_dt100k exp/nnet_tanh_3l || exit 1
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 2 --transform-dir exp/tri5a/decode_eval exp/tri5a/graph data/eval exp/nnet_tanh_3l/decode_eval &
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 2 --transform-dir exp/tri5a/decode_eval_closelm exp/tri5a/graph_closelm data/eval exp/nnet_tanh_3l/decode_eval_closelm &
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 2 --config conf/decode_wide.config --transform-dir exp/tri5a/decode_eval exp/tri5a/graph data/eval exp/nnet_tanh_3l/decode_wide_eval &
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 2 --config conf/decode_wide.config --transform-dir exp/tri5a/decode_eval_closelm exp/tri5a/graph_closelm data/eval exp/nnet_tanh_3l/decode_wide_eval_closelm &
wait
local/ext/score.sh data/eval exp/tri5a/graph exp/nnet_tanh_3l/decode_eval
local/ext/score.sh data/eval exp/tri5a/graph_closelm exp/nnet_tanh_3l/decode_eval_closelm
local/ext/score.sh data/eval exp/tri5a/graph exp/nnet_tanh_3l/decode_wide_eval
local/ext/score.sh data/eval exp/tri5a/graph_closelm exp/nnet_tanh_3l/decode_wide_eval_closelm
)

Просмотреть файл

@ -1,9 +1,8 @@
#!/bin/bash
#
# Copyright 2012 Vassil Panayotov
# modified from a file that was:
# Copyright 2010-2011 Microsoft Corporation
# Copyright 2010-2011 Microsoft Corporation
# Copyright 2012 Vassil Panayotov
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -54,11 +53,10 @@ cat $RMROOT/rm1/etc/rm1_test.fileids | \
# this is needed, because the original "al_sents.snr" file is not available
# (and because CMU's train utterances have tags like '<sil>' added)
cat $RMROOT/rm1/etc/rm1_train.transcription |\
sed -e 's/\(.*\)\(([a-z][a-z][0-9]\+)\)/\1\U\2/' |\
sed -e 's:</\?si\?l\?>::g' -e 's:([0-9])::g' |\
sed -e 's:\([ ][ ]\+\): :g' -e 's:^[ ]\+::g' |\
cat $RMROOT/rm1/etc/rm1_test.transcription - \
> al_sents.snr
tr '[a-z]' '[A-Z]' |\
sed -E -e 's:</?S(IL)?>: :g' -e 's:\([0-9]\): :g' -e 's: +: :g' -e 's:^ +::' |\
cat $RMROOT/rm1/etc/rm1_test.transcription - \
> al_sents.snr
# training set
../../local/make_trans.pl trn train.flist al_sents.snr train_trans.txt train.scp

Просмотреть файл

@ -4,59 +4,80 @@
# Switchboard portion of eval2000, excluding CallHome, which is
# substantially easier.
# These results are slightly out of date: since then I changed
# the LDA+MLLT to use 7, not 9 frames of context, and also increased
# the learning rate for the "indirect" fMMI.
for x in exp/{mono,tri,sgmm,nnet}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null
for x in exp/{mono,tri,sgmm,nnet}*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
# These results are still partial.
exp/tri1/decode_eval2000_sw1_fsh_tgpr/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 61.5 26.8 11.7 3.2 41.7 70.2 |
exp/tri1/decode_eval2000_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 61.1 27.3 11.6 3.5 42.3 70.3 |
exp/tri2/decode_eval2000_sw1_fsh_tgpr/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 62.8 26.1 11.0 3.2 40.3 70.1 |
exp/tri2/decode_eval2000_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 62.7 26.9 10.3 3.7 40.9 70.5 |
exp/tri3a/decode_eval2000_sw1_fsh_tgpr/score_12/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 69.7 21.6 8.6 3.2 33.5 68.0 |
exp/tri3a/decode_eval2000_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 69.3 22.0 8.7 3.4 34.1 67.3 |
exp/tri3b/decode_eval2000_sw1_fsh_tgpr/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 74.3 18.0 7.6 2.9 28.6 65.6 |
exp/tri3b/decode_eval2000_sw1_fsh_tgpr_newcode/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 74.3 18.0 7.6 2.9 28.6 65.6 |
exp/tri3b/decode_eval2000_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 74.0 18.7 7.3 3.0 29.0 66.5 |
exp/tri3b/decode_eval2000_sw1_tg_newcode/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 74.0 18.7 7.3 3.0 29.0 66.5 |
exp/tri4a/decode_eval2000_sw1_fsh_tgpr/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 78.4 15.2 6.3 2.6 24.1 61.4 |
exp/tri4a/decode_eval2000_sw1_fsh_tgpr.si/score_11/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 71.8 20.7 7.5 3.6 31.8 67.4 |
exp/tri4a/decode_eval2000_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 78.1 15.6 6.3 2.7 24.6 61.7 |
exp/tri4a/decode_eval2000_sw1_tg.si/score_11/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 71.3 21.2 7.5 3.8 32.5 67.7 |
exp/tri4b/decode_eval2000_sw1_fsh_tgpr/score_16/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 80.4 13.3 6.2 2.1 21.7 60.0 |
exp/tri4b/decode_eval2000_sw1_fsh_tgpr.si/score_13/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 73.7 18.9 7.4 3.0 29.3 65.9 |
exp/tri4b/decode_eval2000_sw1_tg/score_14/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 80.1 14.1 5.8 2.5 22.4 60.8 |
exp/tri4b/decode_eval2000_sw1_tg.si/score_12/eval2000.ctm.swbd.filt.sys: | Sum/Avg | 1831 21395 | 73.6 19.3 7.1 3.6 30.0 66.2 |
%WER 37.2 | 1831 21395 | 65.9 24.1 10.0 3.1 37.2 67.8 | exp/tri1/decode_eval2000_sw1_fsh_tgpr/score_13/eval2000.ctm.swbd.filt.sys
%WER 37.4 | 1831 21395 | 65.9 24.1 10.0 3.4 37.4 67.9 | exp/tri1/decode_eval2000_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
%WER 36.8 | 1831 21395 | 66.5 23.8 9.7 3.3 36.8 68.3 | exp/tri2/decode_eval2000_sw1_fsh_tgpr/score_13/eval2000.ctm.swbd.filt.sys
%WER 37.0 | 1831 21395 | 66.6 24.1 9.2 3.7 37.0 68.4 | exp/tri2/decode_eval2000_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys
%WER 29.1 | 1831 21395 | 74.0 18.3 7.7 3.1 29.1 65.4 | exp/tri3b/decode_eval2000_sw1_fsh_tgpr/score_14/eval2000.ctm.swbd.filt.sys
%WER 29.7 | 1831 21395 | 73.6 18.6 7.7 3.3 29.7 65.3 | exp/tri3b/decode_eval2000_sw1_tg/score_14/eval2000.ctm.swbd.filt.sys
%WER 24.3 | 1831 21395 | 77.9 15.0 7.1 2.3 24.3 61.5 | exp/tri4a/decode_eval2000_sw1_fsh_tgpr/score_17/eval2000.ctm.swbd.filt.sys
%WER 32.6 | 1831 21395 | 71.2 21.4 7.4 3.8 32.6 66.9 | exp/tri4a/decode_eval2000_sw1_fsh_tgpr.si/score_12/eval2000.ctm.swbd.filt.sys
%WER 25.0 | 1831 21395 | 77.7 15.6 6.6 2.8 25.0 62.4 | exp/tri4a/decode_eval2000_sw1_tg/score_15/eval2000.ctm.swbd.filt.sys
%WER 33.2 | 1831 21395 | 70.8 21.7 7.5 4.0 33.2 67.1 | exp/tri4a/decode_eval2000_sw1_tg.si/score_12/eval2000.ctm.swbd.filt.sys
%WER 23.5 | 1831 21395 | 79.0 14.7 6.3 2.5 23.5 61.3 | exp/tri4a_fmmi_b0.1/decode_eval2000_it4_sw1_fsh_tgpr/score_13/eval2000.ctm.swbd.filt.sys
%WER 23.7 | 1831 21395 | 78.7 14.8 6.5 2.4 23.7 62.0 | exp/tri4a_fmmi_b0.1/decode_eval2000_it4_sw1_tg/score_14/eval2000.ctm.swbd.filt.sys
%WER 22.1 | 1831 21395 | 80.0 13.3 6.7 2.1 22.1 60.1 | exp/tri4a_fmmi_b0.1/decode_eval2000_it5_sw1_fsh_tgpr/score_14/eval2000.ctm.swbd.filt.sys
%WER 22.6 | 1831 21395 | 79.5 13.9 6.6 2.2 22.6 60.6 | exp/tri4a_fmmi_b0.1/decode_eval2000_it5_sw1_tg/score_14/eval2000.ctm.swbd.filt.sys
%WER 21.9 | 1831 21395 | 80.6 13.8 5.6 2.5 21.9 59.3 | exp/tri4a_fmmi_b0.1/decode_eval2000_it6_sw1_fsh_tgpr/score_11/eval2000.ctm.swbd.filt.sys
%WER 22.2 | 1831 21395 | 80.2 13.9 5.9 2.4 22.2 60.2 | exp/tri4a_fmmi_b0.1/decode_eval2000_it6_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys
%WER 21.5 | 1831 21395 | 80.9 13.5 5.6 2.5 21.5 59.2 | exp/tri4a_fmmi_b0.1/decode_eval2000_it7_sw1_fsh_tgpr/score_11/eval2000.ctm.swbd.filt.sys
%WER 21.8 | 1831 21395 | 80.7 13.7 5.5 2.5 21.8 59.7 | exp/tri4a_fmmi_b0.1/decode_eval2000_it7_sw1_tg/score_11/eval2000.ctm.swbd.filt.sys
%WER 21.3 | 1831 21395 | 81.2 13.3 5.5 2.6 21.3 59.1 | exp/tri4a_fmmi_b0.1/decode_eval2000_it8_sw1_fsh_tgpr/score_11/eval2000.ctm.swbd.filt.sys
%WER 21.7 | 1831 21395 | 80.7 13.4 5.9 2.4 21.7 59.6 | exp/tri4a_fmmi_b0.1/decode_eval2000_it8_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys
%WER 23.0 | 1831 21395 | 79.2 14.2 6.5 2.2 23.0 60.5 | exp/tri4a_mmi_b0.1/decode_eval2000_1.mdl_sw1_fsh_tgpr/score_14/eval2000.ctm.swbd.filt.sys
%WER 23.5 | 1831 21395 | 79.0 14.8 6.2 2.5 23.5 60.8 | exp/tri4a_mmi_b0.1/decode_eval2000_1.mdl_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
%WER 22.3 | 1831 21395 | 79.8 13.7 6.5 2.1 22.3 59.4 | exp/tri4a_mmi_b0.1/decode_eval2000_2.mdl_sw1_fsh_tgpr/score_14/eval2000.ctm.swbd.filt.sys
%WER 22.8 | 1831 21395 | 79.5 14.3 6.2 2.3 22.8 60.0 | exp/tri4a_mmi_b0.1/decode_eval2000_2.mdl_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
%WER 22.0 | 1831 21395 | 80.4 13.8 5.8 2.3 22.0 59.3 | exp/tri4a_mmi_b0.1/decode_eval2000_3.mdl_sw1_fsh_tgpr/score_12/eval2000.ctm.swbd.filt.sys
%WER 22.4 | 1831 21395 | 79.9 13.9 6.2 2.3 22.4 59.4 | exp/tri4a_mmi_b0.1/decode_eval2000_3.mdl_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
%WER 21.7 | 1831 21395 | 80.6 13.6 5.8 2.3 21.7 59.0 | exp/tri4a_mmi_b0.1/decode_eval2000_4.mdl_sw1_fsh_tgpr/score_12/eval2000.ctm.swbd.filt.sys
%WER 22.1 | 1831 21395 | 80.3 13.9 5.8 2.5 22.1 59.4 | exp/tri4a_mmi_b0.1/decode_eval2000_4.mdl_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys
%WER 21.8 | 1831 21395 | 80.5 13.7 5.8 2.3 21.8 59.3 | exp/tri4b/decode_eval2000_sw1_fsh_tgpr/score_15/eval2000.ctm.swbd.filt.sys
%WER 29.3 | 1831 21395 | 74.1 18.8 7.0 3.4 29.3 64.8 | exp/tri4b/decode_eval2000_sw1_fsh_tgpr.si/score_13/eval2000.ctm.swbd.filt.sys
%WER 22.4 | 1831 21395 | 80.0 13.9 6.1 2.4 22.4 60.0 | exp/tri4b/decode_eval2000_sw1_tg/score_16/eval2000.ctm.swbd.filt.sys
%WER 30.3 | 1831 21395 | 73.1 19.7 7.1 3.4 30.3 64.7 | exp/tri4b/decode_eval2000_sw1_tg.si/score_14/eval2000.ctm.swbd.filt.sys
%WER 20.7 | 1831 21395 | 81.3 12.8 6.0 2.0 20.7 59.3 | exp/tri4b_fmmi_b0.1/decode_eval2000_it4_sw1_fsh_tgpr/score_15/eval2000.ctm.swbd.filt.sys
%WER 21.4 | 1831 21395 | 81.0 13.2 5.8 2.4 21.4 59.3 | exp/tri4b_fmmi_b0.1/decode_eval2000_it4_sw1_tg/score_14/eval2000.ctm.swbd.filt.sys
%WER 19.6 | 1831 21395 | 82.2 12.0 5.8 1.9 19.6 57.2 | exp/tri4b_fmmi_b0.1/decode_eval2000_it5_sw1_fsh_tgpr/score_14/eval2000.ctm.swbd.filt.sys
%WER 20.2 | 1831 21395 | 81.9 12.5 5.6 2.1 20.2 56.9 | exp/tri4b_fmmi_b0.1/decode_eval2000_it5_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
%WER 19.4 | 1831 21395 | 82.7 12.0 5.3 2.2 19.4 56.9 | exp/tri4b_fmmi_b0.1/decode_eval2000_it6_sw1_fsh_tgpr/score_12/eval2000.ctm.swbd.filt.sys
%WER 19.9 | 1831 21395 | 82.1 12.2 5.6 2.0 19.9 57.0 | exp/tri4b_fmmi_b0.1/decode_eval2000_it6_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
%WER 19.3 | 1831 21395 | 82.9 12.0 5.2 2.1 19.3 56.4 | exp/tri4b_fmmi_b0.1/decode_eval2000_it7_sw1_fsh_tgpr/score_12/eval2000.ctm.swbd.filt.sys
%WER 19.8 | 1831 21395 | 82.4 12.3 5.3 2.2 19.8 56.6 | exp/tri4b_fmmi_b0.1/decode_eval2000_it7_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys
%WER 19.3 | 1831 21395 | 82.9 11.9 5.2 2.2 19.3 56.5 | exp/tri4b_fmmi_b0.1/decode_eval2000_it8_sw1_fsh_tgpr/score_12/eval2000.ctm.swbd.filt.sys
%WER 19.7 | 1831 21395 | 82.5 12.3 5.2 2.2 19.7 56.7 | exp/tri4b_fmmi_b0.1/decode_eval2000_it8_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys
%WER 20.2 | 1831 21395 | 81.6 12.4 5.9 1.9 20.2 57.6 | exp/tri4b_mmi_b0.1/decode_eval2000_1.mdl_sw1_fsh_tgpr/score_15/eval2000.ctm.swbd.filt.sys
%WER 20.7 | 1831 21395 | 81.4 12.8 5.7 2.1 20.7 57.9 | exp/tri4b_mmi_b0.1/decode_eval2000_1.mdl_sw1_tg/score_14/eval2000.ctm.swbd.filt.sys
%WER 19.7 | 1831 21395 | 82.2 12.1 5.7 1.9 19.7 57.3 | exp/tri4b_mmi_b0.1/decode_eval2000_2.mdl_sw1_fsh_tgpr/score_14/eval2000.ctm.swbd.filt.sys
%WER 20.3 | 1831 21395 | 81.9 12.6 5.5 2.2 20.3 57.8 | exp/tri4b_mmi_b0.1/decode_eval2000_2.mdl_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
%WER 19.5 | 1831 21395 | 82.5 12.0 5.5 2.0 19.5 56.1 | exp/tri4b_mmi_b0.1/decode_eval2000_3.mdl_sw1_fsh_tgpr/score_13/eval2000.ctm.swbd.filt.sys
%WER 20.0 | 1831 21395 | 82.0 12.4 5.5 2.1 20.0 56.8 | exp/tri4b_mmi_b0.1/decode_eval2000_3.mdl_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
%WER 19.5 | 1831 21395 | 82.7 12.5 4.8 2.3 19.5 56.4 | exp/tri4b_mmi_b0.1/decode_eval2000_4.mdl_sw1_fsh_tgpr/score_11/eval2000.ctm.swbd.filt.sys
%WER 19.9 | 1831 21395 | 82.3 12.5 5.2 2.2 19.9 56.7 | exp/tri4b_mmi_b0.1/decode_eval2000_4.mdl_sw1_tg/score_12/eval2000.ctm.swbd.filt.sys
%WER 23.4 | 1831 21395 | 79.2 13.6 7.3 2.6 23.4 62.8 | exp/tri4c_reseg/decode_eval2000_sw1_fsh_tgpr/score_15/eval2000.ctm.swbd.filt.sys
%WER 30.4 | 1831 21395 | 73.0 18.7 8.3 3.4 30.4 68.1 | exp/tri4c_reseg/decode_eval2000_sw1_fsh_tgpr.si/score_13/eval2000.ctm.swbd.filt.sys
%WER 23.6 | 1831 21395 | 78.9 13.6 7.5 2.5 23.6 62.8 | exp/tri4c_reseg/decode_eval2000_sw1_tg/score_16/eval2000.ctm.swbd.filt.sys
%WER 31.0 | 1831 21395 | 72.7 19.0 8.3 3.7 31.0 68.5 | exp/tri4c_reseg/decode_eval2000_sw1_tg.si/score_13/eval2000.ctm.swbd.filt.sys
%WER 21.1 | 1831 21395 | 81.2 12.8 6.0 2.3 21.1 59.7 | exp/sgmm2_5a/decode_eval2000_sw1_fsh_tgpr/score_10/eval2000.ctm.swbd.filt.sys
%WER 21.3 | 1831 21395 | 80.9 13.0 6.2 2.2 21.3 59.5 | exp/sgmm2_5a/decode_eval2000_sw1_tg/score_11/eval2000.ctm.swbd.filt.sys
%WER 19.7 | 1831 21395 | 82.4 12.0 5.6 2.2 19.7 57.9 | exp/sgmm2_5a_mmi_b0.1/decode_eval2000_sw1_fsh_tgpr_it1/score_10/eval2000.ctm.swbd.filt.sys
%WER 19.1 | 1831 21395 | 82.8 11.5 5.7 1.9 19.1 56.7 | exp/sgmm2_5a_mmi_b0.1/decode_eval2000_sw1_fsh_tgpr_it2/score_11/eval2000.ctm.swbd.filt.sys
%WER 19.0 | 1831 21395 | 83.2 11.5 5.3 2.1 19.0 56.9 | exp/sgmm2_5a_mmi_b0.1/decode_eval2000_sw1_fsh_tgpr_it3/score_10/eval2000.ctm.swbd.filt.sys
%WER 18.9 | 1831 21395 | 83.3 11.6 5.1 2.2 18.9 56.9 | exp/sgmm2_5a_mmi_b0.1/decode_eval2000_sw1_fsh_tgpr_it4/score_10/eval2000.ctm.swbd.filt.sys
%WER 20.4 | 1831 21395 | 81.9 12.4 5.7 2.3 20.4 57.8 | exp/sgmm2_5a_mmi_b0.1/decode_eval2000_sw1_tg_it1/score_10/eval2000.ctm.swbd.filt.sys
%WER 19.8 | 1831 21395 | 82.5 12.1 5.4 2.3 19.8 57.3 | exp/sgmm2_5a_mmi_b0.1/decode_eval2000_sw1_tg_it2/score_10/eval2000.ctm.swbd.filt.sys
%WER 19.5 | 1831 21395 | 82.8 12.0 5.3 2.3 19.5 56.9 | exp/sgmm2_5a_mmi_b0.1/decode_eval2000_sw1_tg_it3/score_10/eval2000.ctm.swbd.filt.sys
%WER 19.5 | 1831 21395 | 82.9 12.0 5.1 2.4 19.5 56.5 | exp/sgmm2_5a_mmi_b0.1/decode_eval2000_sw1_tg_it4/score_10/eval2000.ctm.swbd.filt.sys
# some more recent results (Sep 25 2013), from tri4b and tri4c_reseg, to
# see the effect of resegmentation. Note: we're only looking at the "swbd" results here,
# the callhome results or total results are terrible because of huge insertions, because
# it seems that only some segments of the audio files are in the stm. I'm not sure
# where to get the start and end points in the files, that they intended us to
# decode.
%WER 22.2 | 1831 21395 | 80.3 13.8 5.9 2.5 22.2 60.1 | exp/tri4b/decode_eval2000_sw1_fsh_tgpr/score_15/eval2000.ctm.swbd.filt.sys
%WER 29.3 | 1831 21395 | 73.5 18.7 7.8 2.9 29.3 65.0 | exp/tri4b/decode_eval2000_sw1_fsh_tgpr.si/score_17/eval2000.ctm.swbd.filt.sys
%WER 22.5 | 1831 21395 | 79.8 13.8 6.4 2.3 22.5 60.3 | exp/tri4b/decode_eval2000_sw1_tg/score_17/eval2000.ctm.swbd.filt.sys
%WER 30.5 | 1831 21395 | 73.1 19.8 7.1 3.6 30.5 65.8 | exp/tri4b/decode_eval2000_sw1_tg.si/score_14/eval2000.ctm.swbd.filt.sys
%WER 22.9 | 1831 21395 | 79.7 13.4 6.9 2.6 22.9 62.8 | exp/tri4c_reseg/decode_eval2000_sw1_fsh_tgpr/score_14/eval2000.ctm.swbd.filt.sys
%WER 29.6 | 1831 21395 | 73.8 18.2 8.1 3.4 29.6 66.8 | exp/tri4c_reseg/decode_eval2000_sw1_fsh_tgpr.si/score_13/eval2000.ctm.swbd.filt.sys
%WER 23.5 | 1831 21395 | 79.1 13.8 7.1 2.6 23.5 63.6 | exp/tri4c_reseg/decode_eval2000_sw1_tg/score_15/eval2000.ctm.swbd.filt.sys
%WER 30.9 | 1831 21395 | 73.1 19.0 7.9 4.0 30.9 67.6 | exp/tri4c_reseg/decode_eval2000_sw1_tg.si/score_12/eval2000.ctm.swbd.filt.sys
# so the resegmented one is about 0.3 to 1.0 worse, but the #sub is actually down, it's due to more deletions
# and insertions. This is kind of what we'd expect, since the reference segmentation is a kind of "oracle".
# below are some results where I kept the segments that the segmentation
# regarded as noise (e.g. cough, etc.). Results after adaptation almost identical, but
# 0.1% better with the switchboard-only LM.
%WER 22.9 | 1831 21395 | 79.7 13.4 6.9 2.6 22.9 62.6 | exp/tri4c_reseg/decode_eval2000_with_noise_sw1_fsh_tgpr/score_14/eval2000_with_noise.ctm.swbd.filt.sys
%WER 29.7 | 1831 21395 | 73.6 18.3 8.0 3.4 29.7 67.0 | exp/tri4c_reseg/decode_eval2000_with_noise_sw1_fsh_tgpr.si/score_13/eval2000_with_noise.ctm.swbd.filt.sys
%WER 23.6 | 1831 21395 | 79.1 14.0 6.9 2.8 23.6 64.2 | exp/tri4c_reseg/decode_eval2000_with_noise_sw1_tg/score_14/eval2000_with_noise.ctm.swbd.filt.sys
%WER 30.8 | 1831 21395 | 72.9 19.0 8.1 3.7 30.8 67.5 | exp/tri4c_reseg/decode_eval2000_with_noise_sw1_tg.si/score_13/eval2000_with_noise.ctm.swbd.filt.sys
%WER 29.10 [ 14382 / 49427, 1963 ins, 3394 del, 9025 sub ] exp/tri4b/decode_train_dev_sw1_fsh_tgpr/wer_15
%WER 37.81 [ 18686 / 49427, 2078 ins, 4625 del, 11983 sub ] exp/tri4b/decode_train_dev_sw1_fsh_tgpr.si/wer_15
%WER 29.53 [ 14598 / 49427, 1885 ins, 3538 del, 9175 sub ] exp/tri4b/decode_train_dev_sw1_tg/wer_16
%WER 38.42 [ 18990 / 49427, 2154 ins, 4461 del, 12375 sub ] exp/tri4b/decode_train_dev_sw1_tg.si/wer_15

Просмотреть файл

@ -0,0 +1,2 @@
beam=13.0 # beam for decoding. Was 13.0 in the scripts.
latbeam=8.0 # this has most effect on size of the lattices.

Просмотреть файл

@ -116,7 +116,7 @@ acwt=0.08333
{
steps/align_nnet.sh --nj 250 --cmd "$train_cmd" \
data-fmllr-tri4b/train_nodup data/lang $srcdir ${srcdir}_ali_all || exit 1;
steps/make_denlats_nnet.sh --nj 250 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
steps/make_denlats_nnet.sh --nj 10 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
data-fmllr-tri4b/train_nodup data/lang $srcdir ${srcdir}_denlats_all || exit 1;
}
# Now we re-train the hybrid by single iteration of sMBR
@ -156,7 +156,7 @@ acwt=0.08333
{
steps/align_nnet.sh --nj 250 --cmd "$train_cmd" \
data-fmllr-tri4b/train_nodup data/lang $srcdir ${srcdir}_ali_all || exit 1;
steps/make_denlats_nnet.sh --nj 250 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
steps/make_denlats_nnet.sh --nj 10 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
data-fmllr-tri4b/train_nodup data/lang $srcdir ${srcdir}_denlats_all || exit 1;
}
# Now we re-train the hybrid by several iterations of sMBR

Просмотреть файл

@ -13,8 +13,10 @@
. cmd.sh
. path.sh
set -e # exit on error
local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62
local/swbd1_data_prep.sh /home/dpovey/data/LDC97S62
# local/swbd1_data_prep.sh /home/dpovey/data/LDC97S62
# local/swbd1_data_prep.sh /data/corpora0/LDC97S62
# local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
# local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1
@ -29,9 +31,12 @@ utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
# If you have the Fisher data, you can set this "fisher_opt" variable.
fisher_opt="--fisher /export/corpora3/LDC/LDC2004T19/fe_03_p1_tran/"
#fisher_opt="--fisher /home/dpovey/data/LDC2004T19/fe_03_p1_tran/"
#fisher_opt="--fisher /data/corpora0/LDC2004T19/fe_03_p1_tran/"
# edinburgh:
# fisher_opt="--fisher /exports/work/inf_hcrc_cstr_general/corpora/fisher/transcripts"
# brno:
# fisher_opt="--fisher /mnt/matylda2/data/FISHER/fe_03_p1_tran" # BUT
local/swbd1_train_lms.sh $fisher_opt \
data/local/train/text data/local/dict/lexicon.txt data/local/lm
# We don't really need all these options for SRILM, since the LM training script
@ -48,7 +53,7 @@ utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
# For some funny reason we are still using IRSTLM for doing LM pruning :)
export PATH=$PATH:../../../tools/irstlm/bin/
prune-lm --threshold=1e-7 data/local/lm/sw1_fsh.o3g.kn.gz /dev/stdout \
| gzip -c > data/local/lm/sw1_fsh.o3g.pr1-7.kn.gz
| gzip -c > data/local/lm/sw1_fsh.o3g.pr1-7.kn.gz || exit 1
LM=data/local/lm/sw1_fsh.o3g.pr1-7.kn.gz
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
data/lang $LM data/local/dict/lexicon.txt data/lang_sw1_fsh_tgpr
@ -61,23 +66,24 @@ utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
# local/eval2000_data_prep.sh /data/corpora0/LDC2002S09/hub5e_00 /data/corpora0/LDC2002T43
# local/eval2000_data_prep.sh /mnt/matylda2/data/HUB5_2000/ /mnt/matylda2/data/HUB5_2000/2000_hub5_eng_eval_tr
# local/eval2000_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/switchboard/hub5/2000/transcr
# local/eval2000_data_prep.sh /home/dpovey/data/LDC2002S09/hub5e_00 /home/dpovey/data/LDC2002T43
local/eval2000_data_prep.sh /export/corpora2/LDC/LDC2002S09/hub5e_00 /export/corpora2/LDC/LDC2002T43
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
steps/make_mfcc.sh --compress true --nj 20 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir || exit 1;
steps/make_mfcc.sh --compress true --nj 20 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
# Remove the small number of utterances that couldn't be extracted for some
# reason (e.g. too short; no such file).
utils/fix_data_dir.sh data/train || exit 1;
utils/fix_data_dir.sh data/train
# Create MFCCs for the eval set
steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/eval2000 exp/make_mfcc/eval2000 $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/eval2000 exp/make_mfcc/eval2000 $mfccdir || exit 1;
utils/fix_data_dir.sh data/eval2000 || exit 1 # remove segments with problems
steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/eval2000 exp/make_mfcc/eval2000 $mfccdir
steps/compute_cmvn_stats.sh data/eval2000 exp/make_mfcc/eval2000 $mfccdir
utils/fix_data_dir.sh data/eval2000 # remove segments with problems
# Use the first 4k sentences as dev set. Note: when we trained the LM, we used
# the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
@ -114,13 +120,13 @@ local/remove_dup_utts.sh 300 data/train_nodev data/train_nodup # 286hr
## Starting basic training on MFCC features
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
data/train_10k_nodup data/lang exp/mono || exit 1;
data/train_10k_nodup data/lang exp/mono
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train_30k_nodup data/lang exp/mono exp/mono_ali || exit 1;
data/train_30k_nodup data/lang exp/mono exp/mono_ali
steps/train_deltas.sh --cmd "$train_cmd" \
3200 30000 data/train_30k_nodup data/lang exp/mono_ali exp/tri1 || exit 1;
3200 30000 data/train_30k_nodup data/lang exp/mono_ali exp/tri1
for lm_suffix in tg fsh_tgpr; do
(
@ -133,10 +139,10 @@ for lm_suffix in tg fsh_tgpr; do
done
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train_30k_nodup data/lang exp/tri1 exp/tri1_ali || exit 1;
data/train_30k_nodup data/lang exp/tri1 exp/tri1_ali
steps/train_deltas.sh --cmd "$train_cmd" \
3200 30000 data/train_30k_nodup data/lang exp/tri1_ali exp/tri2 || exit 1;
3200 30000 data/train_30k_nodup data/lang exp/tri1_ali exp/tri2
for lm_suffix in tg fsh_tgpr; do
@ -156,11 +162,11 @@ done
# From now, we start building a bigger system (on train_100k_nodup, which has
# 110hrs of data). We start with the LDA+MLLT system
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train_100k_nodup data/lang exp/tri2 exp/tri2_ali_100k_nodup || exit 1;
data/train_100k_nodup data/lang exp/tri2 exp/tri2_ali_100k_nodup
# Train tri3b, which is LDA+MLLT, on 100k_nodup data.
steps/train_lda_mllt.sh --cmd "$train_cmd" \
5500 90000 data/train_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/tri3b || exit 1;
5500 90000 data/train_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/tri3b
for lm_suffix in tg fsh_tgpr; do
(
@ -174,12 +180,12 @@ done
# Train tri4a, which is LDA+MLLT+SAT, on 100k_nodup data.
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_100k_nodup data/lang exp/tri3b exp/tri3b_ali_100k_nodup || exit 1;
data/train_100k_nodup data/lang exp/tri3b exp/tri3b_ali_100k_nodup
steps/train_sat.sh --cmd "$train_cmd" \
5500 90000 data/train_100k_nodup data/lang exp/tri3b_ali_100k_nodup \
exp/tri4a || exit 1;
exp/tri4a
for lm_suffix in tg fsh_tgpr; do
(
@ -198,11 +204,11 @@ done
# 286 hours)
# Train tri4b, which is LDA+MLLT+SAT, on train_nodup data.
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri3b exp/tri3b_ali_nodup || exit 1;
data/train_nodup data/lang exp/tri3b exp/tri3b_ali_nodup
steps/train_sat.sh --cmd "$train_cmd" \
11500 200000 data/train_nodup data/lang exp/tri3b_ali_nodup exp/tri4b || exit 1;
11500 200000 data/train_nodup data/lang exp/tri3b_ali_nodup exp/tri4b
for lm_suffix in tg fsh_tgpr; do
(
@ -215,7 +221,9 @@ for lm_suffix in tg fsh_tgpr; do
$graph_dir data/train_dev exp/tri4b/decode_train_dev_sw1_${lm_suffix}
) &
done
wait
steps/lmrescore.sh --mode 3 --cmd "$mkgraph_cmd" data/lang_sw1_fsh_tgpr data/lang_sw1_fsh_tg data/eval2000 \
exp/tri4b/decode_eval2000_sw1_fsh_tgpr exp/tri4b/decode_eval2000_sw1_fsh_tg.3 || exit 1
# MMI training starting from the LDA+MLLT+SAT systems on both the
@ -229,11 +237,11 @@ steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" --config conf/decode.config \
--transform-dir exp/tri4a_ali_100k_nodup \
data/train_100k_nodup data/lang exp/tri4a exp/tri4a_denlats_100k_nodup \
|| exit 1;
steps/make_denlats.sh --nj 100 --cmd "$decode_cmd" --config conf/decode.config \
--transform-dir exp/tri4b_ali_nodup \
data/train_nodup data/lang exp/tri4b exp/tri4b_denlats_nodup || exit 1;
data/train_nodup data/lang exp/tri4b exp/tri4b_denlats_nodup
# 4 iterations of MMI seems to work well overall. The number of iterations is
# used as an explicit argument even though train_mmi.sh will use 4 iterations by
@ -241,11 +249,11 @@ steps/make_denlats.sh --nj 100 --cmd "$decode_cmd" --config conf/decode.config \
num_mmi_iters=4
steps/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --num-iters $num_mmi_iters \
data/train_100k_nodup data/lang exp/tri4a_{ali,denlats}_100k_nodup \
exp/tri4a_mmi_b0.1 || exit 1;
exp/tri4a_mmi_b0.1
steps/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --num-iters $num_mmi_iters \
data/train_nodup data/lang exp/tri4b_{ali,denlats}_nodup \
exp/tri4b_mmi_b0.1 || exit 1;
exp/tri4b_mmi_b0.1
for iter in 1 2 3 4; do
for lm_suffix in tg fsh_tgpr; do
@ -283,11 +291,11 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 100 --cmd "$train_cmd" \
steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup exp/tri4a_dubm \
exp/tri4a_denlats_100k_nodup exp/tri4a_fmmi_b0.1 || exit 1;
exp/tri4a_denlats_100k_nodup exp/tri4a_fmmi_b0.1
steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri4b_ali_nodup exp/tri4b_dubm \
exp/tri4b_denlats_nodup exp/tri4b_fmmi_b0.1 || exit 1;
exp/tri4b_denlats_nodup exp/tri4b_fmmi_b0.1
for iter in 4 5 6 7 8; do
for lm_suffix in tg fsh_tgpr; do

Просмотреть файл

@ -1,28 +1,76 @@
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
# Use caution when comparing these results with other published results.
Training Set : Timit training set (4620 sentences
Test Set : Timit test set (1680 sentences)
Training Set : 3696 sentences 4620 sentences
Dev Set : 400 sentences
Test Set : 192 sentences Timit test set (1680 sentences)
Language Model : Bigram phoneme language model which is extracted from training set
# monophone, deltas.
%PER 28.94 [ 18201 / 62901, 1598 ins, 5644 del, 10959 sub ] exp/mono/decode_bg_test/wer_4
---------------------------------Dev Set------------------------------------------
%WER 33.53 [ 5048 / 15057, 397 ins, 1674 del, 2977 sub ] exp/mono/decode_dev/wer_3
--------------------------------Test Set------------------------------------------
%WER 34.77 [ 2509 / 7215, 193 ins, 826 del, 1490 sub ] exp/mono/decode_test/wer_3
# tri1 : first triphone system (delta+delta-delta features)
%PER 22.60 [ 14215 / 62901, 1796 ins, 3466 del, 8953 sub ] exp/tri1/decode_bg_test/wer_8
---------------------------------Dev Set------------------------------------------
%WER 29.26 [ 4405 / 15057, 541 ins, 1281 del, 2583 sub ] exp/tri1/decode_dev/wer_6
--------------------------------Test Set------------------------------------------
%WER 30.53 [ 2203 / 7215, 259 ins, 654 del, 1290 sub ] exp/tri1/decode_test/wer_6
#tri2 : an LDA+MLLT system
---------------------------------Dev Set------------------------------------------
%WER 26.38 [ 3972 / 15057, 421 ins, 1269 del, 2282 sub ] exp/tri2/decode_dev/wer_7
--------------------------------Test Set------------------------------------------
%WER 28.41 [ 2050 / 7215, 220 ins, 664 del, 1166 sub ] exp/tri2/decode_test/wer_7
#tri2 : an LDA+MLLT system.
%PER 20.36 [ 12807 / 62901, 1872 ins, 2914 del, 8021 sub ] exp/tri2/decode_bg_test/wer_7
#tri3 : Speaker Adaptive Training (SAT) system
%PER 18.27 [ 11489 / 62901, 1681 ins, 2810 del, 6998 sub ] exp/tri3/decode_bg_test/wer_6
---------------------------------Dev Set------------------------------------------
%WER 23.36 [ 3517 / 15057, 464 ins, 1001 del, 2052 sub ] exp/tri3/decode_dev/wer_4
%WER 26.53 [ 3995 / 15057, 394 ins, 1289 del, 2312 sub ] exp/tri3/decode_dev.si/wer_7
--------------------------------Test Set------------------------------------------
%WER 24.96 [ 1801 / 7215, 245 ins, 529 del, 1027 sub ] exp/tri3/decode_test/wer_4
%WER 27.96 [ 2017 / 7215, 214 ins, 650 del, 1153 sub ] exp/tri3/decode_test.si/wer_7
#SGMM2 Training
%PER 16.17 [ 10171 / 62901, 1309 ins, 2708 del, 6154 sub ] exp/sgmm2_4/decode_bg_test/wer_6
# SGMM2 + MMI Training
%PER 16.14 [ 10154 / 62901, 1845 ins, 2074 del, 6235 sub ] exp/sgmm2_4_mmi_b0.1_z/decode_bg_test_it1/wer_6
%PER 16.58 [ 10430 / 62901, 2032 ins, 2031 del, 6367 sub ] exp/sgmm2_4_mmi_b0.1_z/decode_bg_test_it2/wer_7
%PER 16.80 [ 10570 / 62901, 2071 ins, 2096 del, 6403 sub ] exp/sgmm2_4_mmi_b0.1_z/decode_bg_test_it3/wer_8
%PER 17.02 [ 10706 / 62901, 2154 ins, 2048 del, 6504 sub ] exp/sgmm2_4_mmi_b0.1_z/decode_bg_test_it4/wer_8
#SGMM2 Training :
---------------------------------Dev Set------------------------------------------
%WER 20.66 [ 3111 / 15057, 347 ins, 1022 del, 1742 sub ] exp/sgmm2_4/decode_dev/wer_5
--------------------------------Test Set------------------------------------------
%WER 22.88 [ 1651 / 7215, 189 ins, 519 del, 943 sub ] exp/sgmm2_4/decode_test/wer_5
# SGMM2 + MMI Training :
---------------------------------Dev Set------------------------------------------
%WER 20.48 [ 3084 / 15057, 450 ins, 849 del, 1785 sub ] exp/sgmm2_4_mmi_b0.1/decode_dev_it1/wer_5
%WER 20.20 [ 3042 / 15057, 508 ins, 740 del, 1794 sub ] exp/sgmm2_4_mmi_b0.1/decode_dev_it2/wer_5
%WER 20.36 [ 3065 / 15057, 548 ins, 711 del, 1806 sub ] exp/sgmm2_4_mmi_b0.1/decode_dev_it3/wer_5
%WER 20.40 [ 3071 / 15057, 506 ins, 762 del, 1803 sub ] exp/sgmm2_4_mmi_b0.1/decode_dev_it4/wer_6
--------------------------------Test Set------------------------------------------
%WER 22.66 [ 1635 / 7215, 250 ins, 420 del, 965 sub ] exp/sgmm2_4_mmi_b0.1/decode_test_it1/wer_5
%WER 22.44 [ 1619 / 7215, 282 ins, 384 del, 953 sub ] exp/sgmm2_4_mmi_b0.1/decode_test_it2/wer_5
%WER 22.62 [ 1632 / 7215, 298 ins, 369 del, 965 sub ] exp/sgmm2_4_mmi_b0.1/decode_test_it3/wer_5
%WER 22.48 [ 1622 / 7215, 277 ins, 386 del, 959 sub ] exp/sgmm2_4_mmi_b0.1/decode_test_it4/wer_6
# Hybrid System :
---------------------------------Dev Set------------------------------------------
%WER 22.77 [ 3429 / 15057, 411 ins, 1057 del, 1961 sub ] exp/tri4_nnet/decode_dev/wer_3
--------------------------------Test Set------------------------------------------
%WER 24.84 [ 1792 / 7215, 197 ins, 579 del, 1016 sub ] exp/tri4_nnet/decode_test/wer_2
# Combination :
---------------------------------Dev Set------------------------------------------
%WER 20.26 [ 3051 / 15057, 371 ins, 937 del, 1743 sub ] exp/combine_2/decode_dev_it1/wer_4
%WER 19.91 [ 2998 / 15057, 397 ins, 870 del, 1731 sub ] exp/combine_2/decode_dev_it2/wer_4
%WER 19.75 [ 2974 / 15057, 422 ins, 825 del, 1727 sub ] exp/combine_2/decode_dev_it3/wer_4
%WER 19.79 [ 2980 / 15057, 373 ins, 886 del, 1721 sub ] exp/combine_2/decode_dev_it4/wer_5
--------------------------------Test Set------------------------------------------
%WER 21.90 [ 1580 / 7215, 191 ins, 474 del, 915 sub ] exp/combine_2/decode_test_it1/wer_4
%WER 21.73 [ 1568 / 7215, 218 ins, 442 del, 908 sub ] exp/combine_2/decode_test_it2/wer_4
%WER 21.62 [ 1560 / 7215, 223 ins, 423 del, 914 sub ] exp/combine_2/decode_test_it3/wer_4
%WER 21.68 [ 1564 / 7215, 197 ins, 476 del, 891 sub ] exp/combine_2/decode_test_it4/wer_5

Просмотреть файл

Просмотреть файл

@ -1,62 +0,0 @@
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program takes as its standard input an .ndx file from the WSJ corpus that looks
# like this:
#;; File: tr_s_wv1.ndx, updated 04/26/94
#;;
#;; Index for WSJ0 SI-short Sennheiser training data
#;; Data is read WSJ sentences, Sennheiser mic.
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
#;; per speaker TI) = 7236 utts
#;;
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
#and as command-line arguments it takes the names of the WSJ disk locations, e.g.:
#/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc.
# It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with
# /mnt/matylda2/data/WSJ0/11-1.1.
# It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with
# uppercase rather than lower case filenames.
foreach $fn (@ARGV) {
$fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n";
$disk_id=$1;
$disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1
$fn =~ s:/$::; # Remove final slash, just in case it is present.
$disk2fn{$disk_id} = $fn;
}
while(<STDIN>){
if(m/^;/){ next; } # Comment. Ignore it.
else {
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
$disk=$1;
if(!defined $disk2fn{$disk}) {
die "Disk id $disk not found";
}
$filename = $2; # as a subdirectory of the distributed disk.
if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) {
# The disk 13-16.1 has been uppercased for some reason, on the
# BUT system. This is a fix specifically for that case.
$filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why?
}
print "$disk2fn{$disk}/$filename\n";
}
}

Просмотреть файл

@ -1,36 +0,0 @@
#!/bin/bash
. cmd.sh
mfccdir=mfcc
# Make "per-utterance" versions of the test sets where the speaker
# information corresponds to utterances-- to demonstrate adaptation on
# short utterances, particularly for basis fMLLR
for x in "test" ; do
y=${x}_utt
rm -r data/$y
cp -r data/$x data/$y
cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk;
cp data/$y/utt2spk data/$y/spk2utt;
steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1;
done
# basis fMLLR experiments.
# First a baseline: decode per-utterance with normal fMLLR.
steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" \
exp/tri3b/graph_bg data/test_utt exp/tri3b/decode_bg_test_utt || exit 1;
# get the fMLLR basis.
steps/get_fmllr_basis.sh --cmd "$train_cmd" data/train data/lang exp/tri3b
# decoding tri3b with basis fMLLR
steps/decode_basis_fmllr.sh --nj 30 --cmd "$decode_cmd" \
exp/tri3b/graph_bg data/test exp/tri3b/decode_bg_test_basis || exit 1;
# The same, per-utterance.
steps/decode_basis_fmllr.sh --nj 30 --cmd "$decode_cmd" \
exp/tri3b/graph_bg data/test_utt exp/tri3b/decode_bg_test_basis_utt || exit 1;

Просмотреть файл

@ -1,41 +0,0 @@
#prepare reverse lexicon and language model for backwards decoding
utils/prepare_lang.sh --reverse true data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp.reverse data/lang.reverse || exit 1;
utils/reverse_lm.sh data/local/nist_lm/lm_bg_5k.arpa.gz data/lang.reverse data/lang_test_bg_5k.reverse || exit 1;
utils/reverse_lm_test.sh data/lang_test_bg_5k data/lang_test_bg_5k.reverse || exit 1;
# normal forward decoding
utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_10 || exit 1;
# backward decoding
utils/mkgraph.sh --reverse data/lang_test_bg_5k.reverse exp/tri2a exp/tri2a/graph_bg5k_r
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_reverse10 || exit 1;
# pingpong decoding
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 8 --cmd "$decode_cmd" \
--first_pass exp/tri2a/decode_eval92_bg5k_10 exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_pingpong10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 8 --cmd "$decode_cmd" \
--first_pass exp/tri2a/decode_eval92_bg5k_reverse10 exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_pongping10 || exit 1;
# same for bigger language models (on machine with 8GB RAM, you can run the whole decoding in 3-4 min without SGE)
utils/prepare_lang.sh --reverse true data/local/dict_larger "<SPOKEN_NOISE>" data/local/lang_larger.reverse data/lang_bd.reverse || exit;
utils/reverse_lm.sh --lexicon data/local/dict_larger/lexicon.txt data/local/local_lm/3gram-mincount/lm_pr6.0.gz data/lang_bd.reverse data/lang_test_bd_tgpr.reverse || exit 1;
utils/reverse_lm_test.sh data/lang_test_bd_tgpr data/lang_test_bd_tgpr.reverse || exit 1;
utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri2a exp/tri2a/graph_bd_tgpr
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 4 --cmd run.pl \
exp/tri2a/graph_bd_tgpr data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_10 || exit 1;
utils/mkgraph.sh --reverse data/lang_test_bd_tgpr.reverse exp/tri2a exp/tri2a/graph_bd_tgpr_r
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 4 --cmd run.pl \
exp/tri2a/graph_bd_tgpr_r data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_reverse10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 4 --cmd run.pl \
--first_pass exp/tri2a/decode_eval92_bdtgpr4_10 exp/tri2a/graph_bd_tgpr_r data/test_eval92 \
exp/tri2a/decode_eval92_bdtgpr4_pingpong10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 4 --cmd run.pl \
--first_pass exp/tri2a/decode_eval92_bdtgpr4_reverse10 exp/tri2a/graph_bd_tgpr data/test_eval92 \
exp/tri2a/decode_eval92_bdtgpr4_pongping10 || exit 1;

Просмотреть файл

@ -1,96 +0,0 @@
#!/bin/bash
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh ## Source the tools/utils (import the queue.pl)
###
### Now we can train the Deep Neural Network in a hybrid setup
###
### The fMLLR features are
### -spliced,
### -decorrelated by LDA
### -rescaled by CMVN over dataset
###
#( # Train the MLP
dir=exp/tri4a_dnn
$cuda_cmd $dir/_train_nnet.log \
steps/train_nnet.sh --hid-layers 4 --hid-dim 1200 \
--apply-cmvn false --splice-lr 4 --feat-type lda --lda-dim 300 \
--learn-rate 0.008 --bunch-size 256 \
data-fmllr/train data-fmllr/test_test_sup data/lang exp/tri3b exp/tri3b_ali_test $dir || exit 1;
# we can use the graph from the baseline system, tri4a.
# decode . Note: the dev93 results are not valid as testing results because
# the fMLLR was from the training transcripts.
steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --acwt 0.10 \
exp/tri3b/graph_bg data-fmllr/test exp/tri3b_dnn/decode_bg_test
# decode with big dictionary.
utils/mkgraph.sh data/lang_test_bg exp/tri3b_dnn exp/tri3b_dnn/graph_bg || exit 1;
steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --acwt 0.10 \
exp/tri3b_dnn/graph_bg data-fmllr/test exp/tri3b_dnn/decode_bg_test
#)
# Getting results [see RESULTS file]
# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
#from here
#false && \
(
###
### First we need to generate the alignments,
###
### these are used as DNN training targets,
### also the fMLLR transforms are needed
###
# We don't really need the alignment directory, as tri4a was trained
# on si284 and already contains alignments.
#steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
# data/train_si284 data/lang exp/tri4a exp/tri4a_ali_si284 || exit 1
steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
data/test data/lang exp/tri3b exp/tri3b_ali_test || exit 1 #dev
###
### As next step we store the fMLLR features, so we can train on them easily
###
gmmdir=exp/tri3b
# dev93 (using alignments)
dir=data-fmllr/test_test_sup
# generate the features
steps/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
--transform-dir exp/tri3b_ali_test \
$dir data/test $gmmdir $dir/_log $dir/_data || exit 1
# train si284
# generate the features
dir=data-fmllr/train
steps/make_fmllr_feats.sh --nj 20 --cmd "$train_cmd" \
--transform-dir exp/tri3b \
$dir data/train $gmmdir $dir/_log $dir/_data || exit 1
# eval92
dir=data-fmllr/test
steps/make_fmllr_feats.sh --nj 8 --cmd "$train_cmd" \
--transform-dir exp/tri3b/decode_bg_test \
$dir data/test $gmmdir $dir/_log $dir/_data || exit 1
dir=data-fmllr/test
steps/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
--transform-dir exp/tri3b/decode_bg_test \
$dir data/test $gmmdir $dir/_log $dir/_data || exit 1
)

Просмотреть файл

@ -1,57 +0,0 @@
#!/bin/bash
. ./cmd.sh
# Train and test MMI (and boosted MMI) on tri2b system.
steps/make_denlats.sh --sub-split 20 --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri2b exp/tri2b_denlats_train || exit 1;
# train the basic MMI system.
steps/train_mmi.sh --cmd "$train_cmd" \
data/train data/lang exp/tri2b_ali_train \
exp/tri2b_denlats_train exp/tri2b_mmi || exit 1;
for iter in 1 2 3 4; do
steps/decode_si.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
exp/tri2b/graph_bg data/test exp/tri2b_mmi/decode_bg_test_it$iter &
done
# MMI with 0.1 boosting factor.
steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
data/train data/lang exp/tri2b_ali_train exp/tri2b_denlats_train \
exp/tri2b_mmi_b0.1 || exit 1;
for iter in 1 2 3 4; do
steps/decode_si.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
exp/tri2b/graph_bg data/test exp/tri2b_mmi_b0.1/decode_bg_test_it$iter &
done
# Train a UBM with 400 components, for fMMI.
steps/train_diag_ubm.sh --silence-weight 0.5 --nj 30 --cmd "$train_cmd" \
400 data/train data/lang exp/tri2b_ali_train exp/dubm2b
steps/train_mmi_fmmi.sh --boost 0.1 --cmd "$train_cmd" \
data/train data/lang exp/tri2b_ali_train exp/dubm2b exp/tri2b_denlats_train \
exp/tri2b_fmmi_b0.1
for iter in 1 2 3 4; do
steps/decode_fmmi.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
exp/tri2b/graph_bg data/test exp/tri2b_fmmi_b0.1/decode_bg_test_it$iter &
done
steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
data/train data/lang exp/tri2b_ali_train exp/dubm2b exp/tri2b_denlats_train \
exp/tri2b_fmmi_b0.1_lr0.005 || exit 1;
for iter in 1 2 3 4; do
steps/decode_fmmi.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
exp/tri2b/graph_bg data/test exp/tri2b_fmmi_b0.1_lr0.005/decode_bg_test_it$iter &
done
steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
data/train data/lang exp/tri2b_ali_train exp/dubm2b exp/tri2b_denlats_train \
exp/tri2b_fmmi_indirect_b0.1
for iter in 1 2 3 4; do
steps/decode_fmmi.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
exp/tri2b/graph_bg data/test exp/tri2b_fmmi_indirect_b0.1/decode_bg_test_it$iter &
done

Просмотреть файл

@ -1,41 +0,0 @@
#!/bin/bash
. ./cmd.sh
[ -f path.sh ] && . ./path.sh
steps/make_denlats.sh --nj 30 --sub-split 30 --cmd "$train_cmd" \
--transform-dir exp/tri3b_ali_train \
data/train data/lang exp/tri3b exp/tri3b_denlats_train || exit 1;
steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
data/train data/lang exp/tri3b_ali_train exp/tri3b_denlats_train \
exp/tri3b_mmi_b0.1 || exit 1;
steps/decode.sh --nj 30 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode_bg_test \
exp/tri3b/graph_tgpr data/test exp/tri3b_mmi_b0.1/decode_bg_test
#first, train UBM for fMMI experiments.
steps/train_diag_ubm.sh --silence-weight 0.5 --nj 30 --cmd "$train_cmd" \
600 data/train data/lang exp/tri3b_ali_train exp/dubm3b
# Next, fMMI+MMI.
steps/train_mmi_fmmi.sh \
--boost 0.1 --cmd "$train_cmd" data/train data/lang exp/tri3b_ali_train exp/dubm3b exp/tri3b_denlats_train exp/tri3b_fmmi_a || exit 1;
for iter in 1 2 3 4; do
steps/decode_fmmi.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri3b/decode_bg_test exp/tri3b/graph_bg data/test \
exp/tri3b_fmmi_a/decode_bg_test_it$iter
done
# fMMI + mmi with indirect differential.
steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
data/train data/lang exp/tri3b_ali_train exp/dubm3b exp/tri3b_denlats_train \
exp/tri3b_fmmi_indirect || exit 1;
for iter in 1 2 3 4; do
steps/decode_fmmi.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri3b/decode_bg_test exp/tri3b/graph_bg data/test \
exp/tri3b_fmmi_indirect/decode_bg_test_it$iter
done

Просмотреть файл

@ -1,35 +0,0 @@
#!/bin/bash
. ./cmd.sh
( # I'm using basically the same setup as for Switchboard 100 hours,
# but slightly fewer parameters (8M -> 7M) as we have slightly less
# data (81 hours).
steps/train_nnet_cpu.sh \
--mix-up 8000 \
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
--num-jobs-nnet 16 --num-hidden-layers 4 \
--num-parameters 7000000 \
--cmd "$decode_cmd" \
data/train_si284 data/lang exp/tri4b_ali_si284 exp/nnet5c1 || exit 1
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \
--transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
exp/tri4b/graph_bd_tgpr data/test_dev93 exp/nnet5c1/decode_bd_tgpr_dev93
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
--transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
exp/tri4b/graph_bd_tgpr data/test_dev93 exp/nnet5c1/decode_bd_tgpr_dev93
)
(
steps/train_nnet_cpu_mmi.sh --boost 0.1 --initial-learning-rate 0.001 \
--minibatch-size 128 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
data/train data/lang exp/tri5c1_nnet exp/tri5c1_nnet exp/tri5c1_denlats exp/tri5c1_mmi_a
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri5c1_mmi_a/decode
)&

Просмотреть файл

@ -1,42 +0,0 @@
#!/bin/bash
for test in dev93 eval92; do
steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \
data/test_${test} exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 || exit 1;
# Note: for N-best-list generation, choosing the acoustic scale (12) that gave
# the best WER on this test set. Ideally we should do this on a dev set.
# This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM.
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_${test} \
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm30_0.25 \
|| exit 1;
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_${test} \
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm100_0.5 \
|| exit 1;
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_${test} \
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm200_0.5 \
|| exit 1;
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.5 \
|| exit 1;
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 12 \
0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \
exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.75 \
|| exit 1;
done

Просмотреть файл

@ -1,64 +0,0 @@
#!/bin/bash
. cmd.sh
# This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM.
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 17 \
0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm30_0.25 \
|| exit 1;
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 17 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm100_0.5 \
|| exit 1;
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 17 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm200_0.5 \
|| exit 1;
steps/rnnlmrescore.sh \
--N 100 --cmd "$decode_cmd" --inv-acwt 17 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 \
|| exit 1;
steps/rnnlmrescore.sh \
--N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75_N1000
rm -rf $dir
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 $dir
steps/rnnlmrescore.sh \
--stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75
rm -rf $dir
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
steps/rnnlmrescore.sh \
--stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.25
rm -rf $dir
cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
steps/rnnlmrescore.sh \
--stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
0.25 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg $dir
steps/rnnlmrescore.sh \
--N 10 --cmd "$decode_cmd" --inv-acwt 17 \
0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \
exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N10 \
|| exit 1;

Просмотреть файл

@ -1,113 +0,0 @@
#!/bin/bash
# This script is invoked from ../run.sh
# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
. cmd.sh
# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
# training, but this shouldn't have much effect.
(
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1;
steps/train_ubm.sh --cmd "$train_cmd" \
400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1;
steps/train_sgmm.sh --cmd "$train_cmd" \
3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \
exp/ubm5b/final.ubm exp/sgmm5a || exit 1;
(
utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr
steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93
) &
steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \
--use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1;
steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84
steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1
for iter in 1 2 3 4; do
steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter &
done
steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
--update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9
for iter in 1 2 3 4; do
steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter &
done
) &
(
# The next commands are the same thing on all the si284 data.
# SGMM system on the si284 data [sgmm5b]
steps/train_ubm.sh --cmd "$train_cmd" \
600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1;
steps/train_sgmm.sh --cmd "$train_cmd" \
5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
exp/ubm5b/final.ubm exp/sgmm5b || exit 1;
(
utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr
steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93
steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92
utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1;
steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93
steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92
) &
steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
--use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284
steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284
steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1
for iter in 1 2 3 4; do
for test in dev93 eval92; do
steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \
exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter &
steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \
exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
done
done
) &
# Train quinphone SGMM system.
steps/train_sgmm.sh --cmd "$train_cmd" \
--context-opts "--context-width=5 --central-position=2" \
5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
exp/ubm5b/final.ubm exp/sgmm5c || exit 1;
# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93.
steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93

Просмотреть файл

@ -1,74 +0,0 @@
#!/bin/bash
# This script is invoked from ../run.sh
# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
[ -f path.sh ] && . ./path.sh
# Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh;
# this takes out the "symmetric SGMM" part which is not always helpful.
# SGMM system on train data [sgmm4a]. Note: the system we aligned from used the train data for training, but this shouldn't have much effect.
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri3b exp/tri3b_ali_train || exit 1;
steps/train_ubm.sh --cmd "$train_cmd" \
400 data/train data/lang exp/tri3b_ali_train exp/ubm4a || exit 1;
steps/train_sgmm2.sh --cmd "$train_cmd" \
7000 9000 data/train data/lang exp/tri3b_ali_train \
exp/ubm4a/final.ubm exp/sgmm2_4a || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/sgmm2_4a exp/sgmm2_4a/graph_bg
steps/decode_sgmm2.sh --nj 30 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode_bg_test \
exp/sgmm2_4a/graph_bg data/test exp/sgmm2_4a/decode_bg_test
steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri3b_ali_train \
--use-graphs true --use-gselect true data/train data/lang exp/sgmm2_4a exp/sgmm2_4a_ali_train || exit 1;
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri3b_ali_train \
data/train data/lang exp/sgmm2_4a_ali_train exp/sgmm2_4a_denlats_train
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri3b_ali_train --boost 0.1 \
data/train data/lang exp/sgmm2_4a_ali_train exp/sgmm2_4a_denlats_train exp/sgmm2_4a_mmi_b0.1
for iter in 1 2 3 4; do
for test in "test"; do # dev93
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri3b/decode_bg_${test} data/lang_test_bg data/${test} exp/sgmm2_4a/decode_bg_${test} exp/sgmm2_4a_mmi_b0.1/decode_bg_${test}_it$iter
done
done
# steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri3b_ali_train --boost 0.1 \
# --update-opts "--cov-min-value=0.9" data/train data/lang exp/sgmm2_4a_ali_train exp/sgmm2_4a_denlats_train exp/sgmm2_4a_mmi_b0.1_m0.9
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri3b_ali_train --boost 0.1 \
--zero-if-disjoint true data/train data/lang exp/sgmm2_4a_ali_train exp/sgmm2_4a_denlats_train exp/sgmm2_4a_mmi_b0.1_z
for iter in 1 2 3 4; do
for test in "test"; do #dev93
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri3b/decode_bg_${test} data/lang_test_bg data/${test} exp/sgmm2_4a/decode_bg_${test} \
exp/sgmm2_4a_mmi_b0.1_z/decode_bg_${test}_it$iter
done
done
# Examples of combining some of the best decodings: SGMM+MMI with
# MMI+fMMI on a conventional system.
local/score_combine.sh data/test \
data/lang_test_bg \
exp/tri3b_fmmi_a/decode_bg_test_it1 \
exp/sgmm2_4a_mmi_b0.1/decode_bg_test_it1 \
exp/combine_tri3b_fmmi_a_sgmm2_4a_mmi_b0.1/decode_bg_test_it1_1
# Checking MBR decode of baseline:
cp -r -T exp/sgmm2_4a_mmi_b0.1/decode_bg_test_it3{,.mbr}
local/score_mbr.sh data/test data/lang_test_bg exp/sgmm2_4a_mmi_b0.1/decode_bg_test_it3.mbr

Просмотреть файл

@ -1,61 +0,0 @@
#!/bin/bash
# Script for minimum bayes risk decoding.
[ -f ./path.sh ] && . ./path.sh;
# begin configuration section.
cmd=run.pl
min_lmwt=1
max_lmwt=10
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score_mbr.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
data=$1
lang_or_graph=$2
dir=$3
symtab=$lang_or_graph/words.txt
for f in $symtab $dir/lat.1.gz $data/text; do
[ ! -f $f ] && echo "score_mbr.sh: no such file $f" && exit 1;
done
mkdir -p $dir/scoring/log
phonemap="conf/phones.60-48-39.map"
cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
# We submit the jobs separately, not as an array, because it's hard
# to get the inverse of the LM scales.
rm $dir/.error 2>/dev/null
for inv_acwt in `seq $min_lmwt $max_lmwt`; do
acwt=`perl -e "print (1.0/$inv_acwt);"`
$cmd $dir/scoring/rescore_mbr.${inv_acwt}.log \
lattice-mbr-decode --acoustic-scale=$acwt --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/${inv_acwt}.tra \
|| touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "score_mbr.sh: errror getting MBR outout.";
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
local/timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 \| \
compute-wer --text --mode=present \
ark:$dir/scoring/test_filt.txt ark,p:- ">" $dir/wer_LMWT || exit 1;

Просмотреть файл

@ -1,97 +1,109 @@
#!/bin/bash
# Copyright 2013 (Author: Bagher BabaAli)
# Copyright 2013 (Authors: Bagher BabaAli, Daniel Povey, Arnab Ghoshal)
# Apache 2.0.
if [ $# -ne 1 ]; then
echo "Argument should be the Timit directory, see ../run.sh for example."
exit 1;
fi
dir=`pwd`/data/local/data
mkdir -p $dir
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
conf=`pwd`/conf
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
cd $dir
[ -f $conf/test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
[ -f $conf/dev_spk.list ] || error_exit "$PROG: dev-set speaker list not found.";
# Make directory of links to the TIMIT disk. This relies on the command
# line arguments being absolute pathnames.
rm -r links/ 2>/dev/null
mkdir links/
ln -s $* links
# Do some basic checks that we have what we expected.
if [ ! -d $*/TRAIN -o ! -d $*/TEST ]; then
# First check if the train & test directories exist (these can either be upper-
# or lower-cased
if [ ! -d $*/TRAIN -o ! -d $*/TEST ] && [ ! -d $*/train -o ! -d $*/test ]; then
echo "timit_data_prep.sh: Spot check of command line argument failed"
echo "Command line argument must be absolute pathname to TIMIT directory"
echo "with name like /export/corpora5/LDC/LDC93S1/timit/TIMIT"
exit 1;
fi
# This version for TRAIN
# Now check what case the directory structure is
uppercased=false
train_dir=train
test_dir=test
if [ -d $*/TRAIN ]; then
[ -d $*/train -o -d $*/test ] \
&& echo "Error: Found both upper- & lower-cased directories" && exit 1;
uppercased=true
train_dir=TRAIN
test_dir=TEST
fi
TrainDir=$*/TRAIN
find -L $TrainDir \( -iname '*.WAV' -o -iname '*.wav' \) > train.flist
nl=`cat train.flist | wc -l`
[ "$nl" -eq 4620 ] || echo "Warning: expected 4620 lines in train.flist, got $nl"
tmpdir=$(mktemp -d);
trap 'rm -rf "$tmpdir"' EXIT
# Now for the TEST.
# Get the list of speakers. The list of speakers in the 24-speaker core test
# set and the 50-speaker development set must be supplied to the script. All
# speakers in the 'train' directory are used for training.
if $uppercased; then
tr '[:lower:]' '[:upper:]' < $conf/dev_spk.list > $tmpdir/dev_spk
tr '[:lower:]' '[:upper:]' < $conf/test_spk.list > $tmpdir/test_spk
ls -d "$*"/TRAIN/DR*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
else
tr '[:upper:]' '[:lower:]' < $conf/dev_spk.list > $tmpdir/dev_spk
tr '[:upper:]' '[:lower:]' < $conf/test_spk.list > $tmpdir/test_spk
ls -d "$*"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
fi
TestDir=$*/TEST
find -L $TestDir \( -iname '*.WAV' -o -iname '*.wav' \) > test.flist
cd $dir
for x in train dev test; do
# First, find the list of audio files (use only si & sx utterances).
# Note: train & test sets are under different directories, but doing find on
# both and grepping for the speakers will work correctly.
nl=`cat test.flist | wc -l`
[ "$nl" -eq 1680 ] || echo "Warning: expected 1680 lines in test.flist, got $nl"
find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WAV' \
| grep -f $tmpdir/${x}_spk > ${x}_sph.flist
sed -e 's:.*/\(.*\)/\(.*\).WAV$:\1_\2:i' ${x}_sph.flist \
> $tmpdir/${x}_sph.uttids
paste $tmpdir/${x}_sph.uttids ${x}_sph.flist \
| sort -k1,1 > ${x}_sph.scp
# Finding the transcript files:
find -L $TrainDir \( -iname '*.PHN' -o -iname '*.phn' \) > train_phn.flist
find -L $TestDir \( -iname '*.PHN' -o -iname '*.phn' \) > test_phn.flist
cat ${x}_sph.scp | awk '{print $1}' > ${x}.uttids
# Convert the transcripts into our format (no normalization yet)
for x in train test; do
$local/timit_flist2scp.pl $x.flist | sort > ${x}_sph.scp
cat ${x}_sph.scp | awk '{print $1}' > ${x}.uttids
cat ${x}.uttids | $local/timit_find_transcripts.pl ${x}_phn.flist > ${x}_phn.trans
done
# Now, Convert the transcripts into our format (no normalization yet)
# Get the transcripts: each line of the output contains an utterance
# ID followed by the transcript.
find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.PHN' \
| grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist
sed -e 's:.*/\(.*\)/\(.*\).PHN$:\1_\2:i' $tmpdir/${x}_phn.flist \
> $tmpdir/${x}_phn.uttids
while read line; do
[ -f $line ] || error_exit "Cannot find transcription file '$line'";
cut -f3 -d' ' "$line" | tr '\n' ' ' | sed -e 's: *$:\n:'
done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans
paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \
| sort -k1,1 > ${x}.trans
# Do normalization steps.
cat train_phn.trans | $local/timit_norm_trans.pl -i - -m $conf/phones.60-48-39.map -to 48 | sort > train.txt || exit 1;
cat ${x}.trans | $local/timit_norm_trans.pl -i - -m $conf/phones.60-48-39.map -to 39 | sort > $x.txt || exit 1;
for x in test; do
cat ${x}_phn.trans | $local/timit_norm_trans.pl -i - -m $conf/phones.60-48-39.map -to 39 | sort > $x.txt || exit 1;
done
# Create scp's with wav's.
for x in train test; do
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
done
# Make the utt2spk and spk2utt files.
for x in train test; do
cut -f1 -d'_' $x.uttids | paste -d' ' $x.uttids - > $x.utt2spk
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
cut -f1 -d'_' $x.uttids | paste -d' ' $x.uttids - > $x.utt2spk
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
cat $x.spk2utt | awk '{print $1}' | perl -ane 'chop; m:^.:; print "$_ $&\n";' > $x.spk2gender
done
# Make the spk2gender files.
for x in train test; do
cat $x.spk2utt | awk '{print $1}' | perl -ane 'chop; m:^.:; print "$_ $&\n";' > $x.spk2gender
done
echo "Data preparation succeeded"
echo "Data preparation succeeded"

Просмотреть файл

@ -1,60 +0,0 @@
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program takes on its standard input a list of utterance
# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
# It takes as
# Extracts from the phn files the transcripts for a given
# dataset (represented by a file list).
#
@ARGV == 1 || die "timit_find_transcripts.pl phn_trans_flist < utterance_ids > transcripts";
$phn_flist = shift @ARGV;
open(L, "<$phn_flist") || die "Opening file list of phn files: $phn_flist\n";
while(<L>){
chop;
m:^\S+/(\w+)/(\w+)\.[pP][hH][nN]$: || die "Bad line in phn file list: $_";
$spk = $1 . "_" . $2;
$spk2phn{$spk} = $_;
}
%utt2trans = { };
while(<STDIN>){
chop;
$uttid = $_;
$uttid =~ m:(\w+)_(\w+): || die "Bad utterance id $_";
$phnfile = $spk2phn{$uttid};
defined $phnfile || die "No phn file for speaker $spk\n";
open(F, "<$phnfile") || die "Error opening phn file $phnfile\n";
@trans = "";
while(<F>) {
$_ =~ m:\d+\s\d+\s(.+)$: || die "Bad line $_ in phn file $phnfile (line $.)\n";
push (@trans,$1);
}
$utt2trans{$uttid} = join(" ",@trans);
if(!defined $utt2trans{$uttid}) {
print STDERR "No transcript for utterance $uttid (current phn file is $phnfile)\n";
} else {
print "$uttid $utt2trans{$uttid}\n";
}
close(F);
}

Просмотреть файл

@ -1,30 +0,0 @@
#!/usr/bin/perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# takes in a file list with lines like
# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
# and outputs an scp in kaldi format with lines like
# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
# (the first thing is the utterance-id, which is the same as the basename of the file.
while(<>){
m:^\S+/(\w+)/(\w+)\.[wW][aA][vV]$: || die "Bad line $_";
$id = $1 . "_" . $2;
print "$id $_";
}

Просмотреть файл

@ -1,6 +1,6 @@
#!/bin/bash
# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Copyright 2013 (Author: Daniel Povey)
# Apache 2.0
# This script takes data prepared in a corpus-dependent way
@ -9,14 +9,14 @@
. ./path.sh || exit 1;
echo "Preparing train and test data"
echo "Preparing train, dev and test data"
srcdir=data/local/data
lmdir=data/local/nist_lm
tmpdir=data/local/lm_tmp
lexicon=data/local/dict/lexicon.txt
mkdir -p $tmpdir
for x in train test; do
for x in train dev test; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
@ -25,7 +25,6 @@ for x in train test; do
utils/filter_scp.pl data/$x/spk2utt $srcdir/$x.spk2gender > data/$x/spk2gender || exit 1;
done
# Next, for each type of language model, create the corresponding FST
# and the corresponding lang_test_* directory.
@ -53,10 +52,10 @@ for lm_suffix in bg; do
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt

Просмотреть файл

@ -1,6 +1,6 @@
#!/bin/bash
# Copyright 2013 (Author: Bagher BabaAli)
# Copyright 2013 (Authors: Daniel Povey, Bagher BabaAli)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -60,9 +60,7 @@ cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_))
$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
>> $dir/extra_questions.txt || exit 1;
# (2) Create the phone bigram LM
#(
[ -z "$IRSTLM" ] && \
echo "LM building won't work without setting the IRSTLM env variable" && exit 1;
! which build-lm.sh 2>/dev/null && \
@ -76,8 +74,4 @@ cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_))
compile-lm $tmpdir/lm_phone_bg.ilm.gz --text yes /dev/stdout | \
grep -v unk | gzip -c > $lmdir/lm_phone_bg.arpa.gz
#) >& data/prepare_lm.log
echo "Dictionary preparation succeeded"
echo "Dictionary & language model preparation succeeded"

Просмотреть файл

@ -5,13 +5,27 @@
. ./cmd.sh
[ -f path.sh ] && . ./path.sh
# Acoustic model parameters
numLeavesTri1=2500
numGaussTri1=15000
numLeavesMLLT=2500
numGaussMLLT=15000
numLeavesSAT=2500
numGaussSAT=15000
numGaussUBM=400
numLeavesSGMM=7000
numGaussSGMM=9000
decode_nj=20
train_nj=30
echo ============================================================================
echo " Data & Lexicon & Language Preparation "
echo ============================================================================
timit=/export/corpora5/LDC/LDC93S1/timit/TIMIT
local/timit_data_prep.sh $timit || exit 1;
local/timit_data_prep.sh $timit || exit 1;
local/timit_prepare_dict.sh || exit 1;
@ -21,119 +35,178 @@ utils/prepare_lang.sh --position-dependent-phones false --num-sil-states 3 \
local/timit_format_data.sh || exit 1;
echo ============================================================================
echo " MFCC Feature Extration & CMVN for Training and Test set "
echo " MFCC Feature Extration & CMVN for Training and Test set "
echo ============================================================================
# Now make MFCC features.
mfccdir=mfcc
for x in test train; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 \
data/$x exp/make_mfcc/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
use_pitch=false
use_ffv=false
for x in train dev test; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
done
vecho ============================================================================
echo ============================================================================
echo " MonoPhone Training & Decoding "
echo ============================================================================
steps/train_mono.sh --nj 30 --cmd "$train_cmd" data/train data/lang exp/mono || exit 1;
steps/train_mono.sh --nj "$train_nj" --cmd "$train_cmd" data/train data/lang exp/mono || exit 1;
utils/mkgraph.sh --mono data/lang_test_bg exp/mono exp/mono/graph_bg || exit 1;
utils/mkgraph.sh --mono data/lang_test_bg exp/mono exp/mono/graph || exit 1;
steps/decode.sh --nj 30 --cmd "$decode_cmd" \
exp/mono/graph_bg data/test exp/mono/decode_bg_test || exit 1;
steps/decode.sh --nj "$decode_nj" --cmd "$decode_cmd" \
exp/mono/graph data/dev exp/mono/decode_dev || exit 1;
steps/decode.sh --nj "$decode_nj" --cmd "$decode_cmd" \
exp/mono/graph data/test exp/mono/decode_test || exit 1;
echo ============================================================================
echo " tri1 : Deltas + Delta-Deltas Training & Decoding "
echo ============================================================================
steps/align_si.sh --boost-silence 1.25 --nj 30 --cmd "$train_cmd" \
steps/align_si.sh --boost-silence 1.25 --nj "$train_nj" --cmd "$train_cmd" \
data/train data/lang exp/mono exp/mono_ali || exit 1;
# Train tri1, which is deltas + delta-deltas, on train data.
steps/train_deltas.sh --cmd "$train_cmd" \
2500 15000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
$numLeavesTri1 $numGaussTri1 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/tri1 exp/tri1/graph_bg || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/tri1 exp/tri1/graph || exit 1;
steps/decode.sh --nj 30 --cmd "$decode_cmd" \
exp/tri1/graph_bg data/test exp/tri1/decode_bg_test || exit 1;
steps/decode.sh --nj "$decode_nj" --cmd "$decode_cmd" \
exp/tri1/graph data/dev exp/tri1/decode_dev || exit 1;
steps/decode.sh --nj "$decode_nj" --cmd "$decode_cmd" \
exp/tri1/graph data/test exp/tri1/decode_test || exit 1;
echo ============================================================================
echo " tri2 : LDA + MLLT Training & Decoding "
echo ============================================================================
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri1 exp/tri1_ali_train || exit 1;
steps/align_si.sh --nj "$train_nj" --cmd "$train_cmd" \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
2500 15000 data/train data/lang exp/tri1_ali_train exp/tri2 || exit 1;
$numLeavesMLLT $numGaussMLLT data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/tri2 exp/tri2/graph_bg || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/tri2 exp/tri2/graph || exit 1;
steps/decode.sh --nj 30 --cmd "$decode_cmd" \
exp/tri2/graph_bg data/test exp/tri2/decode_bg_test || exit 1;
steps/decode.sh --nj "$decode_nj" --cmd "$decode_cmd" \
exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
steps/decode.sh --nj "$decode_nj" --cmd "$decode_cmd" \
exp/tri2/graph data/test exp/tri2/decode_test || exit 1;
echo ============================================================================
echo " tri3 : LDA + MLLT + SAT Training & Decoding "
echo ============================================================================
# Align tri2 system with train data.
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
--use-graphs true data/train data/lang exp/tri2 exp/tri2_ali_train || exit 1;
steps/align_si.sh --nj "$train_nj" --cmd "$train_cmd" \
--use-graphs true data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
# From tri2 system, train tri3 which is LDA + MLLT + SAT.
steps/train_sat.sh --cmd "$train_cmd" \
2500 15000 data/train data/lang exp/tri2_ali_train exp/tri3 || exit 1;
$numLeavesSAT $numGaussSAT data/train data/lang exp/tri2_ali exp/tri3 || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/tri3 exp/tri3/graph_bg || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/tri3 exp/tri3/graph || exit 1;
steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" \
exp/tri3/graph_bg data/test exp/tri3/decode_bg_test || exit 1;
steps/decode_fmllr.sh --nj "$decode_nj" --cmd "$decode_cmd" \
exp/tri3/graph data/dev exp/tri3/decode_dev || exit 1;
steps/decode_fmllr.sh --nj "$decode_nj" --cmd "$decode_cmd" \
exp/tri3/graph data/test exp/tri3/decode_test || exit 1;
echo ============================================================================
echo " SGMM2 Training & Decoding "
echo ============================================================================
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri3 exp/tri3_ali_train || exit 1;
steps/align_fmllr.sh --nj "$train_nj" --cmd "$train_cmd" \
data/train data/lang exp/tri3 exp/tri3_ali || exit 1;
steps/train_ubm.sh --cmd "$train_cmd" \
400 data/train data/lang exp/tri3_ali_train exp/ubm4 || exit 1;
$numGaussUBM data/train data/lang exp/tri3_ali exp/ubm4 || exit 1;
steps/train_sgmm2.sh --cmd "$train_cmd" 7000 9000 \
data/train data/lang exp/tri3_ali_train exp/ubm4/final.ubm exp/sgmm2_4 || exit 1;
steps/train_sgmm2.sh --cmd "$train_cmd" $numLeavesSGMM $numGaussSGMM \
data/train data/lang exp/tri3_ali exp/ubm4/final.ubm exp/sgmm2_4 || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/sgmm2_4 exp/sgmm2_4/graph_bg || exit 1;
utils/mkgraph.sh data/lang_test_bg exp/sgmm2_4 exp/sgmm2_4/graph || exit 1;
steps/decode_sgmm2.sh --nj 30 --cmd "$decode_cmd"\
--transform-dir exp/tri3/decode_bg_test exp/sgmm2_4/graph_bg data/test \
exp/sgmm2_4/decode_bg_test || exit 1;
steps/decode_sgmm2.sh --nj "$decode_nj" --cmd "$decode_cmd"\
--transform-dir exp/tri3/decode_dev exp/sgmm2_4/graph data/dev \
exp/sgmm2_4/decode_dev || exit 1;
steps/decode_sgmm2.sh --nj "$decode_nj" --cmd "$decode_cmd"\
--transform-dir exp/tri3/decode_test exp/sgmm2_4/graph data/test \
exp/sgmm2_4/decode_test || exit 1;
echo ============================================================================
echo " MMI + SGMM2 Training & Decoding "
echo ============================================================================
steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" \
--transform-dir exp/tri3_ali_train --use-graphs true --use-gselect true data/train \
data/lang exp/sgmm2_4 exp/sgmm2_4_ali_train || exit 1;
steps/align_sgmm2.sh --nj "$train_nj" --cmd "$train_cmd" \
--transform-dir exp/tri3_ali --use-graphs true --use-gselect true data/train \
data/lang exp/sgmm2_4 exp/sgmm2_4_ali || exit 1;
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd"\
--transform-dir exp/tri3_ali_train data/train data/lang exp/sgmm2_4_ali_train \
exp/sgmm2_4_denlats_train || exit 1;
steps/make_denlats_sgmm2.sh --nj "$train_nj" --sub-split "$train_nj" --cmd "$decode_cmd"\
--transform-dir exp/tri3_ali data/train data/lang exp/sgmm2_4_ali \
exp/sgmm2_4_denlats || exit 1;
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" \
--transform-dir exp/tri3_ali_train --boost 0.1 --zero-if-disjoint true \
data/train data/lang exp/sgmm2_4_ali_train exp/sgmm2_4_denlats_train \
exp/sgmm2_4_mmi_b0.1_z || exit 1;
--transform-dir exp/tri3_ali --boost 0.1 --zero-if-disjoint true \
data/train data/lang exp/sgmm2_4_ali exp/sgmm2_4_denlats \
exp/sgmm2_4_mmi_b0.1 || exit 1;
for iter in 1 2 3 4; do
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri3/decode_bg_test data/lang_test_bg data/test \
exp/sgmm2_4/decode_bg_test exp/sgmm2_4_mmi_b0.1_z/decode_bg_test_it$iter || exit 1;
--transform-dir exp/tri3/decode_dev data/lang_test_bg data/dev \
exp/sgmm2_4/decode_dev exp/sgmm2_4_mmi_b0.1/decode_dev_it$iter || exit 1;
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri3/decode_test data/lang_test_bg data/test \
exp/sgmm2_4/decode_test exp/sgmm2_4_mmi_b0.1/decode_test_it$iter || exit 1;
done
echo ============================================================================
echo " DNN Hybrid Training & Decoding "
echo ============================================================================
# DNN hybrid system training parameters
dnn_mem_reqs="mem_free=1.0G,ram_free=0.2G"
dnn_extra_opts="--num_epochs 20 --num-epochs-extra 10 --add-layers-period 1 --shrink-interval 3"
steps/train_nnet_cpu.sh --mix-up 5000 --initial-learning-rate 0.015 \
--final-learning-rate 0.002 --num-hidden-layers 2 --num-parameters 1500000 \
--num-jobs-nnet "$train_nj" --cmd "$train_cmd" "${dnn_train_extra_opts[@]}" \
data/train data/lang exp/tri3_ali exp/tri4_nnet || exit 1;
decode_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6 -l mem_free=4G,ram_free=0.7G")
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj "$decode_nj" "${decode_extra_opts[@]}" \
--transform-dir exp/tri3/decode_dev exp/tri3/graph data/dev \
exp/tri4_nnet/decode_dev | tee exp/tri4_nnet/decode_dev/decode.log
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj "$decode_nj" "${decode_extra_opts[@]}" \
--transform-dir exp/tri3/decode_test exp/tri3/graph data/test \
exp/tri4_nnet/decode_test | tee exp/tri4_nnet/decode_test/decode.log
echo ============================================================================
echo " System Combination (DNN+SGMM) "
echo ============================================================================
for iter in 1 2 3 4; do
local/score_combine.sh --cmd "$decode_cmd" \
data/dev data/lang_test_bg exp/tri4_nnet/decode_dev \
exp/sgmm2_4_mmi_b0.1/decode_dev_it$iter exp/combine_2/decode_dev_it$iter
local/score_combine.sh --cmd "$decode_cmd" \
data/test data/lang_test_bg exp/tri4_nnet/decode_test \
exp/sgmm2_4_mmi_b0.1/decode_test_it$iter exp/combine_2/decode_test_it$iter
done
echo ============================================================================
echo " Getting Results [see RESULTS file] "
echo ============================================================================
@ -142,8 +215,8 @@ for x in exp/*/decode*; do
[ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh
done
exit 0;
echo ============================================================================
echo "Finished successfully on" `date`
echo ============================================================================
exit 0

Просмотреть файл

@ -13,6 +13,7 @@ stage=0
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
align_to_lats=false # optionally produce alignment in lattice format
lats_decode_opts="--acoustic-scale=0.1 --beam=20 --latbeam=10"
lats_graph_scales="--transition-scale=1.0 --self-loop-scale=0.1"
@ -48,22 +49,19 @@ sdata=$data/split$nj
cp $srcdir/{tree,final.mdl} $dir || exit 1;
#Get the files we will need
# Select default locations to model files
nnet=$srcdir/final.nnet;
[ ! -s "$nnet" ] && echo "Missing nnet '$nnet'" && exit 1;
class_frame_counts=$srcdir/ali_train_pdf.counts
[ ! -s "$class_frame_counts" ] && echo "Missing class_frame_counts '$class_frame_counts'" && exit 1;
feature_transform=$srcdir/final.feature_transform
[ ! -s $feature_transform ] && echo "Missing feature_transform '$feature_transform'" && exit 1
model=$dir/final.mdl
[ ! -s "$model" ] && echo "Missing transtion-model '$model'" && exit 1;
###
### Prepare feature pipeline (same as for decoding)
###
# Check that files exist
for f in $sdata/1/feats.scp $sdata/1/text $lang/L.fst $nnet $model $feature_transform $class_frame_counts; do
[ ! -f $f ] && echo "$0: missing file $f" && exit 1;
done
# PREPARE FEATURE EXTRACTION PIPELINE
# Create the feature stream:
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
# Optionally add cmvn
@ -77,15 +75,11 @@ if [ -f $srcdir/delta_order ]; then
delta_order=$(cat $srcdir/delta_order)
feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
fi
# Finally add feature_transform and the MLP
feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
###
###
###
echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"
echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"
# Map oovs in reference transcription
tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
# We could just use align-mapped in the next line, but it's less efficient as it compiles the

Просмотреть файл

@ -87,8 +87,8 @@ cp $cmvndir/cmvn_$name.scp $data/cmvn.scp || exit 1;
nc=`cat $data/cmvn.scp | wc -l`
nu=`cat $data/spk2utt | wc -l`
if [ $nc -ne $nu ]; then
echo "Error: it seems not all of the speakers got cmvn stats ($nc != $nu);"
exit 1;
echo "$0: warning: it seems not all of the speakers got cmvn stats ($nc != $nu);"
[ $nc -eq 0 ] && exit 1;
fi
echo "Succeeded creating CMVN stats for $name"

Просмотреть файл

@ -3,25 +3,29 @@
# Copyright 2012-2013 Karel Vesely, Daniel Povey
# Apache 2.0
# Begin configuration section.
nnet= # Optionally pre-select network to use for getting state-likelihoods
feature_transform= # Optionally pre-select feature transform (in front of nnet)
model= # Optionally pre-select transition model
class_frame_counts= # Optionally pre-select class-counts used to compute PDF priors
# Begin configuration section.
nnet= # non-default location of DNN (optional)
feature_transform= # non-default location of feature_transform (optional)
model= # non-default location of transition model (optional)
class_frame_counts= # non-default location of PDF counts (optional)
srcdir= # non-default location of DNN-dir (decouples model dir from decode dir)
stage=0 # stage=1 skips lattice generation
nj=4
cmd=run.pl
max_active=7000 # maximum of active tokens
max_mem=50000000 # limit the fst-size to 50MB (larger fsts are minimized)
beam=13.0 # GMM:13.0
latbeam=8.0 # GMM:6.0
acwt=0.10 # GMM:0.0833, note: only really affects pruning (scoring is on lattices).
scoring_opts="--min-lmwt 4 --max-lmwt 15"
acwt=0.10 # note: only really affects pruning (scoring is on lattices).
beam=13.0
latbeam=8.0
max_active=7000 # limit of active tokens
max_mem=50000000 # approx. limit to memory consumption during minimization in bytes
skip_scoring=false
use_gpu_id=-1 # disable gpu
parallel_opts="-pe smp 2" # use 2 CPUs (1 DNN-forward, 1 decoder)
srcdir= # optionaly select dir with DNN model
scoring_opts="--min-lmwt 4 --max-lmwt 15"
num_threads=1 # if >1, will use latgen-faster-parallel
parallel_opts="-pe smp $((num_threads+1))" # use 2 CPUs (1 DNN-forward, 1 decoder)
use_gpu_id=-1 # -1 disable gpu
# End configuration section.
echo "$0 $@" # Print the command line for logging
@ -32,7 +36,7 @@ echo "$0 $@" # Print the command line for logging
if [ $# != 3 ]; then
echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
echo " where the DNN + transition model is."
echo " where the DNN and transition model is."
echo "e.g.: $0 exp/dnn1/graph_tgpr data/test exp/dnn1/decode_tgpr"
echo ""
echo "This script works on plain or modified features (CMN,delta+delta-delta),"
@ -44,13 +48,13 @@ if [ $# != 3 ]; then
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo ""
echo " --nnet <nnet> # which nnet to use (opt.)"
echo " --feature-transform <nnet> # select transform in front of nnet (opt.)"
echo " --class-frame-counts <file> # file with frame counts (used to compute priors) (opt.)"
echo " --model <model> # which transition model to use (opt.)"
echo " --nnet <nnet> # non-default location of DNN (opt.)"
echo " --srcdir <dir> # non-default dir with DNN/models, can be different"
echo " # from parent dir of <decode-dir>' (opt.)"
echo ""
echo " --acwt <float> # select acoustic scale for decoding"
echo " --scoring-opts <opts> # options forwarded to local/score.sh"
echo " --num-threads <N> # N>1: run multi-threaded decoder"
exit 1;
fi
@ -58,43 +62,31 @@ fi
graphdir=$1
data=$2
dir=$3
[ -z $srcdir ] && srcdir=`dirname $dir`; # Or back-off to: model directory one level up from decoding directory.
[ -z $srcdir ] && srcdir=`dirname $dir`; # Default model directory one level up from decoding directory.
sdata=$data/split$nj;
mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
if [ -z "$nnet" ]; then # if --nnet <nnet> was not specified on the command line...
nnet=$srcdir/final.nnet;
fi
[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
# Select default locations to model files (if not already set externally)
if [ -z "$nnet" ]; then nnet=$srcdir/final.nnet; fi
if [ -z "$model" ]; then model=$srcdir/final.mdl; fi
if [ -z "$feature_transform" ]; then feature_transform=$srcdir/final.feature_transform; fi
if [ -z "$class_frame_counts" ]; then class_frame_counts=$srcdir/ali_train_pdf.counts; fi
if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
model=$srcdir/final.mdl;
fi
# find the feature_transform to use
if [ -z "$feature_transform" ]; then
feature_transform=$srcdir/final.feature_transform
fi
if [ ! -f $feature_transform ]; then
echo "Missing feature_transform '$feature_transform'"
exit 1
fi
# check that files exist
for f in $sdata/1/feats.scp $nnet_i $nnet $model $graphdir/HCLG.fst; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
# Check that files exist
for f in $sdata/1/feats.scp $nnet $model $feature_transform $class_frame_counts $graphdir/HCLG.fst; do
[ ! -f $f ] && echo "$0: missing file $f" && exit 1;
done
# PREPARE THE LOG-POSTERIOR COMPUTATION PIPELINE
if [ -z "$class_frame_counts" ]; then
class_frame_counts=$srcdir/ali_train_pdf.counts
else
echo "Overriding class_frame_counts by $class_frame_counts"
fi
# Possibly use multi-threaded decoder
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
# PREPARE FEATURE EXTRACTION PIPELINE
# Create the feature stream:
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
# Optionally add cmvn
@ -109,13 +101,12 @@ if [ -f $srcdir/delta_order ]; then
feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
fi
# Run the decoding in the queue
if [ $stage -le 0 ]; then
$cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet "$feats" ark:- \| \
latgen-faster-mapped --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$latbeam \
--acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
latgen-faster-mapped$thread_string --max-active=$max_active --max-mem=$max_mem --beam=$beam \
--lattice-beam=$latbeam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
$model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

Просмотреть файл

@ -79,28 +79,21 @@ else
fi
#Get the files we will need
cp $srcdir/{tree,final.mdl} $dir
# Select default locations to model files
[ -z "$nnet" ] && nnet=$srcdir/final.nnet;
[ ! -f "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
class_frame_counts=$srcdir/ali_train_pdf.counts
[ -z "$class_frame_counts" ] && echo "Error class_frame_counts '$class_frame_counts' does not exist!" && exit 1;
feature_transform=$srcdir/final.feature_transform
if [ ! -f $feature_transform ]; then
echo "Missing feature_transform '$feature_transform'"
exit 1
fi
model=$dir/final.mdl
[ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1;
###
### Prepare feature pipeline (same as for decoding)
###
# Check that files exist
for f in $sdata/1/feats.scp $nnet $model $feature_transform $class_frame_counts; do
[ ! -f $f ] && echo "$0: missing file $f" && exit 1;
done
# PREPARE FEATURE EXTRACTION PIPELINE
# Create the feature stream:
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
# Optionally add cmvn
@ -114,28 +107,23 @@ if [ -f $srcdir/delta_order ]; then
delta_order=$(cat $srcdir/delta_order)
feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
fi
# Finally add feature_transform and the MLP
feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
###
###
###
###
### We will produce lattices, where the correct path is not necessarily present
###
#1) We don't use reference path here...
echo "Generating the denlats"
#2) Generate the denominator lattices
if [ $sub_split -eq 1 ]; then
echo "$0: generating denlats from data '$data', putting lattices in '$dir'"
#1) Generate the denominator lattices
if [ $sub_split -eq 1 ]; then
# Prepare 'scp' for storing lattices separately and gzipped
for n in `seq $nj`; do
[ ! -d $dir/lat$n ] && mkdir $dir/lat$n;
cat $sdata/$n/feats.scp | awk '{ print $1" | gzip -c >'$dir'/lat'$n'/"$1".gz"; }'
done >$dir/lat.store_separately_as_gz.scp
# Generate the lattices
$cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \
latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
--max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \
$dir/dengraph/HCLG.fst "$feats" "ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp" || exit 1;
$dir/dengraph/HCLG.fst "$feats" "scp:$dir/lat.store_separately_as_gz.scp" || exit 1;
else
for n in `seq $nj`; do
if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
@ -146,32 +134,25 @@ else
split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
fi
mkdir -p $dir/log/$n
mkdir -p $dir/part
feats_subset=$(echo $feats | sed s:JOB/:$n/split$sub_split/JOB/:g)
# Prepare 'scp' for storing lattices separately and gzipped
for k in `seq $sub_split`; do
[ ! -d $dir/lat$n/$k ] && mkdir -p $dir/lat$n/$k;
cat $sdata2/$k/feats.scp | awk '{ print $1" | gzip -c >'$dir'/lat'$n'/'$k'/"$1".gz"; }'
done >$dir/lat.$n.store_separately_as_gz.scp
# Generate lattices
$cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
--max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \
$dir/dengraph/HCLG.fst "$feats_subset" "ark,scp:$dir/lat.$n.JOB.ark,$dir/lat.$n.JOB.scp" || exit 1;
echo Merging lists for data subset $n
for k in `seq $sub_split`; do
cat $dir/lat.$n.$k.scp
done > $dir/lat.$n.all.scp
echo Merge the ark $n
lattice-copy scp:$dir/lat.$n.all.scp ark,scp:$dir/lat.$n.ark,$dir/lat.$n.scp || exit 1;
#remove the data
rm $dir/lat.$n.*.ark $dir/lat.$n.*.scp $dir/lat.$n.all.scp
$dir/dengraph/HCLG.fst "$feats_subset" scp:$dir/lat.$n.store_separately_as_gz.scp || exit 1;
touch $dir/.done.$n
fi
done
fi
#3) Merge the SCPs to create full list of lattices (will use random access)
echo Merging to single list $dir/lat.scp
for ((n=1; n<=nj; n++)); do
cat $dir/lat.$n.scp
done > $dir/lat.scp
#2) Generate 'scp' for reading the lattices
for n in `seq $nj`; do
find $dir/lat${n} -name *.gz | awk -v FS="/" '{ print gensub(".gz","","",$NF)" gunzip -c "$0" |"; }'
done >$dir/lat.scp
echo "$0: done generating denominator lattices."

Просмотреть файл

@ -51,7 +51,7 @@ realign_iters=""
beam=10 # for realignment.
retry_beam=40
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" # by default we use 16 threads; this lets the queue know.
io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
nnet_config_opts=
splice_width=4 # meaning +- 4 frames on each side for second LDA
@ -110,8 +110,9 @@ if [ $# != 4 ]; then
echo " --num-threads <num-threads|16> # Number of parallel threads per job (will affect results"
echo " # as well as speed; may interact with batch size; if you increase"
echo " # this, you may want to decrease the batch size."
echo " --parallel-opts <opts|\"-pe smp 16\"> # extra options to pass to e.g. queue.pl for processes that"
echo " # use multiple threads."
echo " --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\"> # extra options to pass to e.g. queue.pl for processes that"
echo " # use multiple threads... note, you might have to reduce mem_free,ram_free"
echo " # versus your defaults, because it gets multiplied by the -pe smp argument."
echo " --io-opts <opts|\"-tc 10\"> # Options given to e.g. queue.pl for jobs that do a lot of I/O."
echo " --minibatch-size <minibatch-size|128> # Size of minibatch to process (note: product with --num-threads"
echo " # should not get too large, e.g. >2k)."

Просмотреть файл

@ -35,6 +35,7 @@ for ((n=1; n<$#; n++)); do
if [ "${!n}" == "--config" ]; then
n_plus1=$((n+1))
config=${!n_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done

Просмотреть файл

@ -86,7 +86,8 @@ mkdir -p $dir $tmpdir $dir/phones
[ -f path.sh ] && . ./path.sh
! utils/validate_dict_dir.pl $srcdir && echo "*Error validating directory $srcdir*" && exit 1;
! utils/validate_dict_dir.pl $srcdir && \
echo "*Error validating directory $srcdir*" && exit 1;
if [[ ! -f $srcdir/lexicon.txt ]]; then
echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt"
@ -97,8 +98,12 @@ if [[ ! -f $srcdir/lexiconp.txt ]]; then
perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
fi
! utils/validate_dict_dir.pl $srcdir >&/dev/null && \
echo "Validation failed (second time)" && exit 1;
if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
utils/validate_dict_dir.pl $srcdir # show the output.
echo "Validation failed (second time)"
exit 1;
fi
if $position_dependent_phones; then
# Create $tmpdir/lexicon.original from $srcdir/lexicon.txt by

Просмотреть файл

@ -163,8 +163,8 @@ print Q "EOF\n"; # without having to escape things like "|" and quote characters
print Q ") >$logfile\n";
print Q "time1=\`date +\"%s\"\`\n";
print Q " ( $cmd ) 2>>$logfile >>$logfile\n";
print Q "time2=\`date +\"%s\"\`\n";
print Q "ret=\$?\n";
print Q "time2=\`date +\"%s\"\`\n";
print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$nof_threads >>$logfile\n";
print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137;
@ -232,7 +232,7 @@ if (! $sync) { # We're not submitting with -sync y, so we
# Check that the job exists in SGE. Job can be killed if duration
# exceeds some hard limit, or in case of a machine shutdown.
if(($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
if ( -f $f ) { next; }; #syncfile appeared, ok
if ( -f $f ) { next; }; #syncfile appeared: OK.
$ret = system("qstat -j $sge_job_id >/dev/null 2>/dev/null");
if($ret != 0) {
# Don't consider immediately missing job as error, first wait some
@ -245,7 +245,7 @@ if (! $sync) { # We're not submitting with -sync y, so we
if ( -f $f ) { next; }; #syncfile appeared, ok
#Otherwise it is an error
if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/*/g; }
print STDERR "queue.pl: Error, unfinished job no longer exists, log is in $logfile\n";
print STDERR "queue.pl: Error, unfinished job no longer exists, log is in $logfile, syncfile is $f, return status of qstat was $ret\n";
print STDERR " Possible reasons: a) Exceeded time limit? -> Use more jobs! b) Shutdown/Frozen machine? -> Run again!\n";
exit(1);
}

Просмотреть файл

@ -195,7 +195,7 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
# Checking extra_questions.txt -------------------------------
print "Checking $dict/extra_questions.txt ...\n";
if(-s "$dict/extra_questions.txt") {
if (-s "$dict/extra_questions.txt") {
if(!open(EX, "<$dict/extra_questions.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/extra_questions.txt\n";}
$idx = 1;
$success = 1;
@ -213,6 +213,8 @@ if(-s "$dict/extra_questions.txt") {
}
close(EX);
$success == 0 || print "--> $dict/extra_questions.txt is OK\n";
} else {print "--> $dict/extra_phones.txt is empty\n";}
} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
if($exit == 1) { print " [Error detected ]\n"; exit 1;}
exit 0;

Просмотреть файл

@ -141,6 +141,6 @@ preprocessor variables, setting compile options, linking with libraries, and so
\section build_setup_platforms Which platforms has Kaldi been compiled on?
We have compiled Kaldi on Windows, Cygwin, various flavors of Linux (including
Ubuntu and CentOS), and Darwin.
Ubuntu, CentOS and SUSE), and Darwin.
*/

Просмотреть файл

@ -69,7 +69,7 @@ int main(int argc, char *argv[]) {
SlidingWindowCmn(opts, feat, &cmvn_feat);
feat_writer.Write(utt, feat);
feat_writer.Write(utt, cmvn_feat);
num_done++;
}

Просмотреть файл

@ -47,7 +47,7 @@ template<class IntType> class LatticeStringRepository {
inline bool operator == (const Entry &other) const {
return (parent == other.parent && i == other.i);
}
Entry(const Entry *parent, IntType i): parent(parent), i(i) {}
Entry() { }
Entry(const Entry &e): parent(e.parent), i(e.i) {}
};
// Note: all Entry* pointers returned in function calls are
@ -59,16 +59,22 @@ template<class IntType> class LatticeStringRepository {
// Returns string of "parent" with i appended. Pointer
// owned by repository
const Entry *Successor(const Entry *parent, IntType i) {
Entry entry(parent, i);
typename SetType::iterator iter = set_.find(&entry);
if(iter == set_.end()) { // no such entry already...
Entry *entry_ptr = new Entry(entry);
set_.insert(entry_ptr);
return entry_ptr;
} else {
return *iter;
new_entry_->parent = parent;
new_entry_->i = i;
std::pair<typename SetType::iterator, bool> pr = set_.insert(new_entry_);
if (pr.second) { // Was successfully inserted (was not there). We need to
// replace the element we inserted, which resides on the
// stack, with one from the heap.
const Entry *ans = new_entry_;
new_entry_ = new Entry();
return ans;
} else { // Was not inserted because an equivalent Entry already
// existed.
return *pr.first;
}
}
const Entry *Concatenate (const Entry *a, const Entry *b) {
if (a == NULL) return b;
else if (b == NULL) return a;
@ -94,15 +100,22 @@ template<class IntType> class LatticeStringRepository {
// a common prefix with a.
void ReduceToCommonPrefix(const Entry *a,
vector<IntType> *b) {
vector<IntType> a_vec;
ConvertToVector(a, &a_vec);
if (b->size() > a_vec.size())
b->resize(a_vec.size());
size_t b_sz = 0, max_sz = std::min(a_vec.size(), b->size());
while (b_sz < max_sz && (*b)[b_sz] == a_vec[b_sz])
b_sz++;
if (b_sz != b->size())
b->resize(b_sz);
size_t a_size = Size(a), b_size = b->size();
while (a_size> b_size) {
a = a->parent;
a_size--;
}
if (b_size > a_size)
b_size = a_size;
typename vector<IntType>::iterator b_begin = b->begin();
while (a_size != 0) {
if (a->i != *(b_begin + a_size - 1))
b_size = a_size - 1;
a = a->parent;
a_size--;
}
if (b_size != b->size())
b->resize(b_size);
}
// removes the first n elements of a.
@ -156,8 +169,8 @@ template<class IntType> class LatticeStringRepository {
return e;
}
LatticeStringRepository() { }
LatticeStringRepository() { new_entry_ = new Entry; }
void Destroy() {
for (typename SetType::iterator iter = set_.begin();
iter != set_.end();
@ -165,6 +178,10 @@ template<class IntType> class LatticeStringRepository {
delete *iter;
SetType tmp;
tmp.swap(set_);
if (new_entry_) {
delete new_entry_;
new_entry_ = NULL;
}
}
// Rebuild will rebuild this object, guaranteeing only
@ -220,6 +237,8 @@ template<class IntType> class LatticeStringRepository {
}
DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository);
Entry *new_entry_; // We always have a pre-allocated Entry ready to use,
// to avoid unnecessary news and deletes.
SetType set_;
};

Просмотреть файл

@ -310,7 +310,7 @@ inline int Compare (const LatticeWeightTpl<FloatType> &w1,
template<class FloatType>
inline LatticeWeightTpl<FloatType> Plus(const LatticeWeightTpl<FloatType> &w1,
const LatticeWeightTpl<FloatType> &w2) {
const LatticeWeightTpl<FloatType> &w2) {
return (Compare(w1, w2) >= 0 ? w1 : w2);
}

Просмотреть файл

@ -123,7 +123,7 @@ template<class Arc> VectorFst<Arc>* RandPairFst(RandFstOptions opts = RandFstOpt
all_states.push_back(this_state);
}
// Set final states.
for (size_t j = 0;j < (size_t)opts.n_final;j++) {
for (size_t j = 0; j < (size_t)opts.n_final;j++) {
StateId id = all_states[rand() % opts.n_states];
Weight weight (opts.weight_multiplier*(rand() % 5), opts.weight_multiplier*(rand() % 5));
fst->SetFinal(id, weight);

Просмотреть файл

@ -54,10 +54,10 @@ void DiagGmmNormal::CopyFromDiagGmm(const DiagGmm &diaggmm) {
means_.MulElements(vars_);
}
void DiagGmmNormal::CopyToDiagGmm(DiagGmm *diaggmm, GmmFlagsType flags) {
void DiagGmmNormal::CopyToDiagGmm(DiagGmm *diaggmm, GmmFlagsType flags) const {
KALDI_ASSERT((static_cast<int32>(diaggmm->Dim()) == means_.NumCols())
&& (static_cast<int32>(diaggmm->weights_.Dim()) == weights_.Dim()));
&& (static_cast<int32>(diaggmm->weights_.Dim()) == weights_.Dim()));
DiagGmmNormal oldg(*diaggmm);
if (flags & kGmmWeights)

Просмотреть файл

@ -58,7 +58,7 @@ class DiagGmmNormal {
void CopyFromDiagGmm(const DiagGmm &diaggmm);
/// Copies to DiagGmm the requested parameters
void CopyToDiagGmm(DiagGmm *diaggmm, GmmFlagsType flags = kGmmAll);
void CopyToDiagGmm(DiagGmm *diaggmm, GmmFlagsType flags = kGmmAll) const;
int32 NumGauss() { return weights_.Dim(); }
int32 Dim() { return means_.NumCols(); }

Просмотреть файл

@ -821,6 +821,10 @@ BaseFloat DiagGmm::GaussianSelectionPreselect(
return tot_loglike;
}
void DiagGmm::CopyFromNormal(const DiagGmmNormal &diag_gmm_normal) {
diag_gmm_normal.CopyToDiagGmm(this);
}
void DiagGmm::Generate(VectorBase<BaseFloat> *output) {
KALDI_ASSERT(static_cast<int32>(output->Dim()) == Dim());
BaseFloat tot = weights_.Sum();

Просмотреть файл

@ -51,6 +51,9 @@ class DiagGmm {
CopyFromDiagGmm(gmm);
}
/// Copies from DiagGmmNormal; does not resize.
void CopyFromNormal(const DiagGmmNormal &diag_gmm_normal);
DiagGmm(int32 nMix, int32 dim): valid_gconsts_(false) { Resize(nMix, dim); }
/// Constructor that allows us to merge GMMs with weights. Weights must sum

Просмотреть файл

@ -26,7 +26,7 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
gmm-diff-accs gmm-basis-fmllr-accs gmm-basis-fmllr-training gmm-est-basis-fmllr \
gmm-est-map gmm-adapt-map gmm-latgen-map gmm-basis-fmllr-accs-gpost \
gmm-est-basis-fmllr-gpost gmm-latgen-tracking gmm-latgen-faster-parallel \
gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost
gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats
OBJFILES =

Просмотреть файл

@ -0,0 +1,180 @@
// gmmbin/gmm-global-init-from-feats.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/model-common.h"
#include "gmm/full-gmm.h"
#include "gmm/diag-gmm.h"
#include "gmm/mle-full-gmm.h"
namespace kaldi {
// We initialize the GMM parameters by setting the variance to the global
// variance of the features, and the means to distinct randomly chosen frames.
void InitGmmFromRandomFrames(const Matrix<BaseFloat> &feats, DiagGmm *gmm) {
int32 num_gauss = gmm->NumGauss(), num_frames = feats.NumRows(),
dim = feats.NumCols();
KALDI_ASSERT(num_frames >= 10 * num_gauss && "Too few frames to train on");
Vector<double> mean(dim), var(dim);
for (int32 i = 0; i < num_frames; i++) {
mean.AddVec(1.0 / num_frames, feats.Row(i));
var.AddVec2(1.0 / num_frames, feats.Row(i));
}
var.AddVec2(-1.0, mean);
if (var.Max() <= 0.0)
KALDI_ERR << "Features do not have positive variance " << var;
DiagGmmNormal gmm_normal(*gmm);
std::set<int32> used_frames;
for (int32 g = 0; g < num_gauss; g++) {
int32 random_frame = RandInt(0, num_frames - 1);
while (used_frames.count(random_frame != 0))
random_frame = RandInt(0, num_frames - 1);
used_frames.insert(random_frame);
gmm_normal.weights_(g) = 1.0 / num_gauss;
gmm_normal.means_.Row(g).CopyFromVec(feats.Row(random_frame));
gmm_normal.vars_.Row(g).CopyFromVec(var);
}
gmm->CopyFromNormal(gmm_normal);
gmm->ComputeGconsts();
}
void TrainOneIter(const Matrix<BaseFloat> &feats,
const MleDiagGmmOptions &gmm_opts,
int32 iter,
DiagGmm *gmm) {
AccumDiagGmm gmm_acc(*gmm, kGmmAll);
double tot_like = 0.0;
for (int32 t = 0; t < feats.NumRows(); t++)
tot_like += gmm_acc.AccumulateFromDiag(*gmm, feats.Row(t), 1.0);
KALDI_LOG << "Likelihood per frame on iteration " << iter
<< " was " << (tot_like / feats.NumRows()) << " over "
<< feats.NumRows() << " frames.";
BaseFloat objf_change, count;
MleDiagGmmUpdate(gmm_opts, gmm_acc, kGmmAll, gmm, &objf_change, &count);
KALDI_LOG << "Objective-function change on iteration " << iter << " was "
<< (objf_change / count) << " over " << count << " frames.";
}
} // namespace kaldi
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
const char *usage =
"This program initializes a single diagonal GMM and does multiple iterations of\n"
"training from features stored in memory.\n"
"Usage: gmm-global-init-feats [options] <feature-rspecifier> <model-out>\n"
"e.g.: gmm-global-init-feats scp:train.scp 1.mdl\n";
ParseOptions po(usage);
MleDiagGmmOptions gmm_opts;
bool binary = true;
int32 num_gauss = 100;
int32 num_iters = 50;
int32 num_frames = 200000;
int32 srand_seed = 0;
po.Register("binary", &binary, "Write output in binary mode");
po.Register("num-gauss", &num_gauss, "Number of Gaussians in the model");
po.Register("num-iters", &num_iters, "Number of iterations of training");
po.Register("num-frames", &num_frames, "Number of feature vectors to store in "
"memory and train on (randomly chosen from the input features)");
po.Register("srand", &srand_seed, "Seed for random number generator ");
gmm_opts.Register(&po);
po.Read(argc, argv);
srand(srand_seed);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
std::string feature_rspecifier = po.GetArg(1),
model_wxfilename = po.GetArg(2);
Matrix<BaseFloat> feats;
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
KALDI_ASSERT(num_frames > 0);
int64 num_read = 0, dim = 0;
KALDI_LOG << "Reading features (will keep " << num_frames << " frames.)";
for (; !feature_reader.Done(); feature_reader.Next()) {
const Matrix<BaseFloat> &this_feats = feature_reader.Value();
for (int32 t = 0; t < this_feats.NumRows(); t++) {
num_read++;
if (dim == 0) {
dim = this_feats.NumCols();
feats.Resize(num_frames, dim);
} else if (this_feats.NumCols() != dim) {
KALDI_ERR << "Features have inconsistent dims "
<< this_feats.NumCols() << " vs. " << dim
<< " (current utt is) " << feature_reader.Key();
}
if (num_read <= num_frames) {
feats.Row(num_read - 1).CopyFromVec(this_feats.Row(t));
} else {
BaseFloat keep_prob = num_frames / static_cast<BaseFloat>(num_read);
if (WithProb(keep_prob)) { // With probability "keep_prob"
feats.Row(RandInt(0, num_frames - 1)).CopyFromVec(this_feats.Row(t));
}
}
}
}
if (num_read < num_frames) {
KALDI_WARN << "Number of frames read " << num_read << " was less than "
<< "target number " << num_frames << ", using all we read.";
feats.Resize(num_read, dim, kCopyData);
}
DiagGmm gmm(num_gauss, dim);
KALDI_LOG << "Initializing GMM means from random frames";
InitGmmFromRandomFrames(feats, &gmm);
for (int32 iter = 0; iter < num_iters; iter++)
TrainOneIter(feats, gmm_opts, iter, &gmm);
WriteKaldiObject(gmm, model_wxfilename, binary);
KALDI_LOG << "Wrote model to " << model_wxfilename;
return 0;
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -5,11 +5,11 @@ include ../kaldi.mk
EXTRA_CXXFLAGS += -Wno-sign-compare
TESTFILES = kaldi-lattice-test
TESTFILES = kaldi-lattice-test push-lattice-test minimize-lattice-test
OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \
phone-align-lattice.o word-align-lattice-lexicon.o sausages.o \
kws-functions.o push-lattice.o
kws-functions.o push-lattice.o minimize-lattice.o
LIBNAME = kaldi-lat

Просмотреть файл

@ -311,6 +311,7 @@ void ConvertLatticeToPhones(const TransitionModel &trans,
void ConvertCompactLatticeToPhones(const TransitionModel &trans,
CompactLattice *clat) {
typedef CompactLatticeArc Arc;
typedef Arc::Weight Weight;
int32 num_states = clat->NumStates();
for (int32 state = 0; state < num_states; state++) {
for (fst::MutableArcIterator<CompactLattice> aiter(clat, state);
@ -327,6 +328,18 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans,
arc.weight.SetString(phone_seq);
aiter.SetValue(arc);
} // end looping over arcs
Weight f = clat->Final(state);
if (f != Weight::Zero()) {
std::vector<int32> phone_seq;
const std::vector<int32> &tid_seq = f.String();
for (std::vector<int32>::const_iterator iter = tid_seq.begin();
iter != tid_seq.end(); ++iter) {
if (trans.IsFinal(*iter))// note: there is one of these per phone...
phone_seq.push_back(trans.TransitionIdToPhone(*iter));
}
f.SetString(phone_seq);
clat->SetFinal(state, f);
}
} // end looping over states
}

Просмотреть файл

@ -0,0 +1,72 @@
// lat/minimize-lattice-test.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "lat/kaldi-lattice.h"
#include "lat/minimize-lattice.h"
#include "lat/push-lattice.h"
#include "fstext/rand-fst.h"
namespace kaldi {
using namespace fst;
CompactLattice *RandDeterministicCompactLattice() {
RandFstOptions opts;
opts.acyclic = true;
while (1) {
Lattice *fst = fst::RandPairFst<LatticeArc>(opts);
CompactLattice *cfst = new CompactLattice;
if (!DeterminizeLattice(*fst, cfst)) {
delete fst;
delete cfst;
KALDI_WARN << "Determinization failed, trying again.";
} else {
delete fst;
return cfst;
}
}
}
void TestMinimizeCompactLattice() {
CompactLattice *clat = RandDeterministicCompactLattice();
CompactLattice clat2(*clat);
BaseFloat delta = (rand() % 2 == 0 ? 1.0 : 1.0e-05);
// Minimization will only work well on determinized and pushed lattices.
PushCompactLatticeStrings(&clat2);
PushCompactLatticeWeights(&clat2);
MinimizeCompactLattice(&clat2, delta);
KALDI_ASSERT(fst::RandEquivalent(*clat, clat2, 5, delta, rand(), 10));
delete clat;
}
} // end namespace kaldi
int main() {
using namespace kaldi;
SetVerboseLevel(4);
for (int32 i = 0; i < 1000; i++) {
TestMinimizeCompactLattice();
}
KALDI_LOG << "Success.";
}

283
src/lat/minimize-lattice.cc Normal file
Просмотреть файл

@ -0,0 +1,283 @@
// lat/minimize-lattice.cc
// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal)
// 2012-2013 Johns Hopkins University (Author: Daniel Povey); Chao Weng;
// Bagher BabaAli
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "lat/minimize-lattice.h"
#include "hmm/transition-model.h"
#include "util/stl-utils.h"
namespace kaldi {
/*
Process the states in reverse topological order.
For each state, compute a hash-value that will be the same for states
that can be combined. Then for each pair of states with the
same hash value, check that the "to-states" map to the
same equivalence class and that the weights are sufficiently similar.
*/
using fst::ArcIterator;
using fst::MutableArcIterator;
using fst::kNoStateId;
class CompactLatticeMinimizer {
public:
typedef CompactLattice::StateId StateId;
typedef CompactLatticeArc Arc;
typedef Arc::Label Label;
typedef CompactLatticeWeight Weight;
typedef size_t HashType;
CompactLatticeMinimizer(CompactLattice *clat, float delta = fst::kDelta):
clat_(clat), delta_(delta) { }
bool Minimize() {
if (clat_->Properties(fst::kTopSorted, true) == 0) {
if (!TopSort(clat_)) {
KALDI_WARN << "Topological sorting of state-level lattice failed "
"(probably your lexicon has empty words or your LM has epsilon cycles; this "
" is a bad idea.)";
return false;
}
}
ComputeStateHashValues();
ComputeStateMap();
ModifyModel();
return true;
}
static HashType ConvertStringToHashValue(const std::vector<int32> &vec) {
const HashType prime = 53281;
VectorHasher<int32> h;
HashType ans = static_cast<HashType>(h(vec));
if (ans == 0) ans = prime;
// We don't allow a zero answer, as this can cause too many values to be the
// same.
return ans;
}
static void InitHashValue(const Weight &final_weight, HashType *h) {
const HashType prime1 = 33317, prime2 = 607; // it's pretty random.
if (final_weight == Weight::Zero()) *h = prime1;
else *h = prime2 * ConvertStringToHashValue(final_weight.String());
}
// It's important that this function and UpdateHashValueForFinalProb be
// insensitive to the order in which it's called, as the order of the arcs
// won't necessarily be the same for different equivalent states.
static void UpdateHashValueForTransition(const Weight &weight,
Label label,
HashType &next_state_hash,
HashType *h) {
const HashType prime1 = 1447, prime2 = 51907;
if (label == 0) label = prime2; // Zeros will cause problems.
*h += prime1 * label *
(1 + ConvertStringToHashValue(weight.String()) * next_state_hash);
// Above, the "1 +" is to ensure that if somehow we get zeros due to
// weird word sequences, they don't propagate.
}
void ComputeStateHashValues() {
// Note: clat_ is topologically sorted, and StateId is
// signed. Each state's hash value is only a function of toplogically-later
// states' hash values.
state_hashes_.resize(clat_->NumStates());
for (StateId s = clat_->NumStates() - 1; s >= 0; s--) {
HashType this_hash;
InitHashValue(clat_->Final(s), &this_hash);
for (ArcIterator<CompactLattice> aiter(*clat_, s); !aiter.Done();
aiter.Next()) {
const Arc &arc = aiter.Value();
HashType next_hash;
if (arc.nextstate > s) {
next_hash = state_hashes_[arc.nextstate];
} else {
KALDI_ASSERT(s == arc.nextstate &&
"Lattice not topologically sorted [code error]");
next_hash = 1;
KALDI_WARN << "Minimizing lattice with self-loops "
"(lattices should not have self-loops)";
}
UpdateHashValueForTransition(arc.weight, arc.ilabel,
next_hash, &this_hash);
}
state_hashes_[s] = this_hash;
}
}
struct EquivalenceSorter {
// This struct has an operator () which you can interpret as a less-than (<)
// operator for arcs. We sort on ilabel; since the lattice is supposed to
// be deterministic, this should completely determine the ordering (there
// should not be more than one arc with the same ilabel, out of the same
// state). For identical ilabels we next sort on the nextstate, simply to
// better handle non-deterministic input (we do our best on this, without
// guaranteeing full minimization). We could sort on the strings next, but
// this would be an unnecessary hassle as we only really need good
// performance on deterministic input.
bool operator () (const Arc &a, const Arc &b) const {
if (a.ilabel < b.ilabel) return true;
else if (a.ilabel > b.ilabel) return false;
else if (a.nextstate < b.nextstate) return true;
else return false;
}
};
// This function works out whether s and t are equivalent, assuming
// we have already partitioned all topologically-later states into
// equivalence classes (i.e. set up state_map_).
bool Equivalent(StateId s, StateId t) const {
if (!ApproxEqual(clat_->Final(s), clat_->Final(t), delta_))
return false;
if (clat_->NumArcs(s) != clat_->NumArcs(t))
return false;
std::vector<Arc> s_arcs;
std::vector<Arc> t_arcs;
for (int32 iter = 0; iter <= 1; iter++) {
StateId state = (iter == 0 ? s : t);
std::vector<Arc> &arcs = (iter == 0 ? s_arcs : t_arcs);
arcs.reserve(clat_->NumArcs(s));
for (ArcIterator<CompactLattice> aiter(*clat_, state); !aiter.Done();
aiter.Next()) {
Arc arc = aiter.Value();
if (arc.nextstate == state) {
// This is a special case for states that have self-loops. If two
// states have an identical self-loop arc, they may be equivalent.
arc.nextstate = kNoStateId;
} else {
KALDI_ASSERT(arc.nextstate > state);
//while (state_map_[arc.nextstate] != arc.nextstate)
arc.nextstate = state_map_[arc.nextstate];
arcs.push_back(arc);
}
}
EquivalenceSorter s;
std::sort(arcs.begin(), arcs.end(), s);
}
KALDI_ASSERT(s_arcs.size() == t_arcs.size());
for (size_t i = 0; i < s_arcs.size(); i++) {
if (s_arcs[i].nextstate != t_arcs[i].nextstate) return false;
KALDI_ASSERT(s_arcs[i].ilabel == s_arcs[i].olabel); // CompactLattices are
// supposed to be
// acceptors.
if (s_arcs[i].ilabel != t_arcs[i].ilabel) return false;
// We've already mapped to equivalence classes.
if (s_arcs[i].nextstate != t_arcs[i].nextstate) return false;
if (!ApproxEqual(s_arcs[i].weight, t_arcs[i].weight)) return false;
}
return true;
}
void ComputeStateMap() {
// We have to compute the state mapping in reverse topological order also,
// since the equivalence test relies on later states being already sorted
// out into equivalence classes (by state_map_).
StateId num_states = clat_->NumStates();
unordered_map<HashType, std::vector<StateId> > hash_groups_;
for (StateId s = 0; s < num_states; s++)
hash_groups_[state_hashes_[s]].push_back(s);
state_map_.resize(num_states);
for (StateId s = 0; s < num_states; s++)
state_map_[s] = s; // Default mapping.
{ // This block is just diagnostic.
typedef unordered_map<HashType, std::vector<StateId> >::const_iterator
HashIter;
size_t max_size = 0;
for (HashIter iter = hash_groups_.begin(); iter != hash_groups_.end();
++iter)
max_size = std::max(max_size, iter->second.size());
if (max_size > 1000) {
KALDI_WARN << "Largest equivalence group (using hash) is " << max_size
<< ", minimization might be slow.";
}
}
for (StateId s = num_states - 1; s >= 0; s--) {
HashType hash = state_hashes_[s];
const std::vector<StateId> &equivalence_class = hash_groups_[hash];
KALDI_ASSERT(!equivalence_class.empty());
for (size_t i = 0; i < equivalence_class.size(); i++) {
StateId t = equivalence_class[i];
// Below, there is no point doing the test if state_map_[t] != t, because
// in that case we will, before after this, be comparing with another state
// that is equivalent to t.
if (t > s && state_map_[t] == t && Equivalent(s, t)) {
state_map_[s] = t;
break;
}
}
}
}
void ModifyModel() {
// Modifies the model according to state_map_;
StateId num_removed = 0;
StateId num_states = clat_->NumStates();
for (StateId s = 0; s < num_states; s++)
if (state_map_[s] != s)
num_removed++;
KALDI_VLOG(3) << "Removing " << num_removed << " of "
<< num_states << " states.";
if (num_removed == 0) return; // Nothing to do.
clat_->SetStart(state_map_[clat_->Start()]);
for (StateId s = 0; s < num_states; s++) {
if (state_map_[s] != s)
continue; // There is no point modifying states we're removing.
for (MutableArcIterator<CompactLattice> aiter(clat_, s); !aiter.Done();
aiter.Next()) {
Arc arc = aiter.Value();
StateId mapped_nextstate = state_map_[arc.nextstate];
if (mapped_nextstate != arc.nextstate) {
arc.nextstate = mapped_nextstate;
aiter.SetValue(arc);
}
}
}
fst::Connect(clat_);
}
private:
CompactLattice *clat_;
float delta_;
std::vector<HashType> state_hashes_;
std::vector<StateId> state_map_; // maps each state to itself or to some
// equivalent state. Within each equivalence
// class, we pick one arbitrarily.
};
bool MinimizeCompactLattice(CompactLattice *clat, float delta) {
CompactLatticeMinimizer minimizer(clat, delta);
return minimizer.Minimize();
}
} // namespace kaldi

Просмотреть файл

@ -0,0 +1,48 @@
// lat/minimize-lattice.h
// Copyright 2013 Johns Hopkins University (Author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_LAT_MINIMIZE_LATTICE_H_
#define KALDI_LAT_MINIMIZE_LATTICE_H_
#include <vector>
#include <map>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "fstext/fstext-lib.h"
#include "hmm/transition-model.h"
#include "lat/kaldi-lattice.h"
namespace kaldi {
/// This function minimizes the compact lattice. It is to be called after
/// determinization (see ../fstext/determinize-lattice-pruned.h) and pushing
/// (see ./push-lattice.h). If the lattice is not determinized and pushed this
/// function will not combine as many states as it could, but it won't crash.
/// Returns true on success, and false if it failed due to topological sorting
/// failing.
bool MinimizeCompactLattice(CompactLattice *clat, float delta = fst::kDelta);
} // namespace kaldi
#endif // KALDI_LAT_PUSH_LATTICE_H_

Просмотреть файл

@ -0,0 +1,119 @@
// lat/push-lattice-test.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "lat/kaldi-lattice.h"
#include "lat/push-lattice.h"
#include "fstext/rand-fst.h"
namespace kaldi {
using namespace fst;
CompactLattice *RandCompactLattice() {
RandFstOptions opts;
opts.acyclic = true;
Lattice *fst = fst::RandPairFst<LatticeArc>(opts);
CompactLattice *cfst = new CompactLattice;
ConvertLattice(*fst, cfst);
delete fst;
return cfst;
}
void TestPushCompactLatticeStrings() {
CompactLattice *clat = RandCompactLattice();
CompactLattice clat2(*clat);
PushCompactLatticeStrings(&clat2);
KALDI_ASSERT(fst::RandEquivalent(*clat, clat2, 5, 0.001, rand(), 10));
for (CompactLatticeArc::StateId s = 0; s < clat2.NumStates(); s++) {
if (s == 0)
continue; // We don't check state zero, as the "leftover string" stays
// there.
int32 first_label;
bool ok = false;
bool first_label_set = false;
for (ArcIterator<CompactLattice> aiter(clat2, s); !aiter.Done();
aiter.Next()) {
if (aiter.Value().weight.String().size() == 0) {
ok = true;
} else {
int32 this_label = aiter.Value().weight.String().front();
if (first_label_set) {
if (this_label != first_label) ok = true;
} else {
first_label = this_label;
first_label_set = true;
}
}
}
if (clat2.Final(s) != CompactLatticeWeight::Zero()) {
if (clat2.Final(s).String().size() == 0) ok = true;
else {
int32 this_label = clat2.Final(s).String().front();
if (first_label_set && this_label != first_label) ok = true;
}
}
KALDI_ASSERT(ok);
}
delete clat;
}
void TestPushCompactLatticeWeights() {
CompactLattice *clat = RandCompactLattice();
CompactLattice clat2(*clat);
PushCompactLatticeWeights(&clat2);
KALDI_ASSERT(fst::RandEquivalent(*clat, clat2, 5, 0.001, rand(), 10));
for (CompactLatticeArc::StateId s = 0; s < clat2.NumStates(); s++) {
if (s == 0)
continue; // We don't check state zero, as the "leftover string" stays
// there.
LatticeWeight sum = clat2.Final(s).Weight();
for (ArcIterator<CompactLattice> aiter(clat2, s); !aiter.Done();
aiter.Next()) {
sum = Plus(sum, aiter.Value().weight.Weight());
}
if (!ApproxEqual(sum, LatticeWeight::One())) {
{
fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
NULL, true, true);
printer.Print(&std::cerr, "<unknown>");
}
{
fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
NULL, true, true);
printer.Print(&std::cerr, "<unknown>");
}
KALDI_ERR << "Bad lattice being pushed.";
}
}
delete clat;
}
} // end namespace kaldi
int main() {
using namespace kaldi;
for (int32 i = 0; i < 15; i++) {
TestPushCompactLatticeStrings();
TestPushCompactLatticeWeights();
}
KALDI_LOG << "Success.";
}

Просмотреть файл

@ -202,12 +202,75 @@ class CompactLatticePusher {
std::vector<int32> shift_vec_;
};
bool PushCompactLattice(CompactLattice *clat) {
bool PushCompactLatticeStrings(CompactLattice *clat) {
CompactLatticePusher pusher(clat);
return pusher.Push();
}
bool PushCompactLatticeWeights(CompactLattice *clat) {
if (clat->Properties(fst::kTopSorted, true) == 0) {
if (!TopSort(clat)) {
KALDI_WARN << "Topological sorting of state-level lattice failed "
"(probably your lexicon has empty words or your LM has epsilon cycles; this "
" is a bad idea.)";
return false;
}
}
typedef CompactLattice::StateId StateId; // Note: this is guaranteed to be
// signed.
typedef CompactLatticeArc Arc;
typedef CompactLatticeWeight Weight;
StateId num_states = clat->NumStates();
if (num_states == 0) {
KALDI_WARN << "Pushing weights of empty compact lattice";
return true; // this is technically success because an empty
// lattice is already pushed.
}
std::vector<LatticeWeight> weight_to_end(num_states); // Note: LatticeWeight
// contains two floats.
for (StateId s = num_states - 1; s >= 0; s--) {
LatticeWeight this_weight_to_end = clat->Final(s).Weight();
for (fst::ArcIterator<CompactLattice> aiter(*clat, s); !aiter.Done();
aiter.Next()) {
const Arc &arc = aiter.Value();
KALDI_ASSERT(arc.nextstate > s && "Cyclic lattices not allowed.");
this_weight_to_end = Plus(this_weight_to_end,
Times(aiter.Value().weight.Weight(),
weight_to_end[arc.nextstate]));
}
if (this_weight_to_end == LatticeWeight::Zero()) {
KALDI_WARN << "Lattice has non-coaccessible states.";
}
weight_to_end[s] = this_weight_to_end;
}
weight_to_end[0] = LatticeWeight::One(); // We leave the "leftover weight" on
// the start state, which won't
// necessarily end up summing to one.
for (StateId s = 0; s < num_states; s++) {
LatticeWeight this_weight_to_end = weight_to_end[s];
if (this_weight_to_end == LatticeWeight::Zero())
continue;
for (fst::MutableArcIterator<CompactLattice> aiter(clat, s); !aiter.Done();
aiter.Next()) {
Arc arc = aiter.Value();
LatticeWeight next_weight_to_end = weight_to_end[arc.nextstate];
if (next_weight_to_end != LatticeWeight::Zero()) {
arc.weight.SetWeight(Times(arc.weight.Weight(),
Divide(next_weight_to_end,
this_weight_to_end)));
aiter.SetValue(arc);
}
}
Weight final_weight = clat->Final(s);
if (final_weight != Weight::Zero()) {
final_weight.SetWeight(Divide(final_weight.Weight(), this_weight_to_end));
clat->SetFinal(s, final_weight);
}
}
return true;
}

Просмотреть файл

@ -41,7 +41,16 @@ namespace kaldi {
/// It returns true on success, false if it failed due to TopSort failing,
/// which should never happen, but we handle it gracefully by just leaving the
/// lattice the same.
bool PushCompactLattice(CompactLattice *clat);
/// This function used to be called just PushCompactLattice.
bool PushCompactLatticeStrings(CompactLattice *clat);
/// This function pushes the weights in the CompactLattice so that all states
/// except possibly the start state, have Weight components (of type
/// LatticeWeight) that "sum to one" in the LatticeWeight (i.e. interpreting the
/// weights as negated log-probs). It returns true on success, false if it
/// failed due to TopSort failing, which should never happen, but we handle it
/// gracefully by just leaving the lattice the same.
bool PushCompactLatticeWeights(CompactLattice *clat);
} // namespace kaldi

Просмотреть файл

@ -16,7 +16,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
lattice-to-ctm-conf lattice-arcgraph lattice-combine lattice-reverse \
lattice-rescore-mapped lattice-depth lattice-align-phones \
lattice-to-smbr-post lattice-determinize-pruned-parallel \
lattice-add-penalty lattice-align-words-lexicon lattice-push
lattice-add-penalty lattice-align-words-lexicon lattice-push \
lattice-minimize
OBJFILES =

Просмотреть файл

@ -82,8 +82,8 @@ int main(int argc, char *argv[]) {
num_done++;
}
KALDI_LOG << "Done " << num_done << " lattices.";
KALDI_LOG << "The average density is "
<< static_cast<float> ((float)sum_depth / total_t);
KALDI_LOG << "Overall density is "
<< (static_cast<BaseFloat>(sum_depth) / total_t);
if (num_done != 0) return 0;
else return 1;
} catch (const std::exception &e) {

Просмотреть файл

@ -21,6 +21,8 @@
#include "lat/kaldi-lattice.h"
#include "fstext/determinize-lattice-pruned.h"
#include "lat/lattice-functions.h"
#include "lat/push-lattice.h"
#include "lat/minimize-lattice.h"
#include "thread/kaldi-task-sequence.h"
namespace kaldi {
@ -33,11 +35,13 @@ class DeterminizeLatticeTask {
std::string key,
BaseFloat acoustic_scale,
BaseFloat beam,
bool minimize,
Lattice *lat,
CompactLatticeWriter *clat_writer,
int32 *num_warn):
opts_(opts), key_(key), acoustic_scale_(acoustic_scale), beam_(beam),
lat_(lat), clat_writer_(clat_writer), num_warn_(num_warn) { }
minimize_(minimize), lat_(lat), clat_writer_(clat_writer),
num_warn_(num_warn) { }
void operator () () {
Invert(lat_); // to get word labels on the input side.
@ -58,6 +62,11 @@ class DeterminizeLatticeTask {
}
delete lat_; // This is no longer needed so we can delete it now;
lat_ = NULL;
if (minimize_) {
PushCompactLatticeStrings(&det_clat_);
PushCompactLatticeWeights(&det_clat_);
MinimizeCompactLattice(&det_clat_);
}
// Invert the original acoustic scaling
fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale_),
&det_clat_);
@ -72,6 +81,7 @@ class DeterminizeLatticeTask {
std::string key_;
BaseFloat acoustic_scale_;
BaseFloat beam_;
bool minimize_;
Lattice *lat_; // The lattice we're working on. Owned locally.
CompactLattice det_clat_; // The output of our process. Will be written
// to clat_writer_ in the destructor.
@ -101,6 +111,7 @@ int main(int argc, char *argv[]) {
ParseOptions po(usage);
BaseFloat acoustic_scale = 1.0;
BaseFloat beam = 10.0;
bool minimize = false;
TaskSequencerConfig sequencer_config; // has --num-threads option
fst::DeterminizeLatticePrunedOptions determinize_config; // Options used in DeterminizeLatticePruned--
// this options class does not have its own Register function as it's viewed as
@ -108,8 +119,11 @@ int main(int argc, char *argv[]) {
determinize_config.max_mem = 50000000;
determinize_config.max_loop = 0; // was 500000;
po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
po.Register("acoustic-scale", &acoustic_scale,
"Scaling factor for acoustic likelihoods");
po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling].");
po.Register("minimize", &minimize,
"If true, push and minimize after determinization");
determinize_config.Register(&po);
sequencer_config.Register(&po);
po.Read(argc, argv);
@ -142,7 +156,7 @@ int main(int argc, char *argv[]) {
Lattice *lat = lat_reader.Value().Copy(); // will give ownership to "task"
// below
DeterminizeLatticeTask *task = new DeterminizeLatticeTask(
determinize_config, key, acoustic_scale, beam,
determinize_config, key, acoustic_scale, beam, minimize,
lat, &compact_lat_writer, &n_warn);
sequencer.Run(task);
n_done++;

Просмотреть файл

@ -21,6 +21,8 @@
#include "lat/kaldi-lattice.h"
#include "fstext/determinize-lattice-pruned.h"
#include "lat/lattice-functions.h"
#include "lat/push-lattice.h"
#include "lat/minimize-lattice.h"
int main(int argc, char *argv[]) {
try {
@ -39,14 +41,18 @@ int main(int argc, char *argv[]) {
ParseOptions po(usage);
BaseFloat acoustic_scale = 1.0;
BaseFloat beam = 10.0;
bool minimize = false;
fst::DeterminizeLatticePrunedOptions opts; // Options used in DeterminizeLatticePruned--
// this options class does not have its own Register function as it's viewed as
// being more part of "fst world", so we register its elements independently.
opts.max_mem = 50000000;
opts.max_loop = 0; // was 500000;
po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
po.Register("acoustic-scale", &acoustic_scale,
"Scaling factor for acoustic likelihoods");
po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling].");
po.Register("minimize", &minimize,
"If true, push and minimize after determinization");
opts.Register(&po);
po.Read(argc, argv);
@ -89,6 +95,11 @@ int main(int argc, char *argv[]) {
"(partial output will be pruned tighter than the specified beam.)";
n_warn++;
}
if (minimize) {
PushCompactLatticeStrings(&det_clat);
PushCompactLatticeWeights(&det_clat);
MinimizeCompactLattice(&det_clat);
}
fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
compact_lat_writer.Write(key, det_clat);
n_done++;

Просмотреть файл

@ -1,6 +1,7 @@
// latbin/lattice-determinize.cc
// Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
// Copyright 2009-2012 Microsoft Corporation
// 2012-2013 Johns Hopkins University (Author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -23,6 +24,8 @@
#include "fstext/fstext-lib.h"
#include "lat/kaldi-lattice.h"
#include "lat/lattice-functions.h"
#include "lat/push-lattice.h"
#include "lat/minimize-lattice.h"
namespace kaldi {
@ -111,16 +114,27 @@ int main(int argc, char *argv[]) {
int32 max_loop = 500000;
BaseFloat delta = fst::kDelta;
bool prune = false;
bool minimize = false;
po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]-- also used "
"to handle determinization failures, set --prune=false to disable routine pruning");
po.Register("acoustic-scale", &acoustic_scale,
"Scaling factor for acoustic likelihoods");
po.Register("beam", &beam,
"Pruning beam [applied after acoustic scaling]-- also used "
"to handle determinization failures, set --prune=false to "
"disable routine pruning");
po.Register("delta", &delta, "Tolerance used in determinization");
po.Register("prune", &prune, "If true, prune determinized lattices with the --beam option.");
po.Register("max-mem", &max_mem, "Maximum approximate memory usage in determinization (real usage might be many times this)");
po.Register("max-loop", &max_loop, "Option to detect a certain type of failure in lattice determinization (not critical)");
po.Register("beam-ratio", &beam_ratio, "Ratio by which to decrease beam if we reach the max-arcs.");
po.Register("num-loops", &num_loops, "Number of times to decrease beam by beam-ratio if determinization fails.");
po.Register("prune", &prune, "If true, prune determinized lattices "
"with the --beam option.");
po.Register("max-mem", &max_mem, "Maximum approximate memory usage in "
"determinization (real usage might be many times this)");
po.Register("max-loop", &max_loop, "Option to detect a certain "
"type of failure in lattice determinization (not critical)");
po.Register("beam-ratio", &beam_ratio, "Ratio by which to "
"decrease beam if we reach the max-arcs.");
po.Register("num-loops", &num_loops, "Number of times to "
"decrease beam by beam-ratio if determinization fails.");
po.Register("minimize", &minimize,
"If true, push and minimize after determinization");
po.Read(argc, argv);
@ -158,6 +172,11 @@ int main(int argc, char *argv[]) {
if (DeterminizeLatticeWrapper(lat, key, prune,
beam, beam_ratio, max_mem, max_loop,
delta, num_loops, &clat)) {
if (minimize) {
PushCompactLatticeStrings(&clat);
PushCompactLatticeWeights(&clat);
MinimizeCompactLattice(&clat);
}
fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &clat);
compact_lattice_writer.Write(key, clat);
n_done++;

Просмотреть файл

@ -0,0 +1,110 @@
// latbin/lattice-minimize.cc
// Copyright 2013 Johns Hopkins University (Author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "fstext/fstext-lib.h"
#include "lat/kaldi-lattice.h"
#include "lat/minimize-lattice.h"
#include "lat/push-lattice.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
typedef kaldi::int32 int32;
typedef kaldi::int64 int64;
using fst::SymbolTable;
using fst::VectorFst;
using fst::StdArc;
const char *usage =
"Minimize lattices, in CompactLattice format. Should be applied to\n"
"determinized lattices (e.g. produced with --determinize-lattice=true)\n"
"Note: by default this program\n"
"pushes the strings and weights prior to minimization."
"Usage: lattice-minimize [options] lattice-rspecifier lattice-wspecifier\n"
" e.g.: lattice-minimize ark:1.lats ark:2.lats\n";
ParseOptions po(usage);
bool push_strings = true;
bool push_weights = true;
po.Register("push-strings", &push_strings, "If true, push the strings in the "
"lattice to the start.");
po.Register("push-weights", &push_weights, "If true, push the weights in the "
"lattice to the start.");
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
std::string lats_rspecifier = po.GetArg(1),
lats_wspecifier = po.GetArg(2);
SequentialCompactLatticeReader clat_reader(lats_rspecifier);
CompactLatticeWriter clat_writer(lats_wspecifier);
int32 n_done = 0, n_err = 0;
for (; !clat_reader.Done(); clat_reader.Next()) {
std::string key = clat_reader.Key();
CompactLattice clat = clat_reader.Value();
KALDI_VLOG(1) << "Processing lattice for utterance " << key;
if (push_strings && !PushCompactLatticeStrings(&clat)) {
KALDI_WARN << "Failure in pushing lattice strings (bad lattice?), "
<< "for key " << key;
n_err++;
continue;
}
if (push_weights && !PushCompactLatticeWeights(&clat)) {
KALDI_WARN << "Failure in pushing lattice weights (bad lattice?),"
<< "for key " << key ;
n_err++;
continue;
}
if (!MinimizeCompactLattice(&clat)) {
KALDI_WARN << "Failure in minimizing lattice (bad lattice?),"
<< "for key " << key ;
n_err++;
continue;
}
if (clat.NumStates() == 0) {
KALDI_WARN << "Empty lattice for key " << key;
n_err++;
continue;
}
clat_writer.Write(key, clat);
n_done++;
}
KALDI_LOG << "Minimized " << n_done << " lattices, errors on " << n_err;
return (n_done != 0 ? 0 : 1);
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -1,6 +1,7 @@
// latbin/lattice-oracle.cc
// Copyright 2011 Gilles Boulianne
// 2013 Johns Hopkins University (author: Daniel Povey)
//
// See ../../COPYING for clarification regarding multiple authors
//
@ -95,10 +96,10 @@ void CreateEditDistance(const fst::StdVectorFst &fst1,
typedef StdArc StdArc;
typedef StdArc::Weight Weight;
typedef StdArc::Label Label;
Weight corrCost(0.0);
Weight subsCost(1.0);
Weight insCost(1.0);
Weight delCost(1.0);
Weight correct_cost(0.0);
Weight substitution_cost(1.0);
Weight insertion_cost(1.0);
Weight deletion_cost(1.0);
// create set of output symbols in fst1
std::vector<Label> fst1syms, fst2syms;
@ -108,17 +109,17 @@ void CreateEditDistance(const fst::StdVectorFst &fst1,
pfst->AddState();
pfst->SetStart(0);
for (size_t i = 0; i < fst1syms.size(); i++)
pfst->AddArc(0, StdArc(fst1syms[i], 0, delCost, 0)); // deletions
pfst->AddArc(0, StdArc(fst1syms[i], 0, deletion_cost, 0)); // deletions
for (size_t i = 0; i < fst2syms.size(); i++)
pfst->AddArc(0, StdArc(0, fst2syms[i], insCost, 0)); // insertions
pfst->AddArc(0, StdArc(0, fst2syms[i], insertion_cost, 0)); // insertions
// stupid implementation O(N^2)
for (size_t i = 0; i < fst1syms.size(); i++) {
Label label1 = fst1syms[i];
for (size_t j = 0; j < fst2syms.size(); j++) {
Label label2 = fst2syms[j];
Weight cost( label1 == label2 ? corrCost : subsCost);
Weight cost( label1 == label2 ? correct_cost : substitution_cost);
pfst->AddArc(0, StdArc(label1, label2, cost, 0)); // substitutions
}
}
@ -127,30 +128,33 @@ void CreateEditDistance(const fst::StdVectorFst &fst1,
}
void CountErrors(fst::StdVectorFst &fst,
unsigned int *corr,
unsigned int *subs,
unsigned int *ins,
unsigned int *del,
unsigned int *totwords) {
int32 *correct,
int32 *substitutions,
int32 *insertions,
int32 *deletions,
int32 *num_words) {
typedef fst::StdArc::StateId StateId;
typedef fst::StdArc::Weight Weight;
*corr = *subs = *ins = *del = *totwords = 0;
*correct = *substitutions = *insertions = *deletions = *num_words = 0;
// go through the first complete path in fst (there should be only one)
StateId src = fst.Start();
while (fst.Final(src)== Weight::Zero()) { // while not final
for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, src); !aiter.Done(); aiter.Next()) {
fst::StdArc arc = aiter.Value();
if (arc.ilabel == 0 && arc.olabel == 0) {
// don't count these so we may compare number of arcs and number of errors
} else if (arc.ilabel == arc.olabel) {
(*corr)++; (*totwords)++;
} else if (arc.ilabel == 0) {
(*ins)++;
} else if (arc.olabel == 0) {
(*del)++; (*totwords)++;
if (arc.ilabel == arc.olabel && arc.ilabel != 0) {
(*correct)++;
(*num_words)++;
} else if (arc.ilabel == 0 && arc.olabel != 0) {
(*deletions)++;
(*num_words)++;
} else if (arc.ilabel != 0 && arc.olabel == 0) {
(*insertions)++;
} else if (arc.ilabel != 0 && arc.olabel != 0) {
(*substitutions)++;
(*num_words)++;
} else {
(*subs)++; (*totwords)++;
KALDI_ASSERT(arc.ilabel == 0 && arc.olabel == 0);
}
src = arc.nextstate;
continue; // jump to next state
@ -175,7 +179,7 @@ bool CheckFst(fst::StdVectorFst &fst, string name, string key) {
// Guoguo Chen added the implementation for option "write-lattices". This
// function does a depth first search on the lattice and remove the arcs that
// don't correspond to the oracle path. By "remove" I actually point the next
// don't correctespond to the oracle path. By "remove" I actually point the next
// state of the arc to some state that is not in the lattice and then use the
// openfst connect function. This makes things much easier.
bool GetOracleLattice(Lattice *oracle_lat,
@ -229,8 +233,9 @@ int main(int argc, char *argv[]) {
const char *usage =
"Finds the path having the smallest edit-distance between two lattices.\n"
"For efficiency put the smallest lattices first (for example reference strings).\n"
"Usage: lattice-oracle [options] test-lattice-rspecifier reference-rspecifier transcriptions-wspecifier\n"
" e.g.: lattice-oracle ark:ref.lats ark:1.tra ark:2.tra\n";
"Usage: lattice-oracle [options] test-lattice-rspecifier reference-rspecifier "
"transcriptions-wspecifier [edit-distance-wspecifier]\n"
" e.g.: lattice-oracle ark:lat.1 'ark:sym2int.pl -f 2- data/lang/words.txt <data/test/text' ark,t:-\n";
ParseOptions po(usage);
@ -238,27 +243,28 @@ int main(int argc, char *argv[]) {
std::string wild_syms_filename;
std::string lats_wspecifier;
po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
po.Register("wildcard-symbols-list", &wild_syms_filename, "List of symbols that don't count as errors");
po.Register("write-lattices", &lats_wspecifier, "If supplied, write 1-best path as lattices to this wspecifier");
po.Read(argc, argv);
if (po.NumArgs() < 3 || po.NumArgs() > 4) {
if (po.NumArgs() != 3) {
po.PrintUsage();
exit(1);
}
std::string lats_rspecifier = po.GetArg(1),
reference_rspecifier = po.GetArg(2),
transcriptions_wspecifier = po.GetOptArg(3);
transcriptions_wspecifier = po.GetArg(3);
// will read input as lattices
SequentialLatticeReader lattice_reader(lats_rspecifier);
RandomAccessInt32VectorReader reference_reader(reference_rspecifier);
Int32VectorWriter transcriptions_writer(transcriptions_wspecifier);
// Guoguo Chen added the implementation for option "write-lattices".
CompactLatticeWriter lats_writer(lats_wspecifier);
@ -276,7 +282,8 @@ int main(int argc, char *argv[]) {
}
int32 n_done = 0, n_fail = 0;
unsigned int tot_corr=0, tot_subs=0, tot_ins=0, tot_del=0, tot_words=0;
int32 tot_correct=0, tot_substitutions=0, tot_insertions=0, tot_deletions=0,
tot_words=0;
for (; !lattice_reader.Done(); lattice_reader.Next()) {
std::string key = lattice_reader.Key();
@ -284,9 +291,9 @@ int main(int argc, char *argv[]) {
cerr << "Lattice "<<key<<" read."<<endl;
// remove all weights while creating a standard FST
VectorFst<StdArc> fst1;
ConvertLatticeToUnweightedAcceptor(lat, wild_syms, &fst1);
CheckFst(fst1, "fst1_", key);
VectorFst<StdArc> lattice_fst;
ConvertLatticeToUnweightedAcceptor(lat, wild_syms, &lattice_fst);
CheckFst(lattice_fst, "lattice_fst_", key);
// TODO: map certain symbols (using an FST created with CreateMapFst())
@ -296,44 +303,49 @@ int main(int argc, char *argv[]) {
continue;
}
const std::vector<int32> &reference = reference_reader.Value(key);
VectorFst<StdArc> fst2;
MakeLinearAcceptor(reference, &fst2);
VectorFst<StdArc> reference_fst;
MakeLinearAcceptor(reference, &reference_fst);
CheckFst(fst2, "fst2_", key);
CheckFst(reference_fst, "reference_fst_", key);
// recreate edit distance fst if necessary
fst::StdVectorFst editDistanceFst;
CreateEditDistance(fst1, fst2, &editDistanceFst);
fst::StdVectorFst edit_distance_fst;
CreateEditDistance(lattice_fst, reference_fst, &edit_distance_fst);
// compose with edit distance transducer
VectorFst<StdArc> composedFst;
fst::Compose(editDistanceFst, fst2, &composedFst);
CheckFst(composedFst, "composed_", key);
VectorFst<StdArc> edit_ref_fst;
fst::Compose(edit_distance_fst, reference_fst, &edit_ref_fst);
CheckFst(edit_ref_fst, "composed_", key);
// make sure composed FST is input sorted
fst::ArcSort(&composedFst, fst::StdILabelCompare());
fst::ArcSort(&edit_ref_fst, fst::StdILabelCompare());
// compose with previous result
VectorFst<StdArc> resultFst;
fst::Compose(fst1, composedFst, &resultFst);
CheckFst(resultFst, "result_", key);
VectorFst<StdArc> result_fst;
fst::Compose(lattice_fst, edit_ref_fst, &result_fst);
CheckFst(result_fst, "result_", key);
// find out best path
VectorFst<StdArc> best_path;
fst::ShortestPath(resultFst, &best_path);
fst::ShortestPath(result_fst, &best_path);
CheckFst(best_path, "best_path_", key);
if (best_path.Start() == fst::kNoStateId) {
KALDI_WARN << "Best-path failed for key " << key;
n_fail++;
} else {
// count errors
unsigned int corr, subs, ins, del, totwords;
CountErrors(best_path, &corr, &subs, &ins, &del, &totwords);
unsigned int toterrs = subs+ins+del;
KALDI_LOG << "%WER "<<(100.*toterrs)/totwords<<" [ "<<toterrs<<" / "<<totwords<<", "<<ins<<" ins, "<<del<<" del, "<<subs<<" sub ]";
tot_corr += corr; tot_subs += subs; tot_ins += ins; tot_del += del; tot_words += totwords;
int32 correct, substitutions, insertions, deletions, num_words;
CountErrors(best_path, &correct, &substitutions, &insertions, &deletions, &num_words);
int32 toterrs = substitutions + insertions + deletions;
KALDI_LOG << "%WER " << (100.*toterrs) / num_words << " [ " << toterrs
<< " / " << num_words << ", " << insertions << " insertions, " << deletions
<< " deletions, " << substitutions << " sub ]";
tot_correct += correct;
tot_substitutions += substitutions;
tot_insertions += insertions;
tot_deletions += deletions;
tot_words += num_words;
std::vector<int32> oracle_words;
std::vector<int32> reference_words;
@ -354,7 +366,8 @@ int main(int argc, char *argv[]) {
for (size_t i = 0; i < reference_words.size(); i++) {
std::string s = word_syms->Find(reference_words[i]);
if (s == "")
KALDI_ERR << "Word-id " << reference_words[i] <<" not in symbol table.";
KALDI_ERR << "Word-id " << reference_words[i]
<< " not in symbol table.";
std::cerr << s << ' ';
}
std::cerr << '\n';
@ -367,8 +380,10 @@ int main(int argc, char *argv[]) {
if (lats_wspecifier != "") {
Lattice oracle_lat = lat;
LatticeArc::StateId bad_state = oracle_lat.AddState();
if (!GetOracleLattice(&oracle_lat, oracle_words, bad_state, oracle_lat.Start(), 0))
KALDI_WARN << "Fail to find the oracle path in the original lattice: " << key;
if (!GetOracleLattice(&oracle_lat, oracle_words,
bad_state, oracle_lat.Start(), 0))
KALDI_WARN << "Failed to find the oracle path in the original "
<< "lattice: " << key;
CompactLattice oracle_clat;
ConvertLattice(oracle_lat, &oracle_clat);
lats_writer.Write(key, oracle_clat);
@ -377,9 +392,13 @@ int main(int argc, char *argv[]) {
n_done++;
}
if (word_syms) delete word_syms;
unsigned int tot_errs = tot_subs + tot_del + tot_ins;
KALDI_LOG << "Overall %WER "<<(100.*tot_errs)/tot_words<<" [ "<<tot_errs<<" / "<<tot_words<<", "<<tot_ins<<" ins, "<<tot_del<<" del, "<<tot_subs<<" sub ]";
KALDI_LOG << "Scored " << n_done << " lattices, "<<n_fail<<" not present in hyp.";
int32 tot_errs = tot_substitutions + tot_deletions + tot_insertions;
KALDI_LOG << "Overall %WER " << (100.*tot_errs)/tot_words << " [ "
<< tot_errs << " / " << tot_words << ", " << tot_insertions
<< " insertions, " << tot_deletions << " deletions, "
<< tot_substitutions << " substitutions ]";
KALDI_LOG << "Scored " << n_done << " lattices, " << n_fail
<< " not present in ref.";
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;

Просмотреть файл

@ -37,13 +37,22 @@ int main(int argc, char *argv[]) {
const char *usage =
"Push lattices, in CompactLattice format, so that the strings are as\n"
"close to the start as possible. Does not affect the weights. This can\n"
"be helpful prior to word-alignment.\n"
"close to the start as possible, and the lowest cost weight for each\n"
"state except the start state is (0, 0). This can be helpful prior to\n"
"word-alignment (in this case, only strings need to be pushed)\n"
"\n"
"Usage: lattice-push [options] lattice-rspecifier lattice-wspecifier\n"
" e.g.: lattice-push ark:1.lats ark:2.lats\n";
ParseOptions po(usage);
bool push_strings = true;
bool push_weights = true;
po.Register("push-strings", &push_strings, "If true, push the strings in the "
"lattice to the start.");
po.Register("push-weights", &push_weights, "If true, push the weights in the "
"lattice to the start.");
po.Read(argc, argv);
@ -66,18 +75,25 @@ int main(int argc, char *argv[]) {
std::string key = clat_reader.Key();
CompactLattice clat = clat_reader.Value();
KALDI_VLOG(1) << "Processing lattice for utterance " << key;
if (!PushCompactLattice(&clat)) {
KALDI_WARN << "Failure in pushing lattice (bad lattice?) for key " << key;
if (push_strings && !PushCompactLatticeStrings(&clat)) {
KALDI_WARN << "Failure in pushing lattice strings (bad lattice?), "
<< "for key " << key;
n_err++;
} else {
if (clat.NumStates() == 0) {
KALDI_WARN << "Empty lattice for key " << key;
n_err++;
} else {
clat_writer.Write(key, clat);
n_done++;
}
continue;
}
if (push_weights && !PushCompactLatticeWeights(&clat)) {
KALDI_WARN << "Failure in pushing lattice weights (bad lattice?),"
<< "for key " << key ;
n_err++;
continue;
}
if (clat.NumStates() == 0) {
KALDI_WARN << "Empty lattice for key " << key;
n_err++;
continue;
}
clat_writer.Write(key, clat);
n_done++;
}
KALDI_LOG << "Pushed " << n_done << " lattices, errors on " << n_err;
return (n_done != 0 ? 0 : 1);

Просмотреть файл

@ -44,10 +44,11 @@ void CompressedMatrix::CopyFromMat(
// we need to ensure that the percentile_0 through percentile_100
// are in strictly increasing order.
float min_value = mat.Min(), max_value = mat.Max();
float safety_margin = 0.001 * (fabs(min_value) + fabs(max_value));
if (safety_margin == 0.0) safety_margin = 1.0;
min_value -= safety_margin;
max_value += safety_margin;
if (max_value == min_value)
max_value = min_value + (1.0 + fabs(min_value)); // ensure it's strictly
// greater than min_value,
// even if matrix is
// constant.
global_header.min_value = min_value;
global_header.range = max_value - min_value;
@ -125,17 +126,25 @@ void CompressedMatrix::ComputeColHeader(
if (num_rows >= 5) {
int quarter_nr = num_rows/4;
// The elements at positions 0, quarter_nr,
// std::sort(sdata.begin(), sdata.end());
// The elements at positions 0, quarter_nr,
// 3*quarter_nr, and num_rows-1 need to be in sorted order.
// Note: the + 1's below are not necessary but may speed things
// up slightly.
std::nth_element(sdata.begin(), sdata.begin() + quarter_nr, sdata.end());
// Now, sdata.begin() + quarter_nr contains the element that would appear
// in sorted order, in that position.
std::nth_element(sdata.begin(), sdata.begin(), sdata.begin() + quarter_nr);
// Now, sdata.begin() and sdata.begin() + quarter_nr contain the elements
// that would appear at those positions in sorted order.
std::nth_element(sdata.begin() + quarter_nr + 1,
sdata.begin() + (3*quarter_nr) + 1, sdata.end());
std::nth_element(sdata.begin() + (3*quarter_nr), sdata.end() - 1,
sdata.begin() + (3*quarter_nr), sdata.end());
// Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
// 3*quarter_nr, contain the elements that would appear at those positions
// in sorted order.
std::nth_element(sdata.begin() + (3*quarter_nr) + 1, sdata.end() - 1,
sdata.end());
// Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
// 3*quarter_nr, and sdata.end() - 1, contain the elements that would appear
// at those positions in sorted order.
header->percentile_0 = FloatToUint16(global_header, sdata[0]);
header->percentile_25 = std::max<uint16>(
@ -228,7 +237,7 @@ void CompressedMatrix::CompressColumn(
unsigned char *byte_data) {
ComputeColHeader(global_header, data, stride,
num_rows, header);
float p0 = Uint16ToFloat(global_header, header->percentile_0),
p25 = Uint16ToFloat(global_header, header->percentile_25),
p75 = Uint16ToFloat(global_header, header->percentile_75),
@ -248,6 +257,9 @@ void* CompressedMatrix::AllocateData(int32 num_bytes) {
return reinterpret_cast<void*>(new float[(num_bytes/3) + 4]);
}
#define DEBUG_COMPRESSED_MATRIX 0 // Must be zero for Kaldi to work; use 1 only
// for debugging.
void CompressedMatrix::Write(std::ostream &os, bool binary) const {
if (binary) { // Binary-mode write:
WriteToken(os, binary, "CM");
@ -264,12 +276,13 @@ void CompressedMatrix::Write(std::ostream &os, bool binary) const {
}
} else {
// In text mode, just use the same format as a regular matrix.
// This is not compressed.
#if DEBUG_COMPRESSED_MATRIX == 0
Matrix<BaseFloat> temp_mat(this->NumRows(), this->NumCols(),
kUndefined);
this->CopyToMat(&temp_mat);
temp_mat.Write(os, binary);
/*
#else
// Text-mode writing. Only really useful for debug, but we'll implement it.
if (data_ == NULL) {
os << 0.0 << ' ' << 0.0 << ' ' << 0 << ' ' << 0 << '\n';
@ -288,7 +301,9 @@ void CompressedMatrix::Write(std::ostream &os, bool binary) const {
for (int32 j = 0; j < h.num_rows; j++, c++)
os << static_cast<int>(*c) << ' ';
os << '\n';
} */
}
}
#endif
}
if (os.fail())
KALDI_ERR << "Error writing compressed matrix to stream.";
@ -316,11 +331,12 @@ void CompressedMatrix::Read(std::istream &is, bool binary) {
is.read(reinterpret_cast<char*>(data_) + sizeof(GlobalHeader),
remaining_size);
} else { // Text-mode read.
#if DEBUG_COMPRESSED_MATRIX == 0
Matrix<BaseFloat> temp;
temp.Read(is, binary);
this->CopyFromMat(temp);
/*
// The old reading code...
#else
// The old reading code...
GlobalHeader h;
is >> h.min_value >> h.range >> h.num_rows >> h.num_cols;
if (is.fail())
@ -346,7 +362,8 @@ void CompressedMatrix::Read(std::istream &is, bool binary) {
assert(i >= 0 && i <= 255);
*c = static_cast<unsigned char>(i);
}
} */
}
#endif
}
if (is.fail())
KALDI_ERR << "Failed to read data.";

Просмотреть файл

@ -123,7 +123,6 @@ class CompressedMatrix {
uint16 percentile_100;
};
// The following function is called in CopyToMatrix.
template<typename Real>
static void CompressColumn(const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,

Просмотреть файл

@ -1346,7 +1346,7 @@ template<typename Real> static void UnitTestEig() {
Pinv.Invert();
Matrix<Real> D(dimM, dimM);
CreateEigenvalueMatrix(real_eigs, imag_eigs, &D);
// check that M = P D P^{-1}.
Matrix<Real> tmp(dimM, dimM);
tmp.AddMatMat(1.0, P, kNoTrans, D, kNoTrans, 0.0); // tmp = P * D
@ -3838,6 +3838,22 @@ template<typename Real> static void UnitTestCompressedMatrix() {
Matrix<Real> diff(M2);
diff.AddMat(-1.0, M);
{ // Check that when compressing a matrix that has already been compressed,
// and uncompressing, we get the same answer.
CompressedMatrix cmat2(M2);
Matrix<Real> M3(cmat.NumRows(), cmat.NumCols());
cmat2.CopyToMat(&M3);
if (!M2.ApproxEqual(M3, 1.0e-05)) {
KALDI_LOG << "cmat is: ";
cmat.Write(std::cout, false);
KALDI_LOG << "cmat2 is: ";
cmat2.Write(std::cout, false);
KALDI_ERR << "Matrices differ " << M2 << " vs. " << M3 << ", M2 range is "
<< M2.Min() << " to " << M2.Max() << ", M3 range is "
<< M3.Min() << " to " << M3.Max();
}
}
// test CopyRowToVec
for (MatrixIndexT i = 0; i < num_rows; i++) {
Vector<Real> V(num_cols);
@ -3891,6 +3907,7 @@ template<typename Real> static void UnitTestCompressedMatrix() {
InitKaldiInputStream(ins, &binary_in);
cmat2.Read(ins, binary_in);
}
#if 1
{ // check that compressed-matrix can be read as matrix.
bool binary_in;
std::ifstream ins("tmpf", std::ios_base::in | std::ios_base::binary);
@ -3900,6 +3917,7 @@ template<typename Real> static void UnitTestCompressedMatrix() {
Matrix<Real> mat2(cmat2);
AssertEqual(mat1, mat2);
}
#endif
Matrix<Real> M3(cmat2.NumRows(), cmat2.NumCols());
cmat2.CopyToMat(&M3);

Просмотреть файл

@ -1,6 +1,6 @@
// nnet/nnet-example.cc
// Copyright 2012 Johns Hopkins University (author: Daniel Povey)
// Copyright 2012-2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -34,7 +34,8 @@ void NnetTrainingExample::Write(std::ostream &os, bool binary) const {
WriteBasicType(os, binary, labels[i].second);
}
WriteToken(os, binary, "<InputFrames>");
input_frames.Write(os, binary);
CompressedMatrix compressed(input_frames);
compressed.Write(os, binary); // can be read as regular Matrix.
WriteToken(os, binary, "<LeftContext>");
WriteBasicType(os, binary, left_context);
WriteToken(os, binary, "<SpkInfo>");

Просмотреть файл

@ -8,7 +8,6 @@ LDLIBS += $(CUDA_LDLIBS)
BINFILES = nnet-train-xent-hardlab-perutt \
nnet-train-xent-hardlab-frmshuff \
nnet-train-xent-hardlab-frmshuff-prior \
nnet-train-mse-tgtmat-frmshuff \
nnet-train-mmi-sequential \
nnet-train-mpe-sequential \

Просмотреть файл

@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
ParseOptions po(usage);
bool binary_write = false;
bool binary_write = true;
po.Register("binary", &binary_write, "Write output in binary mode");
po.Read(argc, argv);

Просмотреть файл

@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=-2;
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif

Просмотреть файл

@ -1,393 +0,0 @@
// nnetbin/nnet-mpe.cc
// Copyright 2011-2013 Karel Vesely; Arnab Ghoshal
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "tree/context-dep.h"
#include "hmm/transition-model.h"
#include "fstext/fstext-lib.h"
#include "decoder/faster-decoder.h"
#include "decoder/decodable-matrix.h"
#include "lat/kaldi-lattice.h"
#include "lat/lattice-functions.h"
#include "nnet/nnet-component.h"
#include "nnet/nnet-activation.h"
#include "nnet/nnet-nnet.h"
#include "util/timer.h"
#include "cudamatrix/cu-device.h"
namespace kaldi {
namespace nnet1 {
void LatticeAcousticRescore(const Matrix<BaseFloat> &log_like,
const TransitionModel &trans_model,
const std::vector<int32> state_times,
Lattice *lat) {
kaldi::uint64 props = lat->Properties(fst::kFstProperties, false);
if (!(props & fst::kTopSorted))
KALDI_ERR << "Input lattice must be topologically sorted.";
KALDI_ASSERT(!state_times.empty());
std::vector<std::vector<int32> > time_to_state(log_like.NumRows());
for (size_t i = 0; i < state_times.size(); i++) {
KALDI_ASSERT(state_times[i] >= 0);
if (state_times[i] < log_like.NumRows()) // end state may be past this..
time_to_state[state_times[i]].push_back(i);
else
KALDI_ASSERT(state_times[i] == log_like.NumRows()
&& "There appears to be lattice/feature mismatch.");
}
for (int32 t = 0; t < log_like.NumRows(); t++) {
for (size_t i = 0; i < time_to_state[t].size(); i++) {
int32 state = time_to_state[t][i];
for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done();
aiter.Next()) {
LatticeArc arc = aiter.Value();
int32 trans_id = arc.ilabel;
if (trans_id != 0) { // Non-epsilon input label on arc
int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
arc.weight.SetValue2(-log_like(t, pdf_id) + arc.weight.Value2());
aiter.SetValue(arc);
}
}
}
}
}
} // namespace nnet1
} // namespace kaldi
int main(int argc, char *argv[]) {
using namespace kaldi;
using namespace kaldi::nnet1;
typedef kaldi::int32 int32;
try {
const char *usage =
"Perform iteration of Neural Network MPE/sMBR training by stochastic "
"gradient descent.\n"
"Usage: nnet-mpe [options] <model-in> <transition-model-in> "
"<feature-rspecifier> <den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
"e.g.: \n"
" nnet-mpe nnet.init trans.mdl scp:train.scp scp:denlats.scp ark:train.ali "
"nnet.iter1\n";
ParseOptions po(usage);
bool binary = false,
crossvalidate = false;
po.Register("binary", &binary, "Write output in binary mode");
po.Register("cross-validate", &crossvalidate,
"Perform cross-validation (don't backpropagate)");
BaseFloat learn_rate = 0.00001,
momentum = 0.0,
l2_penalty = 0.0,
l1_penalty = 0.0;
po.Register("learn-rate", &learn_rate, "Learning rate");
po.Register("momentum", &momentum, "Momentum");
po.Register("l2-penalty", &l2_penalty, "L2 penalty (weight decay)");
po.Register("l1-penalty", &l1_penalty, "L1 penalty (promote sparsity)");
std::string feature_transform, class_frame_counts, silence_phones_str;
po.Register("feature-transform", &feature_transform,
"Feature transform Neural Network");
po.Register("class-frame-counts", &class_frame_counts,
"Class frame counts to compute the class priors");
po.Register("silence-phones", &silence_phones_str, "Colon-separated list "
"of integer id's of silence phones, e.g. 46:47");
BaseFloat acoustic_scale = 1.0,
lm_scale = 1.0,
old_acoustic_scale = 0.0;
po.Register("acoustic-scale", &acoustic_scale,
"Scaling factor for acoustic likelihoods");
po.Register("lm-scale", &lm_scale,
"Scaling factor for \"graph costs\" (including LM costs)");
po.Register("old-acoustic-scale", &old_acoustic_scale,
"Add in the scores in the input lattices with this scale, rather "
"than discarding them.");
bool do_smbr = false;
po.Register("do-smbr", &do_smbr, "Use state-level accuracies instead of "
"phone accuracies.");
#if HAVE_CUDA == 1
kaldi::int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
"(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#endif
po.Read(argc, argv);
if (po.NumArgs() != 6-(crossvalidate?1:0)) {
po.PrintUsage();
exit(1);
}
std::string model_filename = po.GetArg(1),
transition_model_filename = po.GetArg(2),
feature_rspecifier = po.GetArg(3),
den_lat_rspecifier = po.GetArg(4),
ref_ali_rspecifier = po.GetArg(5);
std::string target_model_filename;
if (!crossvalidate) {
target_model_filename = po.GetArg(6);
}
std::vector<int32> silence_phones;
if (!kaldi::SplitStringToIntegers(silence_phones_str, ":", false,
&silence_phones))
KALDI_ERR << "Invalid silence-phones string " << silence_phones_str;
kaldi::SortAndUniq(&silence_phones);
if (silence_phones.empty())
KALDI_LOG << "No silence phones specified.";
// Select the GPU
#if HAVE_CUDA == 1
if (use_gpu_id > -2)
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
#endif
Nnet nnet_transf;
if (feature_transform != "") {
nnet_transf.Read(feature_transform);
}
Nnet nnet;
nnet.Read(model_filename);
// using activations directly: remove softmax, if present
if (nnet.Layer(nnet.LayerCount()-1)->GetType() == Component::kSoftmax) {
KALDI_LOG << "Removing softmax from the nnet " << model_filename;
nnet.RemoveLayer(nnet.LayerCount()-1);
} else {
KALDI_LOG << "The nnet was without softmax " << model_filename;
}
nnet.SetLearnRate(learn_rate, NULL);
nnet.SetMomentum(momentum);
nnet.SetL2Penalty(l2_penalty);
nnet.SetL1Penalty(l1_penalty);
TransitionModel trans_model;
ReadKaldiObject(transition_model_filename, &trans_model);
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
RandomAccessInt32VectorReader ref_ali_reader(ref_ali_rspecifier);
CuMatrix<BaseFloat> feats, feats_transf, nnet_out, nnet_diff;
Matrix<BaseFloat> nnet_out_h, nnet_diff_h;
// Read the class-counts, compute priors
CuVector<BaseFloat> log_priors;
if (class_frame_counts != "") {
Vector<BaseFloat> tmp_priors;
Input in;
in.OpenTextMode(class_frame_counts);
tmp_priors.Read(in.Stream(), false);
in.Close();
// create inv. priors, or log inv priors
BaseFloat sum = tmp_priors.Sum();
tmp_priors.Scale(1.0 / sum);
tmp_priors.ApplyLog();
// push priors to GPU
log_priors.Resize(tmp_priors.Dim());
log_priors.CopyFromVec(tmp_priors);
}
Timer time;
double time_now = 0;
KALDI_LOG << (crossvalidate?"CROSSVALIDATE":"TRAINING") << " STARTED";
int32 num_done = 0, num_no_ref_ali = 0, num_no_den_lat = 0,
num_other_error = 0;
kaldi::int64 total_frames = 0;
double total_frame_acc = 0.0, utt_frame_acc;
// do per-utterance processing
for (; !feature_reader.Done(); feature_reader.Next()) {
std::string utt = feature_reader.Key();
if (!den_lat_reader.HasKey(utt)) {
KALDI_WARN << "Utterance " << utt << ": found no lattice.";
num_no_den_lat++;
continue;
}
if (!ref_ali_reader.HasKey(utt)) {
KALDI_WARN << "Utterance " << utt << ": found no reference alignment.";
num_no_ref_ali++;
continue;
}
// 1) get the features, numerator alignment
const Matrix<BaseFloat> &mat = feature_reader.Value();
const std::vector<int32> &ref_ali = ref_ali_reader.Value(utt);
// check for temporal length of numerator alignments
if (static_cast<MatrixIndexT>(ref_ali.size()) != mat.NumRows()) {
KALDI_WARN << "Numerator alignment has wrong length "
<< ref_ali.size() << " vs. "<< mat.NumRows();
num_other_error++;
continue;
}
// 2) get the denominator lattice, preprocess
Lattice den_lat = den_lat_reader.Value(utt);
if (old_acoustic_scale != 1.0) {
fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
&den_lat);
}
// sort it topologically if not already so
kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
if (!(props & fst::kTopSorted)) {
if (fst::TopSort(&den_lat) == false)
KALDI_ERR << "Cycles detected in lattice.";
}
// get the lattice length and times of states
vector<int32> state_times;
int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
// check for temporal length of denominator lattices
if (max_time != mat.NumRows()) {
KALDI_WARN << "Denominator lattice has wrong length " << max_time
<< " vs. " << mat.NumRows();
num_other_error++;
continue;
}
// 3) propagate the feature to get the log-posteriors (nnet w/o sofrmax)
// push features to GPU
feats = mat;
// possibly apply transform
nnet_transf.Feedforward(feats, &feats_transf);
// propagate through the nnet (assuming w/o softmax)
nnet.Propagate(feats_transf, &nnet_out);
// subtract the log_priors
if (log_priors.Dim() > 0) {
nnet_out.AddVecToRows(-1.0, log_priors);
}
// transfer it back to the host
int32 num_frames = nnet_out.NumRows(),
num_pdfs = nnet_out.NumCols();
nnet_out_h.Resize(num_frames, num_pdfs, kUndefined);
nnet_out.CopyToMat(&nnet_out_h);
// 4) rescore the latice
LatticeAcousticRescore(nnet_out_h, trans_model, state_times, &den_lat);
if (acoustic_scale != 1.0 || lm_scale != 1.0)
fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &den_lat);
// 5) get the posteriors
vector< std::map<int32, char> > arc_accs;
arc_accs.resize(ref_ali.size());
kaldi::Posterior post;
if (do_smbr) { // use state-level accuracies, i.e. sMBR estimation
for (size_t i = 0; i < ref_ali.size(); i++) {
int32 pdf = trans_model.TransitionIdToPdf(ref_ali[i]);
arc_accs[i][pdf] = 1;
}
utt_frame_acc = LatticeForwardBackwardSmbr(den_lat, trans_model,
arc_accs, silence_phones,
&post);
} else { // use phone-level accuracies, i.e. regular MPE
for (size_t i = 0; i < ref_ali.size(); i++) {
int32 phone = trans_model.TransitionIdToPhone(ref_ali[i]);
arc_accs[i][phone] = 1;
}
utt_frame_acc = kaldi::LatticeForwardBackwardMpe(den_lat, trans_model,
arc_accs, &post,
silence_phones);
}
// 6) convert the Posterior to a matrix
nnet_diff_h.Resize(num_frames, num_pdfs, kSetZero);
for (int32 t = 0; t < post.size(); t++) {
for (int32 arc = 0; arc < post[t].size(); arc++) {
int32 pdf = trans_model.TransitionIdToPdf(post[t][arc].first);
nnet_diff_h(t, pdf) -= post[t][arc].second;
}
}
KALDI_VLOG(1) << "Processed lattice for utterance " << num_done + 1
<< " (" << utt << "): found " << den_lat.NumStates()
<< " states and " << fst::NumArcs(den_lat) << " arcs.";
KALDI_VLOG(1) << "Utterance " << utt << ": Average frame accuracy = "
<< (utt_frame_acc/num_frames) << " over " << num_frames
<< " frames.";
// 9) backpropagate through the nnet
if (!crossvalidate) {
nnet_diff = nnet_diff_h;
nnet.Backpropagate(nnet_diff, NULL);
}
// increase time counter
total_frame_acc += utt_frame_acc;
total_frames += num_frames;
num_done++;
if (num_done % 100 == 0) {
time_now = time.Elapsed();
KALDI_VLOG(1) << "After " << num_done << "utterances: time elapsed = "
<< time_now/60 << " min; processed " << total_frames/time_now
<< " frames per second.";
}
}
if (!crossvalidate) {
// add the softmax layer back before writing
KALDI_LOG << "Appending the softmax " << target_model_filename;
nnet.AppendLayer(new Softmax(nnet.OutputDim(),nnet.OutputDim(),&nnet));
//store the nnet
nnet.Write(target_model_filename, binary);
}
time_now = time.Elapsed();
KALDI_LOG << (crossvalidate?"CROSSVALIDATE":"TRAINING") << " FINISHED; "
<< "Time taken = " << time_now/60 << " min; processed "
<< (total_frames/time_now) << " frames per second.";
KALDI_LOG << "Done " << num_done << " files, "
<< num_no_ref_ali << " with no reference alignments, "
<< num_no_den_lat << " with no lattices, "
<< num_other_error << " with other errors.";
KALDI_LOG << "Overall average frame-accuracy is "
<< (total_frame_acc/total_frames) << " over " << total_frames
<< " frames.";
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -102,7 +102,7 @@ int main(int argc, char *argv[]) {
NnetTrainOptions trn_opts; trn_opts.learn_rate=0.00001;
trn_opts.Register(&po);
bool binary = false;
bool binary = true;
po.Register("binary", &binary, "Write output in binary mode");
std::string feature_transform;
@ -134,6 +134,9 @@ int main(int argc, char *argv[]) {
kaldi::int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
"(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
po.Read(argc, argv);

Просмотреть файл

@ -100,7 +100,7 @@ int main(int argc, char *argv[]) {
NnetTrainOptions trn_opts; trn_opts.learn_rate=0.00001;
trn_opts.Register(&po);
bool binary = false;
bool binary = true;
po.Register("binary", &binary, "Write output in binary mode");
std::string feature_transform;
@ -133,6 +133,9 @@ int main(int argc, char *argv[]) {
kaldi::int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
"(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
po.Read(argc, argv);

Просмотреть файл

@ -46,7 +46,7 @@ int main(int argc, char *argv[]) {
NnetTrainOptions trn_opts;
trn_opts.Register(&po);
bool binary = false,
bool binary = true,
crossvalidate = false,
randomize = true;
po.Register("binary", &binary, "Write output in binary mode");
@ -64,6 +64,9 @@ int main(int argc, char *argv[]) {
#if HAVE_CUDA==1
int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
po.Read(argc, argv);

Просмотреть файл

@ -1,223 +0,0 @@
// nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc
// Copyright 2011-2013 Karel Vesely, Brno University of Technology
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "nnet/nnet-trnopts.h"
#include "nnet/nnet-nnet.h"
#include "nnet/nnet-loss-prior.h"
#include "nnet/nnet-cache.h"
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "util/timer.h"
#include "cudamatrix/cu-device.h"
int main(int argc, char *argv[]) {
using namespace kaldi;
using namespace kaldi::nnet1;
try {
const char *usage =
"Perform one iteration of Neural Network training by stochastic gradient descent.\n"
"Usage: nnet-train-xent-hardlab-frmshuff-prior [options] <model-in> <feature-rspecifier> <alignments-rspecifier> [<model-out>]\n"
"e.g.: \n"
" nnet-train-xent-hardlab-frmshuff-prior nnet.init scp:train.scp ark:train.ali nnet.iter1\n";
ParseOptions po(usage);
NnetTrainOptions trn_opts;
trn_opts.Register(&po);
bool binary = false,
crossvalidate = false,
randomize = true;
po.Register("binary", &binary, "Write output in binary mode");
po.Register("cross-validate", &crossvalidate, "Perform cross-validation (don't backpropagate)");
po.Register("randomize", &randomize, "Perform the frame-level shuffling within the Cache::");
std::string feature_transform;
po.Register("feature-transform", &feature_transform, "Feature transform in Nnet format");
int32 bunchsize=512, cachesize=32768, seed=777;
po.Register("bunchsize", &bunchsize, "Size of weight update block");
po.Register("cachesize", &cachesize, "Size of cache for frame level shuffling (max 8388479)");
std::string prior_rxfile;
po.Register("prior", &prior_rxfile, "Priors of the training data to scale down gradients of represented PDFs [REQUIRED]");
BaseFloat prior_softener = 1000; // ie. use uniform prior (disable reweighting)
BaseFloat prior_silence_amount = 1.0; // ie. disable silence downscaling (use all the silence data available)
po.Register("prior-softener", &prior_softener, "Prior softener, scales uniform part added to prior before doing the inverse");
po.Register("prior-silence-amount", &prior_silence_amount, "Define how much of ``effective silence data'' should be used for training, (1.0 will bypass silence scaling)");
int32 prior_silence_numpdf = 5;
po.Register("prior-silence-numpdf", &prior_silence_numpdf, "Number of initial PDFs which model the silence");
#if HAVE_CUDA==1
int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#endif
po.Read(argc, argv);
if (po.NumArgs() != 4-(crossvalidate?1:0)) {
po.PrintUsage();
exit(1);
}
std::string model_filename = po.GetArg(1),
feature_rspecifier = po.GetArg(2),
alignments_rspecifier = po.GetArg(3);
std::string target_model_filename;
if (!crossvalidate) {
target_model_filename = po.GetArg(4);
}
//set the seed to the pre-defined value
srand(seed);
using namespace kaldi;
using namespace kaldi::nnet1;
typedef kaldi::int32 int32;
//Select the GPU
#if HAVE_CUDA==1
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
#endif
Nnet nnet_transf;
if(feature_transform != "") {
nnet_transf.Read(feature_transform);
}
Nnet nnet;
nnet.Read(model_filename);
nnet.SetTrainOptions(trn_opts);
kaldi::int64 total_frames = 0;
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
Cache cache;
cachesize = (cachesize/bunchsize)*bunchsize; // ensure divisibility
cache.Init(cachesize, bunchsize);
XentPrior xent;
if(prior_rxfile != "") {
xent.ReadPriors(prior_rxfile, prior_softener, prior_silence_amount, prior_silence_numpdf);
} else {
KALDI_ERR << "Missing prior file!";
}
CuMatrix<BaseFloat> feats, feats_transf, nnet_in, nnet_out, obj_diff;
std::vector<int32> targets;
Timer time;
double time_now = 0;
double time_next = 0;
KALDI_LOG << (crossvalidate?"CROSSVALIDATE":"TRAINING") << " STARTED";
int32 num_done = 0, num_no_alignment = 0, num_other_error = 0, num_cache = 0;
while (1) {
// fill the cache
while (!cache.Full() && !feature_reader.Done()) {
std::string utt = feature_reader.Key();
if (!alignments_reader.HasKey(utt)) {
num_no_alignment++;
} else {
// get feature alignment pair
const Matrix<BaseFloat> &mat = feature_reader.Value();
const std::vector<int32> &alignment = alignments_reader.Value(utt);
// check the length of the data
if ((int32)alignment.size() != mat.NumRows()) {
KALDI_WARN << "Alignment has wrong length, ali "<< (alignment.size()) << " vs. feats "<< (mat.NumRows()) << ", " << utt;
num_other_error++;
} else { //length OK
// push features to GPU
feats.Resize(mat.NumRows(), mat.NumCols(), kUndefined);
feats.CopyFromMat(mat);
// possibly apply transform
nnet_transf.Feedforward(feats, &feats_transf);
// add to cache
cache.AddData(feats_transf, alignment);
num_done++;
}
}
Timer t_features;
feature_reader.Next();
time_next += t_features.Elapsed();
//report the speed
if (num_done % 1000 == 0) {
time_now = time.Elapsed();
KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
<< time_now/60 << " min; processed " << total_frames/time_now
<< " frames per second.";
}
}
// randomize
if (!crossvalidate && randomize) {
cache.Randomize();
}
// report
KALDI_VLOG(1) << "Cache #" << ++num_cache << " "
<< (cache.Randomized()?"[RND]":"[NO-RND]")
<< " segments: " << num_done
<< " frames: " << static_cast<double>(total_frames)/360000 << "h";
// train with the cache
while (!cache.Empty()) {
// get block of feature/target pairs
cache.GetBunch(&nnet_in, &targets);
// train
nnet.Propagate(nnet_in, &nnet_out);
xent.EvalVec(nnet_out, targets, &obj_diff);
if (!crossvalidate) {
nnet.Backpropagate(obj_diff, NULL);
}
total_frames += nnet_in.NumRows();
}
// stop training when no more data
if (feature_reader.Done()) break;
}
if (!crossvalidate) {
nnet.Write(target_model_filename, binary);
}
KALDI_LOG << (crossvalidate?"CROSSVALIDATE":"TRAINING") << " FINISHED "
<< time.Elapsed()/60 << "min, fps" << total_frames/time.Elapsed()
<< ", feature wait " << time_next << "s";
KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment
<< " with no alignments, " << num_other_error
<< " with other errors.";
KALDI_LOG << xent.Report();
#if HAVE_CUDA==1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
NnetTrainOptions trn_opts;
trn_opts.Register(&po);
bool binary = false,
bool binary = true,
crossvalidate = false,
randomize = true;
po.Register("binary", &binary, "Write output in binary mode");
@ -63,6 +63,9 @@ int main(int argc, char *argv[]) {
#if HAVE_CUDA==1
int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
po.Read(argc, argv);

Просмотреть файл

@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
NnetTrainOptions trn_opts;
trn_opts.Register(&po);
bool binary = false,
bool binary = true,
crossvalidate = false;
po.Register("binary", &binary, "Write output in binary mode");
po.Register("cross-validate", &crossvalidate, "Perform cross-validation (don't backpropagate)");
@ -52,6 +52,9 @@ int main(int argc, char *argv[]) {
#if HAVE_CUDA==1
int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
po.Read(argc, argv);

Просмотреть файл

@ -1,76 +0,0 @@
// nnetbin/nnet-trim-last-n-layers.cc
// Copyright 2012 Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "nnet/nnet-nnet.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
using namespace kaldi::nnet1;
typedef kaldi::int32 int32;
const char *usage =
"Trim ending part of the MLP\n"
"Usage: nnet-trim-last-n-layers [options] <model-in> <model-out>\n"
"e.g.:\n"
" nnet-trim-last-n-layers --binary=false nnet.mdl nnet_txt.mdl\n";
bool binary_write = false;
ParseOptions po(usage);
po.Register("binary", &binary_write, "Write output in binary mode");
int32 trim_num = 0;
po.Register("n", &trim_num, "Number of transforms to be trimmed (include simgoid/softmax)");
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
std::string model_in_filename = po.GetArg(1),
model_out_filename = po.GetArg(2);
Nnet nnet;
{
bool binary_read;
Input ki(model_in_filename, &binary_read);
nnet.Read(ki.Stream(), binary_read);
}
{
Output ko(model_out_filename, binary_write);
int32 write_num_layers = nnet.LayerCount() - trim_num;
nnet.WriteFrontLayers(ko.Stream(), binary_write, write_num_layers);
}
KALDI_LOG << "Written model to " << model_out_filename;
return 0;
} catch(const std::exception& e) {
std::cerr << e.what() << '\n';
return -1;
}
}

Просмотреть файл

@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
" rbm-convert-to-nnet --binary=false rbm.mdl nnet.mdl\n";
bool binary_write = false;
bool binary_write = true;
ParseOptions po(usage);
po.Register("binary", &binary_write, "Write output in binary mode");

Просмотреть файл

@ -68,6 +68,9 @@ int main(int argc, char *argv[]) {
#if HAVE_CUDA==1
int32 use_gpu_id=-2 ;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
po.Read(argc, argv);

Просмотреть файл

@ -5,7 +5,12 @@ include ../kaldi.mk
# The PA_RingBuffer interface is internal and is not exported in the .so libray
# so we have to link against the static one
EXTRA_LDLIBS = ../../tools/portaudio/install/lib/libportaudio.a
ifneq "$(wildcard ../../tools/portaudio/install/lib/libportaudio.a)" ""
EXTRA_LDLIBS = ../../tools/portaudio/install/lib/libportaudio.a
else
EXTRA_LDLIBS = ../../tools/portaudio/install/lib64/libportaudio.a
endif
UNAME=$(shell uname)
ifeq ($(UNAME), Linux)

Просмотреть файл

@ -178,7 +178,11 @@ void SplitStatsByMap(const BuildTreeStatsType &stats, const EventMap &e, std::ve
const EventType &evec = iter->first;
EventAnswerType ans;
if (!e.Map(evec, &ans)) // this is an error--could not map it.
KALDI_ERR << "SplitStatsByMap: could not map event vector " << EventTypeToString(evec);
KALDI_ERR << "SplitStatsByMap: could not map event vector " << EventTypeToString(evec)
<< "if error seen during tree-building, check that "
<< "--context-width and --central-position match stats, "
<< "and that phones that are context-independent (CI) during "
<< "stats accumulation do not share roots with non-CI phones.";
size = std::max(size, (size_t)(ans+1));
}
stats_out->resize(size);