Adding fMPE scripts; changes to fMPE code.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@772 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Dan Povey 2012-02-21 16:08:04 +00:00
Родитель 138a71faf4
Коммит bad94ae0bc
56 изменённых файлов: 1655 добавлений и 2080 удалений

21
COPYING
Просмотреть файл

@ -1,4 +1,24 @@
Update to legal notice, made Feb. 2012. We would like to clarify that we
are using a convention where multiple names in the Apache copyright headers,
for example
// Copyright 2009-2012 Yanmin Qian Arnab Ghoshal
does not necessarily signify joint ownership of copyright of that file, except
in cases where all those names were present in the original release made in
March 2011-- you can use the version history to work this out, if this matters
to you. Instead, we intend that those contributors who later modified the file,
agree to release their changes under the Apache license, but do not claim to
jointly own the copyright of the original material (which would require an agreement
with the original contributors). The conventional way of signifying
this is to duplicate the Apache headers at the top of each file each time
a change is made by a different author, but this would quickly become impractical.
The original legal notice is below. Note: we are continuing to modify it by
adding the names of new contributors.
---
Legal Notices
Each of the files comprising Kaldi v1.0 have been separately licensed by
@ -18,6 +38,7 @@ Individual Contributors (in alphabetical order)
Arnab Ghoshal
Go Vivace Inc.
Mirko Hannemann
Navdeep Jaitly
Microsoft Corporation
Petr Motlicek
Ariya Rastrow

Просмотреть файл

@ -26,4 +26,7 @@ Recipes in progress:
sampling rate).
This directory is a work in progress.
gp: GlobalPhone. This is a multilingual speech corpus.
timit: TIMIT, which is an old corpus of carefully read speech.

Просмотреть файл

@ -28,7 +28,7 @@ exit 1;
# shorten to WAV to take out the empty files and those with compression errors.
# So set WORKDIR to someplace with enough disk space. That is where MFCCs will
# get created, as well as the FST versions of LMs.
WORKDIR=/path/with/disk/space
WORKDIR=/mnt/matylda6/jhu09/qpovey/temp_gp
cp -r conf local utils steps install.sh path.sh $WORKDIR
cd $WORKDIR
# INSTALLING REQUIRED TOOLS:
@ -39,7 +39,7 @@ cd $WORKDIR
{ echo "shorten and/or sox not found on PATH. Installing...";
install.sh }
local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/path/to/GlobalPhone --lm-dir=/path/to/lms --work-dir=$WORKDIR
local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/mnt/matylda2/data/GLOBALPHONE --lm-dir=/path/to/lms --work-dir=$WORKDIR
# On Eddie: local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$PWD/corpus --lm-dir=$PWD/corpus/language_models --work-dir=$PWD
local/gp_format_data.sh --hmm-proto=conf/topo.proto --work-dir=$PWD

Просмотреть файл

@ -5,38 +5,38 @@ for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | scripts/best_wer.sh;
# monophone; delta+accel
exp/mono/decode/wer_4:%WER 9.830049 [ 1232 / 12533, 143 ins, 289 del, 800 sub ]
# First triphone pass; delta+accel
exp/tri1/decode/wer_6:%WER 3.694247 [ 463 / 12533, 69 ins, 100 del, 294 sub ]
exp/tri1/decode/wer_6:%WER 3.893721 [ 488 / 12533, 69 ins, 96 del, 323 sub ]
# Second triphone pass; delta+accel
exp/tri2a/decode/wer_7:%WER 3.638395 [ 456 / 12533, 61 ins, 107 del, 288 sub ]
exp/tri2a/decode/wer_7:%WER 3.486795 [ 437 / 12533, 65 ins, 91 del, 281 sub ]
# [as tri2a, but] LDA+MLLT
exp/tri2b/decode/wer_7:%WER 3.534668 [ 443 / 12533, 74 ins, 88 del, 281 sub ]
exp/tri2b/decode/wer_6:%WER 3.359132 [ 421 / 12533, 73 ins, 71 del, 277 sub ]
# LDA + exponential transform (note: this is with speaker adaptation)
exp/tri2c/decode/wer_5:%WER 2.848480 [ 357 / 12533, 62 ins, 61 del, 234 sub ]
exp/tri2c/decode/wer_5:%WER 2.905492 [ 364 / 12528, 68 ins, 59 del, 237 sub ]
# LDA+MLLT+MMI.
exp/tri3a/decode/wer_7:%WER 3.502753 [ 439 / 12533, 75 ins, 83 del, 281 sub ]
exp/tri3a/decode/wer_7:%WER 3.084052 [ 386 / 12516, 54 ins, 67 del, 265 sub ]
# LDA+MLLT+boosted MMI [note: errors are not identical, although WER is same]
exp/tri3b/decode/wer_7:%WER 3.454879 [ 433 / 12533, 75 ins, 80 del, 278 sub ]
exp/tri3b/decode/wer_5:%WER 3.155960 [ 395 / 12516, 74 ins, 50 del, 271 sub ]
# LDA+MLLT+MCE
exp/tri3c/decode/wer_7:%WER 3.183595 [ 399 / 12533, 62 ins, 79 del, 258 sub ]
exp/tri3c/decode/wer_6:%WER 3.047953 [ 382 / 12533, 56 ins, 69 del, 257 sub ]
# LDA+MLLT+SAT
exp/tri3d/decode/wer_6:%WER 2.553259 [ 320 / 12533, 43 ins, 63 del, 214 sub ]
exp/tri3d/decode/wer_7:%WER 2.234102 [ 280 / 12533, 35 ins, 62 del, 183 sub ]
# LDA+MLLT+SAT+MMI
exp/tri4a/decode/wer_6:%WER 2.473470 [ 310 / 12533, 43 ins, 62 del, 205 sub ]
exp/tri4a/decode/wer_6:%WER 2.146334 [ 269 / 12533, 37 ins, 43 del, 189 sub ]
# LDA+MLLT+SAT, extra phase of builting on top of 3d (no help)
exp/tri4d/decode/wer_5:%WER 2.800606 [ 351 / 12533, 47 ins, 68 del, 236 sub ]
exp/tri4d/decode/wer_5:%WER 2.457512 [ 308 / 12533, 50 ins, 54 del, 204 sub ]
# LDA+MLLT + SGMM with speaker vectors
exp/sgmm3d/decode/wer_4:%WER 2.186228 [ 274 / 12533, 41 ins, 42 del, 191 sub ]
exp/sgmm3d/decode/wer_6:%WER 2.305912 [ 289 / 12533, 53 ins, 52 del, 184 sub ]
# LDA+ET + SGMM with speaker vectors.
exp/sgmm3e/decode/wer_5:%WER 2.242081 [ 281 / 12533, 44 ins, 47 del, 190 sub ]
exp/sgmm3e/decode/wer_4:%WER 2.042608 [ 256 / 12533, 39 ins, 38 del, 179 sub ]
# LDA+MLLT+SAT + SGMM with speaker vectors.
exp/sgmm4f/decode/wer_5:%WER 2.226123 [ 279 / 12533, 56 ins, 49 del, 174 sub ]
exp/sgmm4f/decode/wer_7:%WER 1.970797 [ 247 / 12533, 36 ins, 56 del, 155 sub ]
# + FMLLR on top of it all.
exp/sgmm4f/decode_fmllr/wer_6:%WER 2.202186 [ 276 / 12533, 39 ins, 59 del, 178 sub ]
exp/sgmm4f/decode_fmllr/wer_5:%WER 1.954839 [ 245 / 12533, 40 ins, 47 del, 158 sub ]
# System combination via lattices: combine tri1 and tri2a
exp/combine_1_2a/decode/wer_6:%WER 3.518711 [ 441 / 12533, 62 ins, 97 del, 282 sub ]
# System combination via lattices: combine sgmm4f and tri3d.
exp/combine_sgmm4f_tri3d/decode/wer_5:%WER 2.082502 [ 261 / 12533, 36 ins, 48 del, 177 sub ]
exp/combine_sgmm4f_tri3d/decode/wer_5:%WER 1.763345 [ 221 / 12533, 32 ins, 42 del, 147 sub ]
# System combination via lattices: combine sgmm4f and tri4a.
exp/combine_sgmm4f_tri4a/decode/wer_5:%WER 2.082502 [ 261 / 12533, 37 ins, 49 del, 175 sub ]
exp/combine_sgmm4f_tri4a/decode/wer_6:%WER 1.715471 [ 215 / 12533, 31 ins, 39 del, 145 sub ]

Просмотреть файл

@ -1,5 +1,5 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Copyright 2010-2012 Microsoft Corporation Daniel Povey
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -27,13 +27,24 @@
# ali, final.mdl, final.mat
boost=0 # boosting constant, for boosted MMI.
tau=100 # Tau value.
tau=200 # Tau value.
merge=true # if true, cancel num and den counts as described in
# the boosted MMI paper.
if [ $1 == "--boost" ]; then # e.g. "--boost 0.05"
shift;
boost=$1;
shift;
fi
for x in `seq 4`; do
if [ $1 == "--boost" ]; then # e.g. "--boost 0.05"
boost=$2;
shift 2;
fi
if [ $1 == "--smooth-to-model" ]; then
shift;
smooth_to_model=true
fi
if [ $1 == "--tau" ]; then # e.g. "--tau 200
tau=$2
shift 2;
fi
done
if [ $# != 4 ]; then
echo "Usage: steps/train_lda_etc_mmi.sh <data-dir> <lang-dir> <ali-dir> <exp-dir>"
@ -99,7 +110,7 @@ scripts/mkgraph.sh $dir/lang $alidir $dir/dengraph || exit 1;
echo "Making denominator lattices"
if false; then ##temp
rm $dir/.error 2>/dev/null
for n in 0 1 2 3; do
gmm-latgen-simple --beam=$beam --lattice-beam=$latticebeam --acoustic-scale=$acwt \
@ -113,45 +124,33 @@ if [ -f $dir/.error ]; then
echo "Error creating denominator lattices"
exit 1;
fi
fi ##temp
# No need to create "numerator" alignments/lattices: we just use the
# alignments in $alidir.
echo "Note: ignore absolute offsets in the objective function values"
echo "This is caused by not having LM, lexicon or transition-probs in numerator"
x=0;
while [ $x -lt $num_iters ]; do
echo "Iteration $x: getting denominator stats."
# Get denominator stats...
if [ $x -eq 0 ]; then
( lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat?.gz|" ark:- | \
gmm-acc-stats $dir/$x.mdl "$feats" ark:- $dir/den_acc.$x.acc ) \
2>$dir/acc_den.$x.log || exit 1;
else # Need to recompute acoustic likelihoods...
( gmm-rescore-lattice $dir/$x.mdl "ark:gunzip -c $dir/lat?.gz|" "$feats" ark:- | \
lattice-to-post --acoustic-scale=$acwt ark:- ark:- | \
gmm-acc-stats $dir/$x.mdl "$feats" ark:- $dir/den_acc.$x.acc ) \
2>$dir/acc_den.$x.log || exit 1;
fi
echo "Iteration $x: getting numerator stats."
# Get numerator stats...
gmm-acc-stats-ali $dir/$x.mdl "$feats" ark:$alidir/ali $dir/num_acc.$x.acc \
2>$dir/acc_num.$x.log || exit 1;
echo "Iteration $x: getting stats."
( gmm-rescore-lattice $dir/$x.mdl "ark:gunzip -c $dir/lat?.gz|" "$feats" ark:- | \
lattice-to-post --acoustic-scale=$acwt ark:- ark:- | \
sum-post --merge=$merge --scale1=-1 \
ark:- "ark,s,cs:ali-to-post ark:$alidir/ali ark:- |" ark:- | \
gmm-acc-stats2 $dir/$x.mdl "$feats" ark:- $dir/num_acc.$x.acc $dir/den_acc.$x.acc ) \
2>$dir/acc.$x.log || exit 1;
( gmm-est-gaussians-ebw $dir/$x.mdl "gmm-ismooth-stats --tau=$tau $dir/num_acc.$x.acc $dir/num_acc.$x.acc -|" \
$dir/den_acc.$x.acc - | \
# This tau is only used for smoothing "to the model".
( gmm-est-gaussians-ebw --tau=$tau $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - | \
gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl ) \
2>$dir/update.$x.log || exit 1;
den=`grep Overall $dir/acc_den.$x.log | grep lattice-to-post | awk '{print $7}'`
num=`grep Overall $dir/acc_num.$x.log | grep gmm-acc-stats-ali | awk '{print $11}'`
diff=`perl -e "print ($num * $acwt - $den);"`
impr=`grep Overall $dir/update.$x.log | head -1 | awk '{print $10;}'`
impr=`perl -e "print ($impr * $acwt);"` # auxf impr normalized by multiplying by
# kappa, so it's comparable to an objective-function change.
echo On iter $x, objf was $diff, auxf improvement was $impr | tee $dir/objf.$x.log
objf=`grep Overall $dir/acc.$x.log | grep gmm-acc-stats2 | awk '{print $10}'`
nf=`grep Overall $dir/acc.$x.log | grep gmm-acc-stats2 | awk '{print $12}'`
impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'`
impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
# for the canceling of stats.
echo On iter $x, objf was $objf, auxf improvement from MMI was $impr | tee $dir/objf.$x.log
rm $dir/*.acc
x=$[$x+1]
done

Просмотреть файл

@ -0,0 +1 @@
--use-energy=false # only non-default option.

Просмотреть файл

@ -0,0 +1,22 @@
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>

Просмотреть файл

@ -103,4 +103,3 @@ done
# example of showing the alignments:
# show-alignments data/lang/phones.txt $dir/30.mdl ark:$dir/cur.ali | head -4

Просмотреть файл

@ -1,7 +1,8 @@
. path.sh
local/timit_data_prep.sh /ais/gobi2/speech/TIMIT
local/timit_train_lms.sh data/local
local/timit_format_data.sh
#local/timit_data_prep.sh /ais/gobi2/speech/TIMIT
local/timit_data_prep.sh /mnt/matylda2/data/TIMIT || exit 1;
local/timit_train_lms.sh data/local || exit 1;
local/timit_format_data.sh || exit 1;
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
@ -9,13 +10,13 @@ mfccdir=mfccs
steps/make_mfcc.sh data/train exp/make_mfcc/train $mfccdir 4
for test in train test dev ; do
steps/make_mfcc.sh data/$test exp/make_mfcc/$test $mfccdir 4
steps/make_mfcc.sh data/$test exp/make_mfcc/$test $mfccdir 4 || exit 1;
done
# train monophone system.
steps/train_mono.sh data/train data/lang exp/mono
steps/train_mono.sh data/train data/lang exp/mono || exit 1;
scripts/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
scripts/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1;
echo "Decoding test datasets."
for test in dev test ; do
steps/decode_deltas.sh exp/mono data/$test data/lang exp/mono/decode_$test &
@ -25,8 +26,7 @@ scripts/average_wer.sh exp/mono/decode_*/wer > exp/mono/wer
# Get alignments from monophone system.
echo "Creating training alignments to use to train other systems such as ANN-HMM."
steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali
steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali || exit 1;
echo "Creating dev alignments to use to train other systems such as ANN-HMM."
steps/align_deltas.sh data/dev data/lang exp/mono exp/mono_ali_dev
steps/align_deltas.sh data/dev data/lang exp/mono exp/mono_ali_dev || exit 1;

Просмотреть файл

@ -22,12 +22,15 @@ exp/tri2b/decode_tgpr_dev93_fromlats/wer_15:%WER 16.71 [ 1376 / 8234, 267 ins, 1
exp/tri2b/decode_tgpr_dev93_tg/wer_16:%WER 16.26 [ 1339 / 8234, 267 ins, 141 del, 931 sub ]
exp/tri2b/decode_tgpr_dev93_tg_biglm/wer_16:%WER 16.42 [ 1352 / 8234, 269 ins, 142 del, 941 sub ]
exp/tri2b/decode_tgpr_eval92/wer_16:%WER 11.54 [ 651 / 5643, 146 ins, 42 del, 463 sub ]
exp/tri2b/decode_tgpr_eval92/wer_17:%WER 11.45 [ 646 / 5643, 140 ins, 46 del, 460 sub ]
# +MMI
exp/tri2b_mmi/decode_tgpr_eval92/wer_16:%WER 11.08 [ 625 / 5643, 125 ins, 44 del, 456 sub ]
exp/tri2b_mmi/decode_tgpr_eval92/wer_14:%WER 10.63 [ 600 / 5643, 124 ins, 45 del, 431 sub ]
# +boosting
exp/tri2b_mmi_b0.1/decode_tgpr_eval92/wer_16:%WER 10.83 [ 611 / 5643, 122 ins, 43 del, 446 sub ]
exp/tri2b_mmi_b0.1/decode_tgpr_eval92/wer_16:%WER 10.69 [ 603 / 5643, 119 ins, 48 del, 436 sub ]
# +fMMI
exp/tri2b_fmmi_b0.1/decode_tgpr_eval92/wer_15:%WER 10.26 [ 579 / 5643, 111 ins, 39 del, 429 sub ]
# +MCE
exp/tri2b_mce/decode_tgpr_eval92/wer_16:%WER 11.15 [ 629 / 5643, 132 ins, 45 del, 452 sub ]
@ -69,8 +72,17 @@ exp/tri4b/decode_tgpr_dev93/wer_13:%WER 12.53 [ 1032 / 8234, 242 ins, 79 del, 71
exp/tri4b/decode_tgpr_eval92/wer_16:%WER 8.05 [ 454 / 5643, 119 ins, 23 del, 312 sub ]
# +MMI
exp/tri4b_mmi/decode_tgpr_dev93/wer_14:%WER 11.53 [ 949 / 8234, 203 ins, 82 del, 664 sub ]
exp/tri4b_mmi_b0.1/decode_tgpr_dev93/wer_16:%WER 11.45 [ 943 / 8234, 191 ins, 87 del, 665 sub ]
exp/tri4b_mmi/decode_tgpr_dev93/wer_12:%WER 11.28 [ 929 / 8234, 206 ins, 76 del, 647 sub ]
#+boosting
exp/tri4b_mmi_b0.1/decode_tgpr_dev93/wer_16:%WER 11.25 [ 926 / 8234, 176 ins, 94 del, 656 sub ]
# increasing beam from 13 to 15 to see effect.
exp/tri4b_mmi_b0.1/decode_tgpr_dev93_b15/wer_14:%WER 10.72 [ 883 / 8234, 172 ins, 84 del, 627 sub ]
exp/tri4b_mmi_b0.1/decode_tgpr_eval92/wer_14:%WER 7.34 [ 414 / 5643, 105 ins, 20 del, 289 sub ]
#+fMMI
exp/tri4b_fmmi_b0.1/decode_tgpr_dev93/wer_13:%WER 10.86 [ 894 / 8234, 167 ins, 89 del, 638 sub ]
exp/tri4b_fmmi_b0.1/decode_tgpr_eval92/wer_12:%WER 7.25 [ 409 / 5643, 111 ins, 14 del, 284 sub ]
# LDA+MLLT+SAT, SI-284, full retraining starting from 3b [c.f. 4b]
exp/tri4c/decode_tgpr_dev93/wer_16:%WER 12.10 [ 996 / 8234, 220 ins, 83 del, 693 sub ]

Просмотреть файл

@ -164,6 +164,18 @@ steps/train_lda_etc_mmi.sh --num-jobs 10 --boost 0.1 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b exp/tri2b_mmi_b0.1
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt.sh exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi_b0.1/decode_tgpr_eval92
# The next 3 commands train and test fMMI+MMI (on top of LDA+MLLT).
steps/train_dubm_lda_etc.sh --silence-weight 0.5 \
--num-jobs 10 --cmd "$train_cmd" 400 data/train_si84 \
data/lang exp/tri2b_ali_si84 exp/dubm2b
steps/train_lda_etc_mmi_fmmi.sh \
--num-jobs 10 --boost 0.1 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
exp/tri2b exp/tri2b_fmmi_b0.1
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_fmpe.sh \
exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_fmmi_b0.1/decode_tgpr_eval92
steps/train_lda_etc_mce.sh --cmd "$train_cmd" --num-jobs 10 data/train_si84 data/lang \
exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b exp/tri2b_mce
scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_lda_mllt.sh \
@ -222,7 +234,8 @@ scripts/mkgraph.sh data/lang_test_tgpr exp/tri4b exp/tri4b/graph_tgpr
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b/decode_tgpr_dev93
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b/decode_tgpr_eval92
# Train and test MMI, and boosted MMI, on tri4b.
# Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
# all the data).
# Making num-jobs 40 as want to keep them under 4 hours long (or will fail
# on regular queue at BUT).
steps/align_lda_mllt_sat.sh --num-jobs 40 --cmd "$train_cmd" \
@ -235,6 +248,25 @@ scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tg
steps/train_lda_etc_mmi.sh --boost 0.1 --num-jobs 40 --cmd "$train_cmd" \
data/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b_denlats_si284 exp/tri4b exp/tri4b_mmi_b0.1
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93 exp/tri4b/decode_tgpr_dev93
scripts/decode.sh --opts "--beam 15" --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93_b15 exp/tri4b/decode_tgpr_dev93
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b_mmi_b0.1/decode_tgpr_eval92 exp/tri4b/decode_tgpr_eval92
# Train fMMI+MMI system on top of 4b.
steps/train_dubm_lda_etc.sh --silence-weight 0.5 \
--num-jobs 40 --cmd "$train_cmd" 600 data/train_si284 \
data/lang exp/tri4b_ali_si284 exp/dubm4b
steps/train_lda_etc_mmi_fmmi.sh \
--num-jobs 40 --boost 0.1 --cmd "$train_cmd" \
data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \
exp/tri4b exp/tri4b_fmmi_b0.1
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc_fmpe.sh \
exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b_fmmi_b0.1/decode_tgpr_eval92 \
exp/tri4b/decode_tgpr_eval92
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc_fmpe.sh \
exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_fmmi_b0.1/decode_tgpr_dev93 \
exp/tri4b/decode_tgpr_dev93
# Train UBM, for SGMM system on top of LDA+MLLT.
steps/train_ubm_lda_etc.sh --num-jobs 10 --cmd "$train_cmd" \
@ -245,6 +277,7 @@ scripts/mkgraph.sh data/lang_test_tgpr exp/sgmm3c exp/sgmm3c/graph_tgpr
scripts/decode.sh --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh exp/sgmm3c/graph_tgpr \
data/test_dev93 exp/sgmm3c/decode_tgpr_dev93
# Decode using 3 Gaussians (not 15) for gselect in 1st pass, for fast decoding.
scripts/decode.sh --opts "--first-pass-gselect 3" --cmd "$decode_cmd" \
steps/decode_sgmm_lda_etc.sh exp/sgmm3c/graph_tgpr data/test_dev93 exp/sgmm3c/decode_tgpr_dev93_gs3

Просмотреть файл

@ -62,7 +62,7 @@ fi
requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.mat $graphdir/HCLG.fst $transdir/$jobid.trans"
for f in $requirements; do
if [ ! -f $f ]; then
echo "decode_lda_mllt.sh: no such file $f";
echo "decode_lda_etc.sh: no such file $f";
exit 1;
fi
done

Просмотреть файл

@ -0,0 +1,73 @@
#!/bin/bash
# Decoding script for LDA + optionally MLLT + [some speaker-specific transforms]
# + fMPE.
# This decoding script takes as an argument a previous decoding directory where it
# can find some transforms.
if [ -f ./path.sh ]; then . ./path.sh; fi
numjobs=1
jobid=0
beam=13.0
rescore=false
for x in `seq 3`; do
if [ "$1" == "-j" ]; then
shift;
numjobs=$1;
jobid=$2;
shift 2;
fi
if [ "$1" == "--beam" ]; then
beam=$2;
shift 2;
fi
done
if [ $# != 4 ]; then
# Note: transform-dir has to be last because scripts/decode.sh expects decode-dir to be #3 arg.
echo "Usage: steps/decode_lda_etc.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir> <transform-dir>"
echo " e.g.: steps/decode_lda_etc.sh -j 8 0 exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi/decode_tgpr_dev93 exp/tri4b/decode_tgpr_dev93"
exit 1;
fi
graphdir=$1
data=$2
dir=$3
transdir=$4
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
mkdir -p $dir
if [ $numjobs -gt 1 ]; then
mydata=$data/split$numjobs/$jobid
else
mydata=$data
fi
requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.fmpe $srcdir/final.mat $graphdir/HCLG.fst $transdir/$jobid.trans"
for f in $requirements; do
if [ ! -f $f ]; then
echo "decode_lda_etc_fmpe.sh: no such file $f";
exit 1;
fi
done
basefeats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$mydata/utt2spk ark:$transdir/$jobid.trans ark:- ark:- |"
# Get the Gaussian-selection info for the fMPE.
ngselect=2; # Just the 2 top Gaussians.
gmm-gselect --n=$ngselect $srcdir/final.fmpe "$basefeats" \
"ark:|gzip -c >$dir/gselect.$jobid.gz" 2>$dir/gselect.$jobid.log
# Now set up the fMPE features.
feats="$basefeats fmpe-apply-transform $srcdir/final.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.$jobid.gz|' ark:- |"
gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 \
--acoustic-scale=0.083333 \
--allow-partial=true --word-symbol-table=$graphdir/words.txt \
$srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
2> $dir/decode$jobid.log || exit 1;

Просмотреть файл

@ -0,0 +1,64 @@
#!/bin/bash
# Decoding script that works with a GMM model and the baseline
# [e.g. MFCC] features plus cepstral mean subtraction plus
# LDA+MLLT or similar transform, plus fMPE/FMMI.
# This script just generates lattices for a single broken-up
# piece of the data.
if [ -f ./path.sh ]; then . ./path.sh; fi
numjobs=1
jobid=0
rescore=false
if [ "$1" == "-j" ]; then
shift;
numjobs=$1;
jobid=$2;
shift; shift;
fi
if [ $# != 3 ]; then
echo "Usage: steps/decode_lda_mllt_fmpe.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir>"
echo " e.g.: steps/decode_lda_mllt_fmpe.sh -j 8 0 exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi/decode_dev93_tgpr"
exit 1;
fi
graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
mkdir -p $dir
if [ $numjobs -gt 1 ]; then
mydata=$data/split$numjobs/$jobid
else
mydata=$data
fi
requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.fmpe $srcdir/final.mat $graphdir/HCLG.fst"
for f in $requirements; do
if [ ! -f $f ]; then
echo "decode_lda_mllt_fmpe.sh: no such file $f";
exit 1;
fi
done
basefeats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
# Get the Gaussian-selection info for the fMPE.
ngselect=2; # Just the 2 top Gaussians.
gmm-gselect --n=$ngselect $srcdir/final.fmpe "$basefeats" \
"ark:|gzip -c >$dir/gselect.$jobid.gz" 2>$dir/gselect.$jobid.log
# Now set up the fMPE features.
feats="$basefeats fmpe-apply-transform $srcdir/final.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.$jobid.gz|' ark:- |"
gmm-latgen-faster --max-active=7000 --beam=13.0 --lattice-beam=6.0 --acoustic-scale=0.083333 \
--allow-partial=true --word-symbol-table=$graphdir/words.txt \
$srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
2> $dir/decode.$jobid.log || exit 1;

Просмотреть файл

@ -0,0 +1,114 @@
#!/bin/bash
# This trains a diagonal-covariance UBM (i.e. just a global
# mixture of Gaussians, or GMM).
# Train UBM from a trained HMM/GMM system [with splice+LDA+[MLLT/ET/MLLT+SAT] features]
# Alignment directory is used for the CMN and transforms.
# A UBM is just a single mixture of Gaussians (full-covariance, in our case), that's trained
# on all the data. This will later be used in Subspace Gaussian Mixture Model (SGMM)
# training.
nj=4
cmd=scripts/run.pl
silweight=
for x in 1 2; do
if [ $1 == "--num-jobs" ]; then
shift
nj=$1
shift
fi
if [ $1 == "--cmd" ]; then
shift
cmd=$1
shift
fi
if [ $1 == "--silence-weight" ]; then
shift
silweight=$1 # e.g. to weight down silence in training.
shift
fi
done
if [ $# != 5 ]; then
echo "Usage: steps/train_ubm_lda_etc.sh <num-comps> <data-dir> <lang-dir> <ali-dir> <exp-dir>"
echo " e.g.: steps/train_ubm_lda_etc.sh 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c"
exit 1;
fi
if [ -f path.sh ]; then . path.sh; fi
numcomps=$1
data=$2
lang=$3
alidir=$4
dir=$5
silphonelist=`cat $lang/silphones.csl`
mkdir -p $dir/log
if [ ! -d $data/split$nj -o $data/split$nj -ot $data/feats.scp ]; then
scripts/split_data.sh $data $nj
fi
n1=`get_splits.pl $nj | awk '{print $1}'`
[ -f $alidir/$n1.trans ] && echo "Using speaker transforms from $alidir"
for n in `get_splits.pl $nj`; do
featspart[$n]="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
if [ -f $alidir/$n1.trans ]; then
featspart[$n]="${featspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
fi
if [ ! -z "$silweight" ]; then
weightspart[$n]="--weights='ark,s,cs:gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- | weight-silence-post $silweight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
fi
done
ngselect=50
intermediate=2000
if [ $[$numcomps*2] -gt $intermediate ]; then
intermediate=$[$numcomps*2];
fi
echo "Clustering model $alidir/final.mdl to get initial UBM"
# typically: --intermediate-numcomps=2000 --ubm-numcomps=400
if [ ! -s $dir/0.dubm ]; then
$cmd $dir/log/cluster.log \
init-ubm --intermediate-numcomps=$intermediate --ubm-numcomps=$numcomps \
--verbose=2 --fullcov-ubm=false $alidir/final.mdl $alidir/final.occs \
$dir/0.dubm || exit 1;
fi
rm $dir/.error 2>/dev/null
# First do Gaussian selection to 50 components, which will be used
# as the initial screen for all further passes.
for n in `get_splits.pl $nj`; do
$cmd $dir/log/gselect.$n.log \
gmm-gselect --n=$ngselect $dir/0.dubm "${featspart[$n]}" \
"ark:|gzip -c >$dir/gselect.$n.gz" &
done
wait
[ -f $dir/.error ] && echo "Error doing GMM selection" && exit 1;
for x in 0 1 2 3; do
echo "Pass $x"
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-global-acc-stats ${weightspart[$n]} "--gselect=ark,s,cs:gunzip -c $dir/gselect.$n.gz|" \
$dir/$x.dubm "${featspart[$n]}" $dir/$x.$n.acc || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo "Error accumulating stats for UBM estimation on pass $x" && exit 1;
lowcount_opt="--remove-low-count-gaussians=false"
[ $x -eq 3 ] && lowcount_opt= # Only remove low-count Gaussians on last iter-- keeps gselect info valid.
$cmd $dir/log/update.$x.log \
gmm-global-est $lowcount_opt --verbose=2 $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc |" \
$dir/$[$x+1].dubm || exit 1;
rm $dir/$x.*.acc $dir/$x.dubm
done
rm $dir/gselect.*.gz
rm $dir/final.dubm 2>/dev/null
mv $dir/4.dubm $dir/final.dubm || exit 1;

Просмотреть файл

@ -20,10 +20,8 @@
# [something] may be MLLT, or ET, or MLLT + SAT. Any speaker-specific
# transforms are expected to be located in the alignment directory.
# This script never re-estimates any transforms, it just does model
# training. To make this faster, it initializes the model from the
# old system's model, i.e. for each p.d.f., it takes the best-match pdf
# from the old system (based on overlap of tree-stats counts), and
# uses that GMM to initialize the current GMM.
# training.
# Basically we are doing 4 iterations of Extended Baum-Welch (EBW)
# estimation, as described in Dan Povey's thesis, with a few differences:
# (i) we have the option of "boosting", as in "Boosted MMI", which increases
@ -47,7 +45,9 @@
niters=4
nj=4
boost=0.0
tau=100
tau=200
merge=true # if true, cancel num and den counts as described in
# the boosted MMI paper.
cmd=scripts/run.pl
acwt=0.1
stage=0
@ -69,6 +69,9 @@ for x in `seq 8`; do
if [ $1 == "--acwt" ]; then
shift; acwt=$1; shift
fi
if [ $1 == "--tau" ]; then
shift; tau=$1; shift
fi
if [ $1 == "--stage" ]; then
shift; stage=$1; shift
fi
@ -121,58 +124,60 @@ rm $dir/.error 2>/dev/null
cur_mdl=$srcdir/final.mdl
x=0
while [ $x -lt $niters ]; do
echo "Iteration $x: getting denominator stats."
# Get denominator stats... For simplicity we rescore the lattice
echo "Iteration $x: getting stats."
# Get denominator and numerator stats together... This involves
# merging the num and den posteriors, and (if $merge==true), canceling
# the +ve and -ve occupancies on each frame.
# For simplicity we rescore the lattice
# on all iterations, even though it shouldn't be necessary on the zeroth
# (but we want this script to work even if $srcdir doesn't contain the
# model used to generate the lattice).
# model used to generate the lattice).
if [ $stage -le $x ]; then
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc_den.$x.$n.log \
$cmd $dir/log/acc.$x.$n.log \
gmm-rescore-lattice $cur_mdl "${latspart[$n]}" "${featspart[$n]}" ark:- \| \
lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
gmm-acc-stats $cur_mdl "${featspart[$n]}" ark:- $dir/den_acc.$x.$n.acc \
|| touch $dir/.error &
sum-post --merge=$merge --scale1=-1 \
ark:- "ark,s,cs:gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- |" ark:- \| \
gmm-acc-stats2 $cur_mdl "${featspart[$n]}" ark,s,cs:- \
$dir/num_acc.$x.$n.acc $dir/den_acc.$x.$n.acc || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo Error accumulating den stats on iter $x && exit 1;
[ -f $dir/.error ] && echo Error accumulating stats on iter $x && exit 1;
$cmd $dir/log/den_acc_sum.$x.log \
gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
rm $dir/den_acc.$x.*.acc
echo "Iteration $x: getting numerator stats."
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc_num.$x.$n.log \
gmm-acc-stats-ali $cur_mdl "${featspart[$n]}" "ark:gunzip -c $alidir/$n.ali.gz|" \
$dir/num_acc.$x.$n.acc || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo Error accumulating num stats on iter $x && exit 1;
$cmd $dir/log/num_acc_sum.$x.log \
gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
rm $dir/num_acc.$x.*.acc
# note: this tau value is for smoothing to model parameters;
# you need to use gmm-ismooth-stats to smooth to the ML stats,
# but anyway this script does canceling of num and den stats on
# each frame (as suggested in the Boosted MMI paper) which would
# make smoothing to ML impossible without accumulating extra stats.
$cmd $dir/log/update.$x.log \
gmm-est-gaussians-ebw $cur_mdl "gmm-ismooth-stats --tau=$tau $dir/num_acc.$x.acc $dir/num_acc.$x.acc -|" \
$dir/den_acc.$x.acc - \| \
gmm-est-gaussians-ebw --tau=$tau $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \
gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
else
echo "not doing this iteration because --stage=$stage"
fi
cur_mdl=$dir/$[$x+1].mdl
# Some diagnostics
den=`grep Overall $dir/log/acc_den.$x.*.log | grep lattice-to-post | awk '{p+=$7*$9; nf+=$9;} END{print p/nf;}'`
num=`grep Overall $dir/log/acc_num.$x.*.log | grep gmm-acc-stats-ali | awk '{p+=$11*$13; nf+=$13;} END{print p/nf}'`
diff=`perl -e "print ($num * $acwt - $den);"`
impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10;}'`
impr=`perl -e "print ($impr * $acwt);"` # auxf impr normalized by multiplying by
# kappa, so it's comparable to an objective-function change.
echo On iter $x, objf was $diff, auxf improvement was $impr | tee $dir/objf.$x.log
# Some diagnostics.. note, this objf is somewhat comparable to the
# MMI objective function divided by the acoustic weight, and differences in it
# are comparable to the auxf improvement printed by the update program.
objf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf;}'`
nf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ nf+=$12; } END{print nf;}'`
impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'`
impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
# for the canceling of stats.
echo On iter $x, objf was $objf, auxf improvement from MMI was $impr | tee $dir/objf.$x.log
x=$[$x+1]
done
echo "Succeeded with $niters iterations of MMI training (boosting factor = $boost)"
( cd $dir; ln -s $x.mdl final.mdl )
( cd $dir; rm final.mdl; ln -s $x.mdl final.mdl )

Просмотреть файл

@ -0,0 +1,236 @@
#!/bin/bash
# by Dan Povey, 2012. Apache.
# This script does MMI discriminative training, including
# feature-space (like fMPE) and model-space components.
# If you give the --boost option it does "boosted MMI" (BMMI).
# On the iterations of training it alternates feature-space
# and model-space training. We do 8 iterations in total--
# 4 of each type ((B)MMI, f(B)MMI)
# The features it uses are LDA + [something], where the something
# may be just a global transform like MLLT, or may also include
# speaker-specific transforms such as SAT. This script just uses
# transforms computed in the alignment directory, so it doesn't
# need to know what the transform type is (it isn't re-estimating
# them itself)
niters=8
nj=4
boost=0.0
lrate=0.01
tau=200 # Note: we're doing smoothing "to the previous iteration"
# --smooth-from-model so 200 seems like a more sensible default
# than 100. We smooth to the previous iteration because now
# we are discriminatively training the features (and not using
# the indirect differential), so it seems like it wouldn't make
# sense to use any element of ML.
ngauss=400
merge=true # if true, cancel num and den counts as described in
# the boosted MMI paper.
cmd=scripts/run.pl
acwt=0.1
stage=-1
for x in `seq 8`; do
if [ $1 == "--num-jobs" ]; then
shift; nj=$1; shift
fi
if [ $1 == "--learning-rate" ]; then
shift; lrate=$1; shift
fi
if [ $1 == "--num-gauss" ]; then
shift; ngauss=$1; shift # #Gauss in GMM for fMPE.
fi
if [ $1 == "--num-iters" ]; then
shift; niters=$1; shift
fi
if [ $1 == "--boost" ]; then
shift; boost=$1; shift
fi
if [ $1 == "--cmd" ]; then
shift; cmd=$1; shift
[ -z "$cmd" ] && echo Empty argument to --cmd option && exit 1;
fi
if [ $1 == "--acwt" ]; then
shift; acwt=$1; shift
fi
if [ $1 == "--tau" ]; then
shift; tau=$1; shift
fi
if [ $1 == "--stage" ]; then # used for finishing partial runs.
shift; stage=$1; shift
fi
done
if [ $# != 7 ]; then
echo "Usage: steps/train_lda_etc_mmi_fmmi.sh <data-dir> <lang-dir> <ali-dir> <dubm-dir> <denlat-dir> <model-dir> <exp-dir>"
echo " e.g.: steps/train_lda_etc_mmi_fmmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm2d exp/tri2b_denlats_si84 exp/tri2b exp/tri2b_fmmi"
exit 1;
fi
if [ -f path.sh ]; then . path.sh; fi
data=$1
lang=$2
alidir=$3
dubmdir=$4 # where diagonal UBM is.
denlatdir=$5
srcdir=$6 # may be same model as in alidir, but may not be, e.g.
# if you want to test MMI with different #iters.
dir=$7
silphonelist=`cat $lang/silphones.csl`
ngselect=2; # Just the 2 top Gaussians. Beyond that wouldn't make much
# difference since the posteriors would be very small.
mkdir -p $dir/log
if [ ! -f $srcdir/final.mdl -o ! -f $srcdir/final.mat ]; then
echo "Error: alignment dir $alidir does not contain one of final.mdl or final.mat"
exit 1;
fi
cp $srcdir/final.mat $srcdir/tree $dir
n=`get_splits.pl $nj | awk '{print $1}'`
if [ -f $alidir/$n.trans ]; then
use_trans=true
echo Using transforms from directory $alidir
else
echo No transforms present in alignment directory: assuming speaker independent.
use_trans=false
fi
# Note: ${basefeatspart[$n]} is the features before fMPE.
for n in `get_splits.pl $nj`; do
basefeatspart[$n]="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
$use_trans && basefeatspart[$n]="${basefeatspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
featspart[$n]="${basefeatspart[$n]}" # before 1st iter of fMPE..
[ ! -f $denlatdir/lat.$n.gz ] && echo No such file $denlatdir/lat.$n.gz && exit 1;
latspart[$n]="ark:gunzip -c $denlatdir/lat.$n.gz|"
# note: in next line, doesn't matter which model we use, it's only used to map to phones.
[ $boost != "0.0" -a $boost != "0" ] && latspart[$n]="${latspart[$n]} lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/$n.ali.gz|' ark:- |"
done
# Initialize the fMPE object. Note: we call it .fmpe because
# that's what it was called in the original paper, but since
# we're using the MMI objective function, it's really fMMI.
fmpe-init $dubmdir/final.dubm $dir/0.fmpe || exit 1;
rm $dir/.error 2>/dev/null
if [ $stage -le -1 ]; then
# Get the gselect (Gaussian selection) info for fMPE.
# Note: fMPE object starts with GMM object, so can be read
# as one.
for n in `get_splits.pl $nj`; do
$cmd $dir/log/gselect.$n.log \
gmm-gselect --n=$ngselect $dir/0.fmpe "${featspart[$n]}" \
"ark:|gzip -c >$dir/gselect.$n.gz" || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo "Error in Gaussian selection phase" && exit 1;
fi
cur_mdl=$srcdir/final.mdl
cur_fmpe=$dir/0.fmpe
x=0
while [ $x -lt $niters ]; do
if [ $[$x%2] == 0 ]; then
echo "Iteration $x: doing fMMI"
if [ $stage -le $x ]; then
for n in `get_splits.pl $nj`; do
numpost="ark,s,cs:gunzip -c $alidir/$n.ali.gz| ali-to-post ark:- ark:-|"
# Note: the command gmm-fmpe-acc-stats below requires the "base" features
# (without fMPE), not the fMPE features.
$cmd $dir/log/acc_fmmi.$x.$n.log \
gmm-rescore-lattice $cur_mdl "${latspart[$n]}" "${featspart[$n]}" ark:- \| \
lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
sum-post --scale1=-1 ark:- "$numpost" ark:- \| \
gmm-fmpe-acc-stats $cur_mdl $cur_fmpe "${basefeatspart[$n]}" \
"ark,s,cs:gunzip -c $dir/gselect.$n.gz|" ark,s,cs:- \
$dir/$x.$n.fmpe_acc || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo Error doing fMPE accumulation && exit 1;
( sum-matrices $dir/$x.fmpe_acc $dir/$x.*.fmpe_acc && \
rm $dir/$x.*.fmpe_acc && \
fmpe-est --learning-rate=$lrate $cur_fmpe $dir/$x.fmpe_acc $dir/$[$x+1].fmpe ) \
2>$dir/log/est_fmpe.$x.log || exit 1;
rm $dir/$[$x+1].mdl 2>/dev/null
fi
# We need to set the features to use the correct fMPE object.
for n in `get_splits.pl $nj`; do
featspart[$n]="${basefeatspart[$n]} fmpe-apply-transform $dir/$[$x+1].fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.$n.gz|' ark:- |"
done
cur_fmpe=$dir/$[$x+1].fmpe
# Now, diagnostics.
objf=`grep Overall $dir/log/acc_fmmi.$x.*.log | grep gmm-fmpe-acc-stats | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf;}'`
nf=`grep Overall $dir/log/acc_fmmi.$x.*.log | grep gmm-fmpe-acc-stats | awk '{ nf+=$12; } END{print nf;}'`
impr=`grep Objf $dir/log/est_fmpe.$x.log | awk '{print $NF}'`
impr=`perl -e "print ($impr/$nf);"` # normalize by #frames.
echo On iter $x, objf was $objf, auxf improvement from fMMI was $impr | tee $dir/objf.$x.log
else
echo "Iteration $x: doing MMI (getting stats)..."
# Get denominator stats... For simplicity we rescore the lattice
# on all iterations, even though it shouldn't be necessary on the zeroth
# (but we want this script to work even if $srcdir doesn't contain the
# model used to generate the lattice).
if [ $stage -le $x ]; then
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-rescore-lattice $cur_mdl "${latspart[$n]}" "${featspart[$n]}" ark:- \| \
lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
sum-post --merge=$merge --scale1=-1 \
ark:- "ark,s,cs:gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- |" ark:- \| \
gmm-acc-stats2 $cur_mdl "${featspart[$n]}" ark,s,cs:- \
$dir/num_acc.$x.$n.acc $dir/den_acc.$x.$n.acc || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo Error accumulating stats on iter $x && exit 1;
$cmd $dir/log/den_acc_sum.$x.log \
gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
rm $dir/den_acc.$x.*.acc
$cmd $dir/log/num_acc_sum.$x.log \
gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
rm $dir/num_acc.$x.*.acc
# note: this tau value is for smoothing to model parameters;
# you need to use gmm-ismooth-stats to smooth to the ML stats,
# but anyway this script does canceling of num and den stats on
# each frame (as suggested in the Boosted MMI paper) which would
# make smoothing to ML impossible without accumulating extra stats.
$cmd $dir/log/update.$x.log \
gmm-est-gaussians-ebw --tau=$tau $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \
gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
else
echo "not doing this iteration because --stage=$stage"
fi
# Some diagnostics.. note, this objf is somewhat comparable to the
# MMI objective function divided by the acoustic weight, and differences in it
# are comparable to the auxf improvement printed by the update program.
objf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf;}'`
nf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ nf+=$12; } END{print nf;}'`
impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'`
impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
# for the canceling of stats.
echo On iter $x, objf was $objf, auxf improvement was $impr | tee $dir/objf.$x.log
cur_mdl=$dir/$[$x+1].mdl
fi
x=$[$x+1]
done
echo "Succeeded with $niters iterations of MMI+fMMI training (boosting factor = $boost)"
( cd $dir; rm final.mdl 2>/dev/null; ln -s `basename $cur_mdl` final.mdl;
rm final.fmpe 2>/dev/null; ln -s `basename $cur_fmpe` final.fmpe )
# Now do some cleanup.
rm $dir/gselect.*.gz $dir/*.acc $dir/*.fmpe_acc

Просмотреть файл

@ -22,6 +22,7 @@
nj=4
cmd=scripts/run.pl
silweight=
for x in 1 2; do
if [ $1 == "--num-jobs" ]; then
shift
@ -33,6 +34,11 @@ for x in 1 2; do
cmd=$1
shift
fi
if [ $1 == "--silence-weight" ]; then
shift
silweight=$1 # e.g. to weight down silence in training.
shift
fi
done
if [ $# != 5 ]; then
@ -48,6 +54,7 @@ data=$2
lang=$3
alidir=$4
dir=$5
silphonelist=`cat $lang/silphones.csl`
mkdir -p $dir/log
@ -63,6 +70,9 @@ for n in `get_splits.pl $nj`; do
if [ -f $alidir/$n1.trans ]; then
featspart[$n]="${featspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
fi
if [ ! -z "$silweight" ]; then
weightspart[$n]="--weights='gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- | weight-silence-post $silweight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
fi
done
ngselect1=50
@ -98,7 +108,7 @@ for x in 0 1 2 3; do
$cmd $dir/log/acc.$x.$n.log \
gmm-gselect --n=$ngselect2 "--gselect=ark,s,cs:gunzip -c $dir/gselect_diag.$n.gz|" \
"fgmm-global-to-gmm $dir/$x.ubm - |" "${featspart[$n]}" ark:- \| \
fgmm-global-acc-stats --gselect=ark,s,cs:- $dir/$x.ubm "${featspart[$n]}" \
fgmm-global-acc-stats ${weightspart[$n]} --gselect=ark,s,cs:- $dir/$x.ubm "${featspart[$n]}" \
$dir/$x.$n.acc || touch $dir/.error &
done
wait

Просмотреть файл

@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
"e.g.: \n"
" build-tree-two-level treeacc roots.txt 1.qst topo tree tree.map\n";
bool binary = false;
bool binary = true;
int32 P = 1, N = 3;
bool cluster_leaves = true;

Просмотреть файл

@ -39,6 +39,7 @@ void ScalePosteriors(BaseFloat scale, Posterior *post) {
// note: Posterior is vector<vector<pair<int,BaseFloat> > >
void MergePosteriors(const Posterior &post1,
const Posterior &post2,
bool merge,
Posterior *post) {
KALDI_ASSERT(post1.size() == post2.size()); // precondition.
post->resize(post1.size());
@ -49,10 +50,14 @@ void MergePosteriors(const Posterior &post1,
post1[i].begin(), post1[i].end());
(*post)[i].insert((*post)[i].end(),
post2[i].begin(), post2[i].end());
MergePairVectorSumming(&((*post)[i])); // This sorts on
// the transition-id merges the entries with the same
// key (i.e. same .first element; same transition-id), and
// gets rid of entries with zero .second element.
if (merge) { // combine and sum up entries with same transition-id.
MergePairVectorSumming(&((*post)[i])); // This sorts on
// the transition-id merges the entries with the same
// key (i.e. same .first element; same transition-id), and
// gets rid of entries with zero .second element.
} else { // just to keep them pretty, merge them.
std::sort( (*post)[i].begin(), (*post)[i].end() );
}
}
}
@ -70,10 +75,12 @@ int main(int argc, char *argv[]) {
"Usage: sum-post post-rspecifier1 post-rspecifier2 post-wspecifier\n";
BaseFloat scale1 = 1.0, scale2 = 1.0;
bool merge = true;
ParseOptions po(usage);
po.Register("scale1", &scale1, "Scale for first set of posteriors");
po.Register("scale2", &scale2, "Scale for second set of posteriors");
po.Register("merge", &merge, "If true, merge posterior entries for "
"same transition-id (canceling positive and negative parts)");
po.Read(argc, argv);
if (po.NumArgs() != 3) {
@ -111,7 +118,7 @@ int main(int argc, char *argv[]) {
ScalePosteriors(scale1, &posterior1);
ScalePosteriors(scale2, &posterior2);
kaldi::Posterior posterior_out;
MergePosteriors(posterior1, posterior2, &posterior_out);
MergePosteriors(posterior1, posterior2, merge, &posterior_out);
posterior_writer.Write(key, posterior_out);
num_done++;
}

Просмотреть файл

@ -3,10 +3,11 @@ all:
EXTRA_CXXFLAGS = -Wno-sign-compare
include ../kaldi.mk
BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats compute-cmvn-stats \
add-deltas remove-mean apply-cmvn transform-feats copy-feats compose-transforms \
splice-feats extract-segments subset-feats feat-to-len feat-to-dim \
fmpe-apply-transform fmpe-acc-stats fmpe-init fmpe-update
BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
compute-cmvn-stats add-deltas remove-mean apply-cmvn transform-feats \
copy-feats compose-transforms splice-feats extract-segments subset-feats \
feat-to-len feat-to-dim fmpe-apply-transform fmpe-acc-stats fmpe-init \
fmpe-est fmpe-copy
OBJFILES =
@ -17,8 +18,8 @@ all: $(BINFILES)
TESTFILES =
$(BINFILES): ../feat/kaldi-feature.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
../tree/kaldi-tree.a ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
../base/kaldi-base.a
../tree/kaldi-tree.a ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
../base/kaldi-base.a
# Rule below would expand to, e.g.:
# ../base/kaldi-base.a:

Просмотреть файл

@ -21,6 +21,7 @@
int main(int argc, char *argv[]) {
using namespace kaldi;
using kaldi::int32;
try {
const char *usage =
"Apply fMPE transform to features\n"
@ -55,13 +56,13 @@ int main(int argc, char *argv[]) {
RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
// fmpe stats...
Matrix<BaseFloat> stats(fmpe.ProjectionNumRows() * 2,
fmpe.ProjectionNumCols());
SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionNumRows(),
0, fmpe.ProjectionNumCols());
SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionNumRows(),
fmpe.ProjectionNumRows(),
0, fmpe.ProjectionNumCols());
Matrix<BaseFloat> stats(fmpe.ProjectionTNumRows() * 2,
fmpe.ProjectionTNumCols());
SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionTNumRows(),
0, fmpe.ProjectionTNumCols());
SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionTNumRows(),
fmpe.ProjectionTNumRows(),
0, fmpe.ProjectionTNumCols());
int32 num_done = 0, num_err = 0;

Просмотреть файл

@ -1,6 +1,6 @@
// featbin/fmpe-apply-transform.cc
// Copyright 2012 Daniel Povey
// Copyright 2012 Daniel Povey Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -21,6 +21,7 @@
int main(int argc, char *argv[]) {
using namespace kaldi;
using kaldi::int32;
try {
const char *usage =
"Apply fMPE transform to features\n"
@ -34,7 +35,7 @@ int main(int argc, char *argv[]) {
// no non-default options.
po.Read(argc, argv);
if (po.NumArgs() != 3) {
if (po.NumArgs() != 4) {
po.PrintUsage();
exit(1);
}

62
src/featbin/fmpe-copy.cc Normal file
Просмотреть файл

@ -0,0 +1,62 @@
// featbin/fmpe-copy.cc
// Copyright 2012 Daniel Povey Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "transform/fmpe.h"
int main(int argc, char *argv[]) {
using namespace kaldi;
try {
const char *usage =
"Copy fMPE transform\n"
"Usage: fmpe-init [options...] <fmpe-in> <fmpe-out>\n"
"E.g. fmpe-copy --binary=false 1.fmpe text.fmpe\n";
ParseOptions po(usage);
FmpeOptions opts;
bool binary = true;
po.Register("binary", &binary, "If true, output fMPE object in binary mode.");
opts.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
std::string fmpe_rxfilename = po.GetArg(1),
fmpe_wxfilename = po.GetArg(2);
Fmpe fmpe;
{
bool binary_in;
Input ki(fmpe_rxfilename, &binary_in);
fmpe.Read(ki.Stream(), binary_in);
}
Output ko(fmpe_wxfilename, binary);
fmpe.Write(ko.Stream(), binary);
KALDI_LOG << "Copyied fMPE object to " << fmpe_wxfilename;
return 0;
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -1,6 +1,6 @@
// featbin/fmpe-update.cc
// featbin/fmpe-est.cc
// Copyright 2012 Daniel Povey
// Copyright 2012 Daniel Povey Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -24,8 +24,8 @@ int main(int argc, char *argv[]) {
try {
const char *usage =
"Initialize fMPE transform (to zeo)\n"
"Usage: fmpe-update [options...] <fmpe-in> <stats-in> <fmpe-out>\n"
"E.g. fmpe-update 1.fmpe 1.accs 2.fmpe\n";
"Usage: fmpe-est [options...] <fmpe-in> <stats-in> <fmpe-out>\n"
"E.g. fmpe-est 1.fmpe 1.accs 2.fmpe\n";
ParseOptions po(usage);
FmpeUpdateOptions opts;
@ -58,18 +58,18 @@ int main(int argc, char *argv[]) {
}
// the matrix is in two parts, for the "plus" and "minus"
// parts of the gradient that we stored separately.
SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionNumRows(),
0, fmpe.ProjectionNumCols());
SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionNumRows(),
fmpe.ProjectionNumRows(),
0, fmpe.ProjectionNumCols());
SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionTNumRows(),
0, fmpe.ProjectionTNumCols());
SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionTNumRows(),
fmpe.ProjectionTNumRows(),
0, fmpe.ProjectionTNumCols());
fmpe.Update(opts, stats_plus, stats_minus);
Output ko(fmpe_wxfilename, binary);
fmpe.Write(ko.Stream(), binary);
KALDI_LOG << "Initialized fMPE object and wrote to"
KALDI_LOG << "Updated fMPE object and wrote to "
<< fmpe_wxfilename;
return 0;
} catch(const std::exception& e) {

Просмотреть файл

@ -1,6 +1,6 @@
// featbin/fmpe-init.cc
// Copyright 2012 Daniel Povey
// Copyright 2012 Daniel Povey Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -23,7 +23,7 @@ int main(int argc, char *argv[]) {
using namespace kaldi;
try {
const char *usage =
"Initialize fMPE transform (to zeo)\n"
"Initialize fMPE transform (to zero)\n"
"Usage: fmpe-init [options...] <diag-gmm-in> <fmpe-out>\n"
"E.g. fmpe-init 1.ubm 1.fmpe\n";
@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
Output ko(fmpe_wxfilename, binary);
fmpe.Write(ko.Stream(), binary);
KALDI_LOG << "Initialized fMPE object and wrote to"
KALDI_LOG << "Initialized fMPE object and wrote to "
<< fmpe_wxfilename;
return 0;
} catch(const std::exception& e) {

Просмотреть файл

@ -8,7 +8,7 @@ TESTFILES = diag-gmm-test mle-diag-gmm-test full-gmm-test mle-full-gmm-test \
am-diag-gmm-test ebw-diag-gmm-test
OBJFILES = diag-gmm.o diag-gmm-normal.o mle-diag-gmm.o am-diag-gmm.o mle-am-diag-gmm.o \
full-gmm.o full-gmm-normal.o mle-full-gmm.o fmpe-am-diag-gmm.o model-common.o \
full-gmm.o full-gmm-normal.o mle-full-gmm.o model-common.o \
model-test-common.o ebw-diag-gmm.o
LIBFILE = kaldi-gmm.a

Просмотреть файл

@ -148,11 +148,10 @@ void UpdateEbwDiagGmm(const AccumDiagGmm &num_stats, // with I-smoothing, if use
if (den_has_stats)
var_stats.AddVec(-1.0, den_stats.variance_accumulator().Row(g));
}
double D = opts.E * den_count / 2; // E*gamma_den/2 where E = 2;
// We initialize to half the value of D that would be dictated by
// E; this is part of the strategy used to ensure that the value of
// D we use is at least twice the value that would ensure positive
// variances.
double D = (opts.tau + opts.E * den_count) / 2;
// We initialize to half the value of D that would be dictated by E (and
// tau); this is part of the strategy used to ensure that the value of D we
// use is at least twice the value that would ensure positive variances.
int32 iter, max_iter = 100;
for (iter = 0; iter < max_iter; iter++) { // will normally break from the loop
@ -184,7 +183,7 @@ void UpdateEbwDiagGmm(const AccumDiagGmm &num_stats, // with I-smoothing, if use
D *= 1.1;
}
}
if (iter > 0 && num_floored_out != NULL) *num_floored_out++;
if (iter > 0 && num_floored_out != NULL) (*num_floored_out)++;
if (iter == max_iter) KALDI_WARN << "Dropped off end of loop, recomputing D. (unexpected.)";
}
// copy to natural representation according to flags.

Просмотреть файл

@ -31,10 +31,14 @@ namespace kaldi {
// Options for Extended Baum-Welch Gaussian update.
struct EbwOptions {
BaseFloat E;
EbwOptions(): E(2.0) { }
BaseFloat tau; // This is only useful for smoothing "to the model":
// if you want to smooth to ML stats, you need to use gmm-ismooth-stats
EbwOptions(): E(2.0), tau(0.0) { }
void Register(ParseOptions *po) {
std::string module = "EbwOptions: ";
po->Register("E", &E, module+"Constant E for Extended Baum-Welch (EBW) update");
po->Register("tau", &tau, module+"Tau value for smoothing to the model "
"parameters only (for smoothing to ML stats, use gmm-ismooth-stats");
}
};

Просмотреть файл

@ -1,892 +0,0 @@
// gmm/fmpe-am-diag-gmm.cc
// Copyright 2009-2011 Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include <set>
#include <algorithm>
#include "gmm/diag-gmm.h"
#include "gmm/fmpe-am-diag-gmm.h"
#include "util/stl-utils.h"
#include "tree/clusterable-classes.h"
#include "tree/cluster-utils.h"
namespace kaldi {
void FmpeAccumModelDiff::Read(std::istream &in_stream, bool binary) {
int32 dimension, num_components;
std::string token;
ExpectToken(in_stream, binary, "<FMPEMODELDIFFS>");
ExpectToken(in_stream, binary, "<VECSIZE>");
ReadBasicType(in_stream, binary, &dimension);
ExpectToken(in_stream, binary, "<NUMCOMPONENTS>");
ReadBasicType(in_stream, binary, &num_components);
Resize(num_components, dimension);
ReadToken(in_stream, binary, &token);
while (token != "</FMPEMODELDIFFS>") {
if (token == "<MLE_OCCUPANCY>") {
mle_occupancy_.Read(in_stream, binary);
} else if (token == "<MEANDIFFS>") {
mean_diff_accumulator_.Read(in_stream, binary);
} else if (token == "<DIAGVARDIFFS>") {
variance_diff_accumulator_.Read(in_stream, binary);
} else {
KALDI_ERR << "Unexpected token '" << token << "' in model file ";
}
ReadToken(in_stream, binary, &token);
}
}
void FmpeAccumModelDiff::Write(std::ostream &out_stream, bool binary) const {
WriteToken(out_stream, binary, "<FMPEMODELDIFFS>");
WriteToken(out_stream, binary, "<VECSIZE>");
WriteBasicType(out_stream, binary, dim_);
WriteToken(out_stream, binary, "<NUMCOMPONENTS>");
WriteBasicType(out_stream, binary, num_comp_);
// convert into BaseFloat before writing things
Vector<BaseFloat> occupancy_bf(mle_occupancy_.Dim());
Matrix<BaseFloat> mean_diff_accumulator_bf(mean_diff_accumulator_.NumRows(),
mean_diff_accumulator_.NumCols());
Matrix<BaseFloat> variance_diff_accumulator_bf(variance_diff_accumulator_.NumRows(),
variance_diff_accumulator_.NumCols());
occupancy_bf.CopyFromVec(mle_occupancy_);
mean_diff_accumulator_bf.CopyFromMat(mean_diff_accumulator_);
variance_diff_accumulator_bf.CopyFromMat(variance_diff_accumulator_);
WriteToken(out_stream, binary, "<MLE_OCCUPANCY>");
occupancy_bf.Write(out_stream, binary);
WriteToken(out_stream, binary, "<MEANDIFFS>");
mean_diff_accumulator_bf.Write(out_stream, binary);
WriteToken(out_stream, binary, "<DIAGVARDIFFS>");
variance_diff_accumulator_bf.Write(out_stream, binary);
WriteToken(out_stream, binary, "</FMPEMODELDIFFS>");
}
void FmpeAccumModelDiff::Resize(int32 num_comp, int32 dim) {
KALDI_ASSERT(num_comp > 0 && dim > 0);
num_comp_ = num_comp;
dim_ = dim;
mle_occupancy_.Resize(num_comp);
mean_diff_accumulator_.Resize(num_comp, dim);
variance_diff_accumulator_.Resize(num_comp, dim);
}
void FmpeAccumModelDiff::SetZero() {
mle_occupancy_.SetZero();
mean_diff_accumulator_.SetZero();
variance_diff_accumulator_.SetZero();
}
void FmpeAccumModelDiff::ComputeModelParaDiff(const DiagGmm& diag_gmm,
const AccumDiagGmm& num_acc,
const AccumDiagGmm& den_acc,
const AccumDiagGmm& mle_acc) {
KALDI_ASSERT(num_acc.NumGauss() == num_comp_ && num_acc.Dim() == dim_);
KALDI_ASSERT(den_acc.NumGauss() == num_comp_); // den_acc.Dim() may not be defined,
// if we used the "compressed form" of accs where den only has counts.
KALDI_ASSERT(mle_acc.NumGauss() == num_comp_ && mle_acc.Dim() == dim_);
Matrix<double> mean_diff_tmp(num_comp_, dim_);
Matrix<double> var_diff_tmp(num_comp_, dim_);
Matrix<double> mat_tmp(num_comp_, dim_);
Vector<double> occ_diff(num_comp_);
Matrix<double> means_invvars(num_comp_, dim_);
Matrix<double> inv_vars(num_comp_, dim_);
occ_diff.CopyFromVec(num_acc.occupancy());
occ_diff.AddVec(-1.0, den_acc.occupancy());
means_invvars.CopyFromMat(diag_gmm.means_invvars(), kNoTrans);
inv_vars.CopyFromMat(diag_gmm.inv_vars(), kNoTrans);
/// compute the means differentials first
mean_diff_tmp.CopyFromMat(num_acc.mean_accumulator(), kNoTrans);
if (den_acc.Flags() & kGmmMeans) // probably will be false.
mean_diff_tmp.AddMat(-1.0, den_acc.mean_accumulator(), kNoTrans);
mean_diff_tmp.MulElements(inv_vars);
mat_tmp.CopyFromMat(means_invvars, kNoTrans);
mat_tmp.MulRowsVec(occ_diff);
mean_diff_tmp.AddMat(-1.0, mat_tmp, kNoTrans);
/// compute the means differetials
mean_diff_accumulator_.CopyFromMat(mean_diff_tmp, kNoTrans);
/// compute the vars differentials second
var_diff_tmp.CopyFromMat(num_acc.variance_accumulator(), kNoTrans);
if (den_acc.Flags() & kGmmVariances) // probably will be false.
var_diff_tmp.AddMat(-1.0, den_acc.variance_accumulator(), kNoTrans);
var_diff_tmp.MulElements(inv_vars);
var_diff_tmp.MulElements(inv_vars);
mat_tmp.CopyFromMat(num_acc.mean_accumulator(), kNoTrans);
if (den_acc.Flags() & kGmmMeans) // probably will be false.
mat_tmp.AddMat(-1.0, den_acc.mean_accumulator(), kNoTrans);
mat_tmp.MulElements(inv_vars);
mat_tmp.MulElements(means_invvars);
var_diff_tmp.AddMat(-2.0, mat_tmp, kNoTrans);
mat_tmp.CopyFromMat(means_invvars, kNoTrans);
mat_tmp.MulElements(means_invvars);
mat_tmp.AddMat(-1.0, inv_vars, kNoTrans);
mat_tmp.MulRowsVec(occ_diff);
var_diff_tmp.AddMat(1.0, mat_tmp, kNoTrans);
var_diff_tmp.Scale(0.5);
/// compute the vars differentials
variance_diff_accumulator_.CopyFromMat(var_diff_tmp, kNoTrans);
/// copy to obtain the mle occupation probapility
mle_occupancy_.CopyFromVec(mle_acc.occupancy());
}
void FmpeAccs::Write(std::ostream &out_stream, bool binary) const {
uint32 tmp_uint32;
WriteToken(out_stream, binary, "<FMPEACCS>");
WriteToken(out_stream, binary, "<NumGaussians>");
tmp_uint32 = static_cast<uint32>(config_.gmm_num_comps);
WriteBasicType(out_stream, binary, tmp_uint32);
WriteToken(out_stream, binary, "<LengthContextExp>");
tmp_uint32 = static_cast<uint32>(config_.context_windows.NumRows());
WriteBasicType(out_stream, binary, tmp_uint32);
WriteToken(out_stream, binary, "<DIMENSION>");
WriteBasicType(out_stream, binary, dim_);
if (!binary) out_stream << "\n";
// convert into BaseFloat before writing things
Matrix<BaseFloat> mat_bf(dim_, dim_ + 1);
if (p_.size() != 0) {
WriteToken(out_stream, binary, "<P>");
for (int32 i = 0; i < config_.gmm_num_comps; ++i) {
for (int32 j = 0; j < config_.context_windows.NumRows(); ++j) {
mat_bf.CopyFromMat(p_[i][j], kNoTrans);
mat_bf.Write(out_stream, binary);
}
}
}
if (n_.size() != 0) {
WriteToken(out_stream, binary, "<N>");
for (int32 i = 0; i < config_.gmm_num_comps; ++i) {
for (int32 j = 0; j < config_.context_windows.NumRows(); ++j) {
mat_bf.CopyFromMat(n_[i][j], kNoTrans);
mat_bf.Write(out_stream, binary);
}
}
}
// convert into BaseFloat before writing things
Vector<BaseFloat> diff_bf(diff_.Dim());
Vector<BaseFloat> direct_diff_bf(direct_diff_.Dim());
Vector<BaseFloat> indirect_diff_bf(indirect_diff_.Dim());
diff_bf.CopyFromVec(diff_);
direct_diff_bf.CopyFromVec(direct_diff_);
indirect_diff_bf.CopyFromVec(indirect_diff_);
WriteToken(out_stream, binary, "<DIFFERENTIAL>");
diff_bf.Write(out_stream, binary);
WriteToken(out_stream, binary, "<DIRECTDIFFERENTIAL>");
direct_diff_bf.Write(out_stream, binary);
WriteToken(out_stream, binary, "<INDIRECTDIFFERENTIAL>");
indirect_diff_bf.Write(out_stream, binary);
WriteToken(out_stream, binary, "</FMPEACCS>");
}
void FmpeAccs::Read(std::istream &in_stream, bool binary,
bool add) {
uint32 tmp_uint32;
std::string token;
ExpectToken(in_stream, binary, "<FMPACCS>");
ExpectToken(in_stream, binary, "<NumGaussians>");
ReadBasicType(in_stream, binary, &tmp_uint32);
int32 num_gaussians = static_cast<int32>(tmp_uint32);
ExpectToken(in_stream, binary, "<LengthContExp>");
ReadBasicType(in_stream, binary, &tmp_uint32);
int32 length_cont_exp = static_cast<int32>(tmp_uint32);
ExpectToken(in_stream, binary, "<DIMENSION>");
ReadBasicType(in_stream, binary, &dim_);
ReadToken(in_stream, binary, &token);
while (token != "</FMPEACCS>") {
if (token == "<P>") {
p_.resize(num_gaussians);
for (size_t i = 0; i < p_.size(); ++i) {
p_[i].resize(length_cont_exp);
for (size_t j = 0; j < p_[i].size(); ++j) {
p_[i][j].Read(in_stream, binary, add);
}
}
} else if (token == "<N>") {
n_.resize(num_gaussians);
for (size_t i = 0; i < n_.size(); ++i) {
n_[i].resize(length_cont_exp);
for (size_t j = 0; j < n_[i].size(); ++j) {
n_[i][j].Read(in_stream, binary, add);
}
}
} else if (token == "<DIFFERENTIALS>") {
diff_.Read(in_stream, binary, add);
} else if (token == "<DIRECTDIFFERENTIALS>") {
direct_diff_.Read(in_stream, binary, add);
} else if (token == "<INDIRECTDIFFERENTIALS>") {
indirect_diff_.Read(in_stream, binary, add);
} else {
KALDI_ERR << "Unexpected token '" << token << "' in model file ";
}
ReadToken(in_stream, binary, &token);
}
}
void FmpeAccs::ReadModelDiffs(std::istream &in_stream, bool binary) {
int32 num_pdfs;
int32 dim;
ExpectToken(in_stream, binary, "<DIMENSION>");
ReadBasicType(in_stream, binary, &dim);
ExpectToken(in_stream, binary, "<NUMPDFS>");
ReadBasicType(in_stream, binary, &num_pdfs);
KALDI_ASSERT((num_pdfs > 0) && (dim > 0));
if (model_diff_accumulators_.size() != static_cast<size_t> (num_pdfs))
KALDI_ERR << "Reading model differentials but num-pdfs do not match: "
<< (model_diff_accumulators_.size()) << " vs. "
<< (num_pdfs);
for (std::vector<FmpeAccumModelDiff*>::iterator it = model_diff_accumulators_.begin(),
end = model_diff_accumulators_.end(); it != end; ++it) {
(*it)->Read(in_stream, binary);
}
}
void FmpeAccs::InitPNandDiff(int32 num_gmm_gauss, int32 con_exp, int32 dim) {
p_.resize(num_gmm_gauss);
for (int32 i = 0; i < num_gmm_gauss; ++i) {
p_[i].resize(con_exp);
for (int32 j = 0; j < con_exp; ++j) {
p_[i][j].Resize(dim, dim + 1);
}
}
n_.resize(num_gmm_gauss);
for (int32 i = 0; i < num_gmm_gauss; ++i) {
n_[i].resize(con_exp);
for (int32 j = 0; j < con_exp; ++j) {
n_[i][j].Resize(dim, dim + 1);
}
}
diff_.Resize(dim);
direct_diff_.Resize(dim);
indirect_diff_.Resize(dim);
}
void FmpeAccs::InitModelDiff(const AmDiagGmm &model) {
DeletePointers(&model_diff_accumulators_); // in case was non-empty when called.
model_diff_accumulators_.resize(model.NumPdfs(), NULL);
for (int32 i = 0; i < model.NumPdfs(); i++) {
model_diff_accumulators_[i] = new FmpeAccumModelDiff();
model_diff_accumulators_[i]->Resize(model.GetPdf(i));
}
}
/// Initialization, do InitModelDiff if true when accumulating,
/// and otherwise don't do when sum accumulations
void FmpeAccs::Init(const AmDiagGmm &am_model, bool update) {
dim_ = am_model.Dim();
InitPNandDiff(config_.gmm_num_comps, config_.context_windows.NumRows(), dim_);
if (update) {
InitModelDiff(am_model);
}
}
void FmpeAccs::InitializeGMMs(const DiagGmm &gmm, const DiagGmm &gmm_cluster_centers,
std::vector<int32> &gaussian_cluster_center_map) {
gmm_.CopyFromDiagGmm(gmm);
gmm_cluster_centers_.CopyFromDiagGmm(gmm_cluster_centers);
gaussian_cluster_center_map_.resize(gaussian_cluster_center_map.size());
gaussian_cluster_center_map_ = gaussian_cluster_center_map;
}
void FmpeAccs::ComputeOneFrameOffsetFeature(const VectorBase<BaseFloat>& data,
std::vector<std::pair<int32, Vector<double> > > *offset) const {
KALDI_ASSERT((data.Dim() == gmm_.Dim()) && (data.Dim() == gmm_cluster_centers_.Dim()));
KALDI_ASSERT((gmm_.NumGauss() != 0) && (gmm_cluster_centers_.NumGauss() != 0)
&& (gmm_.NumGauss() > gmm_cluster_centers_.NumGauss())
&& (config_.gmm_cluster_centers_nbest < gmm_cluster_centers_.NumGauss())
&& (config_.gmm_gaussian_nbest < gmm_.NumGauss()))
int32 dim = data.Dim();
int32 num_gauss = gmm_.NumGauss();
int32 num_cluster_centers = gmm_cluster_centers_.NumGauss();
int32 gmm_cluster_centers_nbest = config_.gmm_cluster_centers_nbest;
std::set<int32> pruned_centers;
Vector<BaseFloat> loglikes(num_cluster_centers);
gmm_cluster_centers_.LogLikelihoods(data, &loglikes);
Vector<BaseFloat> loglikes_copy(loglikes);
BaseFloat *ptr = loglikes_copy.Data();
std::nth_element(ptr, ptr+num_cluster_centers-gmm_cluster_centers_nbest, ptr+num_cluster_centers);
BaseFloat thresh = ptr[num_cluster_centers-gmm_cluster_centers_nbest];
for (int32 g = 0; g < num_cluster_centers; g++) {
if (loglikes(g) >= thresh)
pruned_centers.insert(g);
}
std::vector< std::pair<double, int32> > pruned_gauss;
for (int32 gauss_index = 0; gauss_index < num_gauss; gauss_index++) {
int32 current_cluster = gaussian_cluster_center_map_[gauss_index];
if (pruned_centers.end() != pruned_centers.find(current_cluster)) {
double loglike = gmm_.ComponentLogLikelihood(data, gauss_index);
pruned_gauss.push_back(std::make_pair(loglike, gauss_index));
}
}
KALDI_ASSERT(!pruned_gauss.empty());
int32 gmm_gaussian_nbest = config_.gmm_gaussian_nbest;
std::nth_element(pruned_gauss.begin(),
pruned_gauss.end() - gmm_gaussian_nbest,
pruned_gauss.end());
pruned_gauss.erase(pruned_gauss.begin(),
pruned_gauss.end() - gmm_gaussian_nbest);
double weight = 0.0;
for (int32 i = 0; i < pruned_gauss.size(); ++i) {
weight += exp(pruned_gauss[i].first);
}
for (int32 i = 0; i < pruned_gauss.size(); ++i) {
pruned_gauss[i].first = exp(pruned_gauss[i].first) / weight;
}
Vector<BaseFloat> tmp_offset(dim + 1);
SubVector<BaseFloat> sub_tmp_offset(tmp_offset, 1, dim);
Vector<BaseFloat> tmp_mean(dim);
Vector<BaseFloat> tmp_var(dim);
for (int32 i = 0; i < pruned_gauss.size(); ++i) {
tmp_offset(0) = pruned_gauss[i].first * 5.0;
sub_tmp_offset.CopyFromVec(data);
gmm_.GetComponentMean(pruned_gauss[i].second, &tmp_mean);
sub_tmp_offset.AddVec(-1.0, tmp_mean);
gmm_.GetComponentVariance(pruned_gauss[i].second, &tmp_var);
tmp_var.ApplyPow(0.5);
sub_tmp_offset.DivElemByElem(tmp_var);
sub_tmp_offset.Scale(pruned_gauss[i].first);
offset->push_back(std::make_pair(pruned_gauss[i].second, tmp_offset));
}
}
void FmpeAccs::ComputeWholeFileOffsetFeature(const MatrixBase<BaseFloat>& data,
std::vector<std::vector<std::pair<int32, Vector<double> > > > *whole_file_offset) const {
int32 nframe = data.NumRows();
whole_file_offset->reserve(nframe);
for (int32 i = 0; i < nframe; i++) {
std::vector<std::pair<int32, Vector<double> > > offset;
ComputeOneFrameOffsetFeature(data.Row(i), &offset);
whole_file_offset->push_back(offset);
}
}
bool Gauss_index_lower(std::pair<int32, Vector<double> > M,
std::pair<int32, Vector<double> > N) {
return M.first < N.first;
}
void FmpeAccs::ComputeContExpOffsetFeature(
const std::vector<std::vector<std::pair<int32, Vector<double> > >* > &offset_win,
std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > *ht) const {
KALDI_ASSERT((config_.context_windows.NumCols() == offset_win.size()));
std::vector<std::pair<int32, Vector<double> > > offset_tmp;
std::vector<std::pair<int32, Vector<double> > > offset_uniq_tmp;
for (int32 i = 0; i < config_.context_windows.NumRows(); i++) {
// for every context
for (int32 j = 0; j < config_.context_windows.NumCols(); j++) {
if (config_.context_windows(i, j) > 0.0) {
if (offset_win[j]->empty() == 0) {
for (int32 k = 0; k < offset_win[j]->size(); k++) {
offset_tmp.push_back((*offset_win[j])[k]);
offset_tmp.back().second.Scale(config_.context_windows(i, j));
}
}
}
}
if (offset_tmp.empty() == 0) {
std::sort(offset_tmp.begin(), offset_tmp.end(), Gauss_index_lower);
offset_uniq_tmp.push_back(offset_tmp[0]);
for (int32 igauss = 1; igauss < offset_tmp.size(); igauss++) {
if (offset_tmp[igauss].first == offset_tmp[igauss - 1].first) {
offset_uniq_tmp.back().second.AddVec(1.0, offset_tmp[igauss].second);
} else {
offset_uniq_tmp.push_back(offset_tmp[igauss]);
}
}
ht->push_back(std::make_pair(i, offset_uniq_tmp));
offset_tmp.clear();
offset_uniq_tmp.clear();
}
}
}
void FmpeAccs::ComputeHighDimemsionFeature(
const std::vector<std::vector<std::pair<int32, Vector<double> > > > &whole_file_offset_feat,
int32 frame_index,
std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > *ht) const {
KALDI_ASSERT((frame_index >= 0) && (frame_index < whole_file_offset_feat.size()));
int32 lenght_context_windows = config_.context_windows.NumCols();
int32 half_len_win = lenght_context_windows / 2;
int32 num_frame = whole_file_offset_feat.size();
std::vector<std::vector<std::pair<int32, Vector<double> > >* > offset_win;
std::vector<std::pair<int32, Vector<double> > > empty_feat;
for (int32 i = (frame_index - half_len_win);
i < (frame_index - half_len_win + lenght_context_windows); i++) {
/// we append zero if the index is out of the whole file feature lenght
if ((i < 0) || (i >= num_frame)) {
offset_win.push_back(&empty_feat);
} else {
offset_win.push_back(
const_cast<std::vector<std::pair<int32, Vector<double> > >* >
(&(whole_file_offset_feat[i])));
}
}
ComputeContExpOffsetFeature(offset_win, ht);
}
void FmpeAccs::ProjectHighDimensionFeature(
const std::vector< std::vector< Matrix<double> > > &M,
const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht,
Vector<double> *fea_out) const {
KALDI_ASSERT((M.size() == gmm_.NumGauss())
&& (M[0].size() == ht.size())
&& (M[0][0].NumRows() == gmm_.Dim())
&& (M[0][0].NumCols() == gmm_.Dim() + 1));
int32 dim = gmm_.Dim();
Vector<double> tmp_fea(dim);
tmp_fea.SetZero();
for(int32 i = 0; i < ht.size(); i++) {
int32 cont_index = ht[i].first;
for (int32 j = 0; j < ht[i].second.size(); j++) {
int32 gauss_index = ht[i].second[j].first;
tmp_fea.AddMatVec(1.0, M[gauss_index][cont_index], kNoTrans, ht[i].second[j].second, 1.0);
}
}
fea_out->CopyFromVec(tmp_fea);
}
void FmpeAccs::ObtainNewFmpeFeature(
const VectorBase<BaseFloat> &data,
const std::vector< std::vector< Matrix<double> > > &M,
const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht,
Vector<double> *fea_new) const {
KALDI_ASSERT((data.Dim() == gmm_.Dim()));
Vector<double> tmp_fea(data.Dim());
ProjectHighDimensionFeature(M, ht, &tmp_fea);
fea_new->CopyFromVec(data);
fea_new->AddVec(1.0, tmp_fea);
}
void FmpeAccs::AccumulateDirectDiffFromPosteriors(const DiagGmm &gmm,
const VectorBase<BaseFloat> &data,
const VectorBase<BaseFloat> &posteriors,
Vector<double> *direct_diff) {
KALDI_ASSERT(gmm.Dim() == Dim());
KALDI_ASSERT(gmm.NumGauss() == posteriors.Dim());
KALDI_ASSERT(static_cast<int32>(data.Dim()) == Dim());
KALDI_ASSERT(direct_diff->Dim() == Dim());
Matrix<double> means_invvars(gmm.NumGauss(), gmm.Dim());
Matrix<double> inv_vars(gmm.NumGauss(), gmm.Dim());
Matrix<double> data_tmp(gmm.NumGauss(), gmm.Dim());
Matrix<double> mat_tmp(gmm.NumGauss(), gmm.Dim());
Vector<double> post_scale(gmm.NumGauss());
means_invvars.CopyFromMat(gmm.means_invvars(), kNoTrans);
inv_vars.CopyFromMat(gmm.inv_vars(), kNoTrans);
for (int32 i = 0; i < data_tmp.NumRows(); i++) {
data_tmp.Row(i).AddVec(1.0, data);
}
data_tmp.MulElements(inv_vars);
mat_tmp.CopyFromMat(means_invvars, kNoTrans);
mat_tmp.AddMat(-1.0, data_tmp, kNoTrans);
post_scale.CopyFromVec(posteriors);
post_scale.Scale(config_.lat_prob_scale);
direct_diff->AddMatVec(1.0, mat_tmp, kTrans, post_scale, 1.0);
}
void FmpeAccs::AccumulateInDirectDiffFromPosteriors(const DiagGmm &gmm,
const FmpeAccumModelDiff &fmpe_diaggmm_diff_acc,
const VectorBase<BaseFloat> &data,
const VectorBase<BaseFloat> &posteriors,
Vector<double> *indirect_diff) {
KALDI_ASSERT(gmm.NumGauss() == fmpe_diaggmm_diff_acc.NumGauss());
KALDI_ASSERT(gmm.NumGauss() == posteriors.Dim());
KALDI_ASSERT(gmm.Dim() == fmpe_diaggmm_diff_acc.Dim());
KALDI_ASSERT(gmm.Dim() == Dim());
KALDI_ASSERT(static_cast<int32>(data.Dim()) == Dim());
KALDI_ASSERT(indirect_diff->Dim() == Dim());
Matrix<double> mat_tmp(gmm.NumGauss(), gmm.Dim());
Vector<double> vec_tmp(gmm.NumGauss());
gmm.GetMeans(&mat_tmp);
for (int32 i = 0; i < mat_tmp.NumRows(); i++) {
mat_tmp.Row(i).AddVec(-1.0, data);
}
mat_tmp.MulElements(fmpe_diaggmm_diff_acc.variance_diff_accumulator());
mat_tmp.Scale(-2.0);
mat_tmp.AddMat(1.0, fmpe_diaggmm_diff_acc.mean_diff_accumulator(), kNoTrans);
// should be scaled in compute model difficientials,
// but used here just for convenient
mat_tmp.Scale(config_.lat_prob_scale);
vec_tmp.CopyFromVec(posteriors);
vec_tmp.DivElemByElem(fmpe_diaggmm_diff_acc.mle_occupancy());
indirect_diff->AddMatVec(1.0, mat_tmp, kTrans, vec_tmp, 1.0);
}
void FmpeAccs::AccumulateInDirectDiffFromDiag(const DiagGmm &gmm,
const FmpeAccumModelDiff &fmpe_diaggmm_diff_acc,
const VectorBase<BaseFloat> &data,
BaseFloat frame_posterior,
Vector<double> *indirect_diff) {
KALDI_ASSERT(gmm.NumGauss() == fmpe_diaggmm_diff_acc.NumGauss());
KALDI_ASSERT(gmm.Dim() == fmpe_diaggmm_diff_acc.Dim());
KALDI_ASSERT(gmm.Dim() == Dim());
KALDI_ASSERT(static_cast<int32>(data.Dim()) == Dim());
KALDI_ASSERT(indirect_diff->Dim() == Dim());
Vector<BaseFloat> posteriors(gmm.NumGauss());
gmm.ComponentPosteriors(data, &posteriors);
posteriors.Scale(frame_posterior);
AccumulateInDirectDiffFromPosteriors(gmm, fmpe_diaggmm_diff_acc,
data, posteriors, indirect_diff);
}
void FmpeAccs::AccumulateFromDifferential(const VectorBase<double> &direct_diff,
const VectorBase<double> &indirect_diff,
const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht) {
KALDI_ASSERT((direct_diff.Dim() == indirect_diff.Dim()));
KALDI_ASSERT(direct_diff.Dim() == Dim());
Vector<double> diff(direct_diff);
diff.AddVec(1.0, indirect_diff);
int32 dim = gmm_.Dim();
Matrix<double> tmp(dim, dim + 1);
tmp.SetZero();
/// accumulate the p and n statistics
for (int32 i = 0; i < ht.size(); i++) {
int32 cont_index = ht[i].first;
for (int32 j = 0; j < ht[i].second.size(); j++) {
int32 gauss_index = ht[i].second[j].first;
tmp.AddVecVec(1.0, diff, ht[i].second[j].second);
for (int32 r = 0; r < dim; r++) {
for (int32 c = 0;c < (dim + 1); c++) {
if (tmp(r, c) > 0.0) {
p_[gauss_index][cont_index](r, c) += tmp(r, c);
}
else {
n_[gauss_index][cont_index](r, c) -= tmp(r, c);
}
}
}
tmp.SetZero();
}
}
/// accumulate the direct/indirect and total differentials
diff_.AddVec(1.0, diff);
direct_diff_.AddVec(1.0, direct_diff);
indirect_diff_.AddVec(1.0, indirect_diff);
}
FmpeUpdater::FmpeUpdater(const FmpeAccs &accs)
: config_(accs.config()), dim_(accs.Dim()) {
Init(config_.gmm_num_comps, config_.context_windows.NumRows(), dim_);
};
FmpeUpdater::FmpeUpdater(const FmpeUpdater &other)
: config_(other.config_), avg_std_var_(other.avg_std_var_),
dim_(other.dim_) {
if (other.M_.size() != 0) {
M_.resize(other.M_.size());
for (int32 i = 0; i < other.M_.size(); ++i) {
M_[i].resize(other.M_[i].size());
for (int32 j = 0; j < other.M_[i].size(); ++j) {
M_[i][j].Resize(other.M_[i][j].NumRows(), other.M_[i][j].NumCols());
M_[i][j].CopyFromMat(other.M_[i][j], kNoTrans);
}
}
}
}
void FmpeUpdater::Init(int32 num_gmm_gauss, int32 con_exp, int32 dim) {
M_.resize(num_gmm_gauss);
for (int32 i = 0; i < num_gmm_gauss; ++i) {
M_[i].resize(con_exp);
for (int32 j = 0; j < con_exp; ++j) {
M_[i][j].Resize(dim, dim + 1);
}
}
avg_std_var_.Resize(dim);
}
void FmpeUpdater::Write(std::ostream &out_stream, bool binary) const {
uint32 tmp_uint32;
WriteToken(out_stream, binary, "<FMPE>");
WriteToken(out_stream, binary, "<NumGaussians>");
tmp_uint32 = static_cast<uint32>(config_.gmm_num_comps);
WriteBasicType(out_stream, binary, tmp_uint32);
WriteToken(out_stream, binary, "<LengthContExp>");
tmp_uint32 = static_cast<uint32>(config_.context_windows.NumRows());
WriteBasicType(out_stream, binary, tmp_uint32);
WriteToken(out_stream, binary, "<DIMENSION>");
WriteBasicType(out_stream, binary, dim_);
if (!binary) out_stream << "\n";
// convert into BaseFloat before writing things
Matrix<BaseFloat> mat_bf(dim_, dim_ + 1);
if (M_.size() != 0) {
WriteToken(out_stream, binary, "<PROJ_MAT>");
for (int32 i = 0; i < config_.gmm_num_comps; ++i) {
for (int32 j = 0; j < config_.context_windows.NumRows(); ++j) {
mat_bf.CopyFromMat(M_[i][j], kNoTrans);
mat_bf.Write(out_stream, binary);
}
}
}
WriteToken(out_stream, binary, "</FMPE>");
}
void FmpeUpdater::Read(std::istream &in_stream, bool binary) {
uint32 tmp_uint32;
std::string token;
ExpectToken(in_stream, binary, "<FMPE>");
ExpectToken(in_stream, binary, "<NumGaussians>");
ReadBasicType(in_stream, binary, &tmp_uint32);
int32 num_gaussians = static_cast<int32>(tmp_uint32);
ExpectToken(in_stream, binary, "<LengthContExp>");
ReadBasicType(in_stream, binary, &tmp_uint32);
int32 length_cont_exp = static_cast<int32>(tmp_uint32);
ExpectToken(in_stream, binary, "<DIMENSION>");
ReadBasicType(in_stream, binary, &dim_);
ReadToken(in_stream, binary, &token);
while (token != "</FMPE>") {
if (token == "<PROJ_MAT>") {
M_.resize(num_gaussians);
for (size_t i = 0; i < M_.size(); ++i) {
M_[i].resize(length_cont_exp);
for (size_t j = 0; j < M_[i].size(); ++j) {
M_[i][j].Read(in_stream, binary);
}
}
} else {
KALDI_ERR << "Unexpected token '" << token << "' in model file ";
}
ReadToken(in_stream, binary, &token);
}
}
void FmpeUpdater::ComputeAvgStandardDeviation(const AmDiagGmm &am) {
Matrix<double> vars_tmp;
Vector<double> vec_tmp(am.Dim());
for (int32 i = 0; i < am.NumPdfs(); i++) {
const DiagGmm &gmm = am.GetPdf(i);
gmm.GetVars(&vars_tmp);
vars_tmp.ApplyPow(0.5);
vec_tmp.AddRowSumMat(vars_tmp);
}
vec_tmp.Scale(1 / am.NumGauss());
avg_std_var_.CopyFromVec(vec_tmp);
}
void FmpeUpdater::Update(const FmpeAccs &accs,
BaseFloat *obj_change_out,
BaseFloat *count_out) {
KALDI_ASSERT((M_.size() == accs.pos().size()) && (M_.size() == accs.neg().size()));
KALDI_ASSERT((M_[0].size() == accs.pos()[0].size()) && (M_[0].size() == accs.neg()[0].size())
&& M_[0].size() == config_.context_windows.NumRows());
KALDI_ASSERT((M_[0][0].NumRows() == accs.pos()[0][0].NumRows())
&& (M_[0][0].NumRows() == accs.neg()[0][0].NumRows())
&& (M_[0][0].NumRows() == avg_std_var_.Dim()));
KALDI_ASSERT((M_[0][0].NumCols() == accs.pos()[0][0].NumCols())
&& (M_[0][0].NumCols() == accs.neg()[0][0].NumCols())
&& (M_[0][0].NumCols() == (M_[0][0].NumRows() + 1)));
int32 ngauss = M_.size();
int32 n_cont_exp = M_[0].size();
int32 dim = M_[0][0].NumRows();
Matrix<double> pandn_add_tmp(dim, dim + 1);
Matrix<double> pandn_sub_tmp(dim, dim + 1);
Vector<double> vec_tmp(avg_std_var_);
vec_tmp.Scale(1 / config_.E);
KALDI_LOG << "Updating the projection matrix M, the dim is: [ "
<< ngauss << " ][ " << n_cont_exp << " ][ " << dim << " ][ " << dim + 1
<< " ] -> [nGauss][nContExp][fea_dim][fea_dim + 1]";
for (int32 gauss_index = 0; gauss_index < ngauss; gauss_index++) {
for (int32 icon_exp = 0; icon_exp < n_cont_exp; icon_exp++) {
pandn_add_tmp.CopyFromMat(accs.pos()[gauss_index][icon_exp], kNoTrans);
pandn_add_tmp.AddMat(1.0, accs.neg()[gauss_index][icon_exp], kNoTrans);
pandn_sub_tmp.CopyFromMat(accs.pos()[gauss_index][icon_exp], kNoTrans);
pandn_sub_tmp.AddMat(-1.0, accs.neg()[gauss_index][icon_exp], kNoTrans);
pandn_sub_tmp.DivElements(pandn_add_tmp);
pandn_sub_tmp.MulRowsVec(vec_tmp);
M_[gauss_index][icon_exp].AddMat(1.0, pandn_sub_tmp, kNoTrans);
}
}
/// add some code to calculate the objective function change // TODO
}
void ClusterGmmToClusterCenters(const DiagGmm &gmm,
int32 num_cluster_centers,
BaseFloat cluster_varfloor,
DiagGmm *ubm_cluster_centers,
std::vector<int32> *cluster_center_map) {
// Bottom-up clustering of the Gaussians in the gmm model
KALDI_ASSERT(num_cluster_centers < gmm.NumGauss());
int32 dim = gmm.Dim();
Vector<BaseFloat> tmp_mean(dim);
Vector<BaseFloat> tmp_var(dim);
int32 num_gaussians = gmm.NumGauss();
std::vector<Clusterable*> gauss_clusters;
gauss_clusters.reserve(num_cluster_centers);
for (int32 gauss_index = 0; gauss_index < num_gaussians; gauss_index++) {
gmm.GetComponentMean(gauss_index, &tmp_mean);
gmm.GetComponentVariance(gauss_index, &tmp_var);
tmp_var.AddVec2(1.0, tmp_mean); // make it x^2 stats.
BaseFloat this_weight = gmm.weights()(gauss_index);
tmp_mean.Scale(this_weight);
tmp_var.Scale(this_weight);
gauss_clusters.push_back(new GaussClusterable(tmp_mean, tmp_var,
cluster_varfloor, this_weight));
}
std::vector<Clusterable*> gauss_clusters_out;
KALDI_VLOG(1) << "Creating " << num_cluster_centers << " gaussian clusters centers.";
ClusterBottomUp(gauss_clusters, kBaseFloatMax, num_cluster_centers,
&gauss_clusters_out,
cluster_center_map /*get the cluster assignments*/);
DeletePointers(&gauss_clusters);
// Next, put the clustered Gaussians centers into a single GMM.
KALDI_VLOG(1) << "Putting " << num_cluster_centers << " Gaussians cluster centers"
<< "into a single GMM model.";
Matrix<BaseFloat> tmp_means(num_cluster_centers, dim);
Matrix<BaseFloat> tmp_vars(num_cluster_centers, dim);
Vector<BaseFloat> tmp_weights(num_cluster_centers);
Vector<BaseFloat> tmp_vec(dim);
DiagGmm tmp_gmm;
for (int32 gauss_index = 0; gauss_index < num_cluster_centers; gauss_index++) {
GaussClusterable *this_cluster = static_cast<GaussClusterable*>(
gauss_clusters_out[gauss_index]);
BaseFloat weight = this_cluster->count();
tmp_weights(gauss_index) = weight;
tmp_vec.CopyFromVec(this_cluster->x_stats());
tmp_vec.Scale(1/weight);
tmp_means.CopyRowFromVec(tmp_vec, gauss_index);
tmp_vec.CopyFromVec(this_cluster->x2_stats());
tmp_vec.Scale(1/weight);
tmp_vec.AddVec2(-1.0, tmp_means.Row(gauss_index)); // x^2 stats to var.
tmp_vars.CopyRowFromVec(tmp_vec, gauss_index);
}
DeletePointers(&gauss_clusters_out);
tmp_gmm.Resize(num_cluster_centers, dim);
tmp_weights.Scale(1.0/tmp_weights.Sum());
tmp_gmm.SetWeights(tmp_weights);
tmp_vars.InvertElements(); // need inverse vars...
tmp_gmm.SetInvVarsAndMeans(tmp_vars, tmp_means);
KALDI_VLOG(1) << "Obtain " << tmp_gmm.NumGauss() << " Gaussians cluster centers.";
ubm_cluster_centers->CopyFromDiagGmm(tmp_gmm);
}
void ObtainUbmAndSomeClusterCenters(
const AmDiagGmm &am,
const Vector<BaseFloat> &state_occs,
const FmpeConfig &config,
DiagGmm *gmm_out,
DiagGmm *gmm_cluster_centers_out,
std::vector<int32> *gaussian_cluster_center_map_out) {
/// First clusters the Gaussians in an acoustic model to a single GMM with specified
/// number of components. Using the same algorithm in the SGMM's UBM
/// initialization
kaldi::UbmClusteringOptions ubm_opts;
ubm_opts.ubm_numcomps = config.gmm_num_comps;
ClusterGaussiansToUbm(am, state_occs, ubm_opts, gmm_out);
/// Clusters the Gaussians in the gmm model to some cluster centers, which is for
/// more efficient evaluation of the gaussian posteriors just with
/// the most likely cluster centers
ClusterGmmToClusterCenters(*gmm_out, config.gmm_num_cluster_centers, config.cluster_varfloor,
gmm_cluster_centers_out, gaussian_cluster_center_map_out);
}
} // End of namespace kaldi

Просмотреть файл

@ -1,388 +0,0 @@
// gmm/fmpe-am-diag-gmm.h
// Copyright 2009-2011 Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_GMM_FMPE_AM_DIAG_GMM_H_
#define KALDI_GMM_FMPE_AM_DIAG_GMM_H_ 1
#include <vector>
#include "gmm/am-diag-gmm.h"
#include "gmm/mle-diag-gmm.h"
#include "gmm/ebw-diag-gmm.h"
namespace kaldi {
struct FmpeConfig {
/// Number of the Gaussian components in the gmm model
int32 gmm_num_comps;
/// Number of the Gaussian cluster centers for fast evaluation
int32 gmm_num_cluster_centers;
/// the cluster var floor
BaseFloat cluster_varfloor;
/// Number of highest-scoring of the best cluster centers
int32 gmm_cluster_centers_nbest;
/// Number of highest-scoring of the best gaussians
int32 gmm_gaussian_nbest;
/// The lat prob scale
double lat_prob_scale;
/// The constant that contrals the overall learning rate
double E;
/// The Matrix indicates the length of context expansion
/// and the weight of each corresponding context frame. e.g.[9][17]
Matrix<BaseFloat> context_windows;
/*
Matrix<BaseFloat> context_windows;
// Normal dimension is [9][17]
// Example would be
// context_windows = [ 0 0 0 0 0 0 0 0 1.0 0 0 0 0 0 0 0 0
// 0 0 0 0 0 0 0 0 0 1.0 0 0 0 0 0 0 0
// .... etc.
// Then your nlength_context_expansion variable equals
// the NumRows() of this.
// Then you don't have to hard-code the computation in ComputeContExpOffsetFeature.
// Note: the code in ComputeContExpOffsetFeature that iterates over
// context_windows will check for zeros, so it will not have to do any work if
// it finds a zero feature.
// Also be careful when the same Gaussian index is present on more than one frame,
// that you are adding the values together, not replacing one with the other or
// creating duplicates with the same index. [maybe use function DeDuplicateVector(
// std::vector<std::pair<int32, Vector<BaseFloat> >*), that would first sort on the
// int32 and then add together and combine any sets of elements with the same
// integer value.
*/
FmpeConfig() {
gmm_num_comps = 2048;
gmm_num_cluster_centers = 128;
cluster_varfloor = 0.01;
gmm_cluster_centers_nbest = 25;
gmm_gaussian_nbest = 2;
lat_prob_scale = 0.083;
E = 10.0;
}
void Register(ParseOptions *po) {
po->Register("gmm-num-comps", &gmm_num_comps, "Number of the Gaussian"
" components in the gmm model to calculate the gaussian posteriors.");
po->Register("gmm-num-cluster-centers", &gmm_num_cluster_centers, "Number"
" of the Gaussian cluster centers for fast posteriors evaluation.");
po->Register("cluster-varfloor", &cluster_varfloor,
"Variance floor used in bottom-up state clustering.");
po->Register("gmm-cluster-centers-nbest", &gmm_cluster_centers_nbest,
"Number of highest-scoring of the best cluster centers.");
po->Register("gmm-gaussian-nbest", &gmm_gaussian_nbest, "Number of"
" of highest-scoring of the best gaussians.");
po->Register("lat-prob-scale", &lat_prob_scale,
"The lattice probability scale, very important.");
po->Register("E", &E, "The constant that contrals the overall learning rate.");
}
};
/** \class FmpeAccumModelDiff
* Class for computing the basic model parameter differentials from
* the mpe statistics produced in the first pass of fmpe training
*/
class FmpeAccumModelDiff {
public:
FmpeAccumModelDiff(): dim_(0), num_comp_(0) {}
explicit FmpeAccumModelDiff(const DiagGmm &gmm) {
Resize(gmm);
}
void Read(std::istream &in_stream, bool binary);
void Write(std::ostream &out_stream, bool binary) const;
/// Allocates memory for accumulators
void Resize(int32 num_comp, int32 dim);
/// Calls ResizeAccumulators based on gmm
void Resize(const DiagGmm &gmm);
/// Returns the number of mixture components
int32 NumGauss() const { return num_comp_; }
/// Returns the dimensionality of the feature vectors
int32 Dim() const { return dim_; }
void SetZero();
// Accessors
const Vector<double>& mle_occupancy() const { return mle_occupancy_; }
const Matrix<double>& mean_diff_accumulator() const { return mean_diff_accumulator_; }
const Matrix<double>& variance_diff_accumulator() const { return variance_diff_accumulator_; }
/// Computes the Model parameter differentials using the statistics from
/// the MPE training, including the numerator and denominator accumulators
/// and applies I-smoothing to the numerator accs, if needed,
/// which using mle_acc.
void ComputeModelParaDiff(const DiagGmm &diag_gmm,
const AccumDiagGmm &num_acc,
const AccumDiagGmm &den_acc,
const AccumDiagGmm &mle_acc);
private:
int32 dim_;
int32 num_comp_;
/// Accumulators
Vector<double> mle_occupancy_;
Matrix<double> mean_diff_accumulator_;
Matrix<double> variance_diff_accumulator_;
// Cannot have copy constructor and assigment operator
KALDI_DISALLOW_COPY_AND_ASSIGN(FmpeAccumModelDiff);
};
inline void FmpeAccumModelDiff::Resize(const DiagGmm &gmm) {
Resize(gmm.NumGauss(), gmm.Dim());
}
/** \class FmpeAccs
* Class for accumulate the positive and negative statistics
* for computing the feature-level minimum phone error estimate of the
* parameters of projection M matrix.
* The acoustic model is diagonal Gaussian mixture models
*/
class FmpeAccs {
public:
explicit FmpeAccs(const FmpeConfig &config)
: config_(config) {};
~FmpeAccs() {}
void Read(std::istream &in_stream, bool binary, bool add);
void Write(std::ostream &out_stream, bool binary) const;
/// Read the am model's parameters differentials
void ReadModelDiffs(std::istream &in_stream, bool binary);
/// Initializes the P and N statistics, and model parameter differentials if needed
void Init(const AmDiagGmm &am_model, bool update);
/// Initializes the P and N statistics, and diff statistics
void InitPNandDiff(int32 num_gmm_gauss, int32 con_exp, int32 dim);
/// Initializes the model parameter differentials
void InitModelDiff(const AmDiagGmm &model);
/// Initializes the GMMs for computing the high dimensional features
void InitializeGMMs(const DiagGmm &gmm, const DiagGmm &gmm_cluster_centers,
std::vector<int32> &gaussian_cluster_center_map);
/// Compute the offset feature given one frame data
void ComputeOneFrameOffsetFeature(const VectorBase<BaseFloat>& data,
std::vector<std::pair<int32, Vector<double> > > *offset) const;
/// Compute all the offset features given the whole file data
void ComputeWholeFileOffsetFeature(const MatrixBase<BaseFloat>& data,
std::vector<std::vector<std::pair<int32, Vector<double> > > > *whole_file_offset) const;
/// Compute the context expansion high dimension feature
/// The high dimension offset feature with the context expansion: "ht";
/// the vector "ht" store the expanded offset feature corresponding
/// each context. And each element of "ht" is the relative context's
/// offset feature, which stored as the pair, including the used
/// gaussian index and the corresponding offset feature
/// vector. This structure is designed for the sparse vector ht.
/// dim is [nContExp * nGaussian * (fea_dim + 1)]
/// "offset_win" stores the current corresponding offset features
/// which are used to compute "ht"
void ComputeContExpOffsetFeature(
const std::vector<std::vector<std::pair<int32, Vector<double> > >* > &offset_win,
std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > *ht) const;
/// obtain the current needed context expension high dimension feature using
/// the whole file offset features as the inputs which is indexed
/// by the current frame's number frame_index
void ComputeHighDimemsionFeature(
const std::vector<std::vector<std::pair<int32, Vector<double> > > > &whole_file_offset_feat,
int32 frame_index,
std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > *ht) const;
/// Prject the high dimension features down to the dimension of the original
/// features and add them to the origianl features.
/// This is the sparse multiply using the non-sparse matrix M and
/// the sparse high dimension vector ht
void ProjectHighDimensionFeature(
const std::vector< std::vector< Matrix<double> > > &M,
const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht,
Vector<double> *fea_out) const;
/// Add the projected feature to the old feature and obtain the new fmpe feature
void ObtainNewFmpeFeature(const VectorBase<BaseFloat> &data,
const std::vector< std::vector< Matrix<double> > > &M,
const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht,
Vector<double> *fea_new) const;
/// Accumulate the direct differentials
void AccumulateDirectDiffFromPosteriors(const DiagGmm &gmm,
const VectorBase<BaseFloat> &data,
const VectorBase<BaseFloat> &posteriors,
Vector<double> *direct_diff);
/// Accumulate the indirect differentials from posteriors
void AccumulateInDirectDiffFromPosteriors(const DiagGmm &gmm,
const FmpeAccumModelDiff &fmpe_diaggmm_diff_acc,
const VectorBase<BaseFloat> &data,
const VectorBase<BaseFloat> &posteriors,
Vector<double> *indirect_diff);
/// Accumulate the indirect differentials from a DiagGmm model
void AccumulateInDirectDiffFromDiag(const DiagGmm &gmm,
const FmpeAccumModelDiff &fmpe_diaggmm_diff_acc,
const VectorBase<BaseFloat> &data,
BaseFloat frame_posterior,
Vector<double> *indirect_diff);
/// Accumulate the statistics about the positive and negative differential
void AccumulateFromDifferential(const VectorBase<double> &direct_diff,
const VectorBase<double> &indirect_diff,
const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht);
// Accessors
FmpeAccumModelDiff& GetAccsModelDiff(int32 pdf_index);
const FmpeAccumModelDiff& GetAccsModelDiff(int32 pdf_index) const;
const std::vector< std::vector< Matrix<double> > >& pos() const { return p_; }
const std::vector< std::vector< Matrix<double> > >& neg() const { return n_; }
const FmpeConfig& config() const { return config_; }
/// Returns the number of mixture components in the GMM model
int32 NumGaussInGmm() const { return gmm_.NumGauss(); }
/// Returns the number of cluster centers in the cluster center GMM
int32 NumClusterCenter() const { return gmm_cluster_centers_.NumGauss(); }
/// Returns the dimensionality of the feature vectors
int32 Dim() const { return dim_; }
private:
FmpeConfig config_;
/// These contain the gmm models used to calculate the high deminsion
/// offet feature : one compute the high dimension vector gaussian posteriors,
/// and the other one is just for more efficient computing using
/// the most likely cluster centers
DiagGmm gmm_;
DiagGmm gmm_cluster_centers_;
/// The mapping between the gmm_ model and the cluster centers of gmm_cluster_centers_
std::vector<int32> gaussian_cluster_center_map_;
/// The basic model parameter differentials for the AmDiagGmm
std::vector<FmpeAccumModelDiff*> model_diff_accumulators_;
/// The positive accumulated matrix p_ij; dim is [nGauss][nContExp][fea_dim][fea_dim + 1].
std::vector< std::vector< Matrix<double> > > p_;
/// The negative accumulated matrix n_ij; dim is [nGauss][nContExp][fea_dim][fea_dim + 1].
std::vector< std::vector< Matrix<double> > > n_;
/// The summation of the differential
Vector<double> diff_;
/// The summation of the direct differential
Vector<double> direct_diff_;
/// The summation of the indirect differential
Vector<double> indirect_diff_;
/// The feature dim
int32 dim_;
// Cannot have copy constructor and assigment operator
KALDI_DISALLOW_COPY_AND_ASSIGN(FmpeAccs);
};
inline FmpeAccumModelDiff& FmpeAccs::GetAccsModelDiff(int32 pdf_index) {
KALDI_ASSERT((static_cast<size_t>(pdf_index) < model_diff_accumulators_.size())
&& (model_diff_accumulators_[pdf_index] != NULL));
return *(model_diff_accumulators_[pdf_index]);
}
inline const FmpeAccumModelDiff& FmpeAccs::GetAccsModelDiff(int32 pdf_index) const {
KALDI_ASSERT((static_cast<size_t>(pdf_index) < model_diff_accumulators_.size())
&& (model_diff_accumulators_[pdf_index] != NULL));
return *(model_diff_accumulators_[pdf_index]);
}
/** \class FmpeUpdater
* Class for containing the functions that updating the feature-level
* minimum phone error estimate of the parameters of projection M matrix
* that adds offsets to the original feature.
* The acoustic model is diagonal Gaussian mixture models
*/
class FmpeUpdater {
public:
explicit FmpeUpdater(const FmpeAccs &accs);
~FmpeUpdater() {}
// provide copy constructor.
explicit FmpeUpdater(const FmpeUpdater &other);
void Read(std::istream &in_stream, bool binary);
void Write(std::ostream &out_stream, bool binary) const;
/// Initializes feature projection Matrix M
void Init(int32 num_gmm_gauss, int32 con_exp, int32 dim);
/// compute the average standard deviation of gaussians
/// in the current AmDiagGmm set
void ComputeAvgStandardDeviation(const AmDiagGmm &am);
/// Update the projection matrix M
void Update(const FmpeAccs &accs,
BaseFloat *obj_change_out,
BaseFloat *count_out);
// Accessors
const std::vector< std::vector< Matrix<double> > >& ProjMat() const { return M_; }
const FmpeConfig& config() const { return config_; }
private:
FmpeConfig config_;
/// The average standard deviation of gaussians in the current AmDiagGmm set
Vector<double> avg_std_var_;
/// The feature projection matrix; dim is [nGauss][nContExp][fea_dim][fea_dim + 1].
std::vector< std::vector< Matrix<double> > > M_;
/// The feature dim
int32 dim_;
};
/** Clusters the Gaussians in the gmm model to some cluster centers
*/
void ClusterGmmToClusterCenters(const DiagGmm &gmm,
int32 num_cluster_centers,
BaseFloat cluster_varfloor,
DiagGmm *ubm_cluster_centers,
std::vector<int32> *cluster_center_map);
/** First clusters the Gaussians in an acoustic model to a single GMM with specified
* number of components. Using the same algorithm in the SGMM's UBM
* initialization, and then Clusters the Gaussians in the gmm model
* to some cluster centers, which is for more efficient evaluation of the
* gaussian posteriors just with the most likely cluster centers
*/
void ObtainUbmAndSomeClusterCenters(
const AmDiagGmm &am,
const Vector<BaseFloat> &state_occs,
const FmpeConfig &config,
DiagGmm *gmm_out,
DiagGmm *gmm_cluster_centers_out,
std::vector<int32> *gaussian_cluster_center_map_out);
} // End namespace kaldi
#endif // KALDI_GMM_FMPE_AM_DIAG_GMM_H_

Просмотреть файл

@ -15,13 +15,13 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
gmm-est-fmllr-gpost gmm-est-fmllr gmm-est-regtree-fmllr-ali \
gmm-est-regtree-mllr gmm-decode-kaldi gmm-compute-likes \
gmm-decode-faster-regtree-mllr gmm-et-apply-c gmm-latgen-simple \
gmm-rescore-lattice gmm-decode-biglm-faster fmpe-gmm-model-diffs-est \
fmpe-gmm-acc-stats-gpost fmpe-gmm-sum-accs fmpe-init-gmms fmpe-gmm-est \
gmm-rescore-lattice gmm-decode-biglm-faster \
gmm-est-gaussians-ebw gmm-est-weights-ebw gmm-latgen-faster gmm-copy \
gmm-global-acc-stats gmm-global-est gmm-global-sum-accs gmm-gselect \
gmm-latgen-biglm-faster gmm-ismooth-stats gmm-global-get-frame-likes \
gmm-global-est-fmllr gmm-global-to-fgmm gmm-global-acc-stats-twofeats \
gmm-global-copy gmm-align-compiled-plusphones gmm-get-feat-deriv
gmm-global-copy gmm-align-compiled-plusphones gmm-get-feat-deriv \
gmm-fmpe-acc-stats gmm-acc-stats2
OBJFILES =

Просмотреть файл

@ -1,186 +0,0 @@
// gmmbin/fmpe-gmm-acc-stats-gpost.cc
// Copyright 2009-2011 Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/diag-gmm.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "gmm/fmpe-am-diag-gmm.h"
int main(int argc, char *argv[]) {
using namespace kaldi;
try {
const char *usage =
"Accumulate positive and negative stats for Fmpe training (reading in gaussian-level posteriors).\n"
"Note: not yet tested.\n"
"Usage: fmpe-gmm-acc-stats-gpost [options] <model-in> <model-diffs-in> <gmms-model-in> <feature-rspecifier> <gposteriors-ebw-rspecifier> <gposteriors-mle-rspecifier> <stats-out>\n"
"e.g.: \n"
" fmpe-gmm-acc-stats-gpost 1.mdl 1.model.diffs 1.gmm scp:train.scp ark:1.ebw.gpost ark:1.mle.gpost 1.fmpe.acc\n";
typedef kaldi::int32 int32;
bool binary = false;
FmpeConfig fmpe_opts;
int32 gmm_cluster_centers_nbest = 25;
int32 gmm_gaussian_nbest = 2;
double lat_prob_scale = 0.083;
double E = 10.0;
ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");
po.Register("gmm-cluster-centers-nbest", &gmm_cluster_centers_nbest,
"Number of highest-scoring of the best cluster centers.");
po.Register("gmm-gaussian-nbest", &gmm_gaussian_nbest, "Number of"
" of highest-scoring of the best gaussians.");
po.Register("lat-prob-scale", &lat_prob_scale,
"The lattice probability scale, very important.");
po.Register("E", &E, "The constant that contrals the overall learning rate.");
fmpe_opts.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 7) {
po.PrintUsage();
exit(1);
}
std::string model_filename = po.GetArg(1),
model_diffs_filename = po.GetArg(2),
gmms_model_filename = po.GetArg(3),
feature_rspecifier = po.GetArg(4),
gposteriors_ebw_rspecifier = po.GetArg(5),
gposteriors_mle_rspecifier = po.GetArg(6),
accs_wxfilename = po.GetArg(7);
using namespace kaldi;
AmDiagGmm am_gmm;
TransitionModel trans_model;
{
bool binary;
Input ki(model_filename, &binary);
trans_model.Read(ki.Stream(), binary);
am_gmm.Read(ki.Stream(), binary);
}
FmpeAccs fmpe_accs(fmpe_opts);
fmpe_accs.Init(am_gmm, true);
{
bool binary;
Input ki(model_diffs_filename, &binary);
fmpe_accs.ReadModelDiffs(ki.Stream(), binary);
}
kaldi::DiagGmm gmm;
kaldi::DiagGmm gmm_clusters;
std::vector<int32> gaussian_cluster_center_map;
{
bool binary;
Input ki(gmms_model_filename, &binary);
gmm.Read(ki.Stream(), binary);
gmm_clusters.Read(ki.Stream(), binary);
ReadIntegerVector(ki.Stream(), binary, &gaussian_cluster_center_map);
}
fmpe_accs.InitializeGMMs(gmm, gmm_clusters, gaussian_cluster_center_map);
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessGauPostReader gposteriors_ebw_reader(gposteriors_ebw_rspecifier);
RandomAccessGauPostReader gposteriors_mle_reader(gposteriors_mle_rspecifier);
int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
for (; !feature_reader.Done(); feature_reader.Next()) {
std::string key = feature_reader.Key();
if ((!gposteriors_ebw_reader.HasKey(key)) &&
(!gposteriors_mle_reader.HasKey(key))) {
num_no_posterior++;
} else {
const Matrix<BaseFloat> &mat = feature_reader.Value();
const GauPost &gpost_ebw = gposteriors_ebw_reader.Value(key);
const GauPost &gpost_mle = gposteriors_ebw_reader.Value(key);
if ((static_cast<int32>(gpost_ebw.size()) != mat.NumRows()) &&
(static_cast<int32>(gpost_mle.size()) != mat.NumRows())) {
KALDI_WARN << "Gaussian Posterior vector has wrong size : gpost-ebw. " <<
(gpost_ebw.size()) << "gpost-mle. " << (gpost_mle.size()) << " vs. "<< (mat.NumRows());
num_other_error++;
continue;
}
num_done++;
std::vector<std::vector<std::pair<int32, Vector<double> > > > whole_file_offset;
std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > ht;
fmpe_accs.ComputeWholeFileOffsetFeature(mat, &whole_file_offset);
for (size_t i = 0; i < mat.NumRows(); i++) {
fmpe_accs.ComputeHighDimemsionFeature(whole_file_offset, i, &ht);
Vector<double> direct_diff(mat.NumCols()), indirect_diff(mat.NumCols());
/// compute the direct differentials
for (size_t j = 0; j < gpost_ebw[i].size(); j++) {
int32 tid = gpost_ebw[i][j].first, // transition identifier.
pdf_id = trans_model.TransitionIdToPdf(tid);
fmpe_accs.AccumulateDirectDiffFromPosteriors(am_gmm.GetPdf(pdf_id),
mat.Row(i),
gpost_ebw[i][j].second,
&direct_diff);
}
/// compute the indirect differentials
for (size_t j = 0; j < gpost_mle[i].size(); j++) {
int32 tid = gpost_mle[i][j].first, // transition identifier.
pdf_id = trans_model.TransitionIdToPdf(tid);
fmpe_accs.AccumulateInDirectDiffFromPosteriors(am_gmm.GetPdf(pdf_id),
fmpe_accs.GetAccsModelDiff(pdf_id),
mat.Row(i),
gpost_mle[i][j].second,
&indirect_diff);
}
fmpe_accs.AccumulateFromDifferential(direct_diff, indirect_diff, ht);
ht.clear();
}
if (num_done % 50 == 0) {
KALDI_LOG << "Processed " << num_done << " utterances.";
}
}
}
KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
<< " with no posteriors, " << num_other_error
<< " with other errors.";
{
Output ko(accs_wxfilename, binary);
fmpe_accs.Write(ko.Stream(), binary);
}
KALDI_LOG << "Written accs.";
if (num_done != 0) return 0;
else return 1;
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -1,97 +0,0 @@
// gmmbin/fmpe-gmm-est.cc
// Copyright 2009-2011 Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "tree/context-dep.h"
#include "hmm/transition-model.h"
#include "gmm/fmpe-am-diag-gmm.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
typedef kaldi::int32 int32;
const char *usage =
"Estimate fMPE transforms.\n"
"Note: not yet tested.\n"
"Usage: fmpe-gmm-est [options] <am-model-in> <fmpe-proj-matrix-in> <stats-in> <fmpe-proj-matrix-out>\n"
"e.g.: gmm-est 1.mdl 1.mat 1.acc 2.mat\n";
bool binary_write = false;
FmpeConfig fmpe_opts;
ParseOptions po(usage);
po.Register("binary", &binary_write, "Write output in binary mode");
fmpe_opts.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 4) {
po.PrintUsage();
exit(1);
}
std::string model_in_filename = po.GetArg(1),
fmpe_proj_mat_in_filename = po.GetArg(2),
stats_filename = po.GetArg(3),
fmpe_proj_mat_out_filename = po.GetArg(4);
AmDiagGmm am_gmm;
TransitionModel trans_model;
{
bool binary_read;
Input ki(model_in_filename, &binary_read);
trans_model.Read(ki.Stream(), binary_read);
am_gmm.Read(ki.Stream(), binary_read);
}
FmpeAccs fmpe_accs(fmpe_opts);
{
bool binary;
Input ki(stats_filename, &binary);
fmpe_accs.Read(ki.Stream(), binary, true); // true == add; doesn't matter here.
}
FmpeUpdater fmpe_updater(fmpe_accs);
{
bool binary;
Input ki(fmpe_proj_mat_in_filename, &binary);
fmpe_updater.Read(ki.Stream(), binary);
}
{ // update the Fmpe projection matrix
BaseFloat obj_change_out, count_out;
fmpe_updater.ComputeAvgStandardDeviation(am_gmm);
fmpe_updater.Update(fmpe_accs, &obj_change_out, &count_out);
}
{
Output ko(fmpe_proj_mat_out_filename, binary_write);
fmpe_updater.Write(ko.Stream(), binary_write);
}
KALDI_LOG << "Written Fmpe projection matrix to " << fmpe_proj_mat_out_filename;
} catch(const std::exception& e) {
std::cerr << e.what() << '\n';
return -1;
}
}

Просмотреть файл

@ -1,112 +0,0 @@
// gmmbin/fmpe-gmm-model-diffs-est.cc
// Copyright 2009-2011 Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "tree/context-dep.h"
#include "hmm/transition-model.h"
#include "gmm/mle-am-diag-gmm.h"
//#include "gmm/ebw-am-diag-gmm.h" // TODO wait Arnab to finish the AccumAmEbwDiagGmm Class, then make it active
#include "gmm/fmpe-am-diag-gmm.h"
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
typedef kaldi::int32 int32;
const char *usage =
"Compute the model parameters differentials from the ebw accumulators (in mpe training) for fmpe training.\n"
"Usage: fmpe-gmm-model-diffs-est [options] <model-in> <ebw-stats-in> <mle-stats-in> <model-diffs-out>\n"
"e.g.: fmpe-gmm-model-diff-est 1.mdl 1.ebw.acc 1.mle.acc 1.model.diffs\n";
bool binary = false;
ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);
if (po.NumArgs() != 4) {
po.PrintUsage();
exit(1);
}
std::string model_in_filename = po.GetArg(1),
ebw_stats_in_filename = po.GetArg(2),
mle_stats_in_filename = po.GetArg(3),
model_diffs_out_filename = po.GetArg(4);
AmDiagGmm am_gmm;
TransitionModel trans_model;
{
bool binary_read;
Input ki(model_in_filename, &binary_read);
trans_model.Read(ki.Stream(), binary_read);
am_gmm.Read(ki.Stream(), binary_read);
}
Vector<double> transition_ebw_accs;
// AccumAmEbwDiagGmm gmm_ebw_accs; // TODO wait Arnab to finish the AccumAmEbwDiagGmm Class, then make it active
{
bool binary;
Input ki(ebw_stats_in_filename, &binary);
transition_ebw_accs.Read(ki.Stream(), binary);
// TODO wait Arnab to finish the AccumAmEbwDiagGmm Class, then make it active
// gmm_ebw_accs.Read(ki.Stream(), binary, true); // true == add; doesn't matter here.
}
Vector<double> transition_mle_accs;
AccumAmDiagGmm gmm_mle_accs;
{
bool binary;
Input ki(mle_stats_in_filename, &binary);
transition_mle_accs.Read(ki.Stream(), binary);
gmm_mle_accs.Read(ki.Stream(), binary, true); // true == add; doesn't matter here.
}
std::vector<FmpeAccumModelDiff*> model_diffs;
model_diffs.reserve(am_gmm.NumPdfs());
for (int32 i = 0; i < am_gmm.NumPdfs(); i++) {
model_diffs.push_back(new FmpeAccumModelDiff(am_gmm.GetPdf(i)));
// TODO wait Arnab to finish the AccumAmEbwDiagGmm Class, then make it active
// model_diff.back()->ComputeModelParaDiff(am_gmm.GetPdf(i), gmm_ebw_acc.GetAcc(i), gmm_mle_accs.GetAcc(i));
}
// Write out the model diffs
{
kaldi::Output ko(model_diffs_out_filename, binary);
WriteToken(ko.Stream(), binary, "<DIMENSION>");
WriteBasicType(ko.Stream(), binary, static_cast<int32>(am_gmm.Dim()));
WriteToken(ko.Stream(), binary, "<NUMPDFS>");
WriteBasicType(ko.Stream(), binary, static_cast<int32>(model_diffs.size()));
for (std::vector<FmpeAccumModelDiff*>::const_iterator it = model_diffs.begin(),
end = model_diffs.end(); it != end; ++it) {
(*it)->Write(ko.Stream(), binary);
}
}
KALDI_LOG << "Written model diffs to " << model_diffs_out_filename;
} catch(const std::exception& e) {
std::cerr << e.what() << '\n';
return -1;
}
}

Просмотреть файл

@ -1,66 +0,0 @@
// gmmbin/fmpe-gmm-sum-accs.cc
// Copyright 2009-2011 Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "util/common-utils.h"
#include "gmm/fmpe-am-diag-gmm.h"
#include "hmm/transition-model.h"
int main(int argc, char *argv[]) {
try {
typedef kaldi::int32 int32;
const char *usage =
"Sum multiple accumulated stats files for Fmpe training.\n"
"Usage: fmpe-gmm-sum-accs [options] stats-out stats-in1 stats-in2 ...\n";
bool binary = false;
kaldi::FmpeConfig fmpe_opts;
kaldi::ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);
if (po.NumArgs() < 3) {
po.PrintUsage();
exit(1);
}
std::string stats_out_filename = po.GetArg(1);
kaldi::FmpeAccs fmpe_accs(fmpe_opts);
for (int i = 2, max = po.NumArgs(); i <= max; ++i) {
std::string stats_in_filename = po.GetArg(i);
bool binary_read;
kaldi::Input ki(stats_in_filename, &binary_read);
fmpe_accs.Read(ki.Stream(), binary_read, true /*add read values*/);
}
// Write out the accs
{
kaldi::Output ko(stats_out_filename, binary);
fmpe_accs.Write(ko.Stream(), binary);
}
KALDI_LOG << "Written stats to " << stats_out_filename;
} catch(const std::exception& e) {
std::cerr << e.what() << '\n';
return -1;
}
}

Просмотреть файл

@ -1,110 +0,0 @@
// gmmbin/fmpe-init-gmms.cc
// Copyright 2009-2011 Yanmin Qian
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/kaldi-io.h"
#include "gmm/diag-gmm.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "gmm/fmpe-am-diag-gmm.h"
int main(int argc, char *argv[]) {
try {
typedef kaldi::int32 int32;
typedef kaldi::BaseFloat BaseFloat;
const char *usage =
"Cluster the Gaussians in a diagonal-GMM acoustic model\n"
"to two single diag-covariance GMMs used in fmpe: one is the gmm model\n"
"for compute gaussian posteriors and one is the gaussian\n"
"cluster centers which is used to speed up gaussian calculations"
"Usage: fmpe-init-gmms [options] <model-file> <state-occs> <gmm-out> <gmm-cluster-centers-out> <gaussian-cluster-center-map-out>\n";
bool binary_write = false;
int32 gmm_num_comps = 2048;
int32 gmm_num_cluster_centers = 128;
BaseFloat cluster_varfloor = 0.01;
kaldi::FmpeConfig fmpe_opts;
kaldi::ParseOptions po(usage);
po.Register("binary", &binary_write, "Write output in binary mode");
po.Register("gmm-num-comps", &gmm_num_comps, "Number of the Gaussian"
" components in the gmm model to calculate the gaussian posteriors.");
po.Register("gmm-num-cluster-centers", &gmm_num_cluster_centers, "Number"
" of the Gaussian cluster centers for fast posteriors evaluation.");
po.Register("cluster-varfloor", &cluster_varfloor,
"Variance floor used in bottom-up state clustering.");
fmpe_opts.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 5) {
po.PrintUsage();
exit(1);
}
std::string model_in_filename = po.GetArg(1),
occs_in_filename = po.GetArg(2),
gmm_out_filename = po.GetArg(3),
gmm_cluster_centers_out_filename = po.GetArg(4),
gauss_cluster_center_map_out_filename = po.GetArg(5);
kaldi::AmDiagGmm am_gmm;
kaldi::TransitionModel trans_model;
{
bool binary_read;
kaldi::Input ki(model_in_filename, &binary_read);
trans_model.Read(ki.Stream(), binary_read);
am_gmm.Read(ki.Stream(), binary_read);
}
kaldi::Vector<BaseFloat> state_occs;
state_occs.Resize(am_gmm.NumPdfs());
{
bool binary_read;
kaldi::Input ki(occs_in_filename, &binary_read);
state_occs.Read(ki.Stream(), binary_read);
}
kaldi::DiagGmm gmm;
kaldi::DiagGmm gmm_cluster_centers;
std::vector<int32> gaussian_cluster_center_map;
ObtainUbmAndSomeClusterCenters(
am_gmm,
state_occs,
fmpe_opts,
&gmm,
&gmm_cluster_centers,
&gaussian_cluster_center_map);
// Write out the gmms model
{
kaldi::Output ko(gmm_out_filename, binary_write);
gmm.Write(ko.Stream(), binary_write);
gmm_cluster_centers.Write(ko.Stream(), binary_write);
kaldi::WriteIntegerVector(ko.Stream(), binary_write, gaussian_cluster_center_map);
}
KALDI_LOG << "Written GMMs to " << gmm_out_filename;
} catch(const std::exception& e) {
std::cerr << e.what() << '\n';
return -1;
}
}

Просмотреть файл

@ -0,0 +1,153 @@
// gmmbin/gmm-acc-stats.cc
// Copyright 2009-2012 Daniel Povey
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "gmm/mle-am-diag-gmm.h"
int main(int argc, char *argv[]) {
using namespace kaldi;
typedef kaldi::int32 int32;
typedef kaldi::int64 int64;
try {
const char *usage =
"Accumulate stats for GMM training (from posteriors)\n"
"This version writes two accumulators (e.g. num and den),\n"
"and puts the positive accumulators in num, negative in den\n"
"Usage: gmm-acc-stats2 [options] <model> <feature-rspecifier>"
"<posteriors-rspecifier> <num-stats-out> <den-stats-out>\n"
"e.g.:\n"
"gmm-acc-stats 1.mdl \"$feats\" ark:1.post 1.num_acc 1.den_acc\n";
ParseOptions po(usage);
bool binary = true;
std::string update_flags_str = "mvwt"; // note: t is ignored, we acc
// transition stats regardless.
po.Register("binary", &binary, "Write stats in binary mode");
po.Register("update-flags", &update_flags_str, "Which GMM parameters to "
"update: subset of mvwt.");
po.Read(argc, argv);
if (po.NumArgs() != 5) {
po.PrintUsage();
exit(1);
}
std::string model_rxfilename = po.GetArg(1),
feature_rspecifier = po.GetArg(2),
posteriors_rspecifier = po.GetArg(3),
num_accs_wxfilename = po.GetArg(4),
den_accs_wxfilename = po.GetArg(5);
AmDiagGmm am_gmm;
TransitionModel trans_model;
{
bool binary;
Input ki(model_rxfilename, &binary);
trans_model.Read(ki.Stream(), binary);
am_gmm.Read(ki.Stream(), binary);
}
Vector<double> num_trans_accs, den_trans_accs;
trans_model.InitStats(&num_trans_accs);
trans_model.InitStats(&den_trans_accs);
AccumAmDiagGmm num_gmm_accs, den_gmm_accs;
num_gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
den_gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
BaseFloat tot_like = 0.0, tot_weight = 0.0;
// tot_like is total weighted likelihood (note: weighted
// by both +ve and -ve numbers)
// tot_t is total weight in posteriors (will often be about zero).
int64 tot_frames = 0.0;
int32 num_done = 0, num_err = 0;
for (; !feature_reader.Done(); feature_reader.Next()) {
std::string key = feature_reader.Key();
if (!posteriors_reader.HasKey(key)) {
num_err++;
} else {
const Matrix<BaseFloat> &mat = feature_reader.Value();
const Posterior &posterior = posteriors_reader.Value(key);
if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
KALDI_WARN << "Posterior vector has wrong size "
<< (posterior.size()) << " vs. "
<< (mat.NumRows());
num_err++;
continue;
}
BaseFloat tot_like_this_file = 0.0, tot_weight_this_file = 0.0;
for (size_t i = 0; i < posterior.size(); i++) {
for (size_t j = 0; j < posterior[i].size(); j++) {
int32 tid = posterior[i][j].first,
pdf_id = trans_model.TransitionIdToPdf(tid);
BaseFloat weight = posterior[i][j].second;
trans_model.Accumulate(fabs(weight), tid,
(weight > 0.0 ?
&num_trans_accs : &den_trans_accs));
num_done++;
tot_like_this_file +=
(weight > 0.0 ? &num_gmm_accs : &den_gmm_accs) ->
AccumulateForGmm(am_gmm, mat.Row(i), pdf_id, fabs(weight)) * weight;
tot_weight_this_file += weight;
}
}
tot_like += tot_like_this_file;
tot_weight += tot_weight_this_file;
tot_frames += static_cast<int32>(posterior.size());
}
}
KALDI_LOG << "Done " << num_done << " files, " << num_err
<< " had errors.";
KALDI_LOG << "Overall weighted acoustic likelihood per frame was "
<< (tot_like/tot_frames) << " over " << tot_frames << " frames;"
<< " average weight per frame was " << (tot_weight / tot_frames);
{
Output ko(num_accs_wxfilename, binary);
num_trans_accs.Write(ko.Stream(), binary);
num_gmm_accs.Write(ko.Stream(), binary);
}
{
Output ko(den_accs_wxfilename, binary);
den_trans_accs.Write(ko.Stream(), binary);
den_gmm_accs.Write(ko.Stream(), binary);
}
KALDI_LOG << "Written accs.";
return (num_done != 0 ? 0 : 1);
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -125,7 +125,7 @@ int main(int argc, char *argv[]) {
power, min_count);
if (!occs_out_filename.empty()) {
bool binary = false; // write this in text mode-- useful to look at.
bool binary = true; // write this in text mode-- useful to look at.
kaldi::Output ko(occs_out_filename, binary);
state_occs.Write(ko.Stream(), binary);
}

Просмотреть файл

@ -0,0 +1,152 @@
// gmmbin/gmm-fmpe-acc-stats.cc
// Copyright 2012 Daniel Povey
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "transform/fmpe.h"
int main(int argc, char *argv[]) {
using namespace kaldi;
using kaldi::int32;
try {
const char *usage =
"Accumulate stats for fMPE training, using GMM model. Note: this could\n"
"be done using gmm-get-feat-deriv and fmpe-acc-stats (but you'd be computing\n"
"the features twice). Features input should be pre-fMPE features.\n"
"\n"
"Usage: gmm-fmpe-acc-stats [options] <model-in> <fmpe-in> <feature-rspecifier> "
"<gselect-rspecifier> <posteriors-rspecifier> <fmpe-stats-out>\n"
"e.g.: \n"
" gmm-fmpe-acc-stats 1.mdl 1.fmpe \"$feats\" ark:1.gselect ark:1.post 1.fmpe_stats\n";
ParseOptions po(usage);
bool binary = true;
po.Register("binary", &binary, "If true, write stats in binary mode.");
po.Read(argc, argv);
if (po.NumArgs() != 6) {
po.PrintUsage();
exit(1);
}
std::string model_rxfilename = po.GetArg(1),
fmpe_rxfilename = po.GetArg(2),
feature_rspecifier = po.GetArg(3),
gselect_rspecifier = po.GetArg(4),
posteriors_rspecifier = po.GetArg(5),
stats_wxfilename = po.GetArg(6);
AmDiagGmm am_gmm;
TransitionModel trans_model;
{
bool binary;
Input ki(model_rxfilename, &binary);
trans_model.Read(ki.Stream(), binary);
am_gmm.Read(ki.Stream(), binary);
}
Fmpe fmpe;
{
bool binary_in;
Input ki(fmpe_rxfilename, &binary_in);
fmpe.Read(ki.Stream(), binary_in);
}
// fmpe stats...
Matrix<BaseFloat> stats(fmpe.ProjectionTNumRows() * 2,
fmpe.ProjectionTNumCols());
SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionTNumRows(),
0, fmpe.ProjectionTNumCols());
SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionTNumRows(),
fmpe.ProjectionTNumRows(),
0, fmpe.ProjectionTNumCols());
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
BaseFloat tot_like = 0.0; // tot like weighted by posterior.
int32 num_frames = 0;
int32 num_done = 0, num_err = 0;
for (; !feature_reader.Done(); feature_reader.Next()) {
std::string key = feature_reader.Key();
if (!posteriors_reader.HasKey(key)) {
num_err++;
KALDI_WARN << "No posteriors for utterance " << key;
continue;
}
const Matrix<BaseFloat> &feat_in = feature_reader.Value();
const Posterior &posterior = posteriors_reader.Value(key);
if (static_cast<int32>(posterior.size()) != feat_in.NumRows()) {
KALDI_WARN << "Posterior vector has wrong size " <<
(posterior.size()) << " vs. "<< (feat_in.NumRows());
num_err++;
continue;
}
if (!gselect_reader.HasKey(key)) {
KALDI_WARN << "No gselect information for key " << key;
num_err++;
continue;
}
const std::vector<std::vector<int32> > &gselect =
gselect_reader.Value(key);
if (static_cast<int32>(gselect.size()) != feat_in.NumRows()) {
KALDI_WARN << "gselect information has wrong size";
num_err++;
continue;
}
num_done++;
Matrix<BaseFloat> fmpe_feat(feat_in.NumRows(), feat_in.NumCols());
fmpe.ComputeFeatures(feat_in, gselect, &fmpe_feat);
fmpe_feat.AddMat(1.0, feat_in);
Matrix<BaseFloat> feat_deriv;
tot_like += ComputeAmGmmFeatureDeriv(am_gmm, trans_model, posterior,
fmpe_feat, &feat_deriv);
num_frames += feat_in.NumRows();
fmpe.AccStats(feat_in, gselect, feat_deriv, &stats_plus, &stats_minus);
if (num_done % 100 == 0)
KALDI_LOG << "Processed " << num_done << " utterances.";
}
KALDI_LOG << "Done " << num_done << " files, " << num_err
<< " with errors.";
KALDI_LOG << "Overall weighted acoustic likelihood per frame is "
<< (tot_like/num_frames) << " over " << num_frames << " frames.";
Output ko(stats_wxfilename, binary);
stats.Write(ko.Stream(), binary);
return (num_done != 0 ? 0 : 1);
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -0,0 +1,110 @@
// gmmbin/gmm-get-feat-deriv.cc
// Copyright 2012 Daniel Povey
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "transform/fmpe.h"
int main(int argc, char *argv[]) {
using namespace kaldi;
using kaldi::int32;
try {
const char *usage =
"From GMM model and posteriors (which don't have to be positive),\n"
"output for each utterance a matrix of likelihood derivatives w.r.t.\n"
"the features.\n"
"E.g. used in feature-space discriminative training.\n"
"\n"
"Usage: gmm-get-feat-deriv [options] <model-in> <feature-rspecifier> "
"<posteriors-rspecifier> <feature-deriv-wspecifier>\n"
"e.g.: \n"
" gmm-get-feat-deriv 1.mdl \"$feats\" ark:1.post ark:1.deriv\n";
ParseOptions po(usage);
po.Read(argc, argv);
if (po.NumArgs() != 4) {
po.PrintUsage();
exit(1);
}
std::string model_filename = po.GetArg(1),
feature_rspecifier = po.GetArg(2),
posteriors_rspecifier = po.GetArg(3),
deriv_wspecifier = po.GetArg(4);
AmDiagGmm am_gmm;
TransitionModel trans_model;
{
bool binary;
Input ki(model_filename, &binary);
trans_model.Read(ki.Stream(), binary);
am_gmm.Read(ki.Stream(), binary);
}
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
BaseFloatMatrixWriter deriv_writer(deriv_wspecifier);
int32 num_done = 0, num_err = 0;
for (; !feature_reader.Done(); feature_reader.Next()) {
std::string key = feature_reader.Key();
if (!posteriors_reader.HasKey(key)) {
KALDI_WARN << "No posteriors for utterance " << key;
num_err++;
} else {
const Matrix<BaseFloat> &mat = feature_reader.Value();
const Posterior &posterior = posteriors_reader.Value(key);
if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
KALDI_WARN << "Posterior vector has wrong size " <<
(posterior.size()) << " vs. "<< (mat.NumRows());
num_err++;
continue;
}
num_done++;
// Derivative of likelihood (or whatever objective func.)
// w.r.t. features.
Matrix<BaseFloat> deriv;
ComputeAmGmmFeatureDeriv(am_gmm, trans_model, posterior,
mat, &deriv);
deriv_writer.Write(key, deriv);
if (num_done % 100 == 0)
KALDI_LOG << "Processed " << num_done << " utterances.";
}
}
KALDI_LOG << "Done " << num_done << " files, " << num_err
<< " with errors.";
if (num_done != 0) return 0;
else return 1;
} catch(const std::exception& e) {
std::cerr << e.what();
return -1;
}
}

Просмотреть файл

@ -30,7 +30,7 @@ int main(int argc, char *argv[]) {
"Convert a full covariance GMM into a diagonal one.\n"
"Usage: full-to-tied <full-gmm-in> <diag-gmm-out>\n";
bool binary = false;
bool binary = true;
ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);

Просмотреть файл

@ -106,7 +106,7 @@ int main(int argc, char *argv[]) {
"e.g.: \n"
" init-tied-codebooks tree tree.acc ubm-full tree.map\n";
bool binary = false;
bool binary = true;
int max_num_gaussians = 512;
bool split_gaussians = false;
BaseFloat perturb = 0.01;

Просмотреть файл

@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
" tied-diag-gmm-acc-stats-ali 1.mdl 1.ali scp:train.scp ark:1.ali 1.acc\n";
ParseOptions po(usage);
bool binary = false;
bool binary = true;
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);

Просмотреть файл

@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
" tied-diag-gmm-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
ParseOptions po(usage);
bool binary = false;
bool binary = true;
BaseFloat beam = 200.0;
BaseFloat retry_beam = 0.0;
BaseFloat acoustic_scale = 1.0;

Просмотреть файл

@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
"e.g.: \n"
" tied-diag-gmm-init-model tree topo tree.map diag0.ubm diag1.ubm 1.mdl\n";
bool binary = false;
bool binary = true;
ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");

Просмотреть файл

@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
"e.g.: \n"
" tied-diag-gmm-init-mono topo cb.pdf mono.mdl mono.tree\n";
bool binary = false;
bool binary = true;
ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);

Просмотреть файл

@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
" tied-full-gmm-acc-stats-ali 1.mdl 1.ali scp:train.scp ark:1.ali 1.acc\n";
ParseOptions po(usage);
bool binary = false;
bool binary = true;
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);

Просмотреть файл

@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
" tied-full-gmm-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
ParseOptions po(usage);
bool binary = false;
bool binary = true;
BaseFloat beam = 200.0;
BaseFloat retry_beam = 0.0;
BaseFloat acoustic_scale = 1.0;

Просмотреть файл

@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
"e.g.: \n"
" tied-full-gmm-init-model tree topo tree.map full0.ubm full1.ubm 1.mdl\n";
bool binary = false;
bool binary = true;
ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");

Просмотреть файл

@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
"e.g.: \n"
" tied-full-gmm-init-mono topo cb.pdf mono.mdl mono.tree\n";
bool binary = false;
bool binary = true;
ParseOptions po(usage);
po.Register("binary", &binary, "Write output in binary mode");
po.Read(argc, argv);

Просмотреть файл

@ -167,7 +167,7 @@ try {
" tied-lbg tree-old tree-tied topo scp:train.scp ark:ali ubm-full "
"tree.map\n";
bool binary = false;
bool binary = true;
bool full = true;
BaseFloat perturb = 0.01;

Просмотреть файл

@ -4,7 +4,7 @@ include ../kaldi.mk
TESTFILES = regtree-fmllr-diag-gmm-test lda-estimate-test \
regression-tree-test fmllr-diag-gmm-test exponential-transform-test \
regtree-mllr-diag-gmm-test
regtree-mllr-diag-gmm-test fmpe-test
OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
regtree-fmllr-diag-gmm.o cmvn.o transform-common.o fmllr-diag-gmm.o \

174
src/transform/fmpe-test.cc Normal file
Просмотреть файл

@ -0,0 +1,174 @@
// transform/fmpe-test.cc
// Copyright 2012 Daniel Povey
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "util/common-utils.h"
#include "gmm/diag-gmm.h"
#include "gmm/diag-gmm-normal.h"
#include "gmm/model-test-common.h"
#include "transform/fmpe.h"
namespace kaldi {
// Compute derivative of GMM log-likelihood w.r.t. features.
// Note: this code copied from gmm-get-feat-deriv.cc; had
// to simplify a bit.
void GetFeatDeriv(const DiagGmm &gmm,
const Matrix<BaseFloat> &feats,
Matrix<BaseFloat> *deriv) {
deriv->Resize(feats.NumRows(), feats.NumCols());
Vector<BaseFloat> gauss_posteriors;
Vector<BaseFloat> temp_vec(feats.NumCols());
for (int32 i = 0; i < feats.NumRows(); i++) {
SubVector<BaseFloat> this_feat(feats, i);
SubVector<BaseFloat> this_deriv(*deriv, i);
gmm.ComponentPosteriors(this_feat, &gauss_posteriors);
BaseFloat weight = 1.0;
gauss_posteriors.Scale(weight);
// The next line does: to i'th row of deriv, add
// means_invvars^T * gauss_posteriors,
// where each row of means_invvars is the mean times
// diagonal inverse covariance... after transposing,
// this becomes a weighted of these rows, weighted by
// the posteriors. This comes from the term
// feat^T * inv_var * mean
// in the objective function.
this_deriv.AddMatVec(1.0, gmm.means_invvars(), kTrans,
gauss_posteriors, 1.0);
// next line does temp_vec == inv_vars^T * gauss_posteriors,
// which sets temp_vec to a weighted sum of the inv_vars,
// weighed by Gaussian posterior.
temp_vec.AddMatVec(1.0, gmm.inv_vars(), kTrans,
gauss_posteriors, 0.0);
// Add to the derivative, -(this_feat .* temp_vec),
// which is the term that comes from the -0.5 * inv_var^T feat_sq,
// in the objective function (where inv_var is a vector, and feat_sq
// is a vector of squares of the feature values).
this_deriv.AddVecVec(-1.0, this_feat, temp_vec, 1.0);
}
}
// Gets total log-likelihood, summed over all frames.
BaseFloat GetGmmLike(const DiagGmm &gmm,
const Matrix<BaseFloat> &feats) {
BaseFloat ans = 0.0;
for (int32 i = 0; i < feats.NumRows(); i++)
ans += gmm.LogLikelihood(feats.Row(i));
return ans;
}
void TestFmpe() {
int32 dim = 10 + (rand() % 10);
int32 num_comp = 10 + (rand() % 10);
DiagGmm gmm;
unittest::InitRandDiagGmm(dim, num_comp, &gmm);
int32 num_frames = 20;
Matrix<BaseFloat> feats(num_frames, dim);
for (int32 i = 0; i < num_frames; i++)
for (int32 j = 0; j < dim; j++)
feats(i,j) = RandGauss();
FmpeOptions opts; // Default.
{
Fmpe fmpe(gmm, opts);
{
bool binary = (rand() % 2 == 1);
Output ko("tmpf", binary);
fmpe.Write(ko.Stream(), binary);
}
}
Fmpe fmpe(gmm, opts);
{
bool binary_in;
Input ki("tmpf", &binary_in);
fmpe.Read(ki.Stream(), binary_in);
}
// We'll first be testing that the feature derivative is
// accurate, by measuring a small random offset in feature space.
{
Matrix<BaseFloat> deriv;
Matrix<BaseFloat> random_offset(feats.NumRows(), feats.NumCols());
for (int32 i = 0; i < feats.NumRows(); i++)
for (int32 j = 0; j < feats.NumCols(); j++)
random_offset(i,j) = 1.0e-03 * RandGauss();
BaseFloat like_before = GetGmmLike(gmm, feats);
feats.AddMat(1.0, random_offset);
BaseFloat like_after = GetGmmLike(gmm, feats);
feats.AddMat(-1.0, random_offset); // undo the change.
GetFeatDeriv(gmm, feats, &deriv);
BaseFloat change1 = like_after - like_before,
change2 = TraceMatMat(random_offset, deriv, kTrans);
KALDI_LOG << "Random offset led to like change "
<< change1 << " (manually), and " << change2
<< " (derivative)";
// note: not making this threshold smaller, as don't want
// spurious failures. Seems to be OK though.
KALDI_ASSERT( fabs(change1-change2) < 0.15*fabs(change1+change2));
}
std::vector<std::vector<int32> > gselect(feats.NumRows()); // make it have all Gaussians...
for (int32 i = 0; i < feats.NumRows(); i++)
for (int32 j = 0; j < gmm.NumGauss(); j++)
gselect[i].push_back(j);
Matrix<BaseFloat> fmpe_offset;
// Check that the fMPE feature offset is zero.
fmpe.ComputeFeatures(feats, gselect, &fmpe_offset);
KALDI_ASSERT(fmpe_offset.IsZero());
// Note: we're just using the ML objective function here.
// This is just to make sure the derivatives are all computed
// correctly.
BaseFloat like_before_update = GetGmmLike(gmm, feats);
// Now get stats for update.
int32 nr = fmpe.ProjectionTNumRows(), nc = fmpe.ProjectionTNumCols();
Matrix<BaseFloat> plus_stats(nr, nc), minus_stats(nr, nc);
Matrix<BaseFloat> deriv;
GetFeatDeriv(gmm, feats, &deriv);
fmpe.AccStats(feats, gselect, deriv, &plus_stats, &minus_stats);
FmpeUpdateOptions update_opts;
update_opts.learning_rate = 0.001; // so linear assumption is more valid.
BaseFloat delta = fmpe.Update(update_opts, plus_stats, minus_stats);
fmpe.ComputeFeatures(feats, gselect, &fmpe_offset);
feats.AddMat(1.0, fmpe_offset);
BaseFloat like_after_update = GetGmmLike(gmm, feats);
BaseFloat delta2 = like_after_update - like_before_update;
KALDI_LOG << "Change predicted by fMPE Update function is "
<< delta << ", change computed directly is "
<< delta2;
KALDI_ASSERT(fabs(delta-delta2) < 0.15 * fabs(delta+delta2));
}
}
int main() {
kaldi::g_kaldi_verbose_level = 5;
for (int i = 0; i <= 10; i++)
kaldi::TestFmpe();
std::cout << "Test OK.\n";
}

Просмотреть файл

@ -19,6 +19,8 @@
#include "transform/fmpe.h"
#include "util/text-utils.h"
#include "gmm/diag-gmm-normal.h"
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
namespace kaldi {
@ -73,7 +75,7 @@ void Fmpe::ComputeC() {
// to get centered covariance.
C_.Resize(dim);
try {
TpMatrix<double> Ctmp; Ctmp.Cholesky(x2_stats);
TpMatrix<double> Ctmp(dim); Ctmp.Cholesky(x2_stats);
C_.CopyFromTp(Ctmp);
} catch (...) {
KALDI_ERR << "Error initializing fMPE object: cholesky of "
@ -94,9 +96,9 @@ void Fmpe::ApplyContext(const MatrixBase<BaseFloat> &intermed_feat,
// Applies the temporal-context part of the transformation.
int32 dim = FeatDim(), ncontexts = NumContexts(),
T = intermed_feat.NumRows();
KALDI_ASSERT(intermed_feat.NumRows() == dim * ncontexts &&
intermed_feat.NumCols() == feat_out->NumCols()
&& feat_out->NumRows() == dim);
KALDI_ASSERT(intermed_feat.NumCols() == dim * ncontexts &&
intermed_feat.NumRows() == feat_out->NumRows()
&& feat_out->NumCols() == dim);
// note: ncontexts == contexts_.size().
for (int32 i = 0; i < ncontexts; i++) {
// this_intermed_feat is the chunk of the "intermediate features"
@ -125,9 +127,9 @@ void Fmpe::ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
// in reverse, for getting derivatives for training.
int32 dim = FeatDim(), ncontexts = NumContexts(),
T = feat_deriv.NumRows();
KALDI_ASSERT(intermed_feat_deriv->NumRows() == dim * ncontexts &&
intermed_feat_deriv->NumCols() == feat_deriv.NumCols()
&& feat_deriv.NumRows() == dim);
KALDI_ASSERT(intermed_feat_deriv->NumCols() == dim * ncontexts &&
intermed_feat_deriv->NumRows() == feat_deriv.NumRows()
&& feat_deriv.NumCols() == dim);
// note: ncontexts == contexts_.size().
for (int32 i = 0; i < ncontexts; i++) {
// this_intermed_feat is the chunk of the derivative of
@ -142,7 +144,7 @@ void Fmpe::ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
// but this doesn't dominate the computation and I think this is
// clearer.
for (int32 t_out = 0; t_out < T; t_out++) { // t_out indexes the output
int32 t_in = t_in + t_offset; // t_in indexes the input.
int32 t_in = t_out + t_offset; // t_in indexes the input.
if (t_in >= 0 && t_in < T) // Discard frames outside range.
this_intermed_feat_deriv.Row(t_in).AddVec(weight,
feat_deriv.Row(t_out));
@ -164,7 +166,16 @@ void Fmpe::ApplyC(MatrixBase<BaseFloat> *feat_out, bool reverse) const {
}
}
// Constructs the high-dim features and applies the main projection matrix proj_.
// Constructs the high-dim features and applies the main projection matrix
// projT_. This projects from dimension ngauss*(dim+1) to dim*ncontexts. Note:
// because the input vector of size ngauss*(dim+1) is sparse in a blocky way
// (i.e. each frame only has a couple of nonzero posteriors), we deal with
// sub-matrices of the projection matrix projT_. We actually further optimize
// the code by taking all frames in a file that had nonzero posteriors for a
// particular Gaussian, and forming a matrix out of the corresponding
// high-dimensional features; we can then use a matrix-matrix multiply rather
// than using vector-matrix operations.
void Fmpe::ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
const std::vector<std::vector<int32> > &gselect,
MatrixBase<BaseFloat> *intermed_feat) const {
@ -173,17 +184,44 @@ void Fmpe::ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
Vector<BaseFloat> post; // will be posteriors of selected Gaussians.
Vector<BaseFloat> input_chunk(dim+1); // will be a segment of
// the high-dimensional features.
// "all_posts" is a vector of ((gauss-index, time-index), gaussian
// posterior).
// We'll compute the posterior information, sort it, and then
// go through it in sorted order, which maintains memory locality
// when accessing the projection matrix.
// Note: if we really cared we could make this use level-3 BLAS
// (matrix-matrix multiply), but we'd need to have a temporary
// matrix for the output and input.
std::vector<std::pair<std::pair<int32,int32>, BaseFloat> > all_posts;
for (int32 t = 0; t < feat_in.NumRows(); t++) {
SubVector<BaseFloat> this_feat(feat_in, t);
SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
gmm_.LogLikelihoodsPreselect(this_feat, gselect[t], &post);
// At this point, post will contain log-likes of the selected
// Gaussians.
post.ApplySoftMax(); // Now they are posteriors (which sum to one).
for (int32 i = 0; i < post.Dim(); i++) {
int32 gauss = gselect[t][i];
all_posts.push_back(std::make_pair(std::make_pair(gauss, t), post(i)));
}
}
std::sort(all_posts.begin(), all_posts.end());
bool optimize = true;
if (!optimize) { // Why do we keep this un-optimized code around?
// For clarity, so you can see what's going on, and for easier
// comparision with ApplyProjectionReverse which is similar to this
// un-optimized segment. Both un-optimized and optimized versions
// should give identical transforms (up to tiny roundoff differences).
for (size_t i = 0; i < all_posts.size(); i++) {
int32 gauss = all_posts[i].first.first, t = all_posts[i].first.second;
SubVector<BaseFloat> this_feat(feat_in, t);
SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
BaseFloat this_post = all_posts[i].second;
SubVector<BaseFloat> this_stddev(stddevs_, gauss);
BaseFloat this_post = post(i);
// The next line is equivalent to setting input_chunk to
// -this_post * the gaussian mean / (gaussian stddev). Note: we use
// the fact that mean * inv_var * stddev == mean / stddev.
@ -196,12 +234,55 @@ void Fmpe::ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
1.0);
// The last element of this input_chunk is the posterior itself
// (between 0 and 1).
input_chunk(dim) = this_post;
input_chunk(dim) = this_post * config_.post_scale;
// this_intermed_feat += [appropriate chjunk of proj_] * input_chunk.
this_intermed_feat.AddMatVec(1.0, proj_.Range(0, dim*ncontexts,
gauss*(dim+1), dim+1),
kNoTrans, input_chunk, 1.0);
// this_intermed_feat += [appropriate chjunk of projT_] * input_chunk.
this_intermed_feat.AddMatVec(1.0, projT_.Range(gauss*(dim+1), dim+1,
0, dim*ncontexts),
kTrans, input_chunk, 1.0);
}
} else {
size_t i = 0;
while (i < all_posts.size()) {
int32 gauss = all_posts[i].first.first;
SubVector<BaseFloat> this_stddev(stddevs_, gauss),
this_mean_invvar(gmm_.means_invvars(), gauss);
SubMatrix<BaseFloat> this_projT_chunk(projT_, gauss*(dim+1), dim+1,
0, dim*ncontexts);
int32 batch_size; // number of posteriors with same Gaussian..
for (batch_size = 0;
batch_size+i < static_cast<int32>(all_posts.size()) &&
all_posts[batch_size+i].first.first == gauss;
batch_size++); // empty loop body.
Matrix<BaseFloat> input_chunks(batch_size, dim+1);
Matrix<BaseFloat> intermed_temp(batch_size, dim*ncontexts);
for (int32 j = 0; j < batch_size; j++) { // set up "input_chunks"
int32 t = all_posts[i+j].first.second;
SubVector<BaseFloat> this_feat(feat_in, t);
SubVector<BaseFloat> this_input_chunk(input_chunks, j);
BaseFloat this_post = all_posts[i+j].second;
this_input_chunk.Range(0, dim).AddVecVec(-this_post,
this_mean_invvar,
this_stddev, 0.0);
this_input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat,
this_stddev, 1.0);
this_input_chunk(dim) = this_post * config_.post_scale;
}
// The next line is where most of the computation will happen,
// during the feature computation phase. We have rearranged
// stuff so it's a matrix-matrix operation, for greater
// efficiency (when using optimized libraries like ATLAS).
intermed_temp.AddMatMat(1.0, input_chunks, kNoTrans,
this_projT_chunk, kNoTrans, 0.0);
for (int32 j = 0; j < batch_size; j++) { // add data from
// intermed_temp to the output "intermed_feat"
int32 t = all_posts[i+j].first.second;
SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
SubVector<BaseFloat> this_intermed_temp(intermed_temp, j);
// this_intermed_feat += this_intermed_temp.
this_intermed_feat.AddVec(1.0, this_intermed_temp);
}
i += batch_size;
}
}
}
@ -221,9 +302,16 @@ void Fmpe::ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
Vector<BaseFloat> post; // will be posteriors of selected Gaussians.
Vector<BaseFloat> input_chunk(dim+1); // will be a segment of
// the high-dimensional features.
// "all_posts" is a vector of ((gauss-index, time-index), gaussian
// posterior).
// We'll compute the posterior information, sort it, and then
// go through it in sorted order, which maintains memory locality
// when accessing the projection matrix.
std::vector<std::pair<std::pair<int32,int32>, BaseFloat> > all_posts;
for (int32 t = 0; t < feat_in.NumRows(); t++) {
SubVector<BaseFloat> this_feat(feat_in, t);
SubVector<BaseFloat> this_intermed_feat_deriv(intermed_feat_deriv, t);
gmm_.LogLikelihoodsPreselect(this_feat, gselect[t], &post);
// At this point, post will contain log-likes of the selected
// Gaussians.
@ -232,35 +320,44 @@ void Fmpe::ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
// The next few lines (where we set up "input_chunk") are identical
// to ApplyProjection.
int32 gauss = gselect[t][i];
SubVector<BaseFloat> this_stddev(stddevs_, gauss);
BaseFloat this_post = post(i);
input_chunk.Range(0, dim).AddVecVec(-this_post, gmm_.means_invvars().Row(gauss),
this_stddev, 0.0);
input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat, this_stddev,
1.0);
input_chunk(dim) = this_post;
// If not for accumulating the + and - parts separately, we would be
// doing something like:
// proj_deriv_.Range(0, dim*ncontexts, gauss*(dim+1), dim+1).AddVecVec(
// 1.0, this_intermed_feat_deriv, input_chunk);
SubMatrix<BaseFloat> plus_chunk(*proj_deriv_plus, 0, dim*ncontexts,
gauss*(dim+1), dim+1),
minus_chunk(*proj_deriv_minus, 0, dim*ncontexts,
gauss*(dim+1), dim+1);
// This next function takes the rank-one matrix
// (this_intermed_deriv * input_chunk') and adds the positive
// part to proj_deriv_plus, and minus the negative part to
// proj_deriv_minus.
AddOuterProductPlusMinus(static_cast<BaseFloat>(1.0),
this_intermed_feat_deriv,
input_chunk,
&plus_chunk, &minus_chunk);
all_posts.push_back(std::make_pair(std::make_pair(gauss, t), post(i)));
}
}
std::sort(all_posts.begin(), all_posts.end());
for (size_t i = 0; i < all_posts.size(); i++) {
int32 gauss = all_posts[i].first.first, t = all_posts[i].first.second;
BaseFloat this_post = all_posts[i].second;
SubVector<BaseFloat> this_feat(feat_in, t);
SubVector<BaseFloat> this_intermed_feat_deriv(intermed_feat_deriv, t);
SubVector<BaseFloat> this_stddev(stddevs_, gauss);
input_chunk.Range(0, dim).AddVecVec(-this_post, gmm_.means_invvars().Row(gauss),
this_stddev, 0.0);
input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat, this_stddev,
1.0);
input_chunk(dim) = this_post * config_.post_scale;
// If not for accumulating the + and - parts separately, we would be
// doing something like:
// proj_deriv_.Range(0, dim*ncontexts, gauss*(dim+1), dim+1).AddVecVec(
// 1.0, this_intermed_feat_deriv, input_chunk);
SubMatrix<BaseFloat> plus_chunk(*proj_deriv_plus,
gauss*(dim+1), dim+1,
0, dim*ncontexts),
minus_chunk(*proj_deriv_minus,
gauss*(dim+1), dim+1,
0, dim*ncontexts);
// This next function takes the rank-one matrix
// (input_chunk * this_intermed_deriv'), and adds the positive
// part to proj_deriv_plus, and minus the negative part to
// proj_deriv_minus.
AddOuterProductPlusMinus(static_cast<BaseFloat>(1.0),
input_chunk,
this_intermed_feat_deriv,
&plus_chunk, &minus_chunk);
}
}
void Fmpe::ComputeFeatures(const MatrixBase<BaseFloat> &feat_in,
@ -296,8 +393,8 @@ void Fmpe::AccStats(const MatrixBase<BaseFloat> &feat_in,
int32 dim = FeatDim(), ncontexts = NumContexts();
KALDI_ASSERT(feat_in.NumRows() != 0 && feat_in.NumCols() == dim);
KALDI_ASSERT(feat_in.NumRows() == static_cast<int32>(gselect.size()));
AssertSameDim(*proj_deriv_plus, proj_);
AssertSameDim(*proj_deriv_minus, proj_);
AssertSameDim(*proj_deriv_plus, projT_);
AssertSameDim(*proj_deriv_minus, projT_);
AssertSameDim(feat_in, feat_deriv_in);
// We do everything in reverse now, in reverse order.
@ -326,28 +423,29 @@ Fmpe::Fmpe(const DiagGmm &gmm, const FmpeOptions &config): gmm_(gmm),
SetContexts(config.context_expansion);
ComputeC();
ComputeStddevs();
proj_.Resize(NumGauss() * (FeatDim()+1), FeatDim() * NumContexts());
projT_.Resize(NumGauss() * (FeatDim()+1), FeatDim() * NumContexts());
}
void Fmpe::Update(const FmpeUpdateOptions &config,
MatrixBase<BaseFloat> &proj_deriv_plus,
MatrixBase<BaseFloat> &proj_deriv_minus) {
BaseFloat Fmpe::Update(const FmpeUpdateOptions &config,
MatrixBase<BaseFloat> &proj_deriv_plus,
MatrixBase<BaseFloat> &proj_deriv_minus) {
// tot_linear_objf_impr is the change in the actual
// objective function if it were linear, i.e.
// objf-gradient . parameter-change // Note: none of this is normalized by the #frames (we don't have
// this info here), so that is done at the script level.
BaseFloat tot_linear_objf_impr = 0.0;
AssertSameDim(proj_deriv_plus, proj_);
AssertSameDim(proj_deriv_minus, proj_);
int32 changed = 0; // Keep track of how many elements change sign.
AssertSameDim(proj_deriv_plus, projT_);
AssertSameDim(proj_deriv_minus, projT_);
KALDI_ASSERT(proj_deriv_plus.Min() >= 0);
KALDI_ASSERT(proj_deriv_minus.Min() >= 0);
BaseFloat learning_rate = config.learning_rate,
l2_weight = config.l2_weight;
for (int32 i = 0; i < proj_.NumRows(); i++) {
for (int32 j = 0; j < proj_.NumCols(); j++) {
for (int32 i = 0; i < projT_.NumRows(); i++) {
for (int32 j = 0; j < projT_.NumCols(); j++) {
BaseFloat p = proj_deriv_plus(i,j), n = proj_deriv_minus(i,j),
x = proj_(i,j);
x = projT_(i,j);
// Suppose the basic update (before regularization) is:
// z <-- x + learning_rate * (p - n) / (p + n),
// where z is the new parameter and x is the old one.
@ -371,10 +469,14 @@ void Fmpe::Update(const FmpeUpdateOptions &config,
// z is the new parameter value.
tot_linear_objf_impr += (z-x) * (p-n); // objf impr based on linear assumption.
proj_(i,j) = z;
projT_(i,j) = z;
if (z*x < 0) changed++;
}
}
KALDI_LOG << "Objf impr (assuming linear) is " << tot_linear_objf_impr;
KALDI_LOG << ((100.0*changed)/(projT_.NumRows()*projT_.NumCols()))
<< "% of matrix elements changed sign.";
return tot_linear_objf_impr;
}
// Note: we write the GMM first, without any other header.
@ -386,7 +488,7 @@ void Fmpe::Write(std::ostream &os, bool binary) const {
gmm_.Write(os, binary);
config_.Write(os, binary);
// stddevs_ are derived, don't write them.
proj_.Write(os, binary);
projT_.Write(os, binary);
C_.Write(os, binary);
// contexts_ are derived from config, don't write them.
}
@ -396,11 +498,59 @@ void Fmpe::Read(std::istream &is, bool binary) {
gmm_.Read(is, binary);
config_.Read(is, binary);
ComputeStddevs(); // computed from gmm.
proj_.Read(is, binary);
projT_.Read(is, binary);
C_.Read(is, binary);
SetContexts(config_.context_expansion);
}
BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm,
const TransitionModel &trans_model,
const Posterior &posterior,
const MatrixBase<BaseFloat> &features,
Matrix<BaseFloat> *deriv) {
BaseFloat ans = 0.0;
KALDI_ASSERT(posterior.size() == static_cast<size_t>(features.NumRows()));
deriv->Resize(features.NumRows(), features.NumCols());
Vector<BaseFloat> temp_vec(features.NumCols());
for (size_t i = 0; i < posterior.size(); i++) {
for (size_t j = 0; j < posterior[i].size(); j++) {
int32 tid = posterior[i][j].first, // transition identifier.
pdf_id = trans_model.TransitionIdToPdf(tid);
BaseFloat weight = posterior[i][j].second;
const DiagGmm &gmm = am_gmm.GetPdf(pdf_id);
Vector<BaseFloat> gauss_posteriors;
SubVector<BaseFloat> this_feat(features, i);
SubVector<BaseFloat> this_deriv(*deriv, i);
ans += weight *
gmm.ComponentPosteriors(this_feat, &gauss_posteriors);
gauss_posteriors.Scale(weight);
// The next line does: to i'th row of deriv, add
// means_invvars^T * gauss_posteriors,
// where each row of means_invvars is the mean times
// diagonal inverse covariance... after transposing,
// this becomes a weighted of these rows, weighted by
// the posteriors. This comes from the term
// feat^T * inv_var * mean
// in the objective function.
this_deriv.AddMatVec(1.0, gmm.means_invvars(), kTrans,
gauss_posteriors, 1.0);
// next line does temp_vec == inv_vars^T * gauss_posteriors,
// which sets temp_vec to a weighted sum of the inv_vars,
// weighed by Gaussian posterior.
temp_vec.AddMatVec(1.0, gmm.inv_vars(), kTrans,
gauss_posteriors, 0.0);
// Add to the derivative, -(this_feat .* temp_vec),
// which is the term that comes from the -0.5 * inv_var^T feat_sq,
// in the objective function (where inv_var is a vector, and feat_sq
// is a vector of squares of the feature values).
this_deriv.AddVecVec(-1.0, this_feat, temp_vec, 1.0);
}
}
return ans;
}
} // End of namespace kaldi

Просмотреть файл

@ -22,6 +22,8 @@
#include <vector>
#include "gmm/am-diag-gmm.h"
#include "hmm/transition-model.h"
#include "util/kaldi-holder.h" // for Posterior
namespace kaldi {
@ -104,8 +106,13 @@ class Fmpe {
int32 NumGauss() const { return gmm_.NumGauss(); }
int32 NumContexts() const { return static_cast<int32>(contexts_.size()); }
int32 ProjectionNumRows() { return FeatDim() * NumContexts(); }
int32 ProjectionNumCols() { return (FeatDim()+1) * NumGauss(); }
// Note: this returns the number of rows and columns in projT_,
// which is the transpose of the high->intermediate dimensional
// projection matrix. This is the dimension we want for the
// stats.
int32 ProjectionTNumRows() { return (FeatDim()+1) * NumGauss(); }
int32 ProjectionTNumCols() { return FeatDim() * NumContexts(); }
// Computes the fMPE feature offsets and outputs them.
// You can add feat_in to this afterwards, if you want.
@ -131,9 +138,10 @@ class Fmpe {
void Write(std::ostream &os, bool binary) const;
void Read(std::istream &is, bool binary);
void Update(const FmpeUpdateOptions &config,
MatrixBase<BaseFloat> &proj_deriv_plus,
MatrixBase<BaseFloat> &proj_deriv_minus);
// Returns total objf improvement, based on linear assumption.
BaseFloat Update(const FmpeUpdateOptions &config,
MatrixBase<BaseFloat> &proj_deriv_plus,
MatrixBase<BaseFloat> &proj_deriv_minus);
private:
void SetContexts(std::string context_str);
@ -180,8 +188,9 @@ class Fmpe {
// variances of the GMM -- computed to avoid taking a square root
// in the fMPE computation. Derived variable-- not stored on
// disk.
Matrix<BaseFloat> proj_; // The projection matrix, of dimension
// (FeatDim() * NumContexts()) x (NumGauss() * (FeatDim()+1))
Matrix<BaseFloat> projT_; // The transpose of the projection matrix;
// this is of dimension
// (NumGauss() * (FeatDim()+1)) * (FeatDim() * NumContexts()).
TpMatrix<BaseFloat> C_; // Cholesky factor of the variance Sigma of
// features around their mean (as estimated from GMM)... applied
@ -197,6 +206,17 @@ class Fmpe {
};
/// Computes derivatives of the likelihood of these states (weighted),
/// w.r.t. the feature values. Used in fMPE training. Note, the
/// weights "posterior" may be positive or negative-- for MMI, MPE,
/// etc., they will typically be of both signs. Will resize "deriv".
/// Returns the sum of (GMM likelihood * weight), which may be used
/// as an approximation to the objective function.
BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm,
const TransitionModel &trans_model,
const Posterior &posterior,
const MatrixBase<BaseFloat> &features,
Matrix<BaseFloat> *deriv);
} // End namespace kaldi