Adding fMPE scripts; changes to fMPE code.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@772 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2012-02-21 16:08:04 +00:00 · 2012-02-21 16:08:04 +00:00 · bad94ae0bc
--- a/21
+++ b/21
@ -1,4 +1,24 @@

+ Update to legal notice, made Feb. 2012.  We would like to clarify that we
+ are using a convention where multiple names in the Apache copyright headers, 
+ for example
+
+  // Copyright 2009-2012  Yanmin Qian  Arnab Ghoshal
+
+ does not necessarily signify joint ownership of copyright of that file, except
+ in cases where all those names were present in the original release made in
+ March 2011-- you can use the version history to work this out, if this matters
+ to you.  Instead, we intend that those contributors who later modified the file, 
+ agree to release their changes under the Apache license, but do not claim to 
+ jointly own the copyright of the original material (which would require an agreement 
+ with the original contributors).  The conventional way of signifying 
+ this is to duplicate the Apache headers at the top of each file each time
+ a change is made by a different author, but this would quickly become impractical.
+
+ The original legal notice is below.  Note: we are continuing to modify it by
+ adding the names of new contributors.
+
+---
                          Legal Notices

 Each of the files comprising Kaldi v1.0 have been separately licensed by
@ -18,6 +38,7 @@ Individual Contributors (in alphabetical order)
      Arnab Ghoshal
      Go Vivace Inc.
      Mirko Hannemann
+      Navdeep Jaitly
      Microsoft Corporation
      Petr Motlicek
      Ariya Rastrow
--- a/egs/README.txt
+++ b/egs/README.txt
@ -26,4 +26,7 @@ Recipes in progress:
    sampling rate).
    This directory is a work in progress.
  
+
 gp: GlobalPhone.  This is a multilingual speech corpus.
+
+ timit: TIMIT, which is an old corpus of carefully read speech.  
--- a/egs/gp/s1/run.sh
+++ b/egs/gp/s1/run.sh
@ -28,7 +28,7 @@ exit 1;
 # shorten to WAV to take out the empty files and those with compression errors. 
 # So set WORKDIR to someplace with enough disk space. That is where MFCCs will 
 # get created, as well as the FST versions of LMs.
-WORKDIR=/path/with/disk/space
+WORKDIR=/mnt/matylda6/jhu09/qpovey/temp_gp
 cp -r conf local utils steps install.sh path.sh $WORKDIR
 cd $WORKDIR
 # INSTALLING REQUIRED TOOLS:
@ -39,7 +39,7 @@ cd $WORKDIR
  { echo "shorten and/or sox not found on PATH. Installing..."; 
    install.sh }

-local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/path/to/GlobalPhone --lm-dir=/path/to/lms --work-dir=$WORKDIR
+local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/mnt/matylda2/data/GLOBALPHONE --lm-dir=/path/to/lms --work-dir=$WORKDIR
 # On Eddie: local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$PWD/corpus --lm-dir=$PWD/corpus/language_models --work-dir=$PWD

 local/gp_format_data.sh --hmm-proto=conf/topo.proto --work-dir=$PWD
--- a/egs/rm/s3/RESULTS
+++ b/egs/rm/s3/RESULTS
@ -5,38 +5,38 @@ for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | scripts/best_wer.sh;
 # monophone; delta+accel
 exp/mono/decode/wer_4:%WER 9.830049 [ 1232 / 12533, 143 ins, 289 del, 800 sub ]
 # First triphone pass; delta+accel
-exp/tri1/decode/wer_6:%WER 3.694247 [ 463 / 12533, 69 ins, 100 del, 294 sub ]
+exp/tri1/decode/wer_6:%WER 3.893721 [ 488 / 12533, 69 ins, 96 del, 323 sub ]
 # Second triphone pass; delta+accel
-exp/tri2a/decode/wer_7:%WER 3.638395 [ 456 / 12533, 61 ins, 107 del, 288 sub ]
+exp/tri2a/decode/wer_7:%WER 3.486795 [ 437 / 12533, 65 ins, 91 del, 281 sub ]
 # [as tri2a, but] LDA+MLLT
-exp/tri2b/decode/wer_7:%WER 3.534668 [ 443 / 12533, 74 ins, 88 del, 281 sub ]
+exp/tri2b/decode/wer_6:%WER 3.359132 [ 421 / 12533, 73 ins, 71 del, 277 sub ]
 # LDA + exponential transform (note: this is with speaker adaptation)
-exp/tri2c/decode/wer_5:%WER 2.848480 [ 357 / 12533, 62 ins, 61 del, 234 sub ]
+exp/tri2c/decode/wer_5:%WER 2.905492 [ 364 / 12528, 68 ins, 59 del, 237 sub ]
 # LDA+MLLT+MMI.
-exp/tri3a/decode/wer_7:%WER 3.502753 [ 439 / 12533, 75 ins, 83 del, 281 sub ]
+exp/tri3a/decode/wer_7:%WER 3.084052 [ 386 / 12516, 54 ins, 67 del, 265 sub ]
 # LDA+MLLT+boosted MMI [note: errors are not identical, although WER is same]
-exp/tri3b/decode/wer_7:%WER 3.454879 [ 433 / 12533, 75 ins, 80 del, 278 sub ]
+exp/tri3b/decode/wer_5:%WER 3.155960 [ 395 / 12516, 74 ins, 50 del, 271 sub ]
 # LDA+MLLT+MCE
-exp/tri3c/decode/wer_7:%WER 3.183595 [ 399 / 12533, 62 ins, 79 del, 258 sub ]
+exp/tri3c/decode/wer_6:%WER 3.047953 [ 382 / 12533, 56 ins, 69 del, 257 sub ]
 # LDA+MLLT+SAT
-exp/tri3d/decode/wer_6:%WER 2.553259 [ 320 / 12533, 43 ins, 63 del, 214 sub ]
+exp/tri3d/decode/wer_7:%WER 2.234102 [ 280 / 12533, 35 ins, 62 del, 183 sub ]
 # LDA+MLLT+SAT+MMI
-exp/tri4a/decode/wer_6:%WER 2.473470 [ 310 / 12533, 43 ins, 62 del, 205 sub ]
+exp/tri4a/decode/wer_6:%WER 2.146334 [ 269 / 12533, 37 ins, 43 del, 189 sub ]
 # LDA+MLLT+SAT, extra phase of builting on top of 3d (no help)
-exp/tri4d/decode/wer_5:%WER 2.800606 [ 351 / 12533, 47 ins, 68 del, 236 sub ]
+exp/tri4d/decode/wer_5:%WER 2.457512 [ 308 / 12533, 50 ins, 54 del, 204 sub ]
 # LDA+MLLT + SGMM with speaker vectors
-exp/sgmm3d/decode/wer_4:%WER 2.186228 [ 274 / 12533, 41 ins, 42 del, 191 sub ]
+exp/sgmm3d/decode/wer_6:%WER 2.305912 [ 289 / 12533, 53 ins, 52 del, 184 sub ]
 # LDA+ET + SGMM with speaker vectors.
-exp/sgmm3e/decode/wer_5:%WER 2.242081 [ 281 / 12533, 44 ins, 47 del, 190 sub ]
+exp/sgmm3e/decode/wer_4:%WER 2.042608 [ 256 / 12533, 39 ins, 38 del, 179 sub ]
 # LDA+MLLT+SAT + SGMM with speaker vectors.
-exp/sgmm4f/decode/wer_5:%WER 2.226123 [ 279 / 12533, 56 ins, 49 del, 174 sub ]
+exp/sgmm4f/decode/wer_7:%WER 1.970797 [ 247 / 12533, 36 ins, 56 del, 155 sub ]
 # + FMLLR on top of it all.
-exp/sgmm4f/decode_fmllr/wer_6:%WER 2.202186 [ 276 / 12533, 39 ins, 59 del, 178 sub ]
+exp/sgmm4f/decode_fmllr/wer_5:%WER 1.954839 [ 245 / 12533, 40 ins, 47 del, 158 sub ]

 # System combination via lattices: combine tri1 and tri2a
 exp/combine_1_2a/decode/wer_6:%WER 3.518711 [ 441 / 12533, 62 ins, 97 del, 282 sub ]
 # System combination via lattices: combine sgmm4f and tri3d.
-exp/combine_sgmm4f_tri3d/decode/wer_5:%WER 2.082502 [ 261 / 12533, 36 ins, 48 del, 177 sub ]
+exp/combine_sgmm4f_tri3d/decode/wer_5:%WER 1.763345 [ 221 / 12533, 32 ins, 42 del, 147 sub ]
 # System combination via lattices: combine sgmm4f and tri4a.
-exp/combine_sgmm4f_tri4a/decode/wer_5:%WER 2.082502 [ 261 / 12533, 37 ins, 49 del, 175 sub ]
+exp/combine_sgmm4f_tri4a/decode/wer_6:%WER 1.715471 [ 215 / 12533, 31 ins, 39 del, 145 sub ]

--- a/egs/rm/s3/steps/train_lda_etc_mmi.sh
+++ b/egs/rm/s3/steps/train_lda_etc_mmi.sh
@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation
+# Copyright 2010-2012 Microsoft Corporation  Daniel Povey

 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -27,13 +27,24 @@
 #  ali, final.mdl, final.mat

 boost=0 # boosting constant, for boosted MMI. 
-tau=100 # Tau value.
+tau=200 # Tau value.
+merge=true # if true, cancel num and den counts as described in 
+    # the boosted MMI paper. 

-if [ $1 == "--boost" ]; then # e.g. "--boost 0.05"
-   shift;
-   boost=$1;
-   shift;
-fi
+for x in `seq 4`; do
+  if [ $1 == "--boost" ]; then # e.g. "--boost 0.05"
+    boost=$2;
+    shift 2;
+  fi
+  if [ $1 == "--smooth-to-model" ]; then 
+    shift;
+    smooth_to_model=true
+  fi
+  if [ $1 == "--tau" ]; then # e.g. "--tau 200
+    tau=$2
+    shift 2;
+  fi
+done

 if [ $# != 4 ]; then
   echo "Usage: steps/train_lda_etc_mmi.sh <data-dir> <lang-dir> <ali-dir> <exp-dir>"
@ -99,7 +110,7 @@ scripts/mkgraph.sh $dir/lang $alidir $dir/dengraph || exit 1;

 echo "Making denominator lattices"

-
+ if false; then ##temp
 rm $dir/.error 2>/dev/null
 for n in 0 1 2 3; do
   gmm-latgen-simple --beam=$beam --lattice-beam=$latticebeam --acoustic-scale=$acwt \
@ -113,45 +124,33 @@ if [ -f $dir/.error ]; then
   echo "Error creating denominator lattices"
   exit 1;
 fi
+ fi ##temp

 # No need to create "numerator" alignments/lattices: we just use the 
 # alignments in $alidir.

-echo "Note: ignore absolute offsets in the objective function values"
-echo "This is caused by not having LM, lexicon or transition-probs in numerator"
-
 x=0;
 while [ $x -lt $num_iters ]; do
-  echo "Iteration $x: getting denominator stats."
-  # Get denominator stats...
-  if [ $x -eq 0 ]; then
-    ( lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat?.gz|" ark:- | \
-      gmm-acc-stats $dir/$x.mdl "$feats" ark:- $dir/den_acc.$x.acc ) \
-     2>$dir/acc_den.$x.log || exit 1;
-  else # Need to recompute acoustic likelihoods...
-   ( gmm-rescore-lattice $dir/$x.mdl "ark:gunzip -c $dir/lat?.gz|" "$feats" ark:- | \
-      lattice-to-post --acoustic-scale=$acwt ark:- ark:- | \
-      gmm-acc-stats $dir/$x.mdl "$feats" ark:- $dir/den_acc.$x.acc ) \
-     2>$dir/acc_den.$x.log || exit 1;
-  fi
-  echo "Iteration $x: getting numerator stats."
-  # Get numerator stats...
-  gmm-acc-stats-ali $dir/$x.mdl "$feats" ark:$alidir/ali $dir/num_acc.$x.acc \
-   2>$dir/acc_num.$x.log || exit 1;
+  echo "Iteration $x: getting  stats."
+  ( gmm-rescore-lattice $dir/$x.mdl "ark:gunzip -c $dir/lat?.gz|" "$feats" ark:- | \
+   lattice-to-post --acoustic-scale=$acwt ark:- ark:- | \
+   sum-post --merge=$merge --scale1=-1 \
+    ark:- "ark,s,cs:ali-to-post ark:$alidir/ali ark:- |" ark:- | \
+   gmm-acc-stats2 $dir/$x.mdl "$feats" ark:- $dir/num_acc.$x.acc $dir/den_acc.$x.acc ) \
+     2>$dir/acc.$x.log || exit 1;

-  ( gmm-est-gaussians-ebw $dir/$x.mdl "gmm-ismooth-stats --tau=$tau $dir/num_acc.$x.acc $dir/num_acc.$x.acc -|" \
-         $dir/den_acc.$x.acc - | \
+  # This tau is only used for smoothing "to the model".
+  ( gmm-est-gaussians-ebw --tau=$tau $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - | \
   gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl ) \
    2>$dir/update.$x.log || exit 1;

-  den=`grep Overall $dir/acc_den.$x.log  | grep lattice-to-post | awk '{print $7}'`
-  num=`grep Overall $dir/acc_num.$x.log  | grep gmm-acc-stats-ali | awk '{print $11}'`
-  diff=`perl -e "print ($num * $acwt - $den);"`
-  impr=`grep Overall $dir/update.$x.log | head -1 | awk '{print $10;}'`
-  impr=`perl -e "print ($impr * $acwt);"` # auxf impr normalized by multiplying by
-  # kappa, so it's comparable to an objective-function change.
-  echo On iter $x, objf was $diff, auxf improvement was $impr | tee $dir/objf.$x.log
-
+  objf=`grep Overall $dir/acc.$x.log  | grep gmm-acc-stats2 | awk '{print $10}'`
+  nf=`grep Overall $dir/acc.$x.log  | grep gmm-acc-stats2 | awk '{print $12}'`
+  impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'`
+  impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
+    # for the canceling of stats.
+  echo On iter $x, objf was $objf, auxf improvement from MMI was $impr | tee $dir/objf.$x.log
+  rm $dir/*.acc
  x=$[$x+1]
 done

--- a/egs/timit/s1/conf/mfcc.conf
+++ b/egs/timit/s1/conf/mfcc.conf
@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
--- a/egs/timit/s1/conf/topo.proto
+++ b/egs/timit/s1/conf/topo.proto
@ -0,0 +1,22 @@
+<Topology> 
+<TopologyEntry> 
+<ForPhones>
+NONSILENCEPHONES
+</ForPhones> 
+<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State> 
+<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State> 
+<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State> 
+<State> 3 </State>
+</TopologyEntry> 
+<TopologyEntry> 
+<ForPhones>
+SILENCEPHONES
+</ForPhones> 
+<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State> 
+<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
+<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State> 
+<State> 5 </State>
+</TopologyEntry> 
+</Topology> 
--- a/egs/timit/s1/steps/train_mono.sh
+++ b/egs/timit/s1/steps/train_mono.sh
@ -103,4 +103,3 @@ done

 # example of showing the alignments:
 # show-alignments data/lang/phones.txt $dir/30.mdl ark:$dir/cur.ali | head -4
-
--- a/egs/timit/s3/run.sh
+++ b/egs/timit/s3/run.sh
@ -1,7 +1,8 @@
 . path.sh
-local/timit_data_prep.sh /ais/gobi2/speech/TIMIT
-local/timit_train_lms.sh data/local
-local/timit_format_data.sh
+#local/timit_data_prep.sh /ais/gobi2/speech/TIMIT
+local/timit_data_prep.sh /mnt/matylda2/data/TIMIT || exit 1;
+local/timit_train_lms.sh data/local || exit 1;
+local/timit_format_data.sh || exit 1;

 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features. 
@ -9,13 +10,13 @@ mfccdir=mfccs

 steps/make_mfcc.sh data/train exp/make_mfcc/train $mfccdir 4
 for test in train test dev ; do
-  steps/make_mfcc.sh data/$test exp/make_mfcc/$test $mfccdir 4
+  steps/make_mfcc.sh data/$test exp/make_mfcc/$test $mfccdir 4 || exit 1;
 done

 # train monophone system.
-steps/train_mono.sh data/train data/lang exp/mono
+steps/train_mono.sh data/train data/lang exp/mono || exit 1;

-scripts/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
+scripts/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1;
 echo "Decoding test datasets."
 for test in dev test ; do
  steps/decode_deltas.sh exp/mono data/$test data/lang exp/mono/decode_$test &
@ -25,8 +26,7 @@ scripts/average_wer.sh exp/mono/decode_*/wer > exp/mono/wer

 # Get alignments from monophone system.
 echo "Creating training alignments to use to train other systems such as ANN-HMM."
-steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali
+steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali || exit 1;
 echo "Creating dev alignments to use to train other systems such as ANN-HMM."
-steps/align_deltas.sh data/dev data/lang exp/mono exp/mono_ali_dev
-
+steps/align_deltas.sh data/dev data/lang exp/mono exp/mono_ali_dev || exit 1;

--- a/egs/wsj/s3/RESULTS
+++ b/egs/wsj/s3/RESULTS
@ -22,12 +22,15 @@ exp/tri2b/decode_tgpr_dev93_fromlats/wer_15:%WER 16.71 [ 1376 / 8234, 267 ins, 1
 exp/tri2b/decode_tgpr_dev93_tg/wer_16:%WER 16.26 [ 1339 / 8234, 267 ins, 141 del, 931 sub ]
 exp/tri2b/decode_tgpr_dev93_tg_biglm/wer_16:%WER 16.42 [ 1352 / 8234, 269 ins, 142 del, 941 sub ]

-exp/tri2b/decode_tgpr_eval92/wer_16:%WER 11.54 [ 651 / 5643, 146 ins, 42 del, 463 sub ]
+exp/tri2b/decode_tgpr_eval92/wer_17:%WER 11.45 [ 646 / 5643, 140 ins, 46 del, 460 sub ]

 # +MMI
-exp/tri2b_mmi/decode_tgpr_eval92/wer_16:%WER 11.08 [ 625 / 5643, 125 ins, 44 del, 456 sub ]
+exp/tri2b_mmi/decode_tgpr_eval92/wer_14:%WER 10.63 [ 600 / 5643, 124 ins, 45 del, 431 sub ]
 #  +boosting
-exp/tri2b_mmi_b0.1/decode_tgpr_eval92/wer_16:%WER 10.83 [ 611 / 5643, 122 ins, 43 del, 446 sub ]
+exp/tri2b_mmi_b0.1/decode_tgpr_eval92/wer_16:%WER 10.69 [ 603 / 5643, 119 ins, 48 del, 436 sub ]
+# +fMMI
+exp/tri2b_fmmi_b0.1/decode_tgpr_eval92/wer_15:%WER 10.26 [ 579 / 5643, 111 ins, 39 del, 429 sub ]
+
 # +MCE
 exp/tri2b_mce/decode_tgpr_eval92/wer_16:%WER 11.15 [ 629 / 5643, 132 ins, 45 del, 452 sub ]

@ -69,8 +72,17 @@ exp/tri4b/decode_tgpr_dev93/wer_13:%WER 12.53 [ 1032 / 8234, 242 ins, 79 del, 71
 exp/tri4b/decode_tgpr_eval92/wer_16:%WER 8.05 [ 454 / 5643, 119 ins, 23 del, 312 sub ]

 # +MMI
-exp/tri4b_mmi/decode_tgpr_dev93/wer_14:%WER 11.53 [ 949 / 8234, 203 ins, 82 del, 664 sub ]
-exp/tri4b_mmi_b0.1/decode_tgpr_dev93/wer_16:%WER 11.45 [ 943 / 8234, 191 ins, 87 del, 665 sub ]
+exp/tri4b_mmi/decode_tgpr_dev93/wer_12:%WER 11.28 [ 929 / 8234, 206 ins, 76 del, 647 sub ]
+#+boosting
+exp/tri4b_mmi_b0.1/decode_tgpr_dev93/wer_16:%WER 11.25 [ 926 / 8234, 176 ins, 94 del, 656 sub ]
+ # increasing beam from 13 to 15 to see effect. 
+ exp/tri4b_mmi_b0.1/decode_tgpr_dev93_b15/wer_14:%WER 10.72 [ 883 / 8234, 172 ins, 84 del, 627 sub ]
+exp/tri4b_mmi_b0.1/decode_tgpr_eval92/wer_14:%WER 7.34 [ 414 / 5643, 105 ins, 20 del, 289 sub ]
+
+#+fMMI
+exp/tri4b_fmmi_b0.1/decode_tgpr_dev93/wer_13:%WER 10.86 [ 894 / 8234, 167 ins, 89 del, 638 sub ]
+exp/tri4b_fmmi_b0.1/decode_tgpr_eval92/wer_12:%WER 7.25 [ 409 / 5643, 111 ins, 14 del, 284 sub ]
+

 # LDA+MLLT+SAT, SI-284, full retraining starting from 3b [c.f. 4b]
 exp/tri4c/decode_tgpr_dev93/wer_16:%WER 12.10 [ 996 / 8234, 220 ins, 83 del, 693 sub ]
--- a/egs/wsj/s3/run.sh
+++ b/egs/wsj/s3/run.sh
@ -164,6 +164,18 @@ steps/train_lda_etc_mmi.sh --num-jobs 10 --boost 0.1 --cmd "$train_cmd" \
  data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b exp/tri2b_mmi_b0.1
 scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt.sh exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi_b0.1/decode_tgpr_eval92

+ # The next 3 commands train and test fMMI+MMI (on top of LDA+MLLT).
+ steps/train_dubm_lda_etc.sh --silence-weight 0.5 \
+   --num-jobs 10 --cmd "$train_cmd" 400 data/train_si84 \
+   data/lang exp/tri2b_ali_si84 exp/dubm2b
+ steps/train_lda_etc_mmi_fmmi.sh \
+   --num-jobs 10 --boost 0.1 --cmd "$train_cmd" \
+   data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
+   exp/tri2b exp/tri2b_fmmi_b0.1
+ scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_fmpe.sh \
+   exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_fmmi_b0.1/decode_tgpr_eval92
+
+
 steps/train_lda_etc_mce.sh --cmd "$train_cmd" --num-jobs 10 data/train_si84 data/lang \
 exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b exp/tri2b_mce
 scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_lda_mllt.sh \
@ -222,7 +234,8 @@ scripts/mkgraph.sh data/lang_test_tgpr exp/tri4b exp/tri4b/graph_tgpr
 scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b/decode_tgpr_dev93
 scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b/decode_tgpr_eval92

-# Train and test MMI, and boosted MMI, on tri4b.
+# Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
+# all the data).
 # Making num-jobs 40 as want to keep them under 4 hours long (or will fail
 # on regular queue at BUT).
 steps/align_lda_mllt_sat.sh --num-jobs 40 --cmd "$train_cmd" \
@ -235,6 +248,25 @@ scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tg
 steps/train_lda_etc_mmi.sh --boost 0.1 --num-jobs 40 --cmd "$train_cmd" \
  data/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b_denlats_si284 exp/tri4b exp/tri4b_mmi_b0.1
 scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93 exp/tri4b/decode_tgpr_dev93
+ scripts/decode.sh --opts "--beam 15" --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93_b15 exp/tri4b/decode_tgpr_dev93
+scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b_mmi_b0.1/decode_tgpr_eval92 exp/tri4b/decode_tgpr_eval92
+
+ # Train fMMI+MMI system on top of 4b.
+ steps/train_dubm_lda_etc.sh --silence-weight 0.5 \
+   --num-jobs 40 --cmd "$train_cmd" 600 data/train_si284 \
+   data/lang exp/tri4b_ali_si284 exp/dubm4b
+ steps/train_lda_etc_mmi_fmmi.sh \
+   --num-jobs 40 --boost 0.1 --cmd "$train_cmd" \
+   data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \
+   exp/tri4b exp/tri4b_fmmi_b0.1 
+ scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc_fmpe.sh \
+   exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b_fmmi_b0.1/decode_tgpr_eval92 \
+   exp/tri4b/decode_tgpr_eval92
+ scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc_fmpe.sh \
+   exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_fmmi_b0.1/decode_tgpr_dev93 \
+   exp/tri4b/decode_tgpr_dev93
+
+

 # Train UBM, for SGMM system on top of LDA+MLLT.
 steps/train_ubm_lda_etc.sh --num-jobs 10 --cmd "$train_cmd" \
@ -245,6 +277,7 @@ scripts/mkgraph.sh data/lang_test_tgpr exp/sgmm3c exp/sgmm3c/graph_tgpr
 scripts/decode.sh --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh exp/sgmm3c/graph_tgpr \
  data/test_dev93 exp/sgmm3c/decode_tgpr_dev93

+
 # Decode using 3 Gaussians (not 15) for gselect in 1st pass, for fast decoding.
 scripts/decode.sh --opts "--first-pass-gselect 3" --cmd "$decode_cmd" \
  steps/decode_sgmm_lda_etc.sh exp/sgmm3c/graph_tgpr data/test_dev93 exp/sgmm3c/decode_tgpr_dev93_gs3
--- a/egs/wsj/s3/steps/decode_lda_etc.sh
+++ b/egs/wsj/s3/steps/decode_lda_etc.sh
@ -62,7 +62,7 @@ fi
 requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.mat $graphdir/HCLG.fst $transdir/$jobid.trans"
 for f in $requirements; do
  if [ ! -f $f ]; then
-     echo "decode_lda_mllt.sh: no such file $f";
+     echo "decode_lda_etc.sh: no such file $f";
     exit 1;
  fi
 done
--- a/egs/wsj/s3/steps/decode_lda_etc_fmpe.sh
+++ b/egs/wsj/s3/steps/decode_lda_etc_fmpe.sh
@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Decoding script for LDA + optionally MLLT + [some speaker-specific transforms]
+# + fMPE.
+# This decoding script takes as an argument a previous decoding directory where it
+# can find some transforms.
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+numjobs=1
+jobid=0
+beam=13.0
+rescore=false
+for x in `seq 3`; do
+  if [ "$1" == "-j" ]; then
+    shift;
+    numjobs=$1;
+    jobid=$2;
+    shift 2;
+  fi
+  if [ "$1" == "--beam" ]; then
+    beam=$2;
+    shift 2;
+  fi
+done
+
+if [ $# != 4 ]; then
+   # Note: transform-dir has to be last because scripts/decode.sh expects decode-dir to be #3 arg.
+   echo "Usage: steps/decode_lda_etc.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir> <transform-dir>"
+   echo " e.g.: steps/decode_lda_etc.sh -j 8 0 exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi/decode_tgpr_dev93 exp/tri4b/decode_tgpr_dev93"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+transdir=$4
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+mkdir -p $dir
+
+if [ $numjobs -gt 1 ]; then
+  mydata=$data/split$numjobs/$jobid
+else
+  mydata=$data
+fi
+
+requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.fmpe $srcdir/final.mat $graphdir/HCLG.fst $transdir/$jobid.trans"
+for f in $requirements; do
+  if [ ! -f $f ]; then
+     echo "decode_lda_etc_fmpe.sh: no such file $f";
+     exit 1;
+  fi
+done
+
+
+basefeats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$mydata/utt2spk ark:$transdir/$jobid.trans ark:- ark:- |"
+
+# Get the Gaussian-selection info for the fMPE.  
+ngselect=2; # Just the 2 top Gaussians. 
+gmm-gselect --n=$ngselect $srcdir/final.fmpe "$basefeats" \
+  "ark:|gzip -c >$dir/gselect.$jobid.gz" 2>$dir/gselect.$jobid.log
+
+
+# Now set up the fMPE features.
+feats="$basefeats fmpe-apply-transform $srcdir/final.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.$jobid.gz|' ark:- |"
+
+gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 \
+  --acoustic-scale=0.083333 \
+  --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+  $srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
+     2> $dir/decode$jobid.log || exit 1;
--- a/egs/wsj/s3/steps/decode_lda_mllt_fmpe.sh
+++ b/egs/wsj/s3/steps/decode_lda_mllt_fmpe.sh
@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Decoding script that works with a GMM model and the baseline
+# [e.g. MFCC] features plus cepstral mean subtraction plus
+# LDA+MLLT or similar transform, plus fMPE/FMMI.
+# This script just generates lattices for a single broken-up
+# piece of the data.
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+numjobs=1
+jobid=0
+rescore=false
+if [ "$1" == "-j" ]; then
+  shift;
+  numjobs=$1;
+  jobid=$2;
+  shift; shift;
+fi
+
+if [ $# != 3 ]; then
+   echo "Usage: steps/decode_lda_mllt_fmpe.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir>"
+   echo " e.g.: steps/decode_lda_mllt_fmpe.sh -j 8 0 exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi/decode_dev93_tgpr"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+
+mkdir -p $dir
+
+if [ $numjobs -gt 1 ]; then
+  mydata=$data/split$numjobs/$jobid
+else
+  mydata=$data
+fi
+
+requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.fmpe $srcdir/final.mat $graphdir/HCLG.fst"
+for f in $requirements; do
+  if [ ! -f $f ]; then
+     echo "decode_lda_mllt_fmpe.sh: no such file $f";
+     exit 1;
+  fi
+done
+
+
+basefeats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+
+# Get the Gaussian-selection info for the fMPE.  
+ngselect=2; # Just the 2 top Gaussians. 
+gmm-gselect --n=$ngselect $srcdir/final.fmpe "$basefeats" \
+  "ark:|gzip -c >$dir/gselect.$jobid.gz" 2>$dir/gselect.$jobid.log
+
+# Now set up the fMPE features.
+feats="$basefeats fmpe-apply-transform $srcdir/final.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.$jobid.gz|' ark:- |"
+
+gmm-latgen-faster --max-active=7000 --beam=13.0 --lattice-beam=6.0 --acoustic-scale=0.083333 \
+  --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+  $srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
+     2> $dir/decode.$jobid.log || exit 1;
+
--- a/egs/wsj/s3/steps/train_dubm_lda_etc.sh
+++ b/egs/wsj/s3/steps/train_dubm_lda_etc.sh
@ -0,0 +1,114 @@
+#!/bin/bash
+
+# This trains a diagonal-covariance UBM (i.e. just a global
+# mixture of Gaussians, or GMM).
+
+# Train UBM from a trained HMM/GMM system [with splice+LDA+[MLLT/ET/MLLT+SAT] features]
+# Alignment directory is used for the CMN and transforms.
+# A UBM is just a single mixture of Gaussians (full-covariance, in our case), that's trained
+# on all the data.  This will later be used in Subspace Gaussian Mixture Model (SGMM)
+# training.
+
+nj=4
+cmd=scripts/run.pl
+silweight=
+for x in 1 2; do
+  if [ $1 == "--num-jobs" ]; then
+     shift
+     nj=$1
+     shift
+  fi
+  if [ $1 == "--cmd" ]; then
+     shift
+     cmd=$1
+     shift
+  fi  
+  if [ $1 == "--silence-weight" ]; then
+     shift
+     silweight=$1 # e.g. to weight down silence in training.
+     shift
+  fi  
+done
+
+if [ $# != 5 ]; then
+  echo "Usage: steps/train_ubm_lda_etc.sh <num-comps> <data-dir> <lang-dir> <ali-dir> <exp-dir>"
+  echo " e.g.: steps/train_ubm_lda_etc.sh 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c"
+  exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+numcomps=$1
+data=$2
+lang=$3
+alidir=$4
+dir=$5
+silphonelist=`cat $lang/silphones.csl`
+
+mkdir -p $dir/log
+
+if [ ! -d $data/split$nj -o $data/split$nj -ot $data/feats.scp ]; then
+  scripts/split_data.sh $data $nj
+fi
+
+n1=`get_splits.pl $nj | awk '{print $1}'`
+[ -f $alidir/$n1.trans ] && echo "Using speaker transforms from $alidir"
+
+for n in `get_splits.pl $nj`; do
+  featspart[$n]="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  if [ -f $alidir/$n1.trans ]; then
+    featspart[$n]="${featspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
+  fi
+  if [ ! -z "$silweight" ]; then
+    weightspart[$n]="--weights='ark,s,cs:gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- | weight-silence-post $silweight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
+  fi
+done
+
+ngselect=50
+
+intermediate=2000
+if [ $[$numcomps*2] -gt $intermediate ]; then
+  intermediate=$[$numcomps*2];
+fi
+
+echo "Clustering model $alidir/final.mdl to get initial UBM"
+# typically: --intermediate-numcomps=2000 --ubm-numcomps=400
+
+if [ ! -s  $dir/0.dubm ]; then
+ $cmd $dir/log/cluster.log \
+  init-ubm --intermediate-numcomps=$intermediate --ubm-numcomps=$numcomps \
+   --verbose=2 --fullcov-ubm=false $alidir/final.mdl $alidir/final.occs \
+    $dir/0.dubm   || exit 1;
+fi
+rm $dir/.error 2>/dev/null
+# First do Gaussian selection to 50 components, which will be used
+# as the initial screen for all further passes.
+for n in `get_splits.pl $nj`; do
+  $cmd $dir/log/gselect.$n.log \
+    gmm-gselect --n=$ngselect $dir/0.dubm "${featspart[$n]}" \
+      "ark:|gzip -c >$dir/gselect.$n.gz"  &
+done
+wait
+[ -f $dir/.error ] && echo "Error doing GMM selection" && exit 1;
+
+for x in 0 1 2 3; do
+  echo "Pass $x"
+  for n in `get_splits.pl $nj`; do
+    $cmd $dir/log/acc.$x.$n.log \
+      gmm-global-acc-stats ${weightspart[$n]} "--gselect=ark,s,cs:gunzip -c $dir/gselect.$n.gz|" \
+        $dir/$x.dubm "${featspart[$n]}" $dir/$x.$n.acc || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "Error accumulating stats for UBM estimation on pass $x" && exit 1;
+  lowcount_opt="--remove-low-count-gaussians=false"
+  [ $x -eq 3 ] && lowcount_opt=   # Only remove low-count Gaussians on last iter-- keeps gselect info valid.
+  $cmd $dir/log/update.$x.log \
+    gmm-global-est $lowcount_opt --verbose=2 $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc |" \
+      $dir/$[$x+1].dubm || exit 1;
+  rm $dir/$x.*.acc $dir/$x.dubm
+done
+
+rm $dir/gselect.*.gz
+rm $dir/final.dubm 2>/dev/null
+mv $dir/4.dubm $dir/final.dubm || exit 1;
+
--- a/egs/wsj/s3/steps/train_lda_etc_mmi.sh
+++ b/egs/wsj/s3/steps/train_lda_etc_mmi.sh
@ -20,10 +20,8 @@
 # [something] may be MLLT, or ET, or MLLT + SAT.  Any speaker-specific
 # transforms are expected to be located in the alignment directory. 
 # This script never re-estimates any transforms, it just does model 
-# training.  To make this faster, it initializes the model from the
-# old system's model, i.e. for each p.d.f., it takes the best-match pdf
-# from the old system (based on overlap of tree-stats counts), and 
-# uses that GMM to initialize the current GMM.
+# training.  
+
 # Basically we are doing 4 iterations of Extended Baum-Welch (EBW)
 # estimation, as described in Dan Povey's thesis, with a few differences:
 # (i) we have the option of "boosting", as in "Boosted MMI", which increases
@ -47,7 +45,9 @@
 niters=4
 nj=4
 boost=0.0
-tau=100
+tau=200
+merge=true # if true, cancel num and den counts as described in 
+    # the boosted MMI paper. 
 cmd=scripts/run.pl
 acwt=0.1
 stage=0
@ -69,6 +69,9 @@ for x in `seq 8`; do
  if [ $1 == "--acwt" ]; then
    shift; acwt=$1; shift
  fi  
+  if [ $1 == "--tau" ]; then
+    shift; tau=$1; shift
+  fi  
  if [ $1 == "--stage" ]; then
    shift; stage=$1; shift
  fi  
@ -121,58 +124,60 @@ rm $dir/.error 2>/dev/null
 cur_mdl=$srcdir/final.mdl
 x=0
 while [ $x -lt $niters ]; do
-  echo "Iteration $x: getting denominator stats."
-  # Get denominator stats...  For simplicity we rescore the lattice
+  echo "Iteration $x: getting stats."
+  # Get denominator and numerator stats together...    This involves
+  # merging the num and den posteriors, and (if $merge==true), canceling
+  # the +ve and -ve occupancies on each frame. 
+  # For simplicity we rescore the lattice
  # on all iterations, even though it shouldn't be necessary on the zeroth
  # (but we want this script to work even if $srcdir doesn't contain the
-  # model used to generate the lattice).
+  #  model used to generate the lattice).
  if [ $stage -le $x ]; then
    for n in `get_splits.pl $nj`; do  
-      $cmd $dir/log/acc_den.$x.$n.log \
+      $cmd $dir/log/acc.$x.$n.log \
        gmm-rescore-lattice $cur_mdl "${latspart[$n]}" "${featspart[$n]}" ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
-        gmm-acc-stats $cur_mdl "${featspart[$n]}" ark:- $dir/den_acc.$x.$n.acc \
-         || touch $dir/.error &
+        sum-post --merge=$merge --scale1=-1 \
+         ark:- "ark,s,cs:gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- |" ark:- \| \
+        gmm-acc-stats2 $cur_mdl "${featspart[$n]}" ark,s,cs:- \
+          $dir/num_acc.$x.$n.acc $dir/den_acc.$x.$n.acc  || touch $dir/.error &
    done 
    wait
-    [ -f $dir/.error ] && echo Error accumulating den stats on iter $x && exit 1;
+    [ -f $dir/.error ] && echo Error accumulating stats on iter $x && exit 1;
    $cmd $dir/log/den_acc_sum.$x.log \
      gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
    rm $dir/den_acc.$x.*.acc
-
-    echo "Iteration $x: getting numerator stats."
-    for n in `get_splits.pl $nj`; do  
-      $cmd $dir/log/acc_num.$x.$n.log \
-        gmm-acc-stats-ali $cur_mdl "${featspart[$n]}" "ark:gunzip -c $alidir/$n.ali.gz|" \
-          $dir/num_acc.$x.$n.acc || touch $dir/.error &
-    done
-    wait;
-    [ -f $dir/.error ] && echo Error accumulating num stats on iter $x && exit 1;
    $cmd $dir/log/num_acc_sum.$x.log \
      gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
    rm $dir/num_acc.$x.*.acc

+    # note: this tau value is for smoothing to model parameters;
+    # you need to use gmm-ismooth-stats to smooth to the ML stats,
+    # but anyway this script does canceling of num and den stats on
+    # each frame (as suggested in the Boosted MMI paper) which would
+    # make smoothing to ML impossible without accumulating extra stats.
+
    $cmd $dir/log/update.$x.log \
-      gmm-est-gaussians-ebw $cur_mdl "gmm-ismooth-stats --tau=$tau $dir/num_acc.$x.acc $dir/num_acc.$x.acc -|" \
-        $dir/den_acc.$x.acc - \| \
+      gmm-est-gaussians-ebw --tau=$tau $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \
      gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
  else 
    echo "not doing this iteration because --stage=$stage"
  fi
  cur_mdl=$dir/$[$x+1].mdl

-  # Some diagnostics
-  den=`grep Overall $dir/log/acc_den.$x.*.log  | grep lattice-to-post | awk '{p+=$7*$9; nf+=$9;} END{print p/nf;}'`
-  num=`grep Overall $dir/log/acc_num.$x.*.log  | grep gmm-acc-stats-ali | awk '{p+=$11*$13; nf+=$13;} END{print p/nf}'`
-  diff=`perl -e "print ($num * $acwt - $den);"`
-  impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10;}'`
-  impr=`perl -e "print ($impr * $acwt);"` # auxf impr normalized by multiplying by
-  # kappa, so it's comparable to an objective-function change.
-  echo On iter $x, objf was $diff, auxf improvement was $impr | tee $dir/objf.$x.log
+  # Some diagnostics.. note, this objf is somewhat comparable to the
+  # MMI objective function divided by the acoustic weight, and differences in it
+  # are comparable to the auxf improvement printed by the update program.
+  objf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf;}'`
+  nf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ nf+=$12; } END{print nf;}'`
+  impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'`
+  impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
+    # for the canceling of stats.
+  echo On iter $x, objf was $objf, auxf improvement from MMI was $impr | tee $dir/objf.$x.log

  x=$[$x+1]
 done

 echo "Succeeded with $niters iterations of MMI training (boosting factor = $boost)"

-( cd $dir; ln -s $x.mdl final.mdl )
+( cd $dir; rm final.mdl; ln -s $x.mdl final.mdl )
--- a/egs/wsj/s3/steps/train_lda_etc_mmi_fmmi.sh
+++ b/egs/wsj/s3/steps/train_lda_etc_mmi_fmmi.sh
@ -0,0 +1,236 @@
+#!/bin/bash
+# by Dan Povey, 2012.  Apache.
+
+# This script does MMI discriminative training, including
+# feature-space (like fMPE) and model-space components. 
+# If you give the --boost option it does "boosted MMI" (BMMI).
+# On the iterations of training it alternates feature-space
+# and model-space training.  We do 8 iterations in total--
+# 4 of each type ((B)MMI, f(B)MMI)
+
+# The features it uses are LDA + [something], where the something
+# may be just a global transform like MLLT, or may also include
+# speaker-specific transforms such as SAT.  This script just uses
+# transforms computed in the alignment directory, so it doesn't
+# need to know what the transform type is (it isn't re-estimating
+# them itself)
+
+
+niters=8
+nj=4
+boost=0.0
+lrate=0.01
+tau=200 # Note: we're doing smoothing "to the previous iteration"
+    # --smooth-from-model so 200 seems like a more sensible default
+    # than 100.  We smooth to the previous iteration because now
+    # we are discriminatively training the features (and not using
+    # the indirect differential), so it seems like it wouldn't make 
+    # sense to use any element of ML.
+ngauss=400
+merge=true # if true, cancel num and den counts as described in 
+    # the boosted MMI paper. 
+
+
+cmd=scripts/run.pl
+acwt=0.1
+stage=-1
+
+for x in `seq 8`; do
+  if [ $1 == "--num-jobs" ]; then
+    shift; nj=$1; shift
+  fi
+  if [ $1 == "--learning-rate" ]; then
+    shift; lrate=$1; shift
+  fi
+  if [ $1 == "--num-gauss" ]; then
+    shift; ngauss=$1; shift  # #Gauss in GMM for fMPE.
+  fi
+  if [ $1 == "--num-iters" ]; then
+    shift; niters=$1; shift
+  fi
+  if [ $1 == "--boost" ]; then
+    shift; boost=$1; shift
+  fi
+  if [ $1 == "--cmd" ]; then
+    shift; cmd=$1; shift
+    [ -z "$cmd" ] && echo Empty argument to --cmd option && exit 1;
+  fi  
+  if [ $1 == "--acwt" ]; then
+    shift; acwt=$1; shift
+  fi  
+  if [ $1 == "--tau" ]; then
+    shift; tau=$1; shift
+  fi  
+  if [ $1 == "--stage" ]; then # used for finishing partial runs.
+    shift; stage=$1; shift
+  fi  
+done
+
+if [ $# != 7 ]; then
+   echo "Usage: steps/train_lda_etc_mmi_fmmi.sh <data-dir> <lang-dir> <ali-dir> <dubm-dir> <denlat-dir> <model-dir> <exp-dir>"
+   echo " e.g.: steps/train_lda_etc_mmi_fmmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm2d exp/tri2b_denlats_si84 exp/tri2b exp/tri2b_fmmi"
+   exit 1;
+fi
+
+if [ -f path.sh ]; then . path.sh; fi
+
+data=$1
+lang=$2
+alidir=$3
+dubmdir=$4  # where diagonal UBM is.
+denlatdir=$5
+srcdir=$6 # may be same model as in alidir, but may not be, e.g.
+      # if you want to test MMI with different #iters.
+dir=$7
+silphonelist=`cat $lang/silphones.csl`
+ngselect=2; # Just the 2 top Gaussians.  Beyond that wouldn't make much
+   # difference since the posteriors would be very small.
+mkdir -p $dir/log
+
+if [ ! -f $srcdir/final.mdl -o ! -f $srcdir/final.mat ]; then
+  echo "Error: alignment dir $alidir does not contain one of final.mdl or final.mat"
+  exit 1;
+fi
+cp $srcdir/final.mat $srcdir/tree $dir
+
+n=`get_splits.pl $nj | awk '{print $1}'`
+if [ -f $alidir/$n.trans ]; then
+  use_trans=true
+  echo Using transforms from directory $alidir
+else
+  echo No transforms present in alignment directory: assuming speaker independent.
+  use_trans=false
+fi
+
+# Note: ${basefeatspart[$n]} is the features before fMPE.
+
+for n in `get_splits.pl $nj`; do
+  basefeatspart[$n]="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  $use_trans && basefeatspart[$n]="${basefeatspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
+  featspart[$n]="${basefeatspart[$n]}" # before 1st iter of fMPE..
+
+  [ ! -f $denlatdir/lat.$n.gz ] && echo No such file $denlatdir/lat.$n.gz && exit 1;
+  latspart[$n]="ark:gunzip -c $denlatdir/lat.$n.gz|"
+  # note: in next line, doesn't matter which model we use, it's only used to map to phones.
+  [ $boost != "0.0" -a $boost != "0" ] && latspart[$n]="${latspart[$n]} lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/$n.ali.gz|' ark:- |"
+done
+
+
+# Initialize the fMPE object.  Note: we call it .fmpe because
+# that's what it was called in the original paper, but since
+# we're using the MMI objective function, it's really fMMI.
+fmpe-init $dubmdir/final.dubm $dir/0.fmpe || exit 1;
+
+rm $dir/.error 2>/dev/null
+
+if [ $stage -le -1 ]; then
+# Get the gselect (Gaussian selection) info for fMPE.
+# Note: fMPE object starts with GMM object, so can be read
+# as one.
+  for n in `get_splits.pl $nj`; do
+    $cmd $dir/log/gselect.$n.log \
+      gmm-gselect --n=$ngselect $dir/0.fmpe "${featspart[$n]}" \
+      "ark:|gzip -c >$dir/gselect.$n.gz" || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "Error in Gaussian selection phase" && exit 1;
+fi
+
+
+cur_mdl=$srcdir/final.mdl
+cur_fmpe=$dir/0.fmpe
+x=0
+while [ $x -lt $niters ]; do
+  if [ $[$x%2] == 0 ]; then
+    echo "Iteration $x: doing fMMI"
+    if [ $stage -le $x ]; then
+      for n in `get_splits.pl $nj`; do  
+        numpost="ark,s,cs:gunzip -c $alidir/$n.ali.gz| ali-to-post ark:- ark:-|"
+        # Note: the command gmm-fmpe-acc-stats below requires the "base" features
+        # (without fMPE), not the fMPE features.
+        $cmd $dir/log/acc_fmmi.$x.$n.log \
+         gmm-rescore-lattice $cur_mdl "${latspart[$n]}" "${featspart[$n]}" ark:- \| \
+          lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+          sum-post --scale1=-1 ark:- "$numpost" ark:- \| \
+          gmm-fmpe-acc-stats $cur_mdl $cur_fmpe "${basefeatspart[$n]}" \
+           "ark,s,cs:gunzip -c $dir/gselect.$n.gz|" ark,s,cs:- \
+           $dir/$x.$n.fmpe_acc || touch $dir/.error &
+      done
+      wait
+      [ -f $dir/.error ] && echo Error doing fMPE accumulation && exit 1;
+      ( sum-matrices $dir/$x.fmpe_acc $dir/$x.*.fmpe_acc && \
+        rm $dir/$x.*.fmpe_acc && \
+        fmpe-est --learning-rate=$lrate $cur_fmpe $dir/$x.fmpe_acc $dir/$[$x+1].fmpe ) \
+       2>$dir/log/est_fmpe.$x.log || exit 1;
+      rm $dir/$[$x+1].mdl 2>/dev/null
+    fi
+    # We need to set the features to use the correct fMPE object.
+    for n in `get_splits.pl $nj`; do
+      featspart[$n]="${basefeatspart[$n]} fmpe-apply-transform $dir/$[$x+1].fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.$n.gz|' ark:- |" 
+    done      
+    cur_fmpe=$dir/$[$x+1].fmpe
+    # Now, diagnostics.
+    objf=`grep Overall $dir/log/acc_fmmi.$x.*.log | grep gmm-fmpe-acc-stats | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf;}'`
+    nf=`grep Overall $dir/log/acc_fmmi.$x.*.log | grep gmm-fmpe-acc-stats | awk '{ nf+=$12; } END{print nf;}'`
+    impr=`grep Objf $dir/log/est_fmpe.$x.log | awk '{print $NF}'`
+    impr=`perl -e "print ($impr/$nf);"` # normalize by #frames.
+    echo On iter $x, objf was $objf, auxf improvement from fMMI was $impr | tee $dir/objf.$x.log
+  else
+    echo "Iteration $x: doing MMI (getting stats)..."
+    # Get denominator stats...  For simplicity we rescore the lattice
+    # on all iterations, even though it shouldn't be necessary on the zeroth
+    # (but we want this script to work even if $srcdir doesn't contain the
+    # model used to generate the lattice).
+    if [ $stage -le $x ]; then
+      for n in `get_splits.pl $nj`; do  
+        $cmd $dir/log/acc.$x.$n.log \
+          gmm-rescore-lattice $cur_mdl "${latspart[$n]}" "${featspart[$n]}" ark:- \| \
+          lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
+          sum-post --merge=$merge --scale1=-1 \
+          ark:- "ark,s,cs:gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- |" ark:- \| \
+          gmm-acc-stats2 $cur_mdl "${featspart[$n]}" ark,s,cs:- \
+          $dir/num_acc.$x.$n.acc $dir/den_acc.$x.$n.acc  || touch $dir/.error &
+      done 
+      wait
+      [ -f $dir/.error ] && echo Error accumulating stats on iter $x && exit 1;
+      $cmd $dir/log/den_acc_sum.$x.log \
+        gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
+      rm $dir/den_acc.$x.*.acc
+      $cmd $dir/log/num_acc_sum.$x.log \
+        gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
+      rm $dir/num_acc.$x.*.acc
+
+
+      # note: this tau value is for smoothing to model parameters;
+      # you need to use gmm-ismooth-stats to smooth to the ML stats,
+      # but anyway this script does canceling of num and den stats on
+      # each frame (as suggested in the Boosted MMI paper) which would
+      # make smoothing to ML impossible without accumulating extra stats.
+      $cmd $dir/log/update.$x.log \
+        gmm-est-gaussians-ebw --tau=$tau $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \
+        gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
+    else 
+      echo "not doing this iteration because --stage=$stage"
+    fi
+  
+    # Some diagnostics.. note, this objf is somewhat comparable to the
+    # MMI objective function divided by the acoustic weight, and differences in it
+    # are comparable to the auxf improvement printed by the update program.
+    objf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf;}'`
+    nf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ nf+=$12; } END{print nf;}'`
+    impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'`
+    impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
+    # for the canceling of stats.
+    echo On iter $x, objf was $objf, auxf improvement was $impr | tee $dir/objf.$x.log
+    cur_mdl=$dir/$[$x+1].mdl
+  fi
+  x=$[$x+1]
+done
+
+echo "Succeeded with $niters iterations of MMI+fMMI training (boosting factor = $boost)"
+
+( cd $dir; rm final.mdl 2>/dev/null; ln -s `basename $cur_mdl` final.mdl;
+  rm final.fmpe 2>/dev/null; ln -s `basename $cur_fmpe` final.fmpe )
+
+# Now do some cleanup.
+rm $dir/gselect.*.gz $dir/*.acc $dir/*.fmpe_acc
--- a/egs/wsj/s3/steps/train_ubm_lda_etc.sh
+++ b/egs/wsj/s3/steps/train_ubm_lda_etc.sh
@ -22,6 +22,7 @@

 nj=4
 cmd=scripts/run.pl
+silweight=
 for x in 1 2; do
  if [ $1 == "--num-jobs" ]; then
     shift
@ -33,6 +34,11 @@ for x in 1 2; do
     cmd=$1
     shift
  fi  
+  if [ $1 == "--silence-weight" ]; then
+     shift
+     silweight=$1 # e.g. to weight down silence in training.
+     shift
+  fi  
 done

 if [ $# != 5 ]; then
@ -48,6 +54,7 @@ data=$2
 lang=$3
 alidir=$4
 dir=$5
+silphonelist=`cat $lang/silphones.csl`

 mkdir -p $dir/log

@ -63,6 +70,9 @@ for n in `get_splits.pl $nj`; do
  if [ -f $alidir/$n1.trans ]; then
    featspart[$n]="${featspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
  fi
+  if [ ! -z "$silweight" ]; then
+    weightspart[$n]="--weights='gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- | weight-silence-post $silweight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
+  fi
 done

 ngselect1=50
@ -98,7 +108,7 @@ for x in 0 1 2 3; do
    $cmd $dir/log/acc.$x.$n.log \
      gmm-gselect --n=$ngselect2 "--gselect=ark,s,cs:gunzip -c $dir/gselect_diag.$n.gz|" \
        "fgmm-global-to-gmm $dir/$x.ubm - |" "${featspart[$n]}" ark:- \| \
-      fgmm-global-acc-stats --gselect=ark,s,cs:- $dir/$x.ubm "${featspart[$n]}" \
+      fgmm-global-acc-stats ${weightspart[$n]} --gselect=ark,s,cs:- $dir/$x.ubm "${featspart[$n]}" \
        $dir/$x.$n.acc || touch $dir/.error &
  done
  wait
--- a/src/bin/build-tree-two-level.cc
+++ b/src/bin/build-tree-two-level.cc
@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
        "e.g.: \n"
        " build-tree-two-level treeacc roots.txt 1.qst topo tree tree.map\n";

-    bool binary = false;
+    bool binary = true;
    int32 P = 1, N = 3;

    bool cluster_leaves = true;
--- a/src/bin/sum-post.cc
+++ b/src/bin/sum-post.cc
@ -39,6 +39,7 @@ void ScalePosteriors(BaseFloat scale, Posterior *post) {
 // note: Posterior is vector<vector<pair<int,BaseFloat> > >
 void MergePosteriors(const Posterior &post1,
                     const Posterior &post2,
+                     bool merge,
                     Posterior *post) {
  KALDI_ASSERT(post1.size() == post2.size()); // precondition.
  post->resize(post1.size());
@ -49,10 +50,14 @@ void MergePosteriors(const Posterior &post1,
                      post1[i].begin(), post1[i].end());
    (*post)[i].insert((*post)[i].end(),
                      post2[i].begin(), post2[i].end());
-    MergePairVectorSumming(&((*post)[i])); // This sorts on
-    // the transition-id merges the entries with the same
-    // key (i.e. same .first element; same transition-id), and
-    // gets rid of entries with zero .second element.
+    if (merge) { // combine and sum up entries with same transition-id.
+      MergePairVectorSumming(&((*post)[i])); // This sorts on
+      // the transition-id merges the entries with the same
+      // key (i.e. same .first element; same transition-id), and
+      // gets rid of entries with zero .second element.
+    } else { // just to keep them pretty, merge them.
+      std::sort( (*post)[i].begin(), (*post)[i].end() );
+    }
  }
 }

@ -70,10 +75,12 @@ int main(int argc, char *argv[]) {
        "Usage: sum-post post-rspecifier1 post-rspecifier2 post-wspecifier\n";

    BaseFloat scale1 = 1.0, scale2 = 1.0;
-
+    bool merge = true;
    ParseOptions po(usage);
    po.Register("scale1", &scale1, "Scale for first set of posteriors");
    po.Register("scale2", &scale2, "Scale for second set of posteriors");
+    po.Register("merge", &merge, "If true, merge posterior entries for "
+                "same transition-id (canceling positive and negative parts)");
    po.Read(argc, argv);

    if (po.NumArgs() != 3) {
@ -111,7 +118,7 @@ int main(int argc, char *argv[]) {
      ScalePosteriors(scale1, &posterior1);
      ScalePosteriors(scale2, &posterior2);
      kaldi::Posterior posterior_out;
-      MergePosteriors(posterior1, posterior2, &posterior_out);
+      MergePosteriors(posterior1, posterior2, merge, &posterior_out);
      posterior_writer.Write(key, posterior_out);
      num_done++;
    }
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@ -3,10 +3,11 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk

-BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats compute-cmvn-stats \
-           add-deltas remove-mean apply-cmvn transform-feats copy-feats compose-transforms \
-           splice-feats extract-segments subset-feats feat-to-len feat-to-dim \
-           fmpe-apply-transform fmpe-acc-stats fmpe-init fmpe-update 
+BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
+    compute-cmvn-stats add-deltas remove-mean apply-cmvn transform-feats \
+    copy-feats compose-transforms splice-feats extract-segments subset-feats \
+    feat-to-len feat-to-dim fmpe-apply-transform fmpe-acc-stats fmpe-init \
+    fmpe-est fmpe-copy


 OBJFILES = 
@ -17,8 +18,8 @@ all:  $(BINFILES)
 TESTFILES =

 $(BINFILES): ../feat/kaldi-feature.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
- 	     ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
-             ../base/kaldi-base.a
+         ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
+         ../base/kaldi-base.a

 # Rule below would expand to, e.g.:
 # ../base/kaldi-base.a:
--- a/src/featbin/fmpe-acc-stats.cc
+++ b/src/featbin/fmpe-acc-stats.cc
@ -21,6 +21,7 @@

 int main(int argc, char *argv[]) {
  using namespace kaldi;
+  using kaldi::int32;
  try {
    const char *usage =
        "Apply fMPE transform to features\n"
@ -55,13 +56,13 @@ int main(int argc, char *argv[]) {
    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);

    // fmpe stats...
-    Matrix<BaseFloat> stats(fmpe.ProjectionNumRows() * 2,
-                            fmpe.ProjectionNumCols());
-    SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionNumRows(),
-                                    0, fmpe.ProjectionNumCols());
-    SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionNumRows(),
-                                    fmpe.ProjectionNumRows(),
-                                    0, fmpe.ProjectionNumCols());
+    Matrix<BaseFloat> stats(fmpe.ProjectionTNumRows() * 2,
+                            fmpe.ProjectionTNumCols());
+    SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionTNumRows(),
+                                    0, fmpe.ProjectionTNumCols());
+    SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionTNumRows(),
+                                    fmpe.ProjectionTNumRows(),
+                                    0, fmpe.ProjectionTNumCols());
    
    int32 num_done = 0, num_err = 0;
    
--- a/src/featbin/fmpe-apply-transform.cc
+++ b/src/featbin/fmpe-apply-transform.cc
@ -1,6 +1,6 @@
 // featbin/fmpe-apply-transform.cc

-// Copyright 2012  Daniel Povey
+// Copyright 2012  Daniel Povey  Yanmin Qian

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -21,6 +21,7 @@

 int main(int argc, char *argv[]) {
  using namespace kaldi;
+  using kaldi::int32;
  try {
    const char *usage =
        "Apply fMPE transform to features\n"
@ -34,7 +35,7 @@ int main(int argc, char *argv[]) {
    // no non-default options.
    po.Read(argc, argv);

-    if (po.NumArgs() != 3) {
+    if (po.NumArgs() != 4) {
      po.PrintUsage();
      exit(1);
    }
--- a/src/featbin/fmpe-copy.cc
+++ b/src/featbin/fmpe-copy.cc
@ -0,0 +1,62 @@
+// featbin/fmpe-copy.cc
+
+// Copyright 2012  Daniel Povey  Yanmin Qian
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "transform/fmpe.h"
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  try {
+    const char *usage =
+        "Copy fMPE transform\n"
+        "Usage: fmpe-init [options...] <fmpe-in> <fmpe-out>\n"
+        "E.g. fmpe-copy --binary=false 1.fmpe text.fmpe\n";
+
+    ParseOptions po(usage);
+    FmpeOptions opts;
+    bool binary = true;
+    po.Register("binary", &binary, "If true, output fMPE object in binary mode.");
+    opts.Register(&po);
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string fmpe_rxfilename = po.GetArg(1),
+        fmpe_wxfilename = po.GetArg(2);
+
+    Fmpe fmpe;
+    {
+      bool binary_in;
+      Input ki(fmpe_rxfilename, &binary_in);
+      fmpe.Read(ki.Stream(), binary_in);
+    }
+    
+
+    Output ko(fmpe_wxfilename, binary);
+    fmpe.Write(ko.Stream(), binary);
+
+    KALDI_LOG << "Copyied fMPE object to " << fmpe_wxfilename;
+    return 0;
+  } catch(const std::exception& e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
--- a/src/featbin/fmpe-update.cc
+++ b/src/featbin/fmpe-update.cc
@ -1,6 +1,6 @@
-// featbin/fmpe-update.cc
+// featbin/fmpe-est.cc

-// Copyright 2012  Daniel Povey
+// Copyright 2012  Daniel Povey  Yanmin Qian

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -24,8 +24,8 @@ int main(int argc, char *argv[]) {
  try {
    const char *usage =
        "Initialize fMPE transform (to zeo)\n"
-        "Usage: fmpe-update [options...] <fmpe-in> <stats-in> <fmpe-out>\n"
-        "E.g. fmpe-update 1.fmpe 1.accs 2.fmpe\n";
+        "Usage: fmpe-est [options...] <fmpe-in> <stats-in> <fmpe-out>\n"
+        "E.g. fmpe-est 1.fmpe 1.accs 2.fmpe\n";

    ParseOptions po(usage);
    FmpeUpdateOptions opts;
@ -58,18 +58,18 @@ int main(int argc, char *argv[]) {
    }
    // the matrix is in two parts, for the "plus" and "minus"
    // parts of the gradient that we stored separately.
-    SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionNumRows(),
-                                    0, fmpe.ProjectionNumCols());
-    SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionNumRows(),
-                                    fmpe.ProjectionNumRows(),
-                                    0, fmpe.ProjectionNumCols());
+    SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionTNumRows(),
+                                    0, fmpe.ProjectionTNumCols());
+    SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionTNumRows(),
+                                    fmpe.ProjectionTNumRows(),
+                                    0, fmpe.ProjectionTNumCols());
    
    fmpe.Update(opts, stats_plus, stats_minus);

    Output ko(fmpe_wxfilename, binary);
    fmpe.Write(ko.Stream(), binary);

-    KALDI_LOG << "Initialized fMPE object and wrote to"
+    KALDI_LOG << "Updated fMPE object and wrote to "
              << fmpe_wxfilename;
    return 0;
  } catch(const std::exception& e) {
--- a/src/featbin/fmpe-init.cc
+++ b/src/featbin/fmpe-init.cc
@ -1,6 +1,6 @@
 // featbin/fmpe-init.cc

-// Copyright 2012  Daniel Povey
+// Copyright 2012  Daniel Povey  Yanmin Qian

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -23,7 +23,7 @@ int main(int argc, char *argv[]) {
  using namespace kaldi;
  try {
    const char *usage =
-        "Initialize fMPE transform (to zeo)\n"
+        "Initialize fMPE transform (to zero)\n"
        "Usage: fmpe-init [options...] <diag-gmm-in> <fmpe-out>\n"
        "E.g. fmpe-init 1.ubm 1.fmpe\n";

@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
    Output ko(fmpe_wxfilename, binary);
    fmpe.Write(ko.Stream(), binary);

-    KALDI_LOG << "Initialized fMPE object and wrote to"
+    KALDI_LOG << "Initialized fMPE object and wrote to "
              << fmpe_wxfilename;
    return 0;
  } catch(const std::exception& e) {
--- a/src/gmm/Makefile
+++ b/src/gmm/Makefile
@ -8,7 +8,7 @@ TESTFILES = diag-gmm-test mle-diag-gmm-test full-gmm-test mle-full-gmm-test \
 		am-diag-gmm-test ebw-diag-gmm-test

 OBJFILES = diag-gmm.o diag-gmm-normal.o mle-diag-gmm.o am-diag-gmm.o mle-am-diag-gmm.o \
-		full-gmm.o full-gmm-normal.o mle-full-gmm.o fmpe-am-diag-gmm.o model-common.o \
+		full-gmm.o full-gmm-normal.o mle-full-gmm.o model-common.o \
 		model-test-common.o ebw-diag-gmm.o

 LIBFILE = kaldi-gmm.a
--- a/src/gmm/ebw-diag-gmm.cc
+++ b/src/gmm/ebw-diag-gmm.cc
@ -148,11 +148,10 @@ void UpdateEbwDiagGmm(const AccumDiagGmm &num_stats, // with I-smoothing, if use
      if (den_has_stats)
        var_stats.AddVec(-1.0, den_stats.variance_accumulator().Row(g));
    }
-    double D = opts.E * den_count / 2; // E*gamma_den/2 where E = 2;
-    // We initialize to half the value of D that would be dictated by
-    // E; this is part of the strategy used to ensure that the value of
-    // D we use is at least twice the value that would ensure positive
-    // variances.
+    double D = (opts.tau + opts.E * den_count) / 2;
+    // We initialize to half the value of D that would be dictated by E (and
+    // tau); this is part of the strategy used to ensure that the value of D we
+    // use is at least twice the value that would ensure positive variances.

    int32 iter, max_iter = 100;
    for (iter = 0; iter < max_iter; iter++) { // will normally break from the loop
@ -184,7 +183,7 @@ void UpdateEbwDiagGmm(const AccumDiagGmm &num_stats, // with I-smoothing, if use
        D *= 1.1; 
      }
    }
-    if (iter > 0 && num_floored_out != NULL) *num_floored_out++;
+    if (iter > 0 && num_floored_out != NULL) (*num_floored_out)++;
    if (iter == max_iter) KALDI_WARN << "Dropped off end of loop, recomputing D. (unexpected.)";
  }
  // copy to natural representation according to flags.
--- a/src/gmm/ebw-diag-gmm.h
+++ b/src/gmm/ebw-diag-gmm.h
@ -31,10 +31,14 @@ namespace kaldi {
 // Options for Extended Baum-Welch Gaussian update.
 struct EbwOptions {
  BaseFloat E;
-  EbwOptions(): E(2.0) { }
+  BaseFloat tau; // This is only useful for smoothing "to the model":
+  // if you want to smooth to ML stats, you need to use gmm-ismooth-stats
+  EbwOptions(): E(2.0), tau(0.0) { }
  void Register(ParseOptions *po) {
    std::string module = "EbwOptions: ";
    po->Register("E", &E, module+"Constant E for Extended Baum-Welch (EBW) update");
+    po->Register("tau", &tau, module+"Tau value for smoothing to the model "
+                 "parameters only (for smoothing to ML stats, use gmm-ismooth-stats");
  }
 };

--- a/src/gmm/fmpe-am-diag-gmm.cc
+++ b/src/gmm/fmpe-am-diag-gmm.cc
@ -1,892 +0,0 @@
-// gmm/fmpe-am-diag-gmm.cc
-
-// Copyright 2009-2011  Yanmin Qian
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include <set>
-#include <algorithm>
-
-#include "gmm/diag-gmm.h"
-#include "gmm/fmpe-am-diag-gmm.h"
-#include "util/stl-utils.h"
-#include "tree/clusterable-classes.h"
-#include "tree/cluster-utils.h"
-
-namespace kaldi {
-
-void FmpeAccumModelDiff::Read(std::istream &in_stream, bool binary) {
-  int32 dimension, num_components;
-  std::string token;
-
-  ExpectToken(in_stream, binary, "<FMPEMODELDIFFS>");
-  ExpectToken(in_stream, binary, "<VECSIZE>");
-  ReadBasicType(in_stream, binary, &dimension);
-  ExpectToken(in_stream, binary, "<NUMCOMPONENTS>");
-  ReadBasicType(in_stream, binary, &num_components);
-
-  Resize(num_components, dimension);
-
-  ReadToken(in_stream, binary, &token);
-  while (token != "</FMPEMODELDIFFS>") {
-    if (token == "<MLE_OCCUPANCY>") {
-      mle_occupancy_.Read(in_stream, binary);
-    } else if (token == "<MEANDIFFS>") {
-      mean_diff_accumulator_.Read(in_stream, binary);
-    } else if (token == "<DIAGVARDIFFS>") {
-      variance_diff_accumulator_.Read(in_stream, binary);
-    } else {
-      KALDI_ERR << "Unexpected token '" << token << "' in model file ";
-    }
-    ReadToken(in_stream, binary, &token);
-  }
-}
-
-void FmpeAccumModelDiff::Write(std::ostream &out_stream, bool binary) const {
-  WriteToken(out_stream, binary, "<FMPEMODELDIFFS>");
-  WriteToken(out_stream, binary, "<VECSIZE>");
-  WriteBasicType(out_stream, binary, dim_);
-  WriteToken(out_stream, binary, "<NUMCOMPONENTS>");
-  WriteBasicType(out_stream, binary, num_comp_);
-
-  // convert into BaseFloat before writing things
-  Vector<BaseFloat> occupancy_bf(mle_occupancy_.Dim());
-  Matrix<BaseFloat> mean_diff_accumulator_bf(mean_diff_accumulator_.NumRows(),
-      mean_diff_accumulator_.NumCols());
-  Matrix<BaseFloat> variance_diff_accumulator_bf(variance_diff_accumulator_.NumRows(),
-      variance_diff_accumulator_.NumCols());
-  occupancy_bf.CopyFromVec(mle_occupancy_);
-  mean_diff_accumulator_bf.CopyFromMat(mean_diff_accumulator_);
-  variance_diff_accumulator_bf.CopyFromMat(variance_diff_accumulator_);
-
-  WriteToken(out_stream, binary, "<MLE_OCCUPANCY>");
-  occupancy_bf.Write(out_stream, binary);
-  WriteToken(out_stream, binary, "<MEANDIFFS>");
-  mean_diff_accumulator_bf.Write(out_stream, binary);
-  WriteToken(out_stream, binary, "<DIAGVARDIFFS>");
-  variance_diff_accumulator_bf.Write(out_stream, binary);
-  WriteToken(out_stream, binary, "</FMPEMODELDIFFS>");
-}
-
-void FmpeAccumModelDiff::Resize(int32 num_comp, int32 dim) {
-  KALDI_ASSERT(num_comp > 0 && dim > 0);
-  num_comp_ = num_comp;
-  dim_ = dim;
-  mle_occupancy_.Resize(num_comp);
-  mean_diff_accumulator_.Resize(num_comp, dim);
-  variance_diff_accumulator_.Resize(num_comp, dim);
-}
-
-void FmpeAccumModelDiff::SetZero() {
-  mle_occupancy_.SetZero();
-  mean_diff_accumulator_.SetZero();
-  variance_diff_accumulator_.SetZero();
-}
-
-void FmpeAccumModelDiff::ComputeModelParaDiff(const DiagGmm& diag_gmm,
-                                              const AccumDiagGmm& num_acc,
-                                              const AccumDiagGmm& den_acc,
-                                              const AccumDiagGmm& mle_acc) {
-  KALDI_ASSERT(num_acc.NumGauss() == num_comp_ && num_acc.Dim() == dim_);
-  KALDI_ASSERT(den_acc.NumGauss() == num_comp_); // den_acc.Dim() may not be defined,
-  // if we used the "compressed form" of accs where den only has counts.
-  KALDI_ASSERT(mle_acc.NumGauss() == num_comp_ && mle_acc.Dim() == dim_);
-
-  Matrix<double> mean_diff_tmp(num_comp_, dim_);
-  Matrix<double> var_diff_tmp(num_comp_, dim_);
-  Matrix<double> mat_tmp(num_comp_, dim_);
-  Vector<double> occ_diff(num_comp_);
-  Matrix<double> means_invvars(num_comp_, dim_);
-  Matrix<double> inv_vars(num_comp_, dim_);
-
-  occ_diff.CopyFromVec(num_acc.occupancy());
-  occ_diff.AddVec(-1.0, den_acc.occupancy());
-
-  means_invvars.CopyFromMat(diag_gmm.means_invvars(), kNoTrans);
-  inv_vars.CopyFromMat(diag_gmm.inv_vars(), kNoTrans);
-  /// compute the means differentials first
-  mean_diff_tmp.CopyFromMat(num_acc.mean_accumulator(), kNoTrans);
-  if (den_acc.Flags() & kGmmMeans) // probably will be false.
-    mean_diff_tmp.AddMat(-1.0, den_acc.mean_accumulator(), kNoTrans);
-  mean_diff_tmp.MulElements(inv_vars);
-
-  mat_tmp.CopyFromMat(means_invvars, kNoTrans);
-  mat_tmp.MulRowsVec(occ_diff);
-
-  mean_diff_tmp.AddMat(-1.0, mat_tmp, kNoTrans);
-
-  /// compute the means differetials
-  mean_diff_accumulator_.CopyFromMat(mean_diff_tmp, kNoTrans);
-
-  /// compute the vars differentials second
-  var_diff_tmp.CopyFromMat(num_acc.variance_accumulator(), kNoTrans);
-  if (den_acc.Flags() & kGmmVariances) // probably will be false.
-    var_diff_tmp.AddMat(-1.0, den_acc.variance_accumulator(), kNoTrans);
-
-  var_diff_tmp.MulElements(inv_vars);
-  var_diff_tmp.MulElements(inv_vars);
-                      
-  mat_tmp.CopyFromMat(num_acc.mean_accumulator(), kNoTrans);
-  if (den_acc.Flags() & kGmmMeans) // probably will be false.
-    mat_tmp.AddMat(-1.0, den_acc.mean_accumulator(), kNoTrans);
-  mat_tmp.MulElements(inv_vars);
-  mat_tmp.MulElements(means_invvars);
-
-  var_diff_tmp.AddMat(-2.0, mat_tmp, kNoTrans);
-
-  mat_tmp.CopyFromMat(means_invvars, kNoTrans);
-  mat_tmp.MulElements(means_invvars);
-  mat_tmp.AddMat(-1.0, inv_vars, kNoTrans);
-  mat_tmp.MulRowsVec(occ_diff);
-
-  var_diff_tmp.AddMat(1.0, mat_tmp, kNoTrans);
-  var_diff_tmp.Scale(0.5);
-
-  /// compute the vars differentials
-  variance_diff_accumulator_.CopyFromMat(var_diff_tmp, kNoTrans);
-
-  /// copy to obtain the mle occupation probapility
-  mle_occupancy_.CopyFromVec(mle_acc.occupancy());
-}
-
-void FmpeAccs::Write(std::ostream &out_stream, bool binary) const {
-  uint32 tmp_uint32;
-
-  WriteToken(out_stream, binary, "<FMPEACCS>");
-
-  WriteToken(out_stream, binary, "<NumGaussians>");
-  tmp_uint32 = static_cast<uint32>(config_.gmm_num_comps);
-  WriteBasicType(out_stream, binary, tmp_uint32);
-  WriteToken(out_stream, binary, "<LengthContextExp>");
-  tmp_uint32 = static_cast<uint32>(config_.context_windows.NumRows());
-  WriteBasicType(out_stream, binary, tmp_uint32);
-  WriteToken(out_stream, binary, "<DIMENSION>");
-  WriteBasicType(out_stream, binary, dim_);
-  if (!binary) out_stream << "\n";
-
-  // convert into BaseFloat before writing things
-  Matrix<BaseFloat> mat_bf(dim_, dim_ + 1);
-
-  if (p_.size() != 0) {
-    WriteToken(out_stream, binary, "<P>");
-    for (int32 i = 0; i < config_.gmm_num_comps; ++i) {
-      for (int32 j = 0; j < config_.context_windows.NumRows(); ++j) {
-		mat_bf.CopyFromMat(p_[i][j], kNoTrans);
-        mat_bf.Write(out_stream, binary);
-	  }
-    }
-  }
-  if (n_.size() != 0) {
-    WriteToken(out_stream, binary, "<N>");
-    for (int32 i = 0; i < config_.gmm_num_comps; ++i) {
-      for (int32 j = 0; j < config_.context_windows.NumRows(); ++j) {
-		mat_bf.CopyFromMat(n_[i][j], kNoTrans);
-        mat_bf.Write(out_stream, binary);
-	  }
-    }
-  }
-
-  // convert into BaseFloat before writing things
-  Vector<BaseFloat> diff_bf(diff_.Dim());
-  Vector<BaseFloat> direct_diff_bf(direct_diff_.Dim());
-  Vector<BaseFloat> indirect_diff_bf(indirect_diff_.Dim());
-  diff_bf.CopyFromVec(diff_);
-  direct_diff_bf.CopyFromVec(direct_diff_);
-  indirect_diff_bf.CopyFromVec(indirect_diff_);
-
-  WriteToken(out_stream, binary, "<DIFFERENTIAL>");
-  diff_bf.Write(out_stream, binary);
-  WriteToken(out_stream, binary, "<DIRECTDIFFERENTIAL>");
-  direct_diff_bf.Write(out_stream, binary);
-  WriteToken(out_stream, binary, "<INDIRECTDIFFERENTIAL>");
-  indirect_diff_bf.Write(out_stream, binary);
-
-  WriteToken(out_stream, binary, "</FMPEACCS>");
-}
-
-void FmpeAccs::Read(std::istream &in_stream, bool binary,
-                         bool add) {
-  uint32 tmp_uint32;
-  std::string token;
-
-  ExpectToken(in_stream, binary, "<FMPACCS>");
-
-  ExpectToken(in_stream, binary, "<NumGaussians>");
-  ReadBasicType(in_stream, binary, &tmp_uint32);
-  int32 num_gaussians = static_cast<int32>(tmp_uint32);
-  ExpectToken(in_stream, binary, "<LengthContExp>");
-  ReadBasicType(in_stream, binary, &tmp_uint32);
-  int32 length_cont_exp = static_cast<int32>(tmp_uint32);
-  ExpectToken(in_stream, binary, "<DIMENSION>");
-  ReadBasicType(in_stream, binary, &dim_);
-
-  ReadToken(in_stream, binary, &token);
-
-  while (token != "</FMPEACCS>") {
-    if (token == "<P>") {
-      p_.resize(num_gaussians);
-      for (size_t i = 0; i < p_.size(); ++i) {
-        p_[i].resize(length_cont_exp);
-		for (size_t j = 0; j < p_[i].size(); ++j) {
-          p_[i][j].Read(in_stream, binary, add);
-		}
-      }
-    } else if (token == "<N>") {
-      n_.resize(num_gaussians);
-      for (size_t i = 0; i < n_.size(); ++i) {
-        n_[i].resize(length_cont_exp);
-		for (size_t j = 0; j < n_[i].size(); ++j) {
-          n_[i][j].Read(in_stream, binary, add);
-		}
-      }
-    } else if (token == "<DIFFERENTIALS>") {
-      diff_.Read(in_stream, binary, add);
-    } else if (token == "<DIRECTDIFFERENTIALS>") {
-      direct_diff_.Read(in_stream, binary, add);
-    } else if (token == "<INDIRECTDIFFERENTIALS>") {
-      indirect_diff_.Read(in_stream, binary, add);
-    } else {
-      KALDI_ERR << "Unexpected token '" << token << "' in model file ";
-    }
-    ReadToken(in_stream, binary, &token);
-  }
-}
-
-void FmpeAccs::ReadModelDiffs(std::istream &in_stream, bool binary) {
-  int32 num_pdfs;
-  int32 dim;
-  ExpectToken(in_stream, binary, "<DIMENSION>");
-  ReadBasicType(in_stream, binary, &dim);
-  ExpectToken(in_stream, binary, "<NUMPDFS>");
-  ReadBasicType(in_stream, binary, &num_pdfs);
-  KALDI_ASSERT((num_pdfs > 0) && (dim > 0));
-
-  if (model_diff_accumulators_.size() != static_cast<size_t> (num_pdfs))
-    KALDI_ERR << "Reading model differentials but num-pdfs do not match: "
-              << (model_diff_accumulators_.size()) << " vs. "
-              << (num_pdfs);
-  for (std::vector<FmpeAccumModelDiff*>::iterator it = model_diff_accumulators_.begin(),
-           end = model_diff_accumulators_.end(); it != end; ++it) {
-    (*it)->Read(in_stream, binary);
-  }
-
-}
-
-void FmpeAccs::InitPNandDiff(int32 num_gmm_gauss, int32 con_exp, int32 dim) {
-    p_.resize(num_gmm_gauss);
-    for (int32 i = 0; i < num_gmm_gauss; ++i) {
-      p_[i].resize(con_exp);
-      for (int32 j = 0; j < con_exp; ++j) {
-        p_[i][j].Resize(dim, dim + 1);
-      }
-    }
-
-    n_.resize(num_gmm_gauss);
-    for (int32 i = 0; i < num_gmm_gauss; ++i) {
-      n_[i].resize(con_exp);
-      for (int32 j = 0; j < con_exp; ++j) {
-        n_[i][j].Resize(dim, dim + 1);
-      }
-    }
-
-	diff_.Resize(dim);
-	direct_diff_.Resize(dim);
-	indirect_diff_.Resize(dim);
-}
-
-void FmpeAccs::InitModelDiff(const AmDiagGmm &model) {
-  DeletePointers(&model_diff_accumulators_);  // in case was non-empty when called.
-  model_diff_accumulators_.resize(model.NumPdfs(), NULL);
-  for (int32 i = 0; i < model.NumPdfs(); i++) {
-    model_diff_accumulators_[i] = new FmpeAccumModelDiff();
-    model_diff_accumulators_[i]->Resize(model.GetPdf(i));
-  }
-}
-
-/// Initialization, do InitModelDiff if true when accumulating,
-/// and otherwise don't do when sum accumulations
-void FmpeAccs::Init(const AmDiagGmm &am_model, bool update) {
-  dim_ = am_model.Dim();
-
-  InitPNandDiff(config_.gmm_num_comps, config_.context_windows.NumRows(), dim_);
-
-  if (update) {
-	InitModelDiff(am_model);
-  }
-}
-
-void FmpeAccs::InitializeGMMs(const DiagGmm &gmm, const DiagGmm &gmm_cluster_centers,
-                              std::vector<int32> &gaussian_cluster_center_map) {
-  gmm_.CopyFromDiagGmm(gmm);
-  gmm_cluster_centers_.CopyFromDiagGmm(gmm_cluster_centers);
-  gaussian_cluster_center_map_.resize(gaussian_cluster_center_map.size());
-  gaussian_cluster_center_map_ = gaussian_cluster_center_map;
-}
-
-void FmpeAccs::ComputeOneFrameOffsetFeature(const VectorBase<BaseFloat>& data,
-                        std::vector<std::pair<int32, Vector<double> > > *offset) const {
-  KALDI_ASSERT((data.Dim() == gmm_.Dim()) && (data.Dim() == gmm_cluster_centers_.Dim()));
-  KALDI_ASSERT((gmm_.NumGauss() != 0) && (gmm_cluster_centers_.NumGauss() != 0)
-               && (gmm_.NumGauss() > gmm_cluster_centers_.NumGauss())
-               && (config_.gmm_cluster_centers_nbest < gmm_cluster_centers_.NumGauss())
-               && (config_.gmm_gaussian_nbest < gmm_.NumGauss()))
-
-  int32 dim = data.Dim();
-  int32 num_gauss = gmm_.NumGauss();
-  int32 num_cluster_centers = gmm_cluster_centers_.NumGauss();
-  int32 gmm_cluster_centers_nbest = config_.gmm_cluster_centers_nbest;
-
-  std::set<int32> pruned_centers;
-  Vector<BaseFloat> loglikes(num_cluster_centers);
-  gmm_cluster_centers_.LogLikelihoods(data, &loglikes);
-  Vector<BaseFloat> loglikes_copy(loglikes);
-  BaseFloat *ptr = loglikes_copy.Data();
-  std::nth_element(ptr, ptr+num_cluster_centers-gmm_cluster_centers_nbest, ptr+num_cluster_centers);
-  BaseFloat thresh = ptr[num_cluster_centers-gmm_cluster_centers_nbest];
-  for (int32 g = 0; g < num_cluster_centers; g++) {
-    if (loglikes(g) >= thresh)
-      pruned_centers.insert(g);
-  }
-
-  std::vector< std::pair<double, int32> > pruned_gauss;
-  for (int32 gauss_index = 0; gauss_index < num_gauss; gauss_index++) {
-    int32 current_cluster = gaussian_cluster_center_map_[gauss_index];
-    if (pruned_centers.end() != pruned_centers.find(current_cluster)) {
-      double loglike = gmm_.ComponentLogLikelihood(data, gauss_index);
-      pruned_gauss.push_back(std::make_pair(loglike, gauss_index));
-    }
-  }
-  KALDI_ASSERT(!pruned_gauss.empty());
-
-  int32 gmm_gaussian_nbest = config_.gmm_gaussian_nbest;
-  std::nth_element(pruned_gauss.begin(),
-                   pruned_gauss.end() - gmm_gaussian_nbest,
-                   pruned_gauss.end());
-  pruned_gauss.erase(pruned_gauss.begin(),
-                     pruned_gauss.end() - gmm_gaussian_nbest);
-
-  double weight = 0.0;
-  for (int32 i = 0; i < pruned_gauss.size(); ++i) {
-    weight += exp(pruned_gauss[i].first);
-  }
-  for (int32 i = 0; i < pruned_gauss.size(); ++i) {
-    pruned_gauss[i].first = exp(pruned_gauss[i].first) / weight;
-  }
-
-  Vector<BaseFloat> tmp_offset(dim + 1);
-  SubVector<BaseFloat> sub_tmp_offset(tmp_offset, 1, dim);
-  Vector<BaseFloat> tmp_mean(dim);
-  Vector<BaseFloat> tmp_var(dim);
-  for (int32 i = 0; i < pruned_gauss.size(); ++i) {
-	tmp_offset(0) = pruned_gauss[i].first * 5.0;
-    sub_tmp_offset.CopyFromVec(data);
-    gmm_.GetComponentMean(pruned_gauss[i].second, &tmp_mean);
-    sub_tmp_offset.AddVec(-1.0, tmp_mean);
-    gmm_.GetComponentVariance(pruned_gauss[i].second, &tmp_var);
-    tmp_var.ApplyPow(0.5);
-    sub_tmp_offset.DivElemByElem(tmp_var);
-    sub_tmp_offset.Scale(pruned_gauss[i].first);
-
-    offset->push_back(std::make_pair(pruned_gauss[i].second, tmp_offset));
-  }
-}
-
-void FmpeAccs::ComputeWholeFileOffsetFeature(const MatrixBase<BaseFloat>& data,
-           std::vector<std::vector<std::pair<int32, Vector<double> > > > *whole_file_offset) const {
-  int32 nframe = data.NumRows();
-  whole_file_offset->reserve(nframe);
-
-  for (int32 i = 0; i < nframe; i++) {
-	std::vector<std::pair<int32, Vector<double> > > offset;
-    ComputeOneFrameOffsetFeature(data.Row(i), &offset);
-    whole_file_offset->push_back(offset);
-  }
-}
-
-bool Gauss_index_lower(std::pair<int32, Vector<double> > M,
-					   std::pair<int32, Vector<double> >  N) {
-  return M.first < N.first;
-}
-
-void FmpeAccs::ComputeContExpOffsetFeature(
-       const std::vector<std::vector<std::pair<int32, Vector<double> > >* > &offset_win,
-       std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > *ht) const {
-  KALDI_ASSERT((config_.context_windows.NumCols() == offset_win.size()));
-
-  std::vector<std::pair<int32, Vector<double> > > offset_tmp;
-  std::vector<std::pair<int32, Vector<double> > > offset_uniq_tmp;
-
-  for (int32 i = 0; i < config_.context_windows.NumRows(); i++) {
-	// for every context
-	for (int32 j = 0; j < config_.context_windows.NumCols(); j++) {
-	  if (config_.context_windows(i, j) > 0.0) {
-		if (offset_win[j]->empty() == 0) {
-		  for (int32 k = 0; k < offset_win[j]->size(); k++) {
-	        offset_tmp.push_back((*offset_win[j])[k]);
-	        offset_tmp.back().second.Scale(config_.context_windows(i, j));
-	      }
-		}
-	  }
-	}
-
-	if (offset_tmp.empty() == 0) {
-	  std::sort(offset_tmp.begin(), offset_tmp.end(), Gauss_index_lower);
-	  offset_uniq_tmp.push_back(offset_tmp[0]);
-	  for (int32 igauss = 1; igauss < offset_tmp.size(); igauss++) {
-	    if (offset_tmp[igauss].first == offset_tmp[igauss - 1].first) {
-		  offset_uniq_tmp.back().second.AddVec(1.0, offset_tmp[igauss].second);
-	    } else {
-          offset_uniq_tmp.push_back(offset_tmp[igauss]);
-	    }
-	  }
-
-	  ht->push_back(std::make_pair(i, offset_uniq_tmp));
-      offset_tmp.clear();
-	  offset_uniq_tmp.clear();
-	}
-  }
-}
-
-void FmpeAccs::ComputeHighDimemsionFeature(
-     const std::vector<std::vector<std::pair<int32, Vector<double> > > > &whole_file_offset_feat,
-	 int32 frame_index,
-     std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > *ht) const {
-  KALDI_ASSERT((frame_index >= 0) && (frame_index < whole_file_offset_feat.size()));
-
-  int32 lenght_context_windows = config_.context_windows.NumCols();
-  int32 half_len_win = lenght_context_windows / 2;
-  int32 num_frame = whole_file_offset_feat.size();
-  std::vector<std::vector<std::pair<int32, Vector<double> > >* > offset_win;
-  std::vector<std::pair<int32, Vector<double> > > empty_feat;
-
-  for (int32 i = (frame_index - half_len_win);
-	   i < (frame_index - half_len_win + lenght_context_windows); i++) {
-	/// we append zero if the index is out of the whole file feature lenght
-	if ((i < 0) || (i >= num_frame)) {
-	  offset_win.push_back(&empty_feat);
-	} else {
-	  offset_win.push_back(
-                 const_cast<std::vector<std::pair<int32, Vector<double> > >* >
-				 (&(whole_file_offset_feat[i])));
-	}
-  }
-
-  ComputeContExpOffsetFeature(offset_win, ht);
-}
-
-void FmpeAccs::ProjectHighDimensionFeature(
-         const std::vector< std::vector< Matrix<double> > > &M,
-         const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht,
-         Vector<double> *fea_out) const {
-  KALDI_ASSERT((M.size() == gmm_.NumGauss())
-			   && (M[0].size() == ht.size())
-			   && (M[0][0].NumRows() == gmm_.Dim())
-			   && (M[0][0].NumCols() == gmm_.Dim() + 1));
-
-  int32 dim = gmm_.Dim();
-  Vector<double> tmp_fea(dim);
-  tmp_fea.SetZero();
-
-  for(int32 i = 0; i < ht.size(); i++) {
-	int32 cont_index = ht[i].first;
-	for (int32 j = 0; j < ht[i].second.size(); j++) {
-      int32 gauss_index = ht[i].second[j].first;
-	  tmp_fea.AddMatVec(1.0, M[gauss_index][cont_index], kNoTrans, ht[i].second[j].second, 1.0);
-	}
-  }
-
-  fea_out->CopyFromVec(tmp_fea);
-}
-
-void FmpeAccs::ObtainNewFmpeFeature(
-    const VectorBase<BaseFloat> &data,
-    const std::vector< std::vector< Matrix<double> > > &M,
-    const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht,
-    Vector<double> *fea_new) const {
-  KALDI_ASSERT((data.Dim() == gmm_.Dim()));
-
-  Vector<double> tmp_fea(data.Dim());
-  ProjectHighDimensionFeature(M, ht, &tmp_fea);
-
-  fea_new->CopyFromVec(data);
-  fea_new->AddVec(1.0, tmp_fea);
-}
-
-void FmpeAccs::AccumulateDirectDiffFromPosteriors(const DiagGmm &gmm,
-                                            const VectorBase<BaseFloat> &data,
-											const VectorBase<BaseFloat> &posteriors,
-                                            Vector<double> *direct_diff) {
-  KALDI_ASSERT(gmm.Dim() == Dim());
-  KALDI_ASSERT(gmm.NumGauss() == posteriors.Dim());
-  KALDI_ASSERT(static_cast<int32>(data.Dim()) == Dim());
-  KALDI_ASSERT(direct_diff->Dim() == Dim());
-
-  Matrix<double> means_invvars(gmm.NumGauss(), gmm.Dim());
-  Matrix<double> inv_vars(gmm.NumGauss(), gmm.Dim());
-  Matrix<double> data_tmp(gmm.NumGauss(), gmm.Dim());
-  Matrix<double> mat_tmp(gmm.NumGauss(), gmm.Dim());
-  Vector<double> post_scale(gmm.NumGauss());
-
-  means_invvars.CopyFromMat(gmm.means_invvars(), kNoTrans);
-  inv_vars.CopyFromMat(gmm.inv_vars(), kNoTrans);
-
-  for (int32 i = 0; i < data_tmp.NumRows(); i++) {
-	data_tmp.Row(i).AddVec(1.0, data);
-  }
-  data_tmp.MulElements(inv_vars);
-
-  mat_tmp.CopyFromMat(means_invvars, kNoTrans);
-  mat_tmp.AddMat(-1.0, data_tmp, kNoTrans);
-
-  post_scale.CopyFromVec(posteriors);
-  post_scale.Scale(config_.lat_prob_scale);
-
-  direct_diff->AddMatVec(1.0, mat_tmp, kTrans, post_scale, 1.0);
-}
-
-void FmpeAccs::AccumulateInDirectDiffFromPosteriors(const DiagGmm &gmm,
-                             const FmpeAccumModelDiff &fmpe_diaggmm_diff_acc,
-                             const VectorBase<BaseFloat> &data,
-                             const VectorBase<BaseFloat> &posteriors,
-							 Vector<double> *indirect_diff) {
-  KALDI_ASSERT(gmm.NumGauss() == fmpe_diaggmm_diff_acc.NumGauss());
-  KALDI_ASSERT(gmm.NumGauss() == posteriors.Dim());
-  KALDI_ASSERT(gmm.Dim() == fmpe_diaggmm_diff_acc.Dim());
-  KALDI_ASSERT(gmm.Dim() == Dim());
-  KALDI_ASSERT(static_cast<int32>(data.Dim()) == Dim());
-  KALDI_ASSERT(indirect_diff->Dim() == Dim());
-
-  Matrix<double> mat_tmp(gmm.NumGauss(), gmm.Dim());
-  Vector<double> vec_tmp(gmm.NumGauss());
-
-  gmm.GetMeans(&mat_tmp);
-  for (int32 i = 0; i < mat_tmp.NumRows(); i++) {
-	mat_tmp.Row(i).AddVec(-1.0, data);
-  }
-  mat_tmp.MulElements(fmpe_diaggmm_diff_acc.variance_diff_accumulator());
-  mat_tmp.Scale(-2.0);
-  mat_tmp.AddMat(1.0, fmpe_diaggmm_diff_acc.mean_diff_accumulator(), kNoTrans);
-  // should be scaled in compute model difficientials,
-  // but used here just for convenient
-  mat_tmp.Scale(config_.lat_prob_scale);
-
-  vec_tmp.CopyFromVec(posteriors);
-  vec_tmp.DivElemByElem(fmpe_diaggmm_diff_acc.mle_occupancy());
-
-  indirect_diff->AddMatVec(1.0, mat_tmp, kTrans, vec_tmp, 1.0);
-}
-
-void FmpeAccs::AccumulateInDirectDiffFromDiag(const DiagGmm &gmm,
-                             const FmpeAccumModelDiff &fmpe_diaggmm_diff_acc,
-                             const VectorBase<BaseFloat> &data,
-                             BaseFloat frame_posterior,
-							 Vector<double> *indirect_diff) {
-  KALDI_ASSERT(gmm.NumGauss() == fmpe_diaggmm_diff_acc.NumGauss());
-  KALDI_ASSERT(gmm.Dim() == fmpe_diaggmm_diff_acc.Dim());
-  KALDI_ASSERT(gmm.Dim() == Dim());
-  KALDI_ASSERT(static_cast<int32>(data.Dim()) == Dim());
-  KALDI_ASSERT(indirect_diff->Dim() == Dim());
-
-  Vector<BaseFloat> posteriors(gmm.NumGauss());
-  gmm.ComponentPosteriors(data, &posteriors);
-  posteriors.Scale(frame_posterior);
-
-  AccumulateInDirectDiffFromPosteriors(gmm, fmpe_diaggmm_diff_acc,
-									   data, posteriors, indirect_diff);
-}
-
-void FmpeAccs::AccumulateFromDifferential(const VectorBase<double> &direct_diff,
-										  const VectorBase<double> &indirect_diff,
-       const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht) {
-  KALDI_ASSERT((direct_diff.Dim() == indirect_diff.Dim()));
-  KALDI_ASSERT(direct_diff.Dim() == Dim());
-
-  Vector<double> diff(direct_diff);
-  diff.AddVec(1.0, indirect_diff);
-
-  int32 dim = gmm_.Dim();
-  Matrix<double> tmp(dim, dim + 1);
-  tmp.SetZero();
-
-  /// accumulate the p and n statistics
-  for (int32 i = 0; i < ht.size(); i++) {
-	int32 cont_index = ht[i].first;
-	for (int32 j = 0; j < ht[i].second.size(); j++) {
-      int32 gauss_index = ht[i].second[j].first;
-	  tmp.AddVecVec(1.0, diff, ht[i].second[j].second);
-
-      for (int32 r = 0; r < dim; r++) {
-        for (int32 c = 0;c < (dim + 1); c++) {
-			if (tmp(r, c) > 0.0) {
-		      p_[gauss_index][cont_index](r, c) += tmp(r, c);
-			}
-	        else {
-		      n_[gauss_index][cont_index](r, c) -= tmp(r, c);
-			}
-		}
-	  }
-
-	  tmp.SetZero();
-	}
-  }
-
-  /// accumulate the direct/indirect and total differentials
-  diff_.AddVec(1.0, diff);
-  direct_diff_.AddVec(1.0, direct_diff);
-  indirect_diff_.AddVec(1.0, indirect_diff);
-}
-
-FmpeUpdater::FmpeUpdater(const FmpeAccs &accs)
-      : config_(accs.config()), dim_(accs.Dim()) {
-  Init(config_.gmm_num_comps, config_.context_windows.NumRows(), dim_);
-};
-
-FmpeUpdater::FmpeUpdater(const FmpeUpdater &other)
-	: config_(other.config_), avg_std_var_(other.avg_std_var_),
-	  dim_(other.dim_) {
-  if (other.M_.size() != 0) {
-    M_.resize(other.M_.size());
-    for (int32 i = 0; i < other.M_.size(); ++i) {
-      M_[i].resize(other.M_[i].size());
-      for (int32 j = 0; j < other.M_[i].size(); ++j) {
-        M_[i][j].Resize(other.M_[i][j].NumRows(), other.M_[i][j].NumCols());
-        M_[i][j].CopyFromMat(other.M_[i][j], kNoTrans);
-      }
-    }
-  }
-}
-
-void FmpeUpdater::Init(int32 num_gmm_gauss, int32 con_exp, int32 dim) {
-    M_.resize(num_gmm_gauss);
-    for (int32 i = 0; i < num_gmm_gauss; ++i) {
-      M_[i].resize(con_exp);
-      for (int32 j = 0; j < con_exp; ++j) {
-        M_[i][j].Resize(dim, dim + 1);
-      }
-    }
-
-	avg_std_var_.Resize(dim);
-}
-
-void FmpeUpdater::Write(std::ostream &out_stream, bool binary) const {
-  uint32 tmp_uint32;
-
-  WriteToken(out_stream, binary, "<FMPE>");
-
-  WriteToken(out_stream, binary, "<NumGaussians>");
-  tmp_uint32 = static_cast<uint32>(config_.gmm_num_comps);
-  WriteBasicType(out_stream, binary, tmp_uint32);
-  WriteToken(out_stream, binary, "<LengthContExp>");
-  tmp_uint32 = static_cast<uint32>(config_.context_windows.NumRows());
-  WriteBasicType(out_stream, binary, tmp_uint32);
-  WriteToken(out_stream, binary, "<DIMENSION>");
-  WriteBasicType(out_stream, binary, dim_);
-  if (!binary) out_stream << "\n";
-
-  // convert into BaseFloat before writing things
-  Matrix<BaseFloat> mat_bf(dim_, dim_ + 1);
-
-  if (M_.size() != 0) {
-    WriteToken(out_stream, binary, "<PROJ_MAT>");
-    for (int32 i = 0; i < config_.gmm_num_comps; ++i) {
-      for (int32 j = 0; j < config_.context_windows.NumRows(); ++j) {
-		mat_bf.CopyFromMat(M_[i][j], kNoTrans);
-        mat_bf.Write(out_stream, binary);
-	  }
-    }
-  }
-
-  WriteToken(out_stream, binary, "</FMPE>");
-}
-
-void FmpeUpdater::Read(std::istream &in_stream, bool binary) {
-  uint32 tmp_uint32;
-  std::string token;
-
-  ExpectToken(in_stream, binary, "<FMPE>");
-
-  ExpectToken(in_stream, binary, "<NumGaussians>");
-  ReadBasicType(in_stream, binary, &tmp_uint32);
-  int32 num_gaussians = static_cast<int32>(tmp_uint32);
-  ExpectToken(in_stream, binary, "<LengthContExp>");
-  ReadBasicType(in_stream, binary, &tmp_uint32);
-  int32 length_cont_exp = static_cast<int32>(tmp_uint32);
-  ExpectToken(in_stream, binary, "<DIMENSION>");
-  ReadBasicType(in_stream, binary, &dim_);
-
-  ReadToken(in_stream, binary, &token);
-
-  while (token != "</FMPE>") {
-    if (token == "<PROJ_MAT>") {
-      M_.resize(num_gaussians);
-      for (size_t i = 0; i < M_.size(); ++i) {
-        M_[i].resize(length_cont_exp);
-		for (size_t j = 0; j < M_[i].size(); ++j) {
-          M_[i][j].Read(in_stream, binary);
-		}
-      }
-    } else {
-      KALDI_ERR << "Unexpected token '" << token << "' in model file ";
-    }
-    ReadToken(in_stream, binary, &token);
-  }
-}
-
-void FmpeUpdater::ComputeAvgStandardDeviation(const AmDiagGmm &am) {
-  Matrix<double> vars_tmp;
-  Vector<double> vec_tmp(am.Dim());
-
-  for (int32 i = 0; i < am.NumPdfs(); i++) {
-	const DiagGmm &gmm = am.GetPdf(i);
-	gmm.GetVars(&vars_tmp);
-	vars_tmp.ApplyPow(0.5);
-	vec_tmp.AddRowSumMat(vars_tmp);
-  }
-
-  vec_tmp.Scale(1 / am.NumGauss());
-
-  avg_std_var_.CopyFromVec(vec_tmp);
-}
-
-void FmpeUpdater::Update(const FmpeAccs &accs,
-					     BaseFloat *obj_change_out,
-                         BaseFloat *count_out) {
-  KALDI_ASSERT((M_.size() == accs.pos().size()) && (M_.size() == accs.neg().size()));
-  KALDI_ASSERT((M_[0].size() == accs.pos()[0].size()) && (M_[0].size() == accs.neg()[0].size())
-			   && M_[0].size() == config_.context_windows.NumRows());
-  KALDI_ASSERT((M_[0][0].NumRows() == accs.pos()[0][0].NumRows())
-			   && (M_[0][0].NumRows() == accs.neg()[0][0].NumRows())
-			   && (M_[0][0].NumRows() == avg_std_var_.Dim()));
-  KALDI_ASSERT((M_[0][0].NumCols() == accs.pos()[0][0].NumCols())
-			   && (M_[0][0].NumCols() == accs.neg()[0][0].NumCols())
-			   && (M_[0][0].NumCols() == (M_[0][0].NumRows() + 1)));
-
-  int32 ngauss = M_.size();
-  int32 n_cont_exp = M_[0].size();
-  int32 dim = M_[0][0].NumRows();
-
-  Matrix<double> pandn_add_tmp(dim, dim + 1);
-  Matrix<double> pandn_sub_tmp(dim, dim + 1);
-  Vector<double> vec_tmp(avg_std_var_);
-  vec_tmp.Scale(1 / config_.E);
-
-  KALDI_LOG << "Updating the projection matrix M, the dim is: [ "
-	        << ngauss << " ][ " << n_cont_exp << " ][ " << dim << " ][ " << dim + 1
-			<< " ] -> [nGauss][nContExp][fea_dim][fea_dim + 1]";
-
-  for (int32 gauss_index = 0; gauss_index < ngauss; gauss_index++) {
-	for (int32 icon_exp = 0; icon_exp < n_cont_exp; icon_exp++) {
-		pandn_add_tmp.CopyFromMat(accs.pos()[gauss_index][icon_exp], kNoTrans);
-		pandn_add_tmp.AddMat(1.0, accs.neg()[gauss_index][icon_exp], kNoTrans);
-		pandn_sub_tmp.CopyFromMat(accs.pos()[gauss_index][icon_exp], kNoTrans);
-		pandn_sub_tmp.AddMat(-1.0, accs.neg()[gauss_index][icon_exp], kNoTrans);
-		pandn_sub_tmp.DivElements(pandn_add_tmp);
-		pandn_sub_tmp.MulRowsVec(vec_tmp);
-
-		M_[gauss_index][icon_exp].AddMat(1.0, pandn_sub_tmp, kNoTrans);
-	}
-  }
-
-  /// add some code to calculate the objective function change // TODO
-}
-
-void ClusterGmmToClusterCenters(const DiagGmm &gmm,
-                                int32 num_cluster_centers,
-                                BaseFloat cluster_varfloor,
-                                DiagGmm *ubm_cluster_centers,
-                                std::vector<int32> *cluster_center_map) {
-  // Bottom-up clustering of the Gaussians in the gmm model
-  KALDI_ASSERT(num_cluster_centers < gmm.NumGauss());
-  int32 dim = gmm.Dim();
-  Vector<BaseFloat> tmp_mean(dim);
-  Vector<BaseFloat> tmp_var(dim);
-  int32 num_gaussians = gmm.NumGauss();
-  std::vector<Clusterable*> gauss_clusters;
-  gauss_clusters.reserve(num_cluster_centers);
-
-  for (int32 gauss_index = 0; gauss_index < num_gaussians; gauss_index++) {
-    gmm.GetComponentMean(gauss_index, &tmp_mean);
-    gmm.GetComponentVariance(gauss_index, &tmp_var);
-    tmp_var.AddVec2(1.0, tmp_mean);  // make it x^2 stats.
-    BaseFloat this_weight = gmm.weights()(gauss_index);
-    tmp_mean.Scale(this_weight);
-    tmp_var.Scale(this_weight);
-    gauss_clusters.push_back(new GaussClusterable(tmp_mean, tmp_var,
-                          cluster_varfloor, this_weight));
-  }
-
-  std::vector<Clusterable*> gauss_clusters_out;
-  KALDI_VLOG(1) << "Creating " << num_cluster_centers << " gaussian clusters centers.";
-  ClusterBottomUp(gauss_clusters, kBaseFloatMax, num_cluster_centers,
-                  &gauss_clusters_out,
-                  cluster_center_map /*get the cluster assignments*/);
-  DeletePointers(&gauss_clusters);
-
-  // Next, put the clustered Gaussians centers into a single GMM.
-  KALDI_VLOG(1) << "Putting " << num_cluster_centers << " Gaussians cluster centers"
-                << "into a single GMM model.";
-  Matrix<BaseFloat> tmp_means(num_cluster_centers, dim);
-  Matrix<BaseFloat> tmp_vars(num_cluster_centers, dim);
-  Vector<BaseFloat> tmp_weights(num_cluster_centers);
-  Vector<BaseFloat> tmp_vec(dim);
-  DiagGmm tmp_gmm;
-  for (int32 gauss_index = 0; gauss_index < num_cluster_centers; gauss_index++) {
-    GaussClusterable *this_cluster = static_cast<GaussClusterable*>(
-        gauss_clusters_out[gauss_index]);
-    BaseFloat weight = this_cluster->count();
-    tmp_weights(gauss_index) = weight;
-    tmp_vec.CopyFromVec(this_cluster->x_stats());
-    tmp_vec.Scale(1/weight);
-    tmp_means.CopyRowFromVec(tmp_vec, gauss_index);
-    tmp_vec.CopyFromVec(this_cluster->x2_stats());
-    tmp_vec.Scale(1/weight);
-    tmp_vec.AddVec2(-1.0, tmp_means.Row(gauss_index));  // x^2 stats to var.
-    tmp_vars.CopyRowFromVec(tmp_vec, gauss_index);
-  }
-  DeletePointers(&gauss_clusters_out);
-
-  tmp_gmm.Resize(num_cluster_centers, dim);
-  tmp_weights.Scale(1.0/tmp_weights.Sum());
-  tmp_gmm.SetWeights(tmp_weights);
-  tmp_vars.InvertElements();  // need inverse vars...
-  tmp_gmm.SetInvVarsAndMeans(tmp_vars, tmp_means);
-
-  KALDI_VLOG(1) << "Obtain " << tmp_gmm.NumGauss() << " Gaussians cluster centers.";
-  ubm_cluster_centers->CopyFromDiagGmm(tmp_gmm);
-}
-
-void ObtainUbmAndSomeClusterCenters(
-			         const AmDiagGmm &am,
-                     const Vector<BaseFloat> &state_occs,
-                     const FmpeConfig &config,
-                     DiagGmm *gmm_out,
-                     DiagGmm *gmm_cluster_centers_out,
-                     std::vector<int32> *gaussian_cluster_center_map_out) {
-  /// First clusters the Gaussians in an acoustic model to a single GMM with specified
-  /// number of components. Using the same algorithm in the SGMM's UBM
-  /// initialization
-  kaldi::UbmClusteringOptions ubm_opts;
-  ubm_opts.ubm_numcomps = config.gmm_num_comps;
-  ClusterGaussiansToUbm(am, state_occs, ubm_opts, gmm_out);
-
-  /// Clusters the Gaussians in the gmm model to some cluster centers, which is for
-  /// more efficient evaluation of the gaussian posteriors just with
-  /// the most likely cluster centers
-  ClusterGmmToClusterCenters(*gmm_out, config.gmm_num_cluster_centers, config.cluster_varfloor,
-                             gmm_cluster_centers_out, gaussian_cluster_center_map_out);
-
-}
-
-}  // End of namespace kaldi
--- a/src/gmm/fmpe-am-diag-gmm.h
+++ b/src/gmm/fmpe-am-diag-gmm.h
@ -1,388 +0,0 @@
-// gmm/fmpe-am-diag-gmm.h
-
-// Copyright 2009-2011  Yanmin Qian
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_GMM_FMPE_AM_DIAG_GMM_H_
-#define KALDI_GMM_FMPE_AM_DIAG_GMM_H_ 1
-
-#include <vector>
-
-#include "gmm/am-diag-gmm.h"
-#include "gmm/mle-diag-gmm.h"
-#include "gmm/ebw-diag-gmm.h"
-
-namespace kaldi {
-
-struct FmpeConfig {
-  /// Number of the Gaussian components in the gmm model
-  int32 gmm_num_comps;
-  /// Number of the Gaussian cluster centers for fast evaluation
-  int32 gmm_num_cluster_centers;
-  /// the cluster var floor
-  BaseFloat cluster_varfloor;
-  /// Number of highest-scoring of the best cluster centers
-  int32 gmm_cluster_centers_nbest;
-  /// Number of highest-scoring of the best gaussians
-  int32 gmm_gaussian_nbest;
-  /// The lat prob scale
-  double lat_prob_scale;
-  /// The constant that contrals the overall learning rate
-  double E;
-  /// The Matrix indicates the length of context expansion
-  /// and the weight of each corresponding context frame. e.g.[9][17]
-  Matrix<BaseFloat> context_windows;
-
-  /*
-    Matrix<BaseFloat> context_windows;
-    // Normal dimension is [9][17]
-    // Example would be
-    // context_windows = [ 0 0 0 0 0 0 0 0 1.0 0 0 0 0 0 0 0 0
-    //                     0 0 0 0 0 0 0 0 0 1.0 0 0 0 0 0 0 0
-    //  .... etc.
-    // Then your nlength_context_expansion variable equals
-    // the NumRows() of this.
-    // Then you don't have to hard-code the computation in ComputeContExpOffsetFeature.
-    // Note: the code in ComputeContExpOffsetFeature that iterates over
-    // context_windows will check for zeros, so it will not have to do any work if
-    // it finds a zero feature.
-    // Also be careful when the same Gaussian index is present on more than one frame,
-    // that you are adding the values together, not replacing one with the other or
-    // creating duplicates with the same index. [maybe use function DeDuplicateVector(
-    //  std::vector<std::pair<int32, Vector<BaseFloat> >*), that would first sort on the
-    // int32 and then add together and combine any sets of elements with the same
-    // integer value.
-  */
-  FmpeConfig() {
-    gmm_num_comps = 2048;
-    gmm_num_cluster_centers = 128;
-    cluster_varfloor = 0.01;
-    gmm_cluster_centers_nbest = 25;
-    gmm_gaussian_nbest = 2;
-    lat_prob_scale = 0.083;
-    E = 10.0;
-  }
-
-  void Register(ParseOptions *po) {
-    po->Register("gmm-num-comps", &gmm_num_comps, "Number of the Gaussian"
-        " components in the gmm model to calculate the gaussian posteriors.");
-    po->Register("gmm-num-cluster-centers", &gmm_num_cluster_centers, "Number"
-        " of the Gaussian cluster centers for fast posteriors evaluation.");
-    po->Register("cluster-varfloor", &cluster_varfloor,
-      "Variance floor used in bottom-up state clustering.");
-    po->Register("gmm-cluster-centers-nbest", &gmm_cluster_centers_nbest,
-        "Number of highest-scoring of the best cluster centers.");
-    po->Register("gmm-gaussian-nbest", &gmm_gaussian_nbest, "Number of"
-        " of highest-scoring of the best gaussians.");
-    po->Register("lat-prob-scale", &lat_prob_scale,
-        "The lattice probability scale, very important.");
-    po->Register("E", &E, "The constant that contrals the overall learning rate.");
-  }
-};
-
-/** \class FmpeAccumModelDiff
- * Class for computing the basic model parameter differentials from
- *  the mpe statistics produced in the first pass of fmpe training
- */
-class FmpeAccumModelDiff {
- public:
-  FmpeAccumModelDiff(): dim_(0), num_comp_(0) {}
-  explicit FmpeAccumModelDiff(const DiagGmm &gmm) {
-    Resize(gmm);
-  }
-
-  void Read(std::istream &in_stream, bool binary);
-  void Write(std::ostream &out_stream, bool binary) const;
-
-  /// Allocates memory for accumulators
-  void Resize(int32 num_comp, int32 dim);
-  /// Calls ResizeAccumulators based on gmm
-  void Resize(const DiagGmm &gmm);
-
-  /// Returns the number of mixture components
-  int32 NumGauss() const { return num_comp_; }
-  /// Returns the dimensionality of the feature vectors
-  int32 Dim() const { return dim_; }
-
-  void SetZero();
-
-  // Accessors
-  const Vector<double>& mle_occupancy() const { return mle_occupancy_; }
-  const Matrix<double>& mean_diff_accumulator() const { return mean_diff_accumulator_; }
-  const Matrix<double>& variance_diff_accumulator() const { return variance_diff_accumulator_; }
-
-  /// Computes the Model parameter differentials using the statistics from
-  /// the MPE training, including the numerator and denominator accumulators
-  /// and applies I-smoothing to the numerator accs, if needed,
-  /// which using mle_acc.
-  void ComputeModelParaDiff(const DiagGmm &diag_gmm,
-                            const AccumDiagGmm &num_acc,
-                            const AccumDiagGmm &den_acc,
-                            const AccumDiagGmm &mle_acc);
-
-
- private:
-  int32 dim_;
-  int32 num_comp_;
-
-  /// Accumulators
-  Vector<double> mle_occupancy_;
-  Matrix<double> mean_diff_accumulator_;
-  Matrix<double> variance_diff_accumulator_;
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FmpeAccumModelDiff);
-};
-
-inline void FmpeAccumModelDiff::Resize(const DiagGmm &gmm) {
-  Resize(gmm.NumGauss(), gmm.Dim());
-}
-
-/** \class FmpeAccs
- *  Class for accumulate the positive and negative statistics
- *  for computing the feature-level minimum phone error estimate of the
- *  parameters of projection M matrix.
- *  The acoustic model is diagonal Gaussian mixture models
- */
-class FmpeAccs {
- public:
-  explicit FmpeAccs(const FmpeConfig &config)
-      : config_(config) {};
-
-  ~FmpeAccs() {}
-
-  void Read(std::istream &in_stream, bool binary, bool add);
-  void Write(std::ostream &out_stream, bool binary) const;
-
-  /// Read the am model's parameters differentials
-  void ReadModelDiffs(std::istream &in_stream, bool binary);
-
-  /// Initializes the P and N statistics, and model parameter differentials if needed
-  void Init(const AmDiagGmm &am_model, bool update);
-
-  /// Initializes the P and N statistics, and diff statistics
-  void InitPNandDiff(int32 num_gmm_gauss, int32 con_exp, int32 dim);
-
-  /// Initializes the model parameter differentials
-  void InitModelDiff(const AmDiagGmm &model);
-
-  /// Initializes the GMMs for computing the high dimensional features
-  void InitializeGMMs(const DiagGmm &gmm, const DiagGmm &gmm_cluster_centers,
-                      std::vector<int32> &gaussian_cluster_center_map);
-
-  /// Compute the offset feature given one frame data
-  void ComputeOneFrameOffsetFeature(const VectorBase<BaseFloat>& data,
-                           std::vector<std::pair<int32, Vector<double> > > *offset) const;
-
-  /// Compute all the offset features given the whole file data
-  void ComputeWholeFileOffsetFeature(const MatrixBase<BaseFloat>& data,
-                           std::vector<std::vector<std::pair<int32, Vector<double> > > > *whole_file_offset) const;
-
-  /// Compute the context expansion high dimension feature
-  /// The high dimension offset feature with the context expansion: "ht";
-  /// the vector "ht" store the expanded offset feature corresponding
-  /// each context. And each element of "ht" is the relative context's
-  /// offset feature, which stored as the pair, including the used
-  /// gaussian index and the corresponding offset feature
-  /// vector. This structure is designed for the sparse vector ht.
-  /// dim is [nContExp * nGaussian * (fea_dim + 1)]
-  /// "offset_win" stores the current corresponding offset features
-  /// which are used to compute "ht"
-  void ComputeContExpOffsetFeature(
-       const std::vector<std::vector<std::pair<int32, Vector<double> > >* > &offset_win,
-       std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > *ht) const;
-
-  /// obtain the current needed context expension high dimension feature using
-  /// the whole file offset features as the inputs which is indexed
-  /// by the current frame's number frame_index
-  void ComputeHighDimemsionFeature(
-       const std::vector<std::vector<std::pair<int32, Vector<double> > > > &whole_file_offset_feat,
-       int32 frame_index,
-       std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > *ht) const;
-
-  /// Prject the high dimension features down to the dimension of the original
-  /// features and add them to the origianl features.
-  /// This is the sparse multiply using the non-sparse matrix M and
-  /// the sparse high dimension vector ht
-  void ProjectHighDimensionFeature(
-         const std::vector< std::vector< Matrix<double> > > &M,
-         const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht,
-         Vector<double> *fea_out) const;
-
-  /// Add the projected feature to the old feature and obtain the new fmpe feature
-  void ObtainNewFmpeFeature(const VectorBase<BaseFloat> &data,
-         const std::vector< std::vector< Matrix<double> > > &M,
-         const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht,
-         Vector<double> *fea_new) const;
-
-  /// Accumulate the direct differentials
-  void AccumulateDirectDiffFromPosteriors(const DiagGmm &gmm,
-                                    const VectorBase<BaseFloat> &data,
-                                    const VectorBase<BaseFloat> &posteriors,
-                                    Vector<double> *direct_diff);
-
-  /// Accumulate the indirect differentials from posteriors
-  void AccumulateInDirectDiffFromPosteriors(const DiagGmm &gmm,
-                                      const FmpeAccumModelDiff &fmpe_diaggmm_diff_acc,
-                                      const VectorBase<BaseFloat> &data,
-                                      const VectorBase<BaseFloat> &posteriors,
-                                      Vector<double> *indirect_diff);
-
-  /// Accumulate the indirect differentials from a DiagGmm model
-  void AccumulateInDirectDiffFromDiag(const DiagGmm &gmm,
-                                      const FmpeAccumModelDiff &fmpe_diaggmm_diff_acc,
-                                      const VectorBase<BaseFloat> &data,
-                                      BaseFloat frame_posterior,
-                                      Vector<double> *indirect_diff);
-
-  /// Accumulate the statistics about the positive and negative differential
-  void AccumulateFromDifferential(const VectorBase<double> &direct_diff,
-                                  const VectorBase<double> &indirect_diff,
-         const std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > &ht);
-
-  // Accessors
-  FmpeAccumModelDiff& GetAccsModelDiff(int32 pdf_index);
-  const FmpeAccumModelDiff& GetAccsModelDiff(int32 pdf_index) const;
-
-  const std::vector< std::vector< Matrix<double> > >& pos() const { return p_; }
-  const std::vector< std::vector< Matrix<double> > >& neg() const { return n_; }
-  const FmpeConfig& config() const { return config_; }
-
-  /// Returns the number of mixture components in the GMM model
-  int32 NumGaussInGmm() const { return gmm_.NumGauss(); }
-  /// Returns the number of cluster centers in the cluster center GMM
-  int32 NumClusterCenter() const { return gmm_cluster_centers_.NumGauss(); }
-  /// Returns the dimensionality of the feature vectors
-  int32 Dim() const { return dim_; }
-
- private:
-  FmpeConfig config_;
-  /// These contain the gmm models used to calculate the high deminsion
-  /// offet feature : one compute the high dimension vector gaussian posteriors,
-  /// and the other one is just for more efficient computing using
-  /// the most likely cluster centers
-  DiagGmm gmm_;
-  DiagGmm gmm_cluster_centers_;
-
-  /// The mapping between the gmm_ model and the cluster centers of gmm_cluster_centers_
-  std::vector<int32> gaussian_cluster_center_map_;
-
-  /// The basic model parameter differentials for the AmDiagGmm
-  std::vector<FmpeAccumModelDiff*> model_diff_accumulators_;
-
-  /// The positive accumulated matrix p_ij; dim is [nGauss][nContExp][fea_dim][fea_dim + 1].
-  std::vector< std::vector< Matrix<double> > > p_;
-  /// The negative accumulated matrix n_ij; dim is [nGauss][nContExp][fea_dim][fea_dim + 1].
-  std::vector< std::vector< Matrix<double> > > n_;
-  /// The summation of the differential
-  Vector<double> diff_;
-  /// The summation of the direct differential
-  Vector<double> direct_diff_;
-  /// The summation of the indirect differential
-  Vector<double> indirect_diff_;
-
-  /// The feature dim
-  int32 dim_;
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FmpeAccs);
-};
-
-inline FmpeAccumModelDiff& FmpeAccs::GetAccsModelDiff(int32 pdf_index) {
-  KALDI_ASSERT((static_cast<size_t>(pdf_index) < model_diff_accumulators_.size())
-               && (model_diff_accumulators_[pdf_index] != NULL));
-  return *(model_diff_accumulators_[pdf_index]);
-}
-
-inline const FmpeAccumModelDiff& FmpeAccs::GetAccsModelDiff(int32 pdf_index) const {
-  KALDI_ASSERT((static_cast<size_t>(pdf_index) < model_diff_accumulators_.size())
-               && (model_diff_accumulators_[pdf_index] != NULL));
-  return *(model_diff_accumulators_[pdf_index]);
-}
-
-/** \class FmpeUpdater
- *  Class for containing the functions that updating the feature-level
- *  minimum phone error estimate of the parameters of projection M matrix
- *  that adds offsets to the original feature.
- *  The acoustic model is diagonal Gaussian mixture models
- */
-class FmpeUpdater {
- public:
-  explicit FmpeUpdater(const FmpeAccs &accs);
-  ~FmpeUpdater() {}
-
-  // provide copy constructor.
-  explicit FmpeUpdater(const FmpeUpdater &other);
-
-  void Read(std::istream &in_stream, bool binary);
-  void Write(std::ostream &out_stream, bool binary) const;
-
-  /// Initializes feature projection Matrix M
-  void Init(int32 num_gmm_gauss, int32 con_exp, int32 dim);
-
-  /// compute the average standard deviation of gaussians
-  /// in the current AmDiagGmm set
-  void ComputeAvgStandardDeviation(const AmDiagGmm &am);
-
-  /// Update the projection matrix M
-  void Update(const FmpeAccs &accs,
-              BaseFloat *obj_change_out,
-              BaseFloat *count_out);
-
-  // Accessors
-  const std::vector< std::vector< Matrix<double> > >& ProjMat() const { return M_; }
-  const FmpeConfig& config() const { return config_; }
-
- private:
-  FmpeConfig config_;
-
-  /// The average standard deviation of gaussians in the current AmDiagGmm set
-  Vector<double> avg_std_var_;
-
-  /// The feature projection matrix; dim is [nGauss][nContExp][fea_dim][fea_dim + 1].
-  std::vector< std::vector< Matrix<double> > > M_;
-
-  /// The feature dim
-  int32 dim_;
-};
-
-/** Clusters the Gaussians in the gmm model to some cluster centers
- */
-void ClusterGmmToClusterCenters(const DiagGmm &gmm,
-                                int32 num_cluster_centers,
-                                BaseFloat cluster_varfloor,
-                                DiagGmm *ubm_cluster_centers,
-                                std::vector<int32> *cluster_center_map);
-
-/** First clusters the Gaussians in an acoustic model to a single GMM with specified
- * number of components. Using the same algorithm in the SGMM's UBM
- * initialization, and then Clusters the Gaussians in the gmm model
- * to some cluster centers, which is for more efficient evaluation of the
- * gaussian posteriors just with the most likely cluster centers
- */
-void ObtainUbmAndSomeClusterCenters(
-                     const AmDiagGmm &am,
-                     const Vector<BaseFloat> &state_occs,
-                     const FmpeConfig &config,
-                     DiagGmm *gmm_out,
-                     DiagGmm *gmm_cluster_centers_out,
-                     std::vector<int32> *gaussian_cluster_center_map_out);
-
-
-}  // End namespace kaldi
-
-
-#endif  // KALDI_GMM_FMPE_AM_DIAG_GMM_H_
--- a/src/gmmbin/Makefile
+++ b/src/gmmbin/Makefile
@ -15,13 +15,13 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
           gmm-est-fmllr-gpost gmm-est-fmllr gmm-est-regtree-fmllr-ali \
           gmm-est-regtree-mllr gmm-decode-kaldi gmm-compute-likes \
           gmm-decode-faster-regtree-mllr gmm-et-apply-c gmm-latgen-simple \
-           gmm-rescore-lattice gmm-decode-biglm-faster fmpe-gmm-model-diffs-est \
-           fmpe-gmm-acc-stats-gpost fmpe-gmm-sum-accs fmpe-init-gmms fmpe-gmm-est \
+           gmm-rescore-lattice gmm-decode-biglm-faster \
           gmm-est-gaussians-ebw gmm-est-weights-ebw gmm-latgen-faster gmm-copy \
           gmm-global-acc-stats gmm-global-est gmm-global-sum-accs gmm-gselect \
           gmm-latgen-biglm-faster gmm-ismooth-stats gmm-global-get-frame-likes \
           gmm-global-est-fmllr gmm-global-to-fgmm gmm-global-acc-stats-twofeats \
-           gmm-global-copy gmm-align-compiled-plusphones gmm-get-feat-deriv
+           gmm-global-copy gmm-align-compiled-plusphones gmm-get-feat-deriv \
+           gmm-fmpe-acc-stats gmm-acc-stats2

 OBJFILES =

--- a/src/gmmbin/fmpe-gmm-acc-stats-gpost.cc
+++ b/src/gmmbin/fmpe-gmm-acc-stats-gpost.cc
@ -1,186 +0,0 @@
-// gmmbin/fmpe-gmm-acc-stats-gpost.cc
-
-// Copyright 2009-2011  Yanmin Qian
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "gmm/fmpe-am-diag-gmm.h"
-
-
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Accumulate positive and negative stats for Fmpe training (reading in gaussian-level posteriors).\n"
-        "Note: not yet tested.\n"
-        "Usage:  fmpe-gmm-acc-stats-gpost [options] <model-in> <model-diffs-in> <gmms-model-in> <feature-rspecifier> <gposteriors-ebw-rspecifier> <gposteriors-mle-rspecifier> <stats-out>\n"
-        "e.g.: \n"
-        " fmpe-gmm-acc-stats-gpost 1.mdl 1.model.diffs 1.gmm scp:train.scp ark:1.ebw.gpost ark:1.mle.gpost 1.fmpe.acc\n";
-
-    typedef kaldi::int32 int32;
-
-    bool binary = false;
-    FmpeConfig fmpe_opts;
-    int32 gmm_cluster_centers_nbest = 25;
-    int32 gmm_gaussian_nbest = 2;
-    double lat_prob_scale = 0.083;
-    double E = 10.0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("gmm-cluster-centers-nbest", &gmm_cluster_centers_nbest,
-        "Number of highest-scoring of the best cluster centers.");
-    po.Register("gmm-gaussian-nbest", &gmm_gaussian_nbest, "Number of"
-        " of highest-scoring of the best gaussians.");
-    po.Register("lat-prob-scale", &lat_prob_scale,
-        "The lattice probability scale, very important.");
-    po.Register("E", &E, "The constant that contrals the overall learning rate.");
-
-    fmpe_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-
-    if (po.NumArgs() != 7) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        model_diffs_filename = po.GetArg(2),
-        gmms_model_filename = po.GetArg(3),
-        feature_rspecifier = po.GetArg(4),
-        gposteriors_ebw_rspecifier = po.GetArg(5),
-        gposteriors_mle_rspecifier = po.GetArg(6),
-        accs_wxfilename = po.GetArg(7);
-
-    using namespace kaldi;
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    FmpeAccs fmpe_accs(fmpe_opts);
-    fmpe_accs.Init(am_gmm, true);
-    {
-      bool binary;
-      Input ki(model_diffs_filename, &binary);
-      fmpe_accs.ReadModelDiffs(ki.Stream(), binary);
-    }
-
-    kaldi::DiagGmm gmm;
-    kaldi::DiagGmm gmm_clusters;
-    std::vector<int32> gaussian_cluster_center_map;
-    {
-      bool binary;
-      Input ki(gmms_model_filename, &binary);
-      gmm.Read(ki.Stream(), binary);
-      gmm_clusters.Read(ki.Stream(), binary);
-      ReadIntegerVector(ki.Stream(), binary, &gaussian_cluster_center_map);
-    }
-
-    fmpe_accs.InitializeGMMs(gmm, gmm_clusters, gaussian_cluster_center_map);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessGauPostReader gposteriors_ebw_reader(gposteriors_ebw_rspecifier);
-    RandomAccessGauPostReader gposteriors_mle_reader(gposteriors_mle_rspecifier);
-
-    int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string key = feature_reader.Key();
-      if ((!gposteriors_ebw_reader.HasKey(key)) &&
-		  (!gposteriors_mle_reader.HasKey(key))) {
-        num_no_posterior++;
-      } else {
-        const Matrix<BaseFloat> &mat = feature_reader.Value();
-        const GauPost &gpost_ebw = gposteriors_ebw_reader.Value(key);
-        const GauPost &gpost_mle = gposteriors_ebw_reader.Value(key);
-
-        if ((static_cast<int32>(gpost_ebw.size()) != mat.NumRows()) &&
-			(static_cast<int32>(gpost_mle.size()) != mat.NumRows())) {
-          KALDI_WARN << "Gaussian Posterior vector has wrong size : gpost-ebw. " <<
-			  (gpost_ebw.size()) << "gpost-mle. " << (gpost_mle.size()) << " vs. "<< (mat.NumRows());
-          num_other_error++;
-          continue;
-        }
-
-        num_done++;
-
-        std::vector<std::vector<std::pair<int32, Vector<double> > > > whole_file_offset;
-        std::vector<std::pair<int32, std::vector<std::pair<int32, Vector<double> > > > > ht;
-
-        fmpe_accs.ComputeWholeFileOffsetFeature(mat, &whole_file_offset);
-
-        for (size_t i = 0; i < mat.NumRows(); i++) {
-          fmpe_accs.ComputeHighDimemsionFeature(whole_file_offset, i, &ht);
-          Vector<double> direct_diff(mat.NumCols()), indirect_diff(mat.NumCols());
-		  /// compute the direct differentials
-          for (size_t j = 0; j < gpost_ebw[i].size(); j++) {
-            int32 tid = gpost_ebw[i][j].first,  // transition identifier.
-                pdf_id = trans_model.TransitionIdToPdf(tid);
-            fmpe_accs.AccumulateDirectDiffFromPosteriors(am_gmm.GetPdf(pdf_id),
-														 mat.Row(i),
-														 gpost_ebw[i][j].second,
-														 &direct_diff);
-          }
-		  /// compute the indirect differentials
-          for (size_t j = 0; j < gpost_mle[i].size(); j++) {
-            int32 tid = gpost_mle[i][j].first,  // transition identifier.
-                pdf_id = trans_model.TransitionIdToPdf(tid);
-            fmpe_accs.AccumulateInDirectDiffFromPosteriors(am_gmm.GetPdf(pdf_id),
-														   fmpe_accs.GetAccsModelDiff(pdf_id),
-														   mat.Row(i),
-														   gpost_mle[i][j].second,
-														   &indirect_diff);
-          }
-          fmpe_accs.AccumulateFromDifferential(direct_diff, indirect_diff, ht);
-          ht.clear();
-        }
-        if (num_done % 50 == 0) {
-          KALDI_LOG << "Processed " << num_done << " utterances.";
-        }
-      }
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
-              << " with no posteriors, " << num_other_error
-              << " with other errors.";
-
-    {
-      Output ko(accs_wxfilename, binary);
-      fmpe_accs.Write(ko.Stream(), binary);
-    }
-    KALDI_LOG << "Written accs.";
-    if (num_done != 0) return 0;
-    else return 1;
-  } catch(const std::exception& e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
--- a/src/gmmbin/fmpe-gmm-est.cc
+++ b/src/gmmbin/fmpe-gmm-est.cc
@ -1,97 +0,0 @@
-// gmmbin/fmpe-gmm-est.cc
-
-// Copyright 2009-2011  Yanmin Qian
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "gmm/fmpe-am-diag-gmm.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Estimate fMPE transforms.\n"
-        "Note: not yet tested.\n"
-        "Usage:  fmpe-gmm-est [options] <am-model-in> <fmpe-proj-matrix-in> <stats-in> <fmpe-proj-matrix-out>\n"
-        "e.g.: gmm-est 1.mdl 1.mat 1.acc 2.mat\n";
-
-    bool binary_write = false;
-    FmpeConfig fmpe_opts;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    fmpe_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-
-    std::string model_in_filename = po.GetArg(1),
-        fmpe_proj_mat_in_filename = po.GetArg(2),
-        stats_filename = po.GetArg(3),
-        fmpe_proj_mat_out_filename = po.GetArg(4);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    FmpeAccs fmpe_accs(fmpe_opts);
-    {
-      bool binary;
-      Input ki(stats_filename, &binary);
-      fmpe_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
-    }
-
-    FmpeUpdater fmpe_updater(fmpe_accs);
-    {
-      bool binary;
-      Input ki(fmpe_proj_mat_in_filename, &binary);
-      fmpe_updater.Read(ki.Stream(), binary);
-    }
-
-    {  // update the Fmpe projection matrix
-      BaseFloat obj_change_out, count_out;
-      fmpe_updater.ComputeAvgStandardDeviation(am_gmm);
-      fmpe_updater.Update(fmpe_accs, &obj_change_out, &count_out);
-    }
-
-    {
-      Output ko(fmpe_proj_mat_out_filename, binary_write);
-      fmpe_updater.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written Fmpe projection matrix to " << fmpe_proj_mat_out_filename;
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
--- a/src/gmmbin/fmpe-gmm-model-diffs-est.cc
+++ b/src/gmmbin/fmpe-gmm-model-diffs-est.cc
@ -1,112 +0,0 @@
-// gmmbin/fmpe-gmm-model-diffs-est.cc
-
-// Copyright 2009-2011  Yanmin Qian
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "gmm/mle-am-diag-gmm.h"
-//#include "gmm/ebw-am-diag-gmm.h"   // TODO wait Arnab to finish the AccumAmEbwDiagGmm Class, then make it active
-#include "gmm/fmpe-am-diag-gmm.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Compute the model parameters differentials from the ebw accumulators (in mpe training) for fmpe training.\n"
-        "Usage:  fmpe-gmm-model-diffs-est [options] <model-in> <ebw-stats-in> <mle-stats-in> <model-diffs-out>\n"
-        "e.g.: fmpe-gmm-model-diff-est 1.mdl 1.ebw.acc 1.mle.acc 1.model.diffs\n";
-
-    bool binary = false;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-
-    std::string model_in_filename = po.GetArg(1),
-        ebw_stats_in_filename = po.GetArg(2),
-        mle_stats_in_filename = po.GetArg(3),
-        model_diffs_out_filename = po.GetArg(4);
-
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    Vector<double> transition_ebw_accs;
-//    AccumAmEbwDiagGmm gmm_ebw_accs;  // TODO wait Arnab to finish the AccumAmEbwDiagGmm Class, then make it active
-    {
-      bool binary;
-      Input ki(ebw_stats_in_filename, &binary);
-      transition_ebw_accs.Read(ki.Stream(), binary);
-      // TODO wait Arnab to finish the AccumAmEbwDiagGmm Class, then make it active
- //     gmm_ebw_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
-    }
-
-    Vector<double> transition_mle_accs;
-    AccumAmDiagGmm gmm_mle_accs;
-    {
-      bool binary;
-      Input ki(mle_stats_in_filename, &binary);
-      transition_mle_accs.Read(ki.Stream(), binary);
-      gmm_mle_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
-    }
-
-    std::vector<FmpeAccumModelDiff*> model_diffs;
-    model_diffs.reserve(am_gmm.NumPdfs());
-    for (int32 i = 0; i < am_gmm.NumPdfs(); i++) {
-      model_diffs.push_back(new FmpeAccumModelDiff(am_gmm.GetPdf(i)));
-      // TODO wait Arnab to finish the AccumAmEbwDiagGmm Class, then make it active
-//      model_diff.back()->ComputeModelParaDiff(am_gmm.GetPdf(i), gmm_ebw_acc.GetAcc(i), gmm_mle_accs.GetAcc(i));
-    }
-
-    // Write out the model diffs
-    {
-      kaldi::Output ko(model_diffs_out_filename, binary);
-      WriteToken(ko.Stream(), binary, "<DIMENSION>");
-      WriteBasicType(ko.Stream(), binary, static_cast<int32>(am_gmm.Dim()));
-      WriteToken(ko.Stream(), binary, "<NUMPDFS>");
-      WriteBasicType(ko.Stream(), binary, static_cast<int32>(model_diffs.size()));
-      for (std::vector<FmpeAccumModelDiff*>::const_iterator it = model_diffs.begin(),
-        end = model_diffs.end(); it != end; ++it) {
-        (*it)->Write(ko.Stream(), binary);
-      }
-    }
-
-    KALDI_LOG << "Written model diffs to " << model_diffs_out_filename;
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
--- a/src/gmmbin/fmpe-gmm-sum-accs.cc
+++ b/src/gmmbin/fmpe-gmm-sum-accs.cc
@ -1,66 +0,0 @@
-// gmmbin/fmpe-gmm-sum-accs.cc
-
-// Copyright 2009-2011  Yanmin Qian
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/fmpe-am-diag-gmm.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Sum multiple accumulated stats files for Fmpe training.\n"
-        "Usage: fmpe-gmm-sum-accs [options] stats-out stats-in1 stats-in2 ...\n";
-
-    bool binary = false;
-    kaldi::FmpeConfig fmpe_opts;
-
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string stats_out_filename = po.GetArg(1);
-    kaldi::FmpeAccs fmpe_accs(fmpe_opts);
-
-    for (int i = 2, max = po.NumArgs(); i <= max; ++i) {
-      std::string stats_in_filename = po.GetArg(i);
-      bool binary_read;
-      kaldi::Input ki(stats_in_filename, &binary_read);
-      fmpe_accs.Read(ki.Stream(), binary_read, true /*add read values*/);
-    }
-
-    // Write out the accs
-    {
-      kaldi::Output ko(stats_out_filename, binary);
-      fmpe_accs.Write(ko.Stream(), binary);
-    }
-
-    KALDI_LOG << "Written stats to " << stats_out_filename;
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
--- a/src/gmmbin/fmpe-init-gmms.cc
+++ b/src/gmmbin/fmpe-init-gmms.cc
@ -1,110 +0,0 @@
-// gmmbin/fmpe-init-gmms.cc
-
-// Copyright 2009-2011   Yanmin Qian
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "gmm/fmpe-am-diag-gmm.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    typedef kaldi::BaseFloat BaseFloat;
-
-    const char *usage =
-        "Cluster the Gaussians in a diagonal-GMM acoustic model\n"
-        "to two single diag-covariance GMMs used in fmpe: one is the gmm model\n"
-        "for compute gaussian posteriors and one is the gaussian\n"
-        "cluster centers which is used to speed up gaussian calculations"
-        "Usage: fmpe-init-gmms [options] <model-file> <state-occs> <gmm-out> <gmm-cluster-centers-out> <gaussian-cluster-center-map-out>\n";
-
-    bool binary_write = false;
-    int32 gmm_num_comps = 2048;
-    int32 gmm_num_cluster_centers = 128;
-    BaseFloat cluster_varfloor = 0.01;
-    kaldi::FmpeConfig fmpe_opts;
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("gmm-num-comps", &gmm_num_comps, "Number of the Gaussian"
-        " components in the gmm model to calculate the gaussian posteriors.");
-    po.Register("gmm-num-cluster-centers", &gmm_num_cluster_centers, "Number"
-        " of the Gaussian cluster centers for fast posteriors evaluation.");
-    po.Register("cluster-varfloor", &cluster_varfloor,
-      "Variance floor used in bottom-up state clustering.");
-
-    fmpe_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        occs_in_filename = po.GetArg(2),
-        gmm_out_filename = po.GetArg(3),
-        gmm_cluster_centers_out_filename = po.GetArg(4),
-        gauss_cluster_center_map_out_filename = po.GetArg(5);
-
-    kaldi::AmDiagGmm am_gmm;
-    kaldi::TransitionModel trans_model;
-    {
-      bool binary_read;
-      kaldi::Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Vector<BaseFloat> state_occs;
-    state_occs.Resize(am_gmm.NumPdfs());
-    {
-      bool binary_read;
-      kaldi::Input ki(occs_in_filename, &binary_read);
-      state_occs.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::DiagGmm gmm;
-    kaldi::DiagGmm gmm_cluster_centers;
-    std::vector<int32> gaussian_cluster_center_map;
-    ObtainUbmAndSomeClusterCenters(
-                     am_gmm,
-                     state_occs,
-                     fmpe_opts,
-                     &gmm,
-                     &gmm_cluster_centers,
-                     &gaussian_cluster_center_map);
-
-    // Write out the gmms model
-    {
-      kaldi::Output ko(gmm_out_filename, binary_write);
-      gmm.Write(ko.Stream(), binary_write);
-      gmm_cluster_centers.Write(ko.Stream(), binary_write);
-      kaldi::WriteIntegerVector(ko.Stream(), binary_write, gaussian_cluster_center_map);
-    }
-
-    KALDI_LOG << "Written GMMs to " << gmm_out_filename;
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
--- a/src/gmmbin/gmm-acc-stats2.cc
+++ b/src/gmmbin/gmm-acc-stats2.cc
@ -0,0 +1,153 @@
+// gmmbin/gmm-acc-stats.cc
+
+// Copyright 2009-2012  Daniel Povey
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "gmm/mle-am-diag-gmm.h"
+
+
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+  typedef kaldi::int64 int64;
+  try {
+    const char *usage =
+        "Accumulate stats for GMM training (from posteriors)\n"
+        "This version writes two accumulators (e.g. num and den),\n"
+        "and puts the positive accumulators in num, negative in den\n"
+        "Usage:  gmm-acc-stats2 [options] <model> <feature-rspecifier>"
+        "<posteriors-rspecifier> <num-stats-out> <den-stats-out>\n"
+        "e.g.:\n"
+        "gmm-acc-stats 1.mdl \"$feats\" ark:1.post 1.num_acc 1.den_acc\n";
+
+    ParseOptions po(usage);
+    bool binary = true;
+    std::string update_flags_str = "mvwt"; // note: t is ignored, we acc
+    // transition stats regardless.
+    po.Register("binary", &binary, "Write stats in binary mode");
+    po.Register("update-flags", &update_flags_str, "Which GMM parameters to "
+                "update: subset of mvwt.");
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_rxfilename = po.GetArg(1),
+        feature_rspecifier = po.GetArg(2),
+        posteriors_rspecifier = po.GetArg(3),
+        num_accs_wxfilename = po.GetArg(4),
+        den_accs_wxfilename = po.GetArg(5);
+
+    
+    AmDiagGmm am_gmm;
+    TransitionModel trans_model;
+    {
+      bool binary;
+      Input ki(model_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_gmm.Read(ki.Stream(), binary);
+    }
+    
+    Vector<double> num_trans_accs, den_trans_accs;
+    trans_model.InitStats(&num_trans_accs);
+    trans_model.InitStats(&den_trans_accs);
+    AccumAmDiagGmm num_gmm_accs, den_gmm_accs;
+    num_gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
+    den_gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
+
+
+    BaseFloat tot_like = 0.0, tot_weight = 0.0;
+    // tot_like is total weighted likelihood (note: weighted
+    // by both +ve and -ve numbers)
+    // tot_t is total weight in posteriors (will often be about zero).
+    int64 tot_frames = 0.0; 
+    
+    int32 num_done = 0, num_err = 0;
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string key = feature_reader.Key();
+      if (!posteriors_reader.HasKey(key)) {
+        num_err++;
+      } else {
+        const Matrix<BaseFloat> &mat = feature_reader.Value();
+        const Posterior &posterior = posteriors_reader.Value(key);
+
+        if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
+          KALDI_WARN << "Posterior vector has wrong size " 
+                     << (posterior.size()) << " vs. "
+                     << (mat.NumRows());
+          num_err++;
+          continue;
+        }
+
+        BaseFloat tot_like_this_file = 0.0, tot_weight_this_file = 0.0;
+
+        for (size_t i = 0; i < posterior.size(); i++) {
+          for (size_t j = 0; j < posterior[i].size(); j++) {
+            int32 tid = posterior[i][j].first,
+                pdf_id = trans_model.TransitionIdToPdf(tid);
+            BaseFloat weight = posterior[i][j].second;
+            trans_model.Accumulate(fabs(weight), tid,
+                                   (weight > 0.0 ?
+                                    &num_trans_accs : &den_trans_accs));
+            num_done++;            
+            tot_like_this_file +=
+                (weight > 0.0 ? &num_gmm_accs : &den_gmm_accs) ->
+                AccumulateForGmm(am_gmm, mat.Row(i), pdf_id, fabs(weight)) * weight;
+            tot_weight_this_file += weight;
+          }
+        }
+        tot_like += tot_like_this_file;
+        tot_weight += tot_weight_this_file;
+        tot_frames += static_cast<int32>(posterior.size());
+      }
+    }
+
+    KALDI_LOG << "Done " << num_done << " files, " << num_err
+              << " had errors.";
+    
+    KALDI_LOG << "Overall weighted acoustic likelihood per frame was "
+              << (tot_like/tot_frames) << " over " << tot_frames << " frames;"
+              << " average weight per frame was " << (tot_weight / tot_frames);
+
+    {
+      Output ko(num_accs_wxfilename, binary);
+      num_trans_accs.Write(ko.Stream(), binary);
+      num_gmm_accs.Write(ko.Stream(), binary);
+    }
+    {
+      Output ko(den_accs_wxfilename, binary);
+      den_trans_accs.Write(ko.Stream(), binary);
+      den_gmm_accs.Write(ko.Stream(), binary);
+    }
+    KALDI_LOG << "Written accs.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception& e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
+
--- a/src/gmmbin/gmm-est.cc
+++ b/src/gmmbin/gmm-est.cc
@ -125,7 +125,7 @@ int main(int argc, char *argv[]) {
                            power, min_count);

      if (!occs_out_filename.empty()) {
-        bool binary = false; // write this in text mode-- useful to look at.
+        bool binary = true; // write this in text mode-- useful to look at.
        kaldi::Output ko(occs_out_filename, binary);
        state_occs.Write(ko.Stream(), binary);
      }
--- a/src/gmmbin/gmm-fmpe-acc-stats.cc
+++ b/src/gmmbin/gmm-fmpe-acc-stats.cc
@ -0,0 +1,152 @@
+// gmmbin/gmm-fmpe-acc-stats.cc
+
+// Copyright 2012  Daniel Povey
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "transform/fmpe.h"
+
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  using kaldi::int32;
+  try {
+    const char *usage =
+        "Accumulate stats for fMPE training, using GMM model.  Note: this could\n"
+        "be done using gmm-get-feat-deriv and fmpe-acc-stats (but you'd be computing\n"
+        "the features twice).  Features input should be pre-fMPE features.\n"
+        "\n"
+        "Usage:  gmm-fmpe-acc-stats [options] <model-in> <fmpe-in> <feature-rspecifier> "
+        "<gselect-rspecifier> <posteriors-rspecifier> <fmpe-stats-out>\n"
+        "e.g.: \n"
+        " gmm-fmpe-acc-stats 1.mdl 1.fmpe \"$feats\" ark:1.gselect ark:1.post 1.fmpe_stats\n";
+        
+    ParseOptions po(usage);
+    bool binary = true;
+    po.Register("binary", &binary, "If true, write stats in binary mode.");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_rxfilename = po.GetArg(1),
+        fmpe_rxfilename = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        gselect_rspecifier = po.GetArg(4),
+        posteriors_rspecifier = po.GetArg(5),
+        stats_wxfilename = po.GetArg(6);
+    
+    AmDiagGmm am_gmm;
+    TransitionModel trans_model;
+    {
+      bool binary;
+      Input ki(model_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_gmm.Read(ki.Stream(), binary);
+    }
+
+    Fmpe fmpe;
+    {
+      bool binary_in;
+      Input ki(fmpe_rxfilename, &binary_in);
+      fmpe.Read(ki.Stream(), binary_in);
+    }
+
+    // fmpe stats...
+    Matrix<BaseFloat> stats(fmpe.ProjectionTNumRows() * 2,
+                            fmpe.ProjectionTNumCols());
+    SubMatrix<BaseFloat> stats_plus(stats, 0, fmpe.ProjectionTNumRows(),
+                                    0, fmpe.ProjectionTNumCols());
+    SubMatrix<BaseFloat> stats_minus(stats, fmpe.ProjectionTNumRows(),
+                                    fmpe.ProjectionTNumRows(),
+                                    0, fmpe.ProjectionTNumCols());
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
+    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
+
+    BaseFloat tot_like = 0.0; // tot like weighted by posterior.
+    int32 num_frames = 0;
+    int32 num_done = 0, num_err = 0;
+    
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string key = feature_reader.Key();
+      if (!posteriors_reader.HasKey(key)) {
+        num_err++;
+        KALDI_WARN << "No posteriors for utterance " << key;
+        continue;
+      } 
+      const Matrix<BaseFloat> &feat_in = feature_reader.Value();
+      const Posterior &posterior = posteriors_reader.Value(key);
+
+      if (static_cast<int32>(posterior.size()) != feat_in.NumRows()) {
+        KALDI_WARN << "Posterior vector has wrong size " <<
+            (posterior.size()) << " vs. "<< (feat_in.NumRows());
+        num_err++;
+        continue;
+      }
+
+      if (!gselect_reader.HasKey(key)) {
+        KALDI_WARN << "No gselect information for key " << key;
+        num_err++;
+        continue;
+      }
+      const std::vector<std::vector<int32> > &gselect =
+          gselect_reader.Value(key);
+      if (static_cast<int32>(gselect.size()) != feat_in.NumRows()) {
+        KALDI_WARN << "gselect information has wrong size";
+        num_err++;
+        continue;
+      }
+      
+      num_done++;
+      Matrix<BaseFloat> fmpe_feat(feat_in.NumRows(), feat_in.NumCols());
+      fmpe.ComputeFeatures(feat_in, gselect, &fmpe_feat);
+      fmpe_feat.AddMat(1.0, feat_in);
+      
+      Matrix<BaseFloat> feat_deriv;
+
+      tot_like += ComputeAmGmmFeatureDeriv(am_gmm, trans_model, posterior,
+                                           fmpe_feat, &feat_deriv);
+      num_frames += feat_in.NumRows();
+
+      fmpe.AccStats(feat_in, gselect, feat_deriv, &stats_plus, &stats_minus);
+      
+      if (num_done % 100 == 0)
+        KALDI_LOG << "Processed " << num_done << " utterances.";
+    }
+
+    KALDI_LOG << "Done " << num_done << " files, " << num_err
+              << " with errors.";
+    KALDI_LOG << "Overall weighted acoustic likelihood per frame is "
+              << (tot_like/num_frames) << " over " << num_frames << " frames.";
+
+    Output ko(stats_wxfilename, binary);
+    stats.Write(ko.Stream(), binary);
+    
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception& e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
+
--- a/src/gmmbin/gmm-get-feat-deriv.cc
+++ b/src/gmmbin/gmm-get-feat-deriv.cc
@ -0,0 +1,110 @@
+// gmmbin/gmm-get-feat-deriv.cc
+
+// Copyright 2012  Daniel Povey
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "transform/fmpe.h"
+
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  using kaldi::int32;
+  try {
+    const char *usage =
+        "From GMM model and posteriors (which don't have to be positive),\n"
+        "output for each utterance a matrix of likelihood derivatives w.r.t.\n"
+        "the features.\n"
+        "E.g. used in feature-space discriminative training.\n"
+        "\n"
+        "Usage:  gmm-get-feat-deriv [options] <model-in> <feature-rspecifier> "
+        "<posteriors-rspecifier> <feature-deriv-wspecifier>\n"
+        "e.g.: \n"
+        " gmm-get-feat-deriv 1.mdl \"$feats\" ark:1.post ark:1.deriv\n";
+        
+    ParseOptions po(usage);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_filename = po.GetArg(1),
+        feature_rspecifier = po.GetArg(2),
+        posteriors_rspecifier = po.GetArg(3),
+        deriv_wspecifier = po.GetArg(4);
+    
+    AmDiagGmm am_gmm;
+    TransitionModel trans_model;
+    {
+      bool binary;
+      Input ki(model_filename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_gmm.Read(ki.Stream(), binary);
+    }
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
+    BaseFloatMatrixWriter deriv_writer(deriv_wspecifier);
+    
+    int32 num_done = 0, num_err = 0;
+    
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string key = feature_reader.Key();
+      if (!posteriors_reader.HasKey(key)) {
+        KALDI_WARN << "No posteriors for utterance " << key;
+        num_err++;
+      } else {
+        const Matrix<BaseFloat> &mat = feature_reader.Value();
+        const Posterior &posterior = posteriors_reader.Value(key);
+
+        if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
+          KALDI_WARN << "Posterior vector has wrong size " <<
+              (posterior.size()) << " vs. "<< (mat.NumRows());
+          num_err++;
+          continue;
+        }
+
+        num_done++;
+
+        // Derivative of likelihood (or whatever objective func.)
+        // w.r.t. features.
+        Matrix<BaseFloat> deriv;
+        ComputeAmGmmFeatureDeriv(am_gmm, trans_model, posterior,
+                                 mat, &deriv);
+        
+        deriv_writer.Write(key, deriv);
+        if (num_done % 100 == 0)
+          KALDI_LOG << "Processed " << num_done << " utterances.";
+      }        
+    }
+
+    KALDI_LOG << "Done " << num_done << " files, " << num_err
+              << " with errors.";
+    if (num_done != 0) return 0;
+    else return 1;
+  } catch(const std::exception& e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
+
--- a/src/tiedbin/full-to-diag.cc
+++ b/src/tiedbin/full-to-diag.cc
@ -30,7 +30,7 @@ int main(int argc, char *argv[]) {
        "Convert a full covariance GMM into a diagonal one.\n"
        "Usage: full-to-tied <full-gmm-in> <diag-gmm-out>\n";

-    bool binary = false;
+    bool binary = true;
    ParseOptions po(usage);
    po.Register("binary", &binary, "Write output in binary mode");
    po.Read(argc, argv);
--- a/src/tiedbin/init-tied-codebooks.cc
+++ b/src/tiedbin/init-tied-codebooks.cc
@ -106,7 +106,7 @@ int main(int argc, char *argv[]) {
        "e.g.: \n"
        "  init-tied-codebooks tree tree.acc ubm-full tree.map\n";

-    bool binary = false;
+    bool binary = true;
    int max_num_gaussians = 512;
    bool split_gaussians = false;
    BaseFloat perturb = 0.01;
--- a/src/tiedbin/tied-diag-gmm-acc-stats-ali.cc
+++ b/src/tiedbin/tied-diag-gmm-acc-stats-ali.cc
@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
        " tied-diag-gmm-acc-stats-ali 1.mdl 1.ali scp:train.scp ark:1.ali 1.acc\n";

    ParseOptions po(usage);
-    bool binary = false;
+    bool binary = true;
    po.Register("binary", &binary, "Write output in binary mode");
    po.Read(argc, argv);

--- a/src/tiedbin/tied-diag-gmm-align-compiled.cc
+++ b/src/tiedbin/tied-diag-gmm-align-compiled.cc
@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
        "   tied-diag-gmm-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";

    ParseOptions po(usage);
-    bool binary = false;
+    bool binary = true;
    BaseFloat beam = 200.0;
    BaseFloat retry_beam = 0.0;
    BaseFloat acoustic_scale = 1.0;
--- a/src/tiedbin/tied-diag-gmm-init-model.cc
+++ b/src/tiedbin/tied-diag-gmm-init-model.cc
@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
        "e.g.: \n"
        "  tied-diag-gmm-init-model tree topo tree.map diag0.ubm diag1.ubm 1.mdl\n";

-    bool binary = false;
+    bool binary = true;

    ParseOptions po(usage);
    po.Register("binary", &binary, "Write output in binary mode");
--- a/src/tiedbin/tied-diag-gmm-init-mono.cc
+++ b/src/tiedbin/tied-diag-gmm-init-mono.cc
@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
        "e.g.: \n"
        " tied-diag-gmm-init-mono topo cb.pdf mono.mdl mono.tree\n";

-    bool binary = false;
+    bool binary = true;
    ParseOptions po(usage);
    po.Register("binary", &binary, "Write output in binary mode");
    po.Read(argc, argv);
--- a/src/tiedbin/tied-full-gmm-acc-stats-ali.cc
+++ b/src/tiedbin/tied-full-gmm-acc-stats-ali.cc
@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
        " tied-full-gmm-acc-stats-ali 1.mdl 1.ali scp:train.scp ark:1.ali 1.acc\n";

    ParseOptions po(usage);
-    bool binary = false;
+    bool binary = true;
    po.Register("binary", &binary, "Write output in binary mode");
    po.Read(argc, argv);

--- a/src/tiedbin/tied-full-gmm-align-compiled.cc
+++ b/src/tiedbin/tied-full-gmm-align-compiled.cc
@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
        "   tied-full-gmm-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";

    ParseOptions po(usage);
-    bool binary = false;
+    bool binary = true;
    BaseFloat beam = 200.0;
    BaseFloat retry_beam = 0.0;
    BaseFloat acoustic_scale = 1.0;
--- a/src/tiedbin/tied-full-gmm-init-model.cc
+++ b/src/tiedbin/tied-full-gmm-init-model.cc
@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
        "e.g.: \n"
        "  tied-full-gmm-init-model tree topo tree.map full0.ubm full1.ubm 1.mdl\n";

-    bool binary = false;
+    bool binary = true;

    ParseOptions po(usage);
    po.Register("binary", &binary, "Write output in binary mode");
--- a/src/tiedbin/tied-full-gmm-init-mono.cc
+++ b/src/tiedbin/tied-full-gmm-init-mono.cc
@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
        "e.g.: \n"
        " tied-full-gmm-init-mono topo cb.pdf mono.mdl mono.tree\n";

-    bool binary = false;
+    bool binary = true;
    ParseOptions po(usage);
    po.Register("binary", &binary, "Write output in binary mode");
    po.Read(argc, argv);
--- a/src/tiedbin/tied-lbg.cc
+++ b/src/tiedbin/tied-lbg.cc
@ -167,7 +167,7 @@ try {
        "  tied-lbg tree-old tree-tied topo scp:train.scp ark:ali ubm-full "
        "tree.map\n";

-    bool binary = false;
+    bool binary = true;
    bool full = true;
    
    BaseFloat perturb = 0.01;
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@ -4,7 +4,7 @@ include ../kaldi.mk

 TESTFILES = regtree-fmllr-diag-gmm-test lda-estimate-test \
      regression-tree-test fmllr-diag-gmm-test exponential-transform-test \
-      regtree-mllr-diag-gmm-test
+      regtree-mllr-diag-gmm-test fmpe-test

 OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
    regtree-fmllr-diag-gmm.o cmvn.o transform-common.o fmllr-diag-gmm.o \
--- a/src/transform/fmpe-test.cc
+++ b/src/transform/fmpe-test.cc
@ -0,0 +1,174 @@
+// transform/fmpe-test.cc
+
+// Copyright 2012  Daniel Povey
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "util/common-utils.h"
+#include "gmm/diag-gmm.h"
+#include "gmm/diag-gmm-normal.h"
+#include "gmm/model-test-common.h"
+#include "transform/fmpe.h"
+
+namespace kaldi {
+
+
+// Compute derivative of GMM log-likelihood w.r.t. features.
+// Note: this code copied from gmm-get-feat-deriv.cc; had
+// to simplify a bit.
+void GetFeatDeriv(const DiagGmm &gmm,
+                  const Matrix<BaseFloat> &feats,
+                  Matrix<BaseFloat> *deriv) {
+  
+  deriv->Resize(feats.NumRows(), feats.NumCols());
+
+  Vector<BaseFloat> gauss_posteriors;
+  Vector<BaseFloat> temp_vec(feats.NumCols());
+  for (int32 i = 0; i < feats.NumRows(); i++) {
+    SubVector<BaseFloat> this_feat(feats, i);
+    SubVector<BaseFloat> this_deriv(*deriv, i);
+    gmm.ComponentPosteriors(this_feat, &gauss_posteriors);
+    BaseFloat weight = 1.0;
+    gauss_posteriors.Scale(weight);
+    // The next line does: to i'th row of deriv, add
+    // means_invvars^T * gauss_posteriors,
+    // where each row of means_invvars is the mean times
+    // diagonal inverse covariance... after transposing,
+    // this becomes a weighted of these rows, weighted by
+    // the posteriors.  This comes from the term
+    //  feat^T * inv_var * mean
+    // in the objective function.
+    this_deriv.AddMatVec(1.0, gmm.means_invvars(), kTrans,
+                         gauss_posteriors, 1.0);
+
+    // next line does temp_vec == inv_vars^T * gauss_posteriors,
+    // which sets temp_vec to a weighted sum of the inv_vars,
+    // weighed by Gaussian posterior.
+    temp_vec.AddMatVec(1.0, gmm.inv_vars(), kTrans,
+                       gauss_posteriors, 0.0);
+    // Add to the derivative, -(this_feat .* temp_vec),
+    // which is the term that comes from the -0.5 * inv_var^T feat_sq,
+    // in the objective function (where inv_var is a vector, and feat_sq
+    // is a vector of squares of the feature values).
+    this_deriv.AddVecVec(-1.0, this_feat, temp_vec, 1.0);
+  }
+}
+
+// Gets total log-likelihood, summed over all frames.
+BaseFloat GetGmmLike(const DiagGmm &gmm,
+                     const Matrix<BaseFloat> &feats) {
+  BaseFloat ans = 0.0;
+  for (int32 i = 0; i < feats.NumRows(); i++)
+    ans += gmm.LogLikelihood(feats.Row(i));
+  return ans;
+}
+
+void TestFmpe() {
+  int32 dim = 10 + (rand() % 10);
+  int32 num_comp = 10 + (rand() % 10);
+  DiagGmm gmm;
+  unittest::InitRandDiagGmm(dim, num_comp, &gmm);
+  
+  int32 num_frames = 20;
+  Matrix<BaseFloat> feats(num_frames, dim);
+
+  for (int32 i = 0; i < num_frames; i++)
+    for (int32 j = 0; j < dim; j++)
+      feats(i,j) = RandGauss();
+
+  FmpeOptions opts; // Default.
+  {
+    Fmpe fmpe(gmm, opts);
+    {
+      bool binary = (rand() % 2 == 1);
+      Output ko("tmpf", binary);
+      fmpe.Write(ko.Stream(), binary);
+    }
+  }
+  Fmpe fmpe(gmm, opts);
+  {
+    bool binary_in;
+    Input ki("tmpf", &binary_in);
+    fmpe.Read(ki.Stream(), binary_in);
+  }
+
+  // We'll first be testing that the feature derivative is
+  // accurate, by measuring a small random offset in feature space.
+  {
+    Matrix<BaseFloat> deriv;
+    Matrix<BaseFloat> random_offset(feats.NumRows(), feats.NumCols());
+    for (int32 i = 0; i < feats.NumRows(); i++)
+      for (int32 j = 0; j < feats.NumCols(); j++)
+        random_offset(i,j) = 1.0e-03 * RandGauss();
+    BaseFloat like_before = GetGmmLike(gmm, feats);
+    feats.AddMat(1.0, random_offset);
+    BaseFloat like_after = GetGmmLike(gmm, feats);
+    feats.AddMat(-1.0, random_offset); // undo the change.
+    GetFeatDeriv(gmm, feats, &deriv);
+    BaseFloat change1 = like_after - like_before,
+        change2 = TraceMatMat(random_offset, deriv, kTrans);
+    KALDI_LOG << "Random offset led to like change "
+              << change1 << " (manually), and " << change2
+              << " (derivative)";
+    // note: not making this threshold smaller, as don't want
+    // spurious failures.  Seems to be OK though.
+    KALDI_ASSERT( fabs(change1-change2) < 0.15*fabs(change1+change2));
+  }
+
+  std::vector<std::vector<int32> > gselect(feats.NumRows()); // make it have all Gaussians...
+  for (int32 i = 0; i < feats.NumRows(); i++)
+    for (int32 j = 0; j < gmm.NumGauss(); j++)
+      gselect[i].push_back(j);
+
+  Matrix<BaseFloat> fmpe_offset;
+  // Check that the fMPE feature offset is zero.
+  fmpe.ComputeFeatures(feats, gselect, &fmpe_offset);
+  KALDI_ASSERT(fmpe_offset.IsZero());
+  
+  // Note: we're just using the ML objective function here.
+  // This is just to make sure the derivatives are all computed
+  // correctly.
+  BaseFloat like_before_update = GetGmmLike(gmm, feats);
+  // Now get stats for update.
+  int32 nr = fmpe.ProjectionTNumRows(), nc = fmpe.ProjectionTNumCols();
+  Matrix<BaseFloat> plus_stats(nr, nc), minus_stats(nr, nc);
+  Matrix<BaseFloat> deriv;
+  GetFeatDeriv(gmm, feats, &deriv);
+  fmpe.AccStats(feats, gselect, deriv, &plus_stats, &minus_stats);
+  FmpeUpdateOptions update_opts;
+  update_opts.learning_rate = 0.001; // so linear assumption is more valid.
+  BaseFloat delta = fmpe.Update(update_opts, plus_stats, minus_stats);
+
+  fmpe.ComputeFeatures(feats, gselect, &fmpe_offset);
+  feats.AddMat(1.0, fmpe_offset);
+
+  BaseFloat like_after_update = GetGmmLike(gmm, feats);
+
+  BaseFloat delta2 = like_after_update - like_before_update;
+  KALDI_LOG << "Change predicted by fMPE Update function is "
+            << delta << ", change computed directly is "
+            << delta2;
+  KALDI_ASSERT(fabs(delta-delta2) < 0.15 * fabs(delta+delta2));
+}
+
+}
+
+
+int main() {
+  kaldi::g_kaldi_verbose_level = 5;
+  for (int i = 0; i <= 10; i++)
+    kaldi::TestFmpe();
+  std::cout << "Test OK.\n";
+}
+
--- a/src/transform/fmpe.cc
+++ b/src/transform/fmpe.cc
@ -19,6 +19,8 @@
 #include "transform/fmpe.h"
 #include "util/text-utils.h"
 #include "gmm/diag-gmm-normal.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"

 namespace kaldi {

@ -73,7 +75,7 @@ void Fmpe::ComputeC() {
  // to get centered covariance.
  C_.Resize(dim);
  try {
-    TpMatrix<double> Ctmp; Ctmp.Cholesky(x2_stats);
+    TpMatrix<double> Ctmp(dim); Ctmp.Cholesky(x2_stats);
    C_.CopyFromTp(Ctmp);
  } catch (...) {
    KALDI_ERR << "Error initializing fMPE object: cholesky of "
@ -94,9 +96,9 @@ void Fmpe::ApplyContext(const MatrixBase<BaseFloat> &intermed_feat,
  // Applies the temporal-context part of the transformation.
  int32 dim = FeatDim(), ncontexts = NumContexts(),
      T = intermed_feat.NumRows();
-  KALDI_ASSERT(intermed_feat.NumRows() == dim * ncontexts &&
-               intermed_feat.NumCols() == feat_out->NumCols()
-               && feat_out->NumRows() == dim);
+  KALDI_ASSERT(intermed_feat.NumCols() == dim * ncontexts &&
+               intermed_feat.NumRows() == feat_out->NumRows()
+               && feat_out->NumCols() == dim);
  // note: ncontexts == contexts_.size().
  for (int32 i = 0; i < ncontexts; i++) {
    // this_intermed_feat is the chunk of the "intermediate features"
@ -125,9 +127,9 @@ void Fmpe::ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
  // in reverse, for getting derivatives for training.
  int32 dim = FeatDim(), ncontexts = NumContexts(),
      T = feat_deriv.NumRows();
-  KALDI_ASSERT(intermed_feat_deriv->NumRows() == dim * ncontexts &&
-               intermed_feat_deriv->NumCols() == feat_deriv.NumCols()
-               && feat_deriv.NumRows() == dim);
+  KALDI_ASSERT(intermed_feat_deriv->NumCols() == dim * ncontexts &&
+               intermed_feat_deriv->NumRows() == feat_deriv.NumRows()
+               && feat_deriv.NumCols() == dim);
  // note: ncontexts == contexts_.size().
  for (int32 i = 0; i < ncontexts; i++) {
    // this_intermed_feat is the chunk of the derivative of
@ -142,7 +144,7 @@ void Fmpe::ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
      // but this doesn't dominate the computation and I think this is
      // clearer.
      for (int32 t_out = 0; t_out < T; t_out++) { // t_out indexes the output
-        int32 t_in = t_in + t_offset; // t_in indexes the input.
+        int32 t_in = t_out + t_offset; // t_in indexes the input.
        if (t_in >= 0 && t_in < T) // Discard frames outside range.
          this_intermed_feat_deriv.Row(t_in).AddVec(weight,
                                                    feat_deriv.Row(t_out));
@ -164,7 +166,16 @@ void Fmpe::ApplyC(MatrixBase<BaseFloat> *feat_out, bool reverse) const {
  }
 }

-// Constructs the high-dim features and applies the main projection matrix proj_.
+// Constructs the high-dim features and applies the main projection matrix
+// projT_.  This projects from dimension ngauss*(dim+1) to dim*ncontexts.  Note:
+// because the input vector of size ngauss*(dim+1) is sparse in a blocky way
+// (i.e. each frame only has a couple of nonzero posteriors), we deal with
+// sub-matrices of the projection matrix projT_.  We actually further optimize
+// the code by taking all frames in a file that had nonzero posteriors for a
+// particular Gaussian, and forming a matrix out of the corresponding
+// high-dimensional features; we can then use a matrix-matrix multiply rather
+// than using vector-matrix operations.
+
 void Fmpe::ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
                           const std::vector<std::vector<int32> > &gselect,
                           MatrixBase<BaseFloat> *intermed_feat) const {
@ -173,17 +184,44 @@ void Fmpe::ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
  Vector<BaseFloat> post; // will be posteriors of selected Gaussians.
  Vector<BaseFloat> input_chunk(dim+1); // will be a segment of
  // the high-dimensional features.
+
+  // "all_posts" is a vector of ((gauss-index, time-index), gaussian
+  // posterior).
+  // We'll compute the posterior information, sort it, and then
+  // go through it in sorted order, which maintains memory locality
+  // when accessing the projection matrix.
+  // Note: if we really cared we could make this use level-3 BLAS
+  // (matrix-matrix multiply), but we'd need to have a temporary
+  // matrix for the output and input.
+  std::vector<std::pair<std::pair<int32,int32>, BaseFloat> > all_posts;
+  
  for (int32 t = 0; t < feat_in.NumRows(); t++) {
    SubVector<BaseFloat> this_feat(feat_in, t);
-    SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
    gmm_.LogLikelihoodsPreselect(this_feat, gselect[t], &post);
    // At this point, post will contain log-likes of the selected
    // Gaussians.
    post.ApplySoftMax(); // Now they are posteriors (which sum to one).
    for (int32 i = 0; i < post.Dim(); i++) {
      int32 gauss = gselect[t][i];
+      all_posts.push_back(std::make_pair(std::make_pair(gauss, t), post(i)));
+    }
+  }
+  std::sort(all_posts.begin(), all_posts.end());
+  
+  bool optimize = true;
+
+  if (!optimize) { // Why do we keep this un-optimized code around?
+    // For clarity, so you can see what's going on, and for easier
+    // comparision with ApplyProjectionReverse which is similar to this
+    // un-optimized segment.  Both un-optimized and optimized versions
+    // should give identical transforms (up to tiny roundoff differences).
+    for (size_t i = 0; i < all_posts.size(); i++) {
+      int32 gauss = all_posts[i].first.first, t = all_posts[i].first.second;
+      SubVector<BaseFloat> this_feat(feat_in, t);
+      SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
+      BaseFloat this_post = all_posts[i].second;
      SubVector<BaseFloat> this_stddev(stddevs_, gauss);
-      BaseFloat this_post = post(i);
+
      // The next line is equivalent to setting input_chunk to
      // -this_post * the gaussian mean / (gaussian stddev).  Note: we use
      // the fact that mean * inv_var *  stddev == mean / stddev.
@ -196,12 +234,55 @@ void Fmpe::ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
                                             1.0);
      // The last element of this input_chunk is the posterior itself
      // (between 0 and 1).
-      input_chunk(dim) = this_post;
+      input_chunk(dim) = this_post * config_.post_scale;

-      // this_intermed_feat += [appropriate chjunk of proj_] * input_chunk.
-      this_intermed_feat.AddMatVec(1.0, proj_.Range(0, dim*ncontexts,
-                                                    gauss*(dim+1), dim+1),
-                                   kNoTrans, input_chunk, 1.0);
+      // this_intermed_feat += [appropriate chjunk of projT_] * input_chunk.
+      this_intermed_feat.AddMatVec(1.0, projT_.Range(gauss*(dim+1), dim+1,
+                                                     0, dim*ncontexts),
+                                   kTrans, input_chunk, 1.0);
+    }
+  } else {
+    size_t i = 0;
+    while (i < all_posts.size()) {
+      int32 gauss = all_posts[i].first.first;
+      SubVector<BaseFloat> this_stddev(stddevs_, gauss),
+          this_mean_invvar(gmm_.means_invvars(), gauss);
+      SubMatrix<BaseFloat> this_projT_chunk(projT_, gauss*(dim+1), dim+1,
+                                            0, dim*ncontexts);
+      int32 batch_size; // number of posteriors with same Gaussian..
+      for (batch_size = 0;
+           batch_size+i < static_cast<int32>(all_posts.size()) &&
+               all_posts[batch_size+i].first.first == gauss;
+           batch_size++); // empty loop body.
+      Matrix<BaseFloat> input_chunks(batch_size, dim+1);
+      Matrix<BaseFloat> intermed_temp(batch_size, dim*ncontexts);
+      for (int32 j = 0; j < batch_size; j++) { // set up "input_chunks"
+        int32 t = all_posts[i+j].first.second;
+        SubVector<BaseFloat> this_feat(feat_in, t);
+        SubVector<BaseFloat> this_input_chunk(input_chunks, j);
+        BaseFloat this_post = all_posts[i+j].second;
+        this_input_chunk.Range(0, dim).AddVecVec(-this_post,
+                                                 this_mean_invvar,
+                                                 this_stddev, 0.0);
+        this_input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat,
+                                                    this_stddev, 1.0);
+        this_input_chunk(dim) = this_post * config_.post_scale;
+      }
+      // The next line is where most of the computation will happen,
+      // during the feature computation phase.  We have rearranged
+      // stuff so it's a matrix-matrix operation, for greater
+      // efficiency (when using optimized libraries like ATLAS).
+      intermed_temp.AddMatMat(1.0, input_chunks, kNoTrans,
+                              this_projT_chunk, kNoTrans, 0.0);
+      for (int32 j = 0; j < batch_size; j++) { // add data from
+        // intermed_temp to the output "intermed_feat"
+        int32 t = all_posts[i+j].first.second;
+        SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
+        SubVector<BaseFloat> this_intermed_temp(intermed_temp, j);
+        // this_intermed_feat += this_intermed_temp.
+        this_intermed_feat.AddVec(1.0, this_intermed_temp);
+      }
+      i += batch_size;
    }
  }
 }      
@ -221,9 +302,16 @@ void Fmpe::ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
  Vector<BaseFloat> post; // will be posteriors of selected Gaussians.
  Vector<BaseFloat> input_chunk(dim+1); // will be a segment of
  // the high-dimensional features.
+
+  // "all_posts" is a vector of ((gauss-index, time-index), gaussian
+  // posterior).
+  // We'll compute the posterior information, sort it, and then
+  // go through it in sorted order, which maintains memory locality
+  // when accessing the projection matrix.
+  std::vector<std::pair<std::pair<int32,int32>, BaseFloat> > all_posts;
+  
  for (int32 t = 0; t < feat_in.NumRows(); t++) {
    SubVector<BaseFloat> this_feat(feat_in, t);
-    SubVector<BaseFloat> this_intermed_feat_deriv(intermed_feat_deriv, t);
    gmm_.LogLikelihoodsPreselect(this_feat, gselect[t], &post);
    // At this point, post will contain log-likes of the selected
    // Gaussians.
@ -232,35 +320,44 @@ void Fmpe::ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
      // The next few lines (where we set up "input_chunk") are identical
      // to ApplyProjection.
      int32 gauss = gselect[t][i];
-      SubVector<BaseFloat> this_stddev(stddevs_, gauss);
-      BaseFloat this_post = post(i);
-      input_chunk.Range(0, dim).AddVecVec(-this_post, gmm_.means_invvars().Row(gauss),
-                                          this_stddev, 0.0);
-      input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat, this_stddev,
-                                             1.0);
-      input_chunk(dim) = this_post;
-
-      // If not for accumulating the + and - parts separately, we would be
-      // doing something like:
-      // proj_deriv_.Range(0, dim*ncontexts, gauss*(dim+1), dim+1).AddVecVec(
-      //                    1.0, this_intermed_feat_deriv, input_chunk);
-
-
-      SubMatrix<BaseFloat> plus_chunk(*proj_deriv_plus, 0, dim*ncontexts,
-                                      gauss*(dim+1), dim+1),
-          minus_chunk(*proj_deriv_minus, 0, dim*ncontexts,
-                      gauss*(dim+1), dim+1);
-          
-      // This next function takes the rank-one matrix
-      //  (this_intermed_deriv * input_chunk') and adds the positive
-      // part to proj_deriv_plus, and minus the negative part to
-      // proj_deriv_minus.
-      AddOuterProductPlusMinus(static_cast<BaseFloat>(1.0),
-                               this_intermed_feat_deriv,
-                               input_chunk,
-                               &plus_chunk, &minus_chunk);
+      all_posts.push_back(std::make_pair(std::make_pair(gauss, t), post(i)));
    }
  }
+  std::sort(all_posts.begin(), all_posts.end());
+  for (size_t i = 0; i < all_posts.size(); i++) {
+    int32 gauss = all_posts[i].first.first, t = all_posts[i].first.second;
+    BaseFloat this_post = all_posts[i].second;
+    SubVector<BaseFloat> this_feat(feat_in, t);    
+    SubVector<BaseFloat> this_intermed_feat_deriv(intermed_feat_deriv, t);
+    SubVector<BaseFloat> this_stddev(stddevs_, gauss);
+    input_chunk.Range(0, dim).AddVecVec(-this_post, gmm_.means_invvars().Row(gauss),
+                                        this_stddev, 0.0);
+    input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat, this_stddev,
+                                           1.0);
+    input_chunk(dim) = this_post * config_.post_scale;
+
+    // If not for accumulating the + and - parts separately, we would be
+    // doing something like:
+    // proj_deriv_.Range(0, dim*ncontexts, gauss*(dim+1), dim+1).AddVecVec(
+    //                    1.0, this_intermed_feat_deriv, input_chunk);
+
+
+    SubMatrix<BaseFloat> plus_chunk(*proj_deriv_plus, 
+                                    gauss*(dim+1), dim+1,
+                                    0, dim*ncontexts),
+        minus_chunk(*proj_deriv_minus, 
+                    gauss*(dim+1), dim+1,
+                    0, dim*ncontexts);
+          
+    // This next function takes the rank-one matrix
+    //  (input_chunk * this_intermed_deriv'), and adds the positive
+    // part to proj_deriv_plus, and minus the negative part to
+    // proj_deriv_minus.
+    AddOuterProductPlusMinus(static_cast<BaseFloat>(1.0),
+                             input_chunk,
+                             this_intermed_feat_deriv,
+                             &plus_chunk, &minus_chunk);
+  }
 }      

 void Fmpe::ComputeFeatures(const MatrixBase<BaseFloat> &feat_in,
@ -296,8 +393,8 @@ void Fmpe::AccStats(const MatrixBase<BaseFloat> &feat_in,
  int32 dim = FeatDim(), ncontexts = NumContexts();
  KALDI_ASSERT(feat_in.NumRows() != 0 && feat_in.NumCols() == dim);
  KALDI_ASSERT(feat_in.NumRows() == static_cast<int32>(gselect.size()));
-  AssertSameDim(*proj_deriv_plus, proj_);
-  AssertSameDim(*proj_deriv_minus, proj_);
+  AssertSameDim(*proj_deriv_plus, projT_);
+  AssertSameDim(*proj_deriv_minus, projT_);
  AssertSameDim(feat_in, feat_deriv_in);

  // We do everything in reverse now, in reverse order.
@ -326,28 +423,29 @@ Fmpe::Fmpe(const DiagGmm &gmm, const FmpeOptions &config): gmm_(gmm),
  SetContexts(config.context_expansion);
  ComputeC();
  ComputeStddevs();
-  proj_.Resize(NumGauss() * (FeatDim()+1), FeatDim() * NumContexts());
+  projT_.Resize(NumGauss() * (FeatDim()+1), FeatDim() * NumContexts());
 }

-void Fmpe::Update(const FmpeUpdateOptions &config,
-                  MatrixBase<BaseFloat> &proj_deriv_plus,
-                  MatrixBase<BaseFloat> &proj_deriv_minus) {
+BaseFloat Fmpe::Update(const FmpeUpdateOptions &config,
+                       MatrixBase<BaseFloat> &proj_deriv_plus,
+                       MatrixBase<BaseFloat> &proj_deriv_minus) {
  // tot_linear_objf_impr is the change in the actual
  // objective function if it were linear, i.e.
  //   objf-gradient . parameter-change  // Note: none of this is normalized by the #frames (we don't have
  // this info here), so that is done at the script level.
  BaseFloat tot_linear_objf_impr = 0.0;
-  AssertSameDim(proj_deriv_plus, proj_);
-  AssertSameDim(proj_deriv_minus, proj_);
+  int32 changed = 0; // Keep track of how many elements change sign.
+  AssertSameDim(proj_deriv_plus, projT_);
+  AssertSameDim(proj_deriv_minus, projT_);
  KALDI_ASSERT(proj_deriv_plus.Min() >= 0);
  KALDI_ASSERT(proj_deriv_minus.Min() >= 0);
  BaseFloat learning_rate = config.learning_rate,
      l2_weight = config.l2_weight;
  
-  for (int32 i = 0; i < proj_.NumRows(); i++) {
-    for (int32 j = 0; j < proj_.NumCols(); j++) {
+  for (int32 i = 0; i < projT_.NumRows(); i++) {
+    for (int32 j = 0; j < projT_.NumCols(); j++) {
      BaseFloat p = proj_deriv_plus(i,j), n = proj_deriv_minus(i,j),
-          x = proj_(i,j);
+          x = projT_(i,j);
      // Suppose the basic update (before regularization) is:
      // z <-- x  +   learning_rate * (p - n) / (p + n),
      // where z is the new parameter and x is the old one.
@ -371,10 +469,14 @@ void Fmpe::Update(const FmpeUpdateOptions &config,
      // z is the new parameter value.

      tot_linear_objf_impr += (z-x) * (p-n); // objf impr based on linear assumption.
-      proj_(i,j) = z;
+      projT_(i,j) = z;
+      if (z*x < 0) changed++;
    }
  }
  KALDI_LOG << "Objf impr (assuming linear) is " << tot_linear_objf_impr;
+  KALDI_LOG << ((100.0*changed)/(projT_.NumRows()*projT_.NumCols()))
+            << "% of matrix elements changed sign.";
+  return tot_linear_objf_impr;
 }

 // Note: we write the GMM first, without any other header.
@ -386,7 +488,7 @@ void Fmpe::Write(std::ostream &os, bool binary) const {
  gmm_.Write(os, binary);
  config_.Write(os, binary);
  // stddevs_ are derived, don't write them.
-  proj_.Write(os, binary);
+  projT_.Write(os, binary);
  C_.Write(os, binary);
  // contexts_ are derived from config, don't write them.
 }
@ -396,11 +498,59 @@ void Fmpe::Read(std::istream &is, bool binary) {
  gmm_.Read(is, binary);
  config_.Read(is, binary);
  ComputeStddevs(); // computed from gmm.
-  proj_.Read(is, binary);
+  projT_.Read(is, binary);
  C_.Read(is, binary);
  SetContexts(config_.context_expansion);
 }


+BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm,
+                                   const TransitionModel &trans_model,
+                                   const Posterior &posterior,
+                                   const MatrixBase<BaseFloat> &features,
+                                   Matrix<BaseFloat> *deriv) {
+  BaseFloat ans = 0.0;
+  KALDI_ASSERT(posterior.size() == static_cast<size_t>(features.NumRows()));
+  deriv->Resize(features.NumRows(), features.NumCols());
+  Vector<BaseFloat> temp_vec(features.NumCols());
+  for (size_t i = 0; i < posterior.size(); i++) {
+    for (size_t j = 0; j < posterior[i].size(); j++) {
+      int32 tid = posterior[i][j].first,  // transition identifier.
+          pdf_id = trans_model.TransitionIdToPdf(tid);
+      BaseFloat weight = posterior[i][j].second;
+      const DiagGmm &gmm = am_gmm.GetPdf(pdf_id);
+      Vector<BaseFloat> gauss_posteriors;
+      SubVector<BaseFloat> this_feat(features, i);
+      SubVector<BaseFloat> this_deriv(*deriv, i);
+      ans += weight * 
+          gmm.ComponentPosteriors(this_feat, &gauss_posteriors);
+      
+      gauss_posteriors.Scale(weight);
+      // The next line does: to i'th row of deriv, add
+      // means_invvars^T * gauss_posteriors,
+      // where each row of means_invvars is the mean times
+      // diagonal inverse covariance... after transposing,
+      // this becomes a weighted of these rows, weighted by
+      // the posteriors.  This comes from the term
+      //  feat^T * inv_var * mean
+      // in the objective function.
+      this_deriv.AddMatVec(1.0, gmm.means_invvars(), kTrans,
+                           gauss_posteriors, 1.0);
+
+      // next line does temp_vec == inv_vars^T * gauss_posteriors,
+      // which sets temp_vec to a weighted sum of the inv_vars,
+      // weighed by Gaussian posterior.
+      temp_vec.AddMatVec(1.0, gmm.inv_vars(), kTrans,
+                         gauss_posteriors, 0.0);
+      // Add to the derivative, -(this_feat .* temp_vec),
+      // which is the term that comes from the -0.5 * inv_var^T feat_sq,
+      // in the objective function (where inv_var is a vector, and feat_sq
+      // is a vector of squares of the feature values).
+      this_deriv.AddVecVec(-1.0, this_feat, temp_vec, 1.0);
+    }
+  }
+  return ans;
+}
+

 }  // End of namespace kaldi
--- a/src/transform/fmpe.h
+++ b/src/transform/fmpe.h
@ -22,6 +22,8 @@
 #include <vector>

 #include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "util/kaldi-holder.h" // for Posterior

 namespace kaldi {

@ -104,8 +106,13 @@ class Fmpe {
  int32 NumGauss() const { return gmm_.NumGauss(); }
  int32 NumContexts() const { return static_cast<int32>(contexts_.size()); }

-  int32 ProjectionNumRows() { return FeatDim() * NumContexts(); }
-  int32 ProjectionNumCols() { return (FeatDim()+1) * NumGauss(); }
+  // Note: this returns the number of rows and columns in projT_,
+  // which is the transpose of the high->intermediate dimensional
+  // projection matrix.  This is the dimension we want for the
+  // stats.
+  int32 ProjectionTNumRows() { return (FeatDim()+1) * NumGauss(); }
+  int32 ProjectionTNumCols() { return FeatDim() * NumContexts(); }
+
  
  // Computes the fMPE feature offsets and outputs them.
  // You can add feat_in to this afterwards, if you want.
@ -131,9 +138,10 @@ class Fmpe {
  void Write(std::ostream &os, bool binary) const;
  void Read(std::istream &is, bool binary);

-  void Update(const FmpeUpdateOptions &config,
-              MatrixBase<BaseFloat> &proj_deriv_plus,
-              MatrixBase<BaseFloat> &proj_deriv_minus);
+  // Returns total objf improvement, based on linear assumption.
+  BaseFloat Update(const FmpeUpdateOptions &config,
+                   MatrixBase<BaseFloat> &proj_deriv_plus,
+                   MatrixBase<BaseFloat> &proj_deriv_minus);
  
 private:
  void SetContexts(std::string context_str);
@ -180,8 +188,9 @@ class Fmpe {
  // variances of the GMM -- computed to avoid taking a square root
  // in the fMPE computation.   Derived variable-- not stored on
  // disk.
-  Matrix<BaseFloat> proj_; // The projection matrix, of dimension
-  // (FeatDim() * NumContexts()) x (NumGauss() * (FeatDim()+1))
+  Matrix<BaseFloat> projT_; // The transpose of the projection matrix;
+  // this is of dimension
+  // (NumGauss() * (FeatDim()+1)) * (FeatDim() * NumContexts()).
  
  TpMatrix<BaseFloat> C_; // Cholesky factor of the variance Sigma of
  // features around their mean (as estimated from GMM)... applied
@ -197,6 +206,17 @@ class Fmpe {
  
 };

+/// Computes derivatives of the likelihood of these states (weighted),
+/// w.r.t. the feature values.  Used in fMPE training.  Note, the
+/// weights "posterior" may be positive or negative-- for MMI, MPE,
+/// etc., they will typically be of both signs.  Will resize "deriv".
+/// Returns the sum of (GMM likelihood * weight), which may be used
+/// as an approximation to the objective function. 
+BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm,
+                                   const TransitionModel &trans_model,
+                                   const Posterior &posterior,
+                                   const MatrixBase<BaseFloat> &features,
+                                   Matrix<BaseFloat> *deriv);


 }  // End namespace kaldi
				`@ -0,0 +1 @@`
				`--use-energy=false # only non-default option.`