From 4fd9c20c6aac4ac035627590970dafdf1172d23c Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Sat, 19 Jul 2014 19:01:45 +0000
Subject: [PATCH] sandbox/language_id: getting VTLN model estimation working
 given a UBM.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@4173 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
---
 egs/lre/v1/conf/mfcc_vtln.conf           |   5 +
 egs/lre/v1/lid/train_lvtln_model.sh      | 282 +++++++++++++++++++++++
 egs/lre/v1/run.sh                        |  20 ++
 egs/lre/v1/sid                           |   1 +
 egs/wsj/s5/steps/compute_cmvn_stats.sh   |   4 +-
 egs/wsj/s5/steps/train_lvtln.sh          |   6 +-
 egs/wsj/s5/utils/validate_data_dir.sh    |   6 +-
 src/gmmbin/Makefile                      |   3 +-
 src/gmmbin/gmm-est-lvtln-trans.cc        |   9 +-
 src/gmmbin/gmm-global-est-lvtln-trans.cc | 235 +++++++++++++++++++
 src/transform/fmllr-diag-gmm.h           |   2 +-
 11 files changed, 559 insertions(+), 14 deletions(-)
 create mode 100644 egs/lre/v1/conf/mfcc_vtln.conf
 create mode 100755 egs/lre/v1/lid/train_lvtln_model.sh
 create mode 120000 egs/lre/v1/sid
 create mode 100644 src/gmmbin/gmm-global-est-lvtln-trans.cc

diff --git a/egs/lre/v1/conf/mfcc_vtln.conf b/egs/lre/v1/conf/mfcc_vtln.conf
new file mode 100644
index 000000000..4c0db12e9
--- /dev/null
+++ b/egs/lre/v1/conf/mfcc_vtln.conf
@@ -0,0 +1,5 @@
+--sample-frequency=8000 
+--frame-length=20 # the default is 25.
+--low-freq=20 # the default.
+--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
+--num-ceps=13
diff --git a/egs/lre/v1/lid/train_lvtln_model.sh b/egs/lre/v1/lid/train_lvtln_model.sh
new file mode 100755
index 000000000..d1cf111aa
--- /dev/null
+++ b/egs/lre/v1/lid/train_lvtln_model.sh
@@ -0,0 +1,282 @@
+#!/bin/bash
+
+# Copyright       2014  Daniel Povey
+# Apache 2.0
+
+#
+# This training script computes some things you will need in order to
+# extract VTLN-warped features.  It takes as input the data directory
+# and an already-trained diagonal-covariance UBM.  Note: although this
+# script is in the lid/ directory, because it is intended to be
+# used in language identification, but it uses features of the
+# same type as those used in the speaker-id scripts (see ../sid/),
+# i.e. double-delta features, rather than the "shifted delta cepstra"
+# features commonly used in language id.
+#
+# This script works with either mfcc or plp features; for plp features, you will
+# need to set the --base-feat-type option.  Regardless, you will need to set the
+# --mfcc-config or --plp-config option if your feature-extraction config is not
+# called conf/${base_feat_type}.conf.  The output of this script will be in
+# $dir/final.lvtln and $dir/final.dubm and $dir/final.ali_dubm; the directory
+# can be passed to ./get_vtln_warps.sh to get VTLN warps for a data directory,
+# or (for data passed to this script) you can use the warping factors this
+# script outputs in $dir/final.warp
+#
+
+# Begin configuration.
+stage=-4 #  This allows restarting after partway, when something when wrong.
+config=
+cmd=run.pl
+num_iters=15    # Number of iterations of training.
+num_utt_lvtln_init=400; # number of utterances (subset) to initialize
+                        # LVTLN transform.  Not too critical.
+min_warp=0.85
+max_warp=1.25
+warp_step=0.01
+base_feat_type=mfcc # or could be PLP.
+mfcc_config=conf/mfcc.conf  # default, can be overridden.
+plp_config=conf/plp.conf  # default, can be overridden.
+logdet_scale=0.0
+subsample=5 # We use every 5th frame by default; this is more
+            # CPU-efficient.
+min_gaussian_weight=0.0001 # does not matter; inherited from diag-ubm training script.
+nj=4
+cleanup=true
+num_gselect=15
+# End configuration.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh;
+. parse_options.sh || exit 1;
+
+num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1;
+default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 <data-dir> <diag-ubm-dir> <exp-dir>"
+   echo "e.g.: $0 data/train_vtln exp/diag_ubm_vtln exp/vtln"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --nj <num-jobs>                                  # number of jobs to use (default 4)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --stage <stage>                                  # stage to do partial re-run from."
+   echo "  --num-iters <num-iters>                          # number of iterations of training"
+   echo "  --base-feat-type <feat-type>                     # mfcc or plp, mfcc is default"
+   echo "  --mfcc-config <config>                           # config for MFCC extraction, default is"
+   echo "                                                   # conf/mfcc.conf"
+   echo "  --plp-config <config>                            # config for PLP extraction, default is"
+   echo "                                                   # conf/plp.conf"
+   exit 1;
+fi
+
+data=$1
+ubmdir=$2
+dir=$3
+
+for f in $data/feats.scp $ubmdir/final.dubm; do
+  [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
+done
+
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+
+sdata=$data/split$nj;
+split_data.sh $data $nj || exit 1;
+
+cmvn_sliding_opts="--norm-vars=false --center=true --cmn-window=300"
+# don't change $cmvn_sliding_opts, it should probably match the
+# options used in ../sid/train_diag_ubm.sh.
+sifeats="ark,s,cs:add-deltas scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding $cmvn_sliding_opts ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+
+
+# for the subsets of features that we use to estimate the linear transforms, we
+# don't bother with CMN.  This will give us wrong offsets on the transforms,
+# but it won't matter because we will allow an arbitrary bias term when we apply
+# these transforms.
+
+# you need to define CLASS when invoking $cmd on featsub_warped.
+featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+
+
+if [ -f $data/utt2warp ]; then
+  echo "$0: source data directory $data appears to already have VTLN.";
+  exit 1;
+fi
+
+# create a small subset of utterances for purposes of initializing the LVTLN transform
+# utils/shuffle_list.pl is deterministic, unlike sort -R.
+cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \
+  head -n $num_utt_lvtln_init > $dir/utt_subset
+
+if [ $stage -le -4 ]; then
+  echo "$0: computing warped subset of features"
+  if [ -f $data/segments ]; then
+    echo "$0 [info]: segments file exists: using that."
+    subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- "
+  else
+    echo "$0 [info]: no segments file exists: using wav.scp directly."
+    subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- "
+  fi
+  rm $dir/.error 2>/dev/null
+  for c in $(seq 0 $[$num_classes-1]); do
+    this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
+    config_name=${base_feat_type}_config # e.g. mfcc_config or plp_config
+    this_config=$(eval echo \$$config_name)  #  e.g. conf/mfcc.conf or conf/plp.conf by default.
+    $cmd $dir/log/compute_warped_feats.$c.log \
+      $subset_feats \| compute-${base_feat_type}-feats --verbose=2 \
+      --config=$this_config --vtln-warp=$this_warp ark:- ark:- \| \
+      copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error &
+  done
+  wait;
+  if [ -f $dir/.error ]; then
+    echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log"
+    exit 1;
+  fi
+fi
+
+if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \
+  compare-feats --threshold=0.98 scp:-  ark:$dir/feats.$default_class.ark >&/dev/null; then
+  echo "$0: features stored on disk differ from those computed with no warping."
+  echo "    Possibly your feature type is wrong (--base-feat-type option)"
+  exit 1;
+fi
+  
+if [ -f $data/segments ]; then
+  subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
+else
+  echo "$0 [info]: no segments file exists: using wav.scp directly."
+  subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |"
+fi
+
+if [ $stage -le -3 ]; then
+  echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)"
+  dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1;
+
+  $cmd $dir/log/init_lvtln.log \
+    gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \
+      $dir/0.lvtln || exit 1;
+
+  for c in $(seq 0 $[$num_classes-1]); do
+    this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
+    orig_feats=ark:$dir/feats.$default_class.ark
+    warped_feats=ark:$dir/feats.$c.ark
+    logfile=$dir/log/train_special.$c.log
+    this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)"
+    if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \
+      $c $dir/0.lvtln $dir/0.lvtln \
+      "$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then
+      echo "$0: Error training LVTLN transform, see $logfile";
+      exit 1;
+    fi
+  done  
+  rm $dir/final.lvtln 2>/dev/null
+  ln -s 0.lvtln $dir/final.lvtln
+fi
+
+cp $ubmdir/final.dubm $dir/0.dubm
+
+if [ $stage -le -2 ]; then
+  echo "$0: computing Gaussian selection info."
+
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    gmm-gselect --n=$num_gselect $ubmdir/final.dubm "$sifeats" \
+      "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+fi
+  
+
+if [ $stage -le -1 ]; then
+  echo "$0: computing initial LVTLN transforms"  # do this per-utt.
+
+  $cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \
+    gmm-global-gselect-to-post $dir/0.dubm "$sifeats" \
+      "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
+    gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
+      $dir/0.dubm $dir/0.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans.0.JOB ark,t:$dir/warp.0.JOB || exit 1
+  
+  # consolidate the warps into one file.
+  for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0
+  rm $dir/warp.0.*
+fi
+
+
+x=0
+while [ $x -lt $num_iters ]; do
+  feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
+
+  # First update the model.
+  if [ $stage -le $x ]; then
+    echo "$0: Updating model on pass $x"
+    # Accumulate stats.
+    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
+      gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
+       $dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1;
+
+    $cmd $dir/log/update.$x.log \
+      gmm-global-est --remove-low-count-gaussians=false --min-gaussian-weight=$min_gaussian_weight \
+        $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
+      $dir/$[$x+1].dubm || exit 1;
+    $cleanup && rm $dir/$x.*.acc $dir/$x.dubm
+  fi
+
+  # Now update the LVTLN transforms (and warps.)
+  if [ $stage -le $x ]; then
+    echo "$0: re-estimating LVTLN transforms on pass $x"
+    $cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \
+      gmm-global-gselect-to-post $dir/$[$x+1].dubm "$feats" \
+        "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
+      gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
+        $dir/$[$x+1].dubm $dir/0.lvtln "$sifeats" ark,s,cs:- \
+        ark:$dir/trans.$[$x+1].JOB ark,t:$dir/warp.$[$x+1].JOB || exit 1
+
+    # consolidate the warps into one file.
+    for j in $(seq $nj); do cat $dir/warp.$[$x+1].$j; done > $dir/warp.$[$x+1]
+    rm $dir/warp.$[$x+1].*
+    $cleanup && rm $dir/trans.$x.*
+  fi
+  x=$[$x+1]
+done
+
+feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
+
+if [ $stage -le $x ]; then
+  # Accumulate stats for "alignment model"-- this model is computed with the
+  # speaker-independent features, but matches Gaussian-for-Gaussian with the
+  # final speaker-adapted model.
+  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
+    gmm-global-acc-stats-twofeats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
+      $dir/$x.dubm "$feats" "$sifeats" $dir/$x.JOB.acc || exit 1
+  [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
+  # Update model.
+  $cmd $dir/log/est_alimdl.log \
+    gmm-global-est --min-gaussian-weight=$min_gaussian_weight \
+      --remove-low-count-gaussians=false $dir/$x.dubm \
+     "gmm-global-sum-accs - $dir/$x.*.acc|" $dir/$x.ali_dubm  || exit 1;
+  $cleanup && rm $dir/$x.*.acc
+fi
+
+if true; then # Diagnostics
+  ln -sf warp.$x $dir/final.warp
+  if [ -f $data/spk2gender ]; then 
+    # To make it easier to eyeball the male and female speakers' warps
+    # separately, separate them out.
+    for g in m f; do # means: for gender in male female
+      cat $dir/final.warp | \
+        utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
+      echo -n "The last few warp factors for gender $g are: "
+      tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}'; 
+      echo
+    done
+  fi
+fi
+
+ln -sf $x.dubm $dir/final.dubm
+ln -sf $x.ali_dubm $dir/final.ali_dubm
+ln -sf 0.lvtln $dir/final.lvtln
+
+# Summarize warning messages...
+utils/summarize_warnings.pl  $dir/log
+
+echo "$0: Done training LVTLN model in $dir"
diff --git a/egs/lre/v1/run.sh b/egs/lre/v1/run.sh
index af577dd0b..d21f13d9e 100755
--- a/egs/lre/v1/run.sh
+++ b/egs/lre/v1/run.sh
@@ -54,6 +54,26 @@ local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
 # max_voiced=3000 
 # local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train
 
+ # Vtln-related things:
+ # We'll use a subset of utterances to train the GMM we'll use for VTLN
+ # warping.
+ utils/subset_data_dir.sh data/train 5000 data/train_5k_novtln
+
+  # for the features we use to estimate VTLN warp factors, we use more cepstra
+  # (13 instead of just 7); this needs to be tuned.
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 50 --cmd "$train_cmd" \
+    data/train_5k_novtln exp/make_mfcc $mfccdir
+  lid/compute_vad_decision.sh data/train_5k_novtln exp/make_mfcc $mfccdir
+  # note, we're using the speaker-id version of the train_diag_ubm.sh script, which
+  # uses double-delta instead of SDC features.  We train a 256-Gaussian UBM; this
+  # has to be tuned.
+  sid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k_novtln 256 \
+    exp/diag_ubm_vtln
+  lid/train_lvtln_model.sh --mfcc-config conf/mfcc_vtln.conf --nj 30 --cmd "$train_cmd" \
+     data/train_5k_novtln exp/diag_ubm_vtln exp/vtln
+
+)
+
 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \
   data/train exp/make_mfcc $mfccdir
 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
diff --git a/egs/lre/v1/sid b/egs/lre/v1/sid
new file mode 120000
index 000000000..893a12f30
--- /dev/null
+++ b/egs/lre/v1/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/compute_cmvn_stats.sh b/egs/wsj/s5/steps/compute_cmvn_stats.sh
index ebcc072c8..dddec0b9a 100755
--- a/egs/wsj/s5/steps/compute_cmvn_stats.sh
+++ b/egs/wsj/s5/steps/compute_cmvn_stats.sh
@@ -23,11 +23,11 @@ echo "$0 $@"  # Print the command line for logging
 fake=false
 two_channel=false
 
-if [ $1 == "--fake" ]; then
+if [ "$1" == "--fake" ]; then
   fake=true
   shift
 fi
-if [ $1 == "--two-channel" ]; then
+if [ "$1" == "--two-channel" ]; then
   two_channel=true
   shift
 fi
diff --git a/egs/wsj/s5/steps/train_lvtln.sh b/egs/wsj/s5/steps/train_lvtln.sh
index e5ade3ac4..a26fb1575 100755
--- a/egs/wsj/s5/steps/train_lvtln.sh
+++ b/egs/wsj/s5/steps/train_lvtln.sh
@@ -5,7 +5,7 @@
 # This training script trains linear-VTLN models starting from an existing
 # system based on either LDA+MLLT or delta+delta-delta features.
 # Works with either mfcc or plp features, but you need to set the 
-# --base-feature-type option.
+# --base-feat-type option.
 # The resulting system can be used with align_lvtln.sh and/or decode_lvtln.sh
 # to get VTLN warping factors for data, for warped data extraction, or (for
 # the training data) you can use the warping factors this script outputs
@@ -65,7 +65,7 @@ alidir=$5
 dir=$6
 
 for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt $data/wav.scp; do
-  [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
 numgauss=$numleaves
@@ -111,7 +111,7 @@ else
 fi
 
 if [ -f $data/utt2warp ]; then
-  echo "$0: source directory appears to already have VTLN.";
+  echo "$0: source data directory $data appears to already have VTLN.";
   exit 1;
 fi
 
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 4f56ed3e2..9a80162ce 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -6,15 +6,15 @@ no_wav=false
 no_text=false
 
 for x in `seq 3`; do
-  if [ $1 == "--no-feats" ]; then
+  if [ "$1" == "--no-feats" ]; then
     no_feats=true
     shift;
   fi
-  if [ $1 == "--no-text" ]; then
+  if [ "$1" == "--no-text" ]; then
     no_text=true
     shift;
   fi
-  if [ $1 == "--no-wav" ]; then
+  if [ "$1" == "--no-wav" ]; then
     no_wav=true
     shift;
   fi
diff --git a/src/gmmbin/Makefile b/src/gmmbin/Makefile
index 3eba1f830..cf637d189 100644
--- a/src/gmmbin/Makefile
+++ b/src/gmmbin/Makefile
@@ -27,7 +27,8 @@ BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
            gmm-est-basis-fmllr-gpost gmm-latgen-tracking gmm-latgen-faster-parallel \
            gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats \
            gmm-global-info gmm-latgen-faster-regtree-fmllr gmm-est-fmllr-global \
-           gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post
+           gmm-acc-mllt-global gmm-transform-means-global gmm-global-gselect-to-post \
+           gmm-global-est-lvtln-trans
 
 OBJFILES =
 
diff --git a/src/gmmbin/gmm-est-lvtln-trans.cc b/src/gmmbin/gmm-est-lvtln-trans.cc
index 913aea6bb..45aa90596 100644
--- a/src/gmmbin/gmm-est-lvtln-trans.cc
+++ b/src/gmmbin/gmm-est-lvtln-trans.cc
@@ -1,6 +1,7 @@
 // gmmbin/gmm-est-lvtln-trans.cc
 
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University
+//                2014  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -125,8 +126,8 @@ int main(int argc, char *argv[]) {
           const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
           const GaussPost &gpost = gpost_reader.Value(utt);
           if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
-            KALDI_WARN << "GauPost vector has wrong size " << (gpost.size())
-                       << " vs. " << (feats.NumRows());
+            KALDI_WARN << "GauPost vector has wrong size " << gpost.size()
+                       << " vs. " << feats.NumRows();
             num_other_error++;
             continue;
           }
@@ -172,8 +173,8 @@ int main(int argc, char *argv[]) {
         const GaussPost &gpost = gpost_reader.Value(utt);
 
         if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
-          KALDI_WARN << "GauPost has wrong size " << (gpost.size())
-              << " vs. " << (feats.NumRows());
+          KALDI_WARN << "GauPost has wrong size " << gpost.size()
+              << " vs. " << feats.NumRows();
           num_other_error++;
           continue;
         }
diff --git a/src/gmmbin/gmm-global-est-lvtln-trans.cc b/src/gmmbin/gmm-global-est-lvtln-trans.cc
new file mode 100644
index 000000000..18bb51095
--- /dev/null
+++ b/src/gmmbin/gmm-global-est-lvtln-trans.cc
@@ -0,0 +1,235 @@
+// gmmbin/gmm-global-est-lvtln-trans.cc
+
+// Copyright 2009-2011  Microsoft Corporation;  Saarland University
+//                2014  Daniel Povey
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+using std::string;
+#include <vector>
+using std::vector;
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "transform/lvtln.h"
+#include "hmm/posterior.h"
+
+namespace kaldi {
+void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
+                            const Posterior &post,
+                            const DiagGmm &gmm,
+                            FmllrDiagGmmAccs *spk_stats) {
+  KALDI_ASSERT(static_cast<int32>(post.size()) == feats.NumRows());
+  for (size_t i = 0; i < post.size(); i++) {
+    std::vector<int32> gselect(post[i].size());
+    Vector<BaseFloat> this_post(post[i].size());
+    for (size_t j = 0; j < post[i].size(); j++) {
+      int32 g = post[i][j].first;
+      BaseFloat weight = post[i][j].second;
+      gselect[j] = g;
+      this_post(j) = weight;
+    }
+    spk_stats->AccumulateFromPosteriorsPreselect(gmm, gselect,
+                                                 feats.Row(i),
+                                                 this_post);
+  }
+}
+
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    typedef kaldi::int32 int32;
+    using namespace kaldi;
+    const char *usage =
+        "Estimate linear-VTLN transforms, either per utterance or for "
+        "the supplied set of speakers (spk2utt option); this version\n"
+        "is for a global diagonal GMM (also known as a UBM).  Reads posteriors\n"
+        "indicating Gaussian indexes in the UBM.\n"
+        "\n"
+        "Usage: gmm-global-est-lvtln-trans [options] <gmm-in> <lvtln-in> "
+        "<feature-rspecifier> <gpost-rspecifier> <lvtln-trans-wspecifier> [<warp-wspecifier>]\n"
+        "e.g.: gmm-global-est-lvtln-trans 0.ubm 0.lvtln '$feats' ark,s,cs:- ark:1.trans ark:1.warp\n"
+        "(where the <gpost-rspecifier> will likely come from gmm-global-get-post or\n"
+        "gmm-global-gselect-to-post\n";
+    
+    ParseOptions po(usage);
+    string spk2utt_rspecifier;
+    BaseFloat logdet_scale = 1.0;
+    std::string norm_type = "offset";
+    po.Register("norm-type", &norm_type, "type of fMLLR applied (\"offset\"|\"none\"|\"diag\")");
+    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
+                "utterance-list map");
+    po.Register("logdet-scale", &logdet_scale, "Scale on log-determinant term in auxiliary function");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 5 || po.NumArgs() > 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    string
+        model_rxfilename = po.GetArg(1),
+        lvtln_rxfilename = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        post_rspecifier = po.GetArg(4),
+        trans_wspecifier = po.GetArg(5),
+        warp_wspecifier = po.GetOptArg(6);
+
+    DiagGmm gmm;
+    ReadKaldiObject(model_rxfilename, &gmm);
+    LinearVtln lvtln;
+    ReadKaldiObject(lvtln_rxfilename, &lvtln);
+
+
+    RandomAccessPosteriorReader post_reader(post_rspecifier);
+
+    double tot_lvtln_impr = 0.0, tot_t = 0.0;
+
+    BaseFloatMatrixWriter transform_writer(trans_wspecifier);
+
+    BaseFloatWriter warp_writer(warp_wspecifier);
+
+    std::vector<int32> class_counts(lvtln.NumClasses(), 0);
+    int32 num_done = 0, num_no_post = 0, num_other_error = 0;
+    if (spk2utt_rspecifier != "") {  // per-speaker adaptation
+      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
+      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
+        FmllrDiagGmmAccs spk_stats(lvtln.Dim());
+        string spk = spk2utt_reader.Key();
+        const vector<string> &uttlist = spk2utt_reader.Value();
+        for (size_t i = 0; i < uttlist.size(); i++) {
+          std::string utt = uttlist[i];
+          if (!feature_reader.HasKey(utt)) {
+            KALDI_WARN << "Did not find features for utterance " << utt;
+            continue;
+          }
+          if (!post_reader.HasKey(utt)) {
+            KALDI_WARN << "Did not find posteriors for utterance " << utt;
+            num_no_post++;
+            continue;
+          }
+          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
+          const Posterior &post = post_reader.Value(utt);
+          if (static_cast<int32>(post.size()) != feats.NumRows()) {
+            KALDI_WARN << "Posterior vector has wrong size " << post.size()
+                       << " vs. " << feats.NumRows();
+            num_other_error++;
+            continue;
+          }
+
+          AccumulateForUtterance(feats, post, gmm, &spk_stats);
+
+          num_done++;
+        }  // end looping over all utterances of the current speaker
+
+        BaseFloat impr, spk_tot_t;
+        {  // Compute the transform and write it out.
+          Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
+          int32 class_idx;
+          lvtln.ComputeTransform(spk_stats,
+                                 norm_type,
+                                 logdet_scale,
+                                 &transform,
+                                 &class_idx,
+                                 NULL,
+                                 &impr,
+                                 &spk_tot_t);
+          class_counts[class_idx]++;
+          transform_writer.Write(spk, transform);
+          if (warp_wspecifier != "")
+            warp_writer.Write(spk, lvtln.GetWarp(class_idx));
+        }
+        KALDI_LOG << "For speaker " << spk << ", auxf-impr from LVTLN is "
+                  << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
+        tot_lvtln_impr += impr;
+        tot_t += spk_tot_t;
+      }  // end looping over speakers
+    } else {  // per-utterance adaptation
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      for (; !feature_reader.Done(); feature_reader.Next()) {
+        string utt = feature_reader.Key();
+        if (!post_reader.HasKey(utt)) {
+          KALDI_WARN << "Did not find posterior for utterance "
+                     << utt;
+          num_no_post++;
+          continue;
+        }
+        const Matrix<BaseFloat> &feats = feature_reader.Value();
+        const Posterior &post = post_reader.Value(utt);
+
+        if (static_cast<int32>(post.size()) != feats.NumRows()) {
+          KALDI_WARN << "Posterior has wrong size " << post.size()
+              << " vs. " << feats.NumRows();
+          num_other_error++;
+          continue;
+        }
+        num_done++;
+
+        FmllrDiagGmmAccs spk_stats(lvtln.Dim());
+
+        AccumulateForUtterance(feats, post, gmm,
+                               &spk_stats);
+        BaseFloat impr, utt_tot_t = spk_stats.beta_;
+        {  // Compute the transform and write it out.
+          Matrix<BaseFloat> transform(lvtln.Dim(), lvtln.Dim()+1);
+          int32 class_idx;
+          lvtln.ComputeTransform(spk_stats,
+                                 norm_type,
+                                 logdet_scale,
+                                 &transform,
+                                 &class_idx,
+                                 NULL,
+                                 &impr,
+                                 &utt_tot_t);
+          class_counts[class_idx]++;
+          transform_writer.Write(utt, transform);
+          if (warp_wspecifier != "")
+            warp_writer.Write(utt, lvtln.GetWarp(class_idx));
+        }
+
+        KALDI_LOG << "For utterance " << utt << ", auxf-impr from LVTLN is "
+                  << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
+        tot_lvtln_impr += impr;
+        tot_t += utt_tot_t;
+      }
+    }
+
+    {
+      std::ostringstream s;
+      for (size_t i = 0; i < class_counts.size(); i++)
+        s << ' ' << class_counts[i];
+      KALDI_LOG << "Distribution of classes is: " << s.str();
+    }
+
+    KALDI_LOG << "Done " << num_done << " files, " << num_no_post
+              << " with no posteriors, " << num_other_error << " with other errors.";
+    KALDI_LOG << "Overall LVTLN auxf impr per frame is "
+              << (tot_lvtln_impr / tot_t) << " over " << tot_t << " frames.";
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/transform/fmllr-diag-gmm.h b/src/transform/fmllr-diag-gmm.h
index 90211de9b..1c203bdc8 100644
--- a/src/transform/fmllr-diag-gmm.h
+++ b/src/transform/fmllr-diag-gmm.h
@@ -100,7 +100,7 @@ class FmllrDiagGmmAccs: public AffineXformStats {
                                 const VectorBase<BaseFloat> &posteriors);
 
   /// Accumulate stats for a GMM, given supplied posteriors.  The "posteriors"
-  /// vector should be have the same size as "gselect".n
+  /// vector should be have the same size as "gselect".
   void AccumulateFromPosteriorsPreselect(
       const DiagGmm &gmm,
       const std::vector<int32> &gselect,