sandbox/online: merging changes from trunk

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/online@4243 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-08-03 01:07:56 +00:00 · 2014-08-03 01:07:56 +00:00 · 3b2a6582b6
--- a/egs/callhome_egyptian/s5/local/create_splits
+++ b/egs/callhome_egyptian/s5/local/create_splits
@ -25,6 +25,8 @@ do

  utils/fix_data_dir.sh $data_dir/$split
  utils/validate_data_dir.sh $data_dir/$split
-  rm $data_dir/$split/*.tmp
+  if ls $data_dir/$split/*.tmp &> /dev/null; then
+    rm $data_dir/$split/*.tmp
+  fi
 done

--- a/egs/callhome_egyptian/s5/local/get_oracle.sh
+++ b/egs/callhome_egyptian/s5/local/get_oracle.sh
@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# Gets lattice oracles
+
+if [ $# -lt 3 ]; then
+    echo "Specify lattice dir, symbol table and text file for partition"
+    exit 1;
+fi
+
+latticeDir=$1
+textFile=$3
+symTable=$2
+oracleDir=$latticeDir/oracle
+
+echo $latticeDir
+echo $oracleDir
+
+. path.sh
+
+if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then
+    echo "Required files not found"
+    exit 1;
+fi
+
+mkdir -p $oracleDir
+
+cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \
+    utils/sym2int.pl -f 2- $symTable | \
+    $KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log
+
+sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra
--- a/egs/callhome_egyptian/s5/local/latconvert.sh
+++ b/egs/callhome_egyptian/s5/local/latconvert.sh
@ -10,7 +10,8 @@ if [ $# -lt 3 ]; then
  exit 1
 fi

-prunebeam=2
+prunebeam=50
+maxProcesses=10

 latdir=$1
 decode_dir=$2
@ -33,6 +34,7 @@ then
  mkdir -p $latdir/$compiledLatDir
  mkdir -p $latdir/$preplfLatDir

+  runningProcesses=0
  for l in $decode_dir/lat.*.gz
  do	
    (
@ -69,11 +71,19 @@ then
          continue
        fi
        # Replace laugh, unk, oov, noise with eps
-        echo "$line" | awk '{if ($3 == 2038 || $3 == 2039 || $3 == 2040) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat"
+        echo "$line" | awk '{if ($3 == 1157 || $3 == 5327 || $3 == 5328 || $3 == 5329 || $3 ==5326) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat"
      done < $bname.ark.fst
      echo "Done isolating lattices"
    fi
    ) &	
+    runningProcesses=$((runningProcesses+1))
+    echo "#### Processes running = " $runningProcesses " ####"
+    if [ $runningProcesses -eq $maxProcesses ]; then
+      echo "#### Waiting for slot ####"
+      wait
+      runningProcesses=0
+      echo "#### Done waiting ####"
+    fi
  done
  wait
  rm $latdir/*.bin
@ -82,6 +92,7 @@ then

  if [ $stage -le 2 ]; then
    #Compile lattices
+    runningProcesses=0
    for l in $latdir/$rawLatDir/*.lat
    do
      (
@ -89,6 +100,14 @@ then
      bname=${l##*/}
      fstcompile --arc_type=log $latdir/$rawLatDir/$bname $latdir/$compiledLatDir/$bname
      ) &
+    runningProcesses=$((runningProcesses+1))
+    echo "#### Processes running = " $runningProcesses " ####"
+    if [ $runningProcesses -eq $maxProcesses ]; then
+      echo "#### Waiting for slot ####"
+      wait
+      runningProcesses=0
+      echo "#### Done waiting ####"
+    fi
    done
    wait
    echo "Done compiling lattices."
@ -99,6 +118,7 @@ then
    # Create a dummy FST with one state and no arcs first
    echo 0 | fstcompile --arc_type=log - $latdir/$preplfLatDir/dummy.fst
    # Push Lattice weights towards initial state
+    runningProcesses=0
    for l in $latdir/$compiledLatDir/*.lat
    do
      (
@ -112,6 +132,14 @@ then
        fstrmepsilon - | \
        fstreverse - $latdir/$preplfLatDir/$bname
      ) &
+    runningProcesses=$((runningProcesses+1))
+    echo "#### Processes running = " $runningProcesses " ####"
+    if [ $runningProcesses -eq $maxProcesses ]; then
+      echo "#### Waiting for slot ####"
+      wait
+      runningProcesses=0
+      echo "#### Done waiting ####"
+    fi
    done
    wait
    # Let's take a moment to thank the dummy FST for playing its
--- a/egs/callhome_egyptian/s5/local/lattice_main.sh
+++ b/egs/callhome_egyptian/s5/local/lattice_main.sh
@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+outDir=exp/lat
+mkdir -p $outDir
+
+stage=1
+
+if [ $stage -lt 1 ]; then
+
+  # First convert all lattices into the pruned, minimized version
+  decodeDir=exp/tri5a/decode_dev
+  acousticScale=0.8333
+  local/latconvert.sh $outDir $decodeDir $acousticScale
+
+  decodeDir=exp/tri5a/decode_test
+  acousticScale=0.8333
+  local/latconvert.sh $outDir $decodeDir $acousticScale
+
+fi
+
+if [ $stage -lt 2 ]; then
+  # Get oracles
+  latticeDir=exp/tri5a/decode_dev
+  textFile=data/dev/text
+  symTable=exp/tri5a/graph/words.txt
+  local/get_oracle.sh $latticeDir $symTable $textFile
+
+  latticeDir=exp/tri5a/decode_test
+  textFile=data/test/text
+  symTable=exp/tri5a/graph/words.txt
+  local/get_oracle.sh $latticeDir $symTable $textFile
+fi
--- a/egs/callhome_egyptian/s5/run.sh
+++ b/egs/callhome_egyptian/s5/run.sh
@ -56,14 +56,14 @@ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 # ones (mostly uh-huh).  So take the 100k shortest ones, and then take 10k random
 # utterances from those.

-steps/train_mono.sh --nj 10 --cmd "$train_cmd" \                                 
+steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
  data/train data/lang exp/mono0a    

-steps/align_si.sh --nj 30 --cmd "$train_cmd" \                                   
-   data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;                 
-                                                                                 
-steps/train_deltas.sh --cmd "$train_cmd" \                                       
-    1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;  
+steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+   data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
+
+steps/train_deltas.sh --cmd "$train_cmd" \
+    1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;


 (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
--- a/egs/fisher_english/s5/run.sh
+++ b/egs/fisher_english/s5/run.sh
@ -153,9 +153,14 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 (
  utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/dev exp/tri5a/decode_dev
+    exp/tri5a/graph data/dev exp/tri5a/decode_dev
 )&

+#
+# steps/cleanup/find_bad_utts.sh --nj 200 --cmd "$train_cmd" data/train data/lang \
+#   exp/tri5a exp/tri5a_cleanup
+
+
 # local/run_for_spkid.sh

 # we don't have to results for the step below yet.
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@ -118,14 +118,17 @@ exit 0
 %WER 1.80 [ 226 / 12533, 29 ins, 44 del, 153 sub ] exp/nnet4c/decode/wer_4
 %WER 8.49 [ 1064 / 12533, 80 ins, 175 del, 809 sub ] exp/nnet4c/decode_ug/wer_11

-%WER 1.61 [ 202 / 12533, 25 ins, 47 del, 130 sub ] exp/nnet4d/decode/wer_5
-%WER 8.17 [ 1024 / 12533, 83 ins, 179 del, 762 sub ] exp/nnet4d/decode_ug/wer_11
+%WER 1.68 [ 211 / 12533, 29 ins, 39 del, 143 sub ] exp/nnet4d/decode/wer_4
+%WER 8.40 [ 1053 / 12533, 101 ins, 153 del, 799 sub ] exp/nnet4d/decode_ug/wer_10

-%WER 1.63 [ 204 / 12533, 29 ins, 42 del, 133 sub ] exp/nnet4d_gpu/decode/wer_4
-%WER 8.11 [ 1016 / 12533, 80 ins, 168 del, 768 sub ] exp/nnet4d_gpu/decode_ug/wer_10
+%WER 1.74 [ 218 / 12533, 25 ins, 48 del, 145 sub ] exp/nnet4d_gpu/decode/wer_6
+%WER 8.39 [ 1051 / 12533, 106 ins, 149 del, 796 sub ] exp/nnet4d_gpu/decode_ug/wer_10

-%WER 1.63 [ 204 / 12533, 29 ins, 42 del, 133 sub ] exp/nnet4d_gpu/decode/wer_4
-%WER 8.11 [ 1016 / 12533, 80 ins, 168 del, 768 sub ] exp/nnet4d_gpu/decode_ug/wer_10
+%WER 1.53 [ 192 / 12533, 22 ins, 42 del, 128 sub ] exp/nnet4d2/decode/wer_3
+%WER 8.06 [ 1010 / 12533, 79 ins, 152 del, 779 sub ] exp/nnet4d2/decode_ug/wer_8
+
+%WER 1.51 [ 189 / 12533, 25 ins, 34 del, 130 sub ] exp/nnet4d2_gpu/decode/wer_3
+%WER 7.97 [ 999 / 12533, 78 ins, 152 del, 769 sub ] exp/nnet4d2_gpu/decode_ug/wer_8

 %WER 1.37 [ 172 / 12533, 14 ins, 36 del, 122 sub ] exp/nnet4e_gpu/decode/wer_3
 %WER 8.03 [ 1006 / 12533, 61 ins, 179 del, 766 sub ] exp/nnet4e_gpu/decode_ug/wer_8
@ -153,8 +156,8 @@ exit 0


 # Discriminatively trained system (using p-norm rather than tanh nonlinearities, using SMBR, on GPU)
-%WER 1.56 [ 195 / 12533, 28 ins, 31 del, 136 sub ] exp/nnet5d_mpe_gpu/decode_epoch2/wer_2
-%WER 8.35 [ 1047 / 12533, 77 ins, 171 del, 799 sub ] exp/nnet5d_mpe_gpu/decode_ug_epoch4/wer_10
+%WER 1.74 [ 218 / 12533, 25 ins, 48 del, 145 sub ] exp/nnet5d_mpe_gpu/decode_epoch1/wer_6
+%WER 8.40 [ 1053 / 12533, 108 ins, 148 del, 797 sub ] exp/nnet5d_mpe_gpu/decode_ug_epoch1/wer_10

 # Discriminatively trained system on top of ensemble trained p-norm network (using SMBR, on GPU)
 %WER 1.36 [ 170 / 12533, 15 ins, 34 del, 121 sub ] exp/nnet5e_mpe_gpu/decode_epoch2/wer_3
--- a/egs/rm/s5/local/nnet2/run_4d2.sh
+++ b/egs/rm/s5/local/nnet2/run_4d2.sh
@ -0,0 +1,62 @@
+#!/bin/bash
+
+# 4d2 is as 4d but adding perturbed training with multiplier=1.0
+
+train_stage=-10
+use_gpu=true
+
+. cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+  fi
+  parallel_opts="-l gpu=1" 
+  num_threads=1
+  minibatch_size=512
+  dir=exp/nnet4d2_gpu
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+  minibatch_size=128
+  parallel_opts="-pe smp $num_threads" 
+  dir=exp/nnet4d2
+fi
+
+
+
+if [ ! -f $dir/final.mdl ]; then
+  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
+     --target-multiplier 1.0 \
+     --num-threads "$num_threads" \
+     --minibatch-size "$minibatch_size" \
+     --parallel-opts "$parallel_opts" \
+     --num-jobs-nnet 4 \
+     --num-epochs-extra 10 --add-layers-period 1 \
+     --num-hidden-layers 2 \
+     --mix-up 4000 \
+     --initial-learning-rate 0.02 --final-learning-rate 0.004 \
+     --cmd "$decode_cmd" \
+     --pnorm-input-dim 1000 \
+     --pnorm-output-dim 200 \
+     data/train data/lang exp/tri3b_ali $dir  || exit 1;
+fi
+
+steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+  --transform-dir exp/tri3b/decode \
+  exp/tri3b/graph data/test $dir/decode  &
+
+steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+  --transform-dir exp/tri3b/decode_ug \
+  exp/tri3b/graph_ug data/test $dir/decode_ug
+
+wait
+
--- a/egs/rm/s5/local/nnet2/run_5d.sh
+++ b/egs/rm/s5/local/nnet2/run_5d.sh
@ -0,0 +1,126 @@
+#!/bin/bash
+
+
+# This script demonstrates discriminative training of p-norm neural nets.
+# It's on top of run_4d_gpu.sh which uses adapted 40-dimensional features.
+# This version of the script uses GPUs.  We distinguish it by putting "_gpu"
+# at the end of the directory name.
+
+
+use_gpu=true
+stage=0
+transform_dir=exp/tri3b_ali
+
+. cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+[ ! -f $transform_dir/num_jobs ] && \
+  echo "Expected $transform_dir/num_jobs to exist" && exit 1;
+nj_orig=$(cat $transform_dir/num_jobs)
+
+
+# The queue options in this script are for the CLSP network, and might not work
+# for you.
+
+if $use_gpu; then
+  . ./cmd.sh
+  . ./path.sh
+  ! cuda-compiled && cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+  align_gpu_opts="-l gpu=1"
+  use_gpu_flag="--use-gpu yes"
+  train_parallel_opts="-l gpu=1"
+  train_num_threads=1
+  srcdir=exp/nnet4d_gpu
+  dir=exp/nnet5d_mpe_gpu
+  nj=$nj_orig
+else
+  align_gpu_opts=
+  use_gpu_flag="--use-gpu no"
+  train_parallel_opts="-pe smp 6"
+  train_num_threads=6
+  srcdir=exp/nnet4d
+  dir=exp/nnet5d_mpe
+  if [ "$decode_cmd" != "run.pl" ]; then
+    nj=$[$nj_orig*5]; # use more jobs, or it will be slow in the alignment
+                      # phase.  But if we are just running everything on
+                      # one machine this won't help us
+  else
+    nj=$nj_orig
+  fi
+fi
+
+if [ ! -f $srcdir/final.mdl ]; then
+  echo "$0: expected $srcdir/final.mdl to exist."
+  exit 1;
+fi
+
+# The denominator lattice creation currently doesn't use GPUs; that would be
+# wasteful since the lattice determinization and graph search use up a fair
+# amount of CPU, and we'd be idling the GPU much of the time.
+
+# We specify 1G each for the mem_free and ram_free which, is per thread... it
+# will likely be less than the default.  Increase the beam relative to the
+# defaults; this is just for this RM setup, where the default beams will likely
+# generate very thin lattices.
+
+#  Note: the transform-dir is important to
+# specify, since this system is on top of fMLLR features.
+
+
+if [ $stage -le 0 ]; then
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
+    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+    --beam 20.0 --lattice-beam 10.0 \
+    --transform-dir $transform_dir \
+    data/train data/lang $srcdir ${srcdir}_denlats
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet2/align.sh  --cmd "$decode_cmd $align_gpu_opts" $use_gpu_flag \
+    --transform-dir $transform_dir \
+    --nj $nj data/train data/lang $srcdir ${srcdir}_ali
+fi
+if [ $stage -le 2 ]; then
+  steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" \
+    --num-jobs-nnet 2 --transform-dir $transform_dir \
+    --num-threads "$train_num_threads" --parallel-opts "$train_parallel_opts" data/train data/lang \
+    ${srcdir}_ali ${srcdir}_denlats $srcdir/final.mdl $dir
+fi
+if [ $stage -le 3 ]; then
+  for epoch in 1 2 3 4; do
+    steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --iter epoch$epoch \
+      --transform-dir exp/tri3b/decode \
+      exp/tri3b/graph data/test $dir/decode_epoch$epoch  &
+
+    steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --iter epoch$epoch \
+      --transform-dir exp/tri3b/decode_ug \
+      exp/tri3b/graph_ug data/test $dir/decode_ug_epoch$epoch &
+  done
+  wait
+fi
+
+exit 0;
+
+
+
+# The following is some test commands that I ran in order to verify that
+# the neural-net splitting and excising code was working as intended.
+
+# (
+# acoustic_scale=0.1
+# for criterion in smbr mmi mpfe; do
+#   for drop_frames in true false; do
+#     nnet-get-egs-discriminative  --drop-frames=$drop_frames  --criterion=$criterion --excise=true exp/tri5c_mpe/0.mdl 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/train/split8/1/utt2spk scp:data/train/split8/1/cmvn.scp "scp:head -n 40 data/train/split8/1/feats.scp|" ark:- | splice-feats --left-context=3 --right-context=3 ark:- ark:- | transform-feats exp/tri5c_mpe/final.mat ark:- ark:- | transform-feats --utt2spk=ark:data/train/split8/1/utt2spk ark:$transform_dir/trans.1 ark:- ark:- |' 'ark,s,cs:gunzip -c exp/${dir}_ali/ali.1.gz |' 'ark,s,cs:gunzip -c exp/${dir}_denlats/lat.1.gz|' "ark:|nnet-combine-egs-discriminative ark:- ark:1.egs"
+
+#     nnet-get-egs-discriminative --drop-frames=$drop_frames --criterion=$criterion --split=false --excise=false exp/tri5c_mpe/0.mdl 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/train/split8/1/utt2spk scp:data/train/split8/1/cmvn.scp "scp:head -n 40 data/train/split8/1/feats.scp|" ark:- | splice-feats --left-context=3 --right-context=3 ark:- ark:- | transform-feats exp/tri5c_mpe/final.mat ark:- ark:- | transform-feats --utt2spk=ark:data/train/split8/1/utt2spk ark:$transform_dir/trans.1 ark:- ark:- |' 'ark,s,cs:gunzip -c exp/${dir}_ali/ali.1.gz |' 'ark,s,cs:gunzip -c exp/${dir}_denlats/lat.1.gz|' ark:2.egs
+
+#    nnet-compare-hash-discriminative --acoustic-scale=$acoustic_scale --drop-frames=$drop_frames --criterion=$criterion $dir/final.mdl ark:1.egs ark:2.egs || exit 1;
+#  done
+# done
+# )
--- a/egs/rm/s5/local/run_nnet2.sh
+++ b/egs/rm/s5/local/run_nnet2.sh
@ -21,12 +21,15 @@ if $use_gpu; then
  # This one is for training pnorm nnets on top of 40-dim + fMLLR features
  # **THIS IS THE PRIMARY RECIPE**
  local/nnet2/run_4d.sh --use-gpu true
+
+  # as above with 'perturbed training'.  A bit better results, a bit slower.
+  local/nnet2/run_4d2.sh --use-gpu true
  
-  # This is discriminative training on top of 4c.
+  # This is discriminative training on top of 4c.  (hardly helps)
  local/nnet2/run_5c_gpu.sh
  
  # This is discriminative training on top of 4d.
-  local/nnet2/run_5d_gpu.sh
+  local/nnet2/run_5d.sh --use-gpu true
 else
  # This example runs on top of "raw-fMLLR" features;
  # you have to run local/run_raw_fmllr.sh first.
@ -42,9 +45,15 @@ else
  # **THIS IS THE PRIMARY RECIPE (40-dim + fMLLR + p-norm neural net)**
  local/nnet2/run_4d.sh --use-gpu false

+  # as above with 'perturbed training'.  A bit better results, a bit slower.
+  local/nnet2/run_4d2.sh --use-gpu false
+
  # This is discriminative training on top of 4c.
  local/nnet2/run_5c.sh

+  # This is discriminative training on top of 4d.
+  local/nnet2/run_5d.sh --use-gpu false
+
  # This is p-norm on top of raw-fMLLR.
  #local/nnet2/run_4e.sh

--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@ -146,6 +146,15 @@ steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
 steps/align_fmllr.sh --nj 8 --cmd "$train_cmd" --use-graphs true \
  data/train data/lang exp/tri3b exp/tri3b_ali

+
+# # We have now added a script that will help you find portions of your data that
+# # has bad transcripts, so you can filter it out.  Below we demonstrate how to
+# # run this script.
+# steps/cleanup/find_bad_utts.sh --nj 20 --cmd "$train_cmd" data/train data/lang \
+#   exp/tri3b_ali exp/tri3b_cleanup 
+# # The following command will show you some of the hardest-to-align utterances in the data.
+# head  exp/tri3b_cleanup/all_info.sorted.txt 
+
 ## MMI on top of tri3b (i.e. LDA+MLLT+SAT+MMI)
 steps/make_denlats.sh --config conf/decode.config \
   --nj 8 --cmd "$train_cmd" --transform-dir exp/tri3b_ali \
--- a/egs/tedlium/s5/local/run_dnn.sh
+++ b/egs/tedlium/s5/local/run_dnn.sh
@ -20,6 +20,9 @@

 . ./path.sh ## Source the tools/utils (import the queue.pl)

+nj=80
+decode_nj=8
+
 # Config:
 gmmdir=exp/tri3
 data_fmllr=data-fmllr-tri3
@ -69,10 +72,10 @@ if [ $stage -le 2 ]; then
    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
    $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1;
  # Decode (reuse HCLG graph)
-  steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+  steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
    --num-threads 3 --parallel-opts "-pe smp 4" \
    $gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
-  steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+  steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
    --num-threads 3 --parallel-opts "-pe smp 4" \
    $gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
 fi
@ -87,9 +90,9 @@ acwt=0.1

 if [ $stage -le 3 ]; then
  # First we generate lattices and alignments:
-  steps/nnet/align.sh --nj 80 --cmd "$train_cmd" \
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
    $data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
-  steps/nnet/make_denlats.sh --nj 3 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+  steps/nnet/make_denlats.sh --nj 6 --sub-split $nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
    --acwt $acwt $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
 fi

@ -99,11 +102,11 @@ if [ $stage -le 4 ]; then
    $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
  # Decode (reuse HCLG graph)
  for ITER in 1; do
-    steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+    steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
      --num-threads 3 --parallel-opts "-pe smp 4" \
      --nnet $dir/${ITER}.nnet --acwt $acwt \
      $gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
-    steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+    steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
      --num-threads 3 --parallel-opts "-pe smp 4" \
      --nnet $dir/${ITER}.nnet --acwt $acwt \
      $gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
@ -117,9 +120,9 @@ acwt=0.1

 if [ $stage -le 5 ]; then
  # First we generate lattices and alignments:
-  steps/nnet/align.sh --nj 80 --cmd "$train_cmd" \
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
    $data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
-  steps/nnet/make_denlats.sh --nj 3 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+  steps/nnet/make_denlats.sh --nj 6 --sub-split $nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
    --acwt $acwt $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
 fi

@ -129,11 +132,11 @@ if [ $stage -le 6 ]; then
    $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
  # Decode (reuse HCLG graph)
  for ITER in 1 2 3 4; do
-    steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+    steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
      --num-threads 3 --parallel-opts "-pe smp 4" \
      --nnet $dir/${ITER}.nnet --acwt $acwt \
      $gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
-    steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+    steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
      --num-threads 3 --parallel-opts "-pe smp 4" \
      --nnet $dir/${ITER}.nnet --acwt $acwt \
      $gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
--- a/egs/timit/s5/run.sh
+++ b/egs/timit/s5/run.sh
@ -27,8 +27,9 @@ numGaussUBM=400
 numLeavesSGMM=7000
 numGaussSGMM=9000

-decode_nj=5
+feats_nj=10
 train_nj=30
+decode_nj=5

 echo ============================================================================
 echo "                Data & Lexicon & Language Preparation                     "
@ -60,7 +61,7 @@ mfccdir=mfcc


 for x in train dev test; do 
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 data/$x exp/make_mfcc/$x $mfccdir
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $feats_nj data/$x exp/make_mfcc/$x $mfccdir
  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
 done

--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@ -272,6 +272,8 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 ) & 


+# This step is just to demonstrate the train_quick.sh script, in which we
+# initialize the GMMs from the old system's GMMs.
 steps/train_quick.sh --cmd "$train_cmd" \
   4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4b || exit 1;

--- a/egs/wsj/s5/steps/align_fmllr.sh
+++ b/egs/wsj/s5/steps/align_fmllr.sh
@ -56,6 +56,7 @@ echo $nj > $dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

 cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.alimdl $dir 2>/dev/null
 cp $srcdir/final.occs $dir;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
--- a/egs/wsj/s5/steps/align_si.sh
+++ b/egs/wsj/s5/steps/align_si.sh
@ -42,6 +42,11 @@ lang=$2
 srcdir=$3
 dir=$4

+
+for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
 oov=`cat $lang/oov.int` || exit 1;
 mkdir -p $dir/log
 echo $nj > $dir/num_jobs
@ -57,6 +62,7 @@ cp $srcdir/{tree,final.mdl} $dir || exit 1;
 cp $srcdir/final.occs $dir;


+
 if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

--- a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
@ -0,0 +1,165 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Computes training alignments using a model with delta or
+# LDA+MLLT features.  This version, rather than just using the
+# text to align, computes mini-language models (unigram) from the text
+# and a few common words in the LM, and allows
+
+# Begin configuration section.  
+nj=4
+cmd=run.pl
+use_graphs=false
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+acoustic_scale=0.1
+beam=20.0
+lattice_beam=10.0
+transform_dir=  # directory to find fMLLR transforms in.
+top_n_words=100 # Number of common words that we compile into each graph (most frequent
+                # in $lang/text.
+stage=0
+cleanup=true
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \
+    $lang/L_disambig.fst $lang/phones/disambig.int; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.occs $dir;
+
+
+utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \
+  awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
+   sort -rn > $dir/word_counts.int || exit 1;
+num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1;
+# print top-n words with their unigram probabilities.
+
+head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int
+utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $srcdir/full.mat $dir    
+   ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ -z "$transform_dir" ] && [ -f $srcdir/trans.1 ]; then
+  transform_dir=$srcdir
+fi
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/trans.$n; done | \
+      copy-feats ark:- ark,scp:$dir/trans.ark,$dir/trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+  fi
+elif [ -f $srcdir/final.alimdl ]; then
+  echo "$0: **WARNING**: you seem to be using an fMLLR system as input,"
+  echo "  but you are not providing the --transform-dir option during alignment."
+fi
+
+
+echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir"
+
+if [ $stage -le 0 ]; then
+  rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null
+
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \
+    steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \
+    compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
+     $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
+    gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$beam \
+     --lattice-beam=$lattice_beam --word-symbol-table=$lang/words.txt \
+     $dir/final.mdl ark:- "$feats" ark:- \| \
+    lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
+      ark,t:- ark,t:$dir/edits.JOB.txt \| \
+    utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  if [ -f $dir/edits.1.txt ]; then
+    for x in $(seq $nj); do cat $dir/edits.$x.txt; done > $dir/edits.txt
+    for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done > $dir/aligned_ref.txt
+  else
+    echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present."
+  fi
+
+  # in case any utterances failed to align, get filtered copy of $data/text that's filtered.
+  utils/filter_scp.pl $dir/edits.txt < $data/text  > $dir/text
+  cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt
+
+  n1=$(wc -l < $dir/edits.txt)
+  n2=$(wc -l < $dir/aligned_ref.txt)
+  n3=$(wc -l < $dir/text)
+  n4=$(wc -l < $dir/length.txt)
+  if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
+    echo "$0: mismatch in lengths of files:"
+    wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt
+    exit 1;
+  fi
+
+  # note: the format of all_info.txt is:
+  # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
+  # with the fields separated by tabs, e.g.
+  # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
+  
+  paste $dir/edits.txt \
+      <(awk '{print $2}' $dir/length.txt) \
+      <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
+      <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt
+
+  sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt
+
+  if $cleanup; then
+    rm $dir/edits.*.txt $dir/aligned_ref.*.txt
+  fi
+fi
+
--- a/egs/wsj/s5/steps/cleanup/make_utterance_fsts.pl
+++ b/egs/wsj/s5/steps/cleanup/make_utterance_fsts.pl
@ -0,0 +1,45 @@
+#!/usr/bin/perl -w
+
+# makes unigram decoding-graph FSTs specific to each utterances, where the
+# supplied top-n-words list together with the supervision text of the utterance are
+# combined.
+
+if (@ARGV != 1) {
+  print STDERR "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive\n" .
+               "e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\\n" .
+               "  make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... \n";
+}
+
+($top_words_file) = @ARGV;
+
+open(F, "<$top_words_file") || die "opening $top_words_file";
+
+%top_word_probs = ( );
+
+while(<F>) {
+  @A = split;
+  (@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file";
+  $A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_\n";
+  $top_word_probs{$A[1]} += $A[0];
+}
+
+while (<STDIN>) {
+  @A = split;
+  $utterance_id = shift @A;
+  print "$utterance_id\n";
+  $num_words = @A + 0;  # length of array @A
+  %word_probs = %top_word_probs;
+  foreach $w (@A) {
+    $w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_";
+    $word_probs{$w} += 1.0 / $num_words;
+  }
+  foreach $w (keys %word_probs) {
+    $prob = $word_probs{$w};
+    $prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id\n";
+    $cost = -log($prob);
+    print "0 0 $w $w $cost\n";
+  }
+  $final_cost = -log(1.0 / $num_words);
+  print "0 $final_cost\n";
+  print "\n"; # Empty line terminates the FST in the text-archive format.
+}
--- a/egs/wsj/s5/steps/decode.sh
+++ b/egs/wsj/s5/steps/decode.sh
@ -4,7 +4,8 @@
 # Apache 2.0

 # Begin configuration section.  
-transform_dir=
+transform_dir=   # this option won't normally be used, but it can be used if you want to 
+                 # supply existing fMLLR transforms when decoding.
 iter=
 model= # You can specify the model to use (e.g. if you want to use the .alimdl)
 stage=0
--- a/egs/wsj/s5/steps/nnet2/align.sh
+++ b/egs/wsj/s5/steps/nnet2/align.sh
@ -77,20 +77,31 @@ case $feat_type in
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
+
 if [ ! -z "$transform_dir" ]; then
-  if ! [ $nj -eq `cat $transform_dir/num_jobs` ]; then
-    echo "$0: Number of jobs mismatch with transform-dir: $nj versus `cat $transform_dir/num_jobs`";
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+  
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
    exit 1;
  fi
-  if [ $feat_type == "lda" ]; then
-    [ ! -f $transform_dir/trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1;
-    echo "$0: using transforms from $transform_dir"
-    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
  fi
-  if [ $feat_type == "raw" ]; then
-    [ ! -f $transform_dir/raw_trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1;
-    echo "$0: using raw-fMLLR transforms from $transform_dir"
-    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
  fi
 fi

--- a/egs/wsj/s5/steps/nnet2/get_lda.sh
+++ b/egs/wsj/s5/steps/nnet2/get_lda.sh
@ -145,6 +145,7 @@ fi

 if [ $stage -le 0 ]; then
  echo "$0: Accumulating LDA statistics."
+  rm $dir/lda.*.acc 2>/dev/null # in case any left over from before.
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
@ -157,11 +158,19 @@ echo $lda_dim > $dir/lda_dim
 echo $ivector_dim > $dir/ivector_dim

 if [ $stage -le 1 ]; then
-  nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
-     --within-class-factor=$within_class_factor --dim=$lda_dim \
-      $dir/lda.mat $dir/lda.*.acc \
-      2>$dir/log/lda_est.log || exit 1;
+  sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1;
  rm $dir/lda.*.acc
 fi

+if [ $stage -le 2 ]; then
+  # There are various things that we sometimes (but not always) need
+  # the within-class covariance and its Cholesky factor for, and we
+  # write these to disk just in case.
+  nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
+     --write-within-covar=$dir/within_covar.spmat \
+     --within-class-factor=$within_class_factor --dim=$lda_dim \
+      $dir/lda.mat $dir/lda.acc \
+      2>$dir/log/lda_est.log || exit 1;
+fi
+
 echo "$0: Finished estimating LDA"
--- a/egs/wsj/s5/steps/nnet2/make_denlats.sh
+++ b/egs/wsj/s5/steps/nnet2/make_denlats.sh
@ -95,25 +95,39 @@ echo "align_si.sh: feature type is $feat_type"

 case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+   ;;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac

-if [ ! -z "$transform_dir" ]; then # add transforms to features...
-  echo "$0: using fMLLR transforms from $transform_dir"
-  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
-  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
-    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
-  [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
-     echo "$0: LDA transforms differ between $srcdir and $transform_dir"
-  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
-else
-  if [ -f $srcdir/final.alimdl ]; then
-    echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option.";
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+  
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
    exit 1;
  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
 fi

 if [ $sub_split -eq 1 ]; then 
--- a/egs/wsj/s5/steps/nnet2/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet2/train_discriminative.sh
@ -22,7 +22,7 @@ num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
 samples_per_iter=400000 # measured in frames, not in "examples"

 spk_vecs_dir=
-modify_learning_rates=false
+modify_learning_rates=true
 last_layer_factor=1.0  # relates to modify-learning-rates
 first_layer_factor=1.0 # relates to modify-learning-rates
 shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
@ -140,17 +140,38 @@ case $feat_type in
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac

-[ -z "$transform_dir" ] && transform_dir=$alidir
+if [ -z "$transform_dir" ]; then
+  if [ -f $transform_dir/trans.1 ] || [ -f $transform_dir/raw_trans.1 ]; then
+    transform_dir=$alidir
+  fi
+fi

-if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
+if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
-  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+  
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then
+    echo "$0: LDA transforms differ between $alidir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
 fi
-if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
-  echo "$0: using raw-fMLLR transforms from $transform_dir"
-  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
-fi
-

 if [ -z "$degs_dir" ]; then
  if [ $stage -le -8 ]; then
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh
@ -64,6 +64,10 @@ max_change_per_sample=0.075
 precondition_rank_in=20  # relates to online preconditioning
 precondition_rank_out=80 # relates to online preconditioning

+# this relates to perturbed training.
+min_target_objf_change=0.1
+target_multiplier=0 #  Set this to e.g. 1.0 to enable perturbed training.
+
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
 num_threads=16
@ -262,24 +266,49 @@ echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
 echo "$0: (while reducing learning rate) + (with constant learning rate)."


+function set_target_objf_change {
+  # nothing to do if $target_multiplier not set.
+  [ "$target_multiplier" == "0" -o "$target_multiplier" == "0.0" ] && return;
+  [ $x -le $finish_add_layers_iter ] && return;
+  wait=2  # the compute_prob_{train,valid} from 2 iterations ago should
+          # most likey be done even though we backgrounded them.
+  [ $[$x-$wait] -le 0 ] && return;
+  while true; do
+    # Note: awk 'some-expression' is the same as: awk '{if(some-expression) print;}'
+    train_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_train.$[$x-$wait].log)
+    valid_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_valid.$[$x-$wait].log)
+    if [ -z "$train_prob" ] || [ -z "$valid_prob" ]; then
+      echo "$0: waiting until $dir/log/compute_prob_{train,valid}.$[$x-$wait].log are done"
+      sleep 60
+    else
+      target_objf_change=$(perl -e '($train,$valid,$min_change,$multiplier)=@ARGV; if (!($train < 0.0) || !($valid < 0.0)) { print "0\n"; print STDERR "Error: invalid train or valid prob: $train_prob, $valid_prob\n"; exit(0); } else { print STDERR "train,valid=$train,$valid\n"; $proposed_target = $multiplier * ($train-$valid); if ($proposed_target < $min_change) { print "0"; } else { print $proposed_target; }}' -- "$train_prob" "$valid_prob" "$min_target_objf_change" "$target_multiplier")
+      echo "On iter $x, (train,valid) probs from iter $[$x-$wait] were ($train_prob,$valid_prob), and setting target-objf-change to $target_objf_change."
+      return;
+    fi
+  done
+}
+
 finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
 # This is when we decide to mix up from: halfway between when we've finished
 # adding the hidden layers and the end of training.
 mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

 if [ $num_threads -eq 1 ]; then
-  train_suffix="-simple" # this enables us to use GPU code if
+  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
+  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
 else
-  train_suffix="-parallel --num-threads=$num_threads"
+  parallel_suffix="-parallel"
+  parallel_train_opts="--num-threads=$num_threads"
 fi

 x=0
+target_objf_change=0 # relates to perturbed training.

 while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
@ -316,11 +345,19 @@ while [ $x -lt $num_iters ]; do
      this_minibatch_size=$minibatch_size
      do_average=true
    fi
+
+    set_target_objf_change;  # only has effect if target_multiplier != 0
+    if [ "$target_objf_change" != "0" ]; then
+      [ ! -f $dir/within_covar.spmat ] && \
+        echo "$0: expected $dir/within_covar.spmat to exist." && exit 1;
+      perturb_suffix="-perturbed"
+      perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
+    fi
    
    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
-       nnet-train$train_suffix \
+       nnet-train$parallel_suffix$perturb_suffix $parallel_train_opts $perturb_opts \
        --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;
--- a/egs/wsj/s5/steps/train_sat_basis.sh
+++ b/egs/wsj/s5/steps/train_sat_basis.sh
@ -12,7 +12,6 @@

 # Begin configuration section.
 stage=-5
-fmllr_update_type=full
 cmd=run.pl
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 beam=10
@ -197,9 +196,9 @@ while [ $x -lt $num_iters ]; do

  if echo $fmllr_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
-      echo Estimating fMLLR transforms
-      # We estimate a transform that's additional to the previous transform;
-      # we'll compose them.
+      # Note: it's not really necessary to re-estimate the basis each time
+      # but this is the way the script does it right now.
+      echo Estimating basis and fMLLR transforms
      $cmd JOB=1:$nj $dir/log/fmllr_est.$x.JOB.log \
          ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
          weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
@ -209,7 +208,7 @@ while [ $x -lt $num_iters ]; do

      # Compute the basis matrices.
      $cmd $dir/log/basis_training.log \
-	  gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;
+ 	    gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;

      $cmd JOB=1:$nj $dir/log/fmllr_app.$x.JOB.log \
          ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
--- a/egs/wsj/s5/utils/sym2int.pl
+++ b/egs/wsj/s5/utils/sym2int.pl
@ -28,7 +28,7 @@ for($x = 0; $x < 2; $x++) {
    }
  }
  if ($ARGV[0] eq "-f") {
-    shift @ARGV; 
+    shift @ARGV;
    $field_spec = shift @ARGV; 
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@ -46,6 +46,14 @@ done
 ! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
  echo "$0: $data/utt2spk has wrong format." && exit;

+ns=$(wc -l < $data/spk2utt)
+if [ "$ns" == 1 ]; then
+  echo "$0: WARNING: you have only one speaker.  This probably a bad idea."
+  echo "   Search for the word 'bold' in http://kaldi.sourceforge.net/data_prep.html"
+  echo "   for more information."
+fi
+
+
 tmpdir=$(mktemp -d kaldi.XXXX);
 trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM

--- a/egs/wsj/s5/utils/validate_dict_dir.pl
+++ b/egs/wsj/s5/utils/validate_dict_dir.pl
@ -12,27 +12,39 @@ if(@ARGV != 1) {
 }

 $dict = shift @ARGV;
+$dict =~ s:/$::;

 $exit = 0;
+$success = 1;  # this is re-set each time we read a file.
+
+sub set_to_fail { $exit = 1; $success = 0; }
+
 # Checking silence_phones.txt -------------------------------
 print "Checking $dict/silence_phones.txt ...\n";
 if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
 if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
 $idx = 1;
 %silence = ();
-$success = 1;
+
 print "--> reading $dict/silence_phones.txt\n";
 while(<S>) {
-  chomp;
+  if (! s/\n$//) {
+    print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
+    set_to_fail();
+  }
  my @col = split(" ", $_);
+  if (@col == 0) {
+    set_to_fail(); 
+    print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; 
+  }
  foreach(0 .. @col-1) {
    my $p = $col[$_];
-    if($silence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; $success = 0;}
+    if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
    else {$silence{$p} = 1;}
    if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
-      $exit = 1;
+      set_to_fail();
      print "--> ERROR: phone \"$p\" has disallowed written form";
-      $success = 0;
+      
    }
  }
  $idx ++;
@ -52,9 +64,9 @@ while(<OS>) {
  chomp;
  my @col = split(" ", $_);
  if ($idx > 1 or @col > 1) {
-    $exit = 1; print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; $success = 0;
+    set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; 
  } elsif (!$silence{$col[0]}) {
-    $exit = 1; print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; $success = 0;
+    set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; 
  }
  $idx ++;
 }
@ -71,22 +83,29 @@ $idx = 1;
 $success = 1;
 print "--> reading $dict/nonsilence_phones.txt\n";
 while(<NS>) {
-  chomp;
+  if (! s/\n$//) {
+    print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
+    set_to_fail();
+  }
  my @col = split(" ", $_);
+  if (@col == 0) {
+    set_to_fail(); 
+    print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; 
+  }
  foreach(0 .. @col-1) {
    my $p = $col[$_];
-    if($nonsilence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; $success = 0;}
+    if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
    else {$nonsilence{$p} = 1;}
    if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
-      $exit = 1;
+      set_to_fail();
      print "--> ERROR: phone \"$p\" has disallowed written form";
-      $success = 0;
+      
    }
  }
  $idx ++;
 }
 close(NS);
-$success == 0 || print "--> $dict/silence_phones.txt is OK\n";
+$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
 print "\n";

 # Checking disjoint -------------------------------
@ -106,37 +125,37 @@ sub intersect {
 print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
@itset = intersect(\%silence, \%nonsilence);
 if(@itset == 0) {print "--> disjoint property is OK.\n";}
-else {$exit = 1; print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
+else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
 print "\n";


 sub check_lexicon {
  my ($lexfn, $pron_probs) = @_;
  print "Checking $lexfn\n";
-  if(-z "$lexfn") {$exit = 1; print "--> ERROR: $lexfn is empty or not exists\n";}
-  if(!open(L, "<$lexfn")) {$exit = 1; print "--> ERROR: fail to open $lexfn\n";}
+  if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";}
+  if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";}
  $idx = 1;
  $success = 1;
  print "--> reading $lexfn\n";
  while (<L>) {
-    chomp;
+    if (! s/\n$//) {
+      print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n";
+      set_to_fail();
+    }
    my @col = split(" ", $_);
    $word = shift @col;
    if (!defined $word) {
-      $exit = 1; print "--> ERROR: empty lexicon line in $lexfn\n"; 
-      $success = 0;
+      set_to_fail(); print "--> ERROR: empty lexicon line in $lexfn\n"; 
    }
    if ($pron_probs) {
      $prob = shift @col;
      if (!($prob > 0.0 && $prob <= 1.0)) { 
-        $exit = 1; print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
-        $success = 0;
+        set_to_fail(); print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
      }
    }
    foreach (0 .. @col-1) {
      if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
-        $exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n"; 
-        $success = 0;
+        set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n"; 
      }
    }
    $idx ++;
@ -150,7 +169,7 @@ if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0); }
 if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1); }
 if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
  print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
-  $exit = 1;
+  set_to_fail();
 }
 # If both lexicon.txt and lexiconp.txt exist, we check that they correspond to
 # each other.  If not, it could be that the user overwrote one and we need to
@ -161,11 +180,21 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
    die "Error opening lexicon.txt and/or lexiconp.txt"; # already checked, so would be code error.
  }
  while(<L>) {
+    if (! s/\n$//) {
+      print "--> ERROR: last line '$_' of $dict/lexicon.txt does not end in newline.\n";
+      set_to_fail();
+      last;
+    }
    @A = split;
    $x = <P>;
+    if ($x !~ s/\n$//) {
+      print "--> ERROR: last line '$x' of $dict/lexiconp.txt does not end in newline.\n";
+      set_to_fail();
+      last;
+    }
    if (!defined $x) {
      print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
-      $exit = 1;
+      set_to_fail();
      last;
    }
    @B = split(" ", $x);
@ -175,13 +204,13 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
    # now @A and @B should be the same.
    if ($#A != $#B) {
      print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
-      $exit = 1;
+      set_to_fail();
      last;
    }
    for ($n = 0; $n < @A; $n++) {
      if ($A[$n] ne $B[$n]) {
        print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
-        $exit = 1;
+        set_to_fail();
        last;
      }
    }
@ -189,32 +218,40 @@ if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
  $x = <P>;
  if (defined $x && $exit == 0) {
    print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
-    $exit = 1;
+    set_to_fail();
  }
 }

 # Checking extra_questions.txt -------------------------------
 print "Checking $dict/extra_questions.txt ...\n";
 if (-s "$dict/extra_questions.txt") {
-  if(!open(EX, "<$dict/extra_questions.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/extra_questions.txt\n";}
+  if (!open(EX, "<$dict/extra_questions.txt")) {
+    set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
+  }
  $idx = 1;
  $success = 1;
  print "--> reading $dict/extra_questions.txt\n";
  while(<EX>) {
-    chomp;
+    if (! s/\n$//) {
+      print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
+      set_to_fail();
+    }
    my @col = split(" ", $_);
-    foreach(0 .. @col-1) {
-      if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
-        $exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; 
-        $success = 0;
-      }
+    if (@col == 0) {
+      set_to_fail();  print "--> ERROR: empty line in $dict/extra_questions.txt\n";
+    }
+  }
+  foreach(0 .. @col-1) {
+    if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
+      set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; 
    }
    $idx ++;
-  } 
+  }
  close(EX);
  $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
 } else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}

-if($exit == 1) { print " [Error detected ]\n"; exit 1;}
+if ($exit == 1) { print "--> ERROR validating dictionary directory $dict (see detailed error messages above)\n"; exit 1;}
+else { print "--> SUCCESS [validating dictionary directory $dict]\n"; }

 exit 0;
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@ -1,6 +1,8 @@
 #!/usr/bin/perl

-# Guoguo Chen (guoguo@jhu.edu)
+# Apache 2.0.
+# Copyright  2012   Guoguo Chen
+#            2014   Neil Nelson
 #
 # Validation script for data/lang

@ -132,7 +134,7 @@ sub check_txt_int_csl {
 }

 sub check_txt_int {
-  my ($cat, $symtab) = @_;
+  my ($cat, $symtab, $sym_check) = @_;
  print "Checking $cat.\{txt, int\} ...\n";
  if (-z "$cat.txt") {$exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n";}
  if (-z "$cat.int") {$exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n";}
@ -154,6 +156,7 @@ sub check_txt_int {
  close(TXT); $idx1 --;
  print "--> $idx1 entry/entries in $cat.txt\n";

+	my %used_syms = ();
  $idx2 = 1;
  while(<INT>) {
    chomp;
@ -168,6 +171,8 @@ sub check_txt_int {
    if (@set != @col) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";}
    foreach(0 .. @set-1) {
      if ($symtab->{@set[$_]} ne @col[$_]) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";}
+			if ($sym_check && defined $used_syms{@set[$_]}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int contain duplicate symbols (break at line $idx2, block " ,$_+1, ")\n";}
+			$used_syms{@set[$_]} = 1;
    }
    $idx2 ++;
  }
@ -175,31 +180,16 @@ sub check_txt_int {
  if ($idx1 != $idx2) {$exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";}
  print "--> $cat.int corresponds to $cat.txt\n";

-  return print "--> $cat.\{txt, int\} are OK\n";
-}
+	if ($sym_check) {
+		while ( my ($key, $value) = each(%silence) ) {
+			if (!defined $used_syms{$key}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all silence phones\n";}
+		}
+		while ( my ($key, $value) = each(%nonsilence) ) {
+			if (!defined $used_syms{$key}) {$exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all non-silence phones\n";}
+		}
+	}

-@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence");
-@list2 = ("roots", "sets");
-foreach(@list1) {
-  check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
-}
-foreach(@list2) {
-  check_txt_int("$lang/phones/$_", \%psymtab); print "\n";
-}
-if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
-  check_txt_int("$lang/phones/extra_questions", \%psymtab); print "\n";
-} else {
-  print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
-  if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
-    print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
-    $warning = 1;
-  } else {
-    print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
-    $exit = 1;
-  }
-} 
-if (-e "$lang/phones/word_boundary.txt") {
-  check_txt_int("$lang/phones/word_boundary", \%psymtab); print "\n";
+  return print "--> $cat.\{txt, int\} are OK\n";
 }

 # Check disjoint and summation -------------------------------
@ -217,7 +207,7 @@ sub intersect {
 }

 sub check_disjoint {
-  print "Checking disjoint: silence.txt, nosilenct.txt, disambig.txt ...\n";
+  print "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n";
  if (!open(S, "<$lang/phones/silence.txt"))    {$exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";}
  if (!open(N, "<$lang/phones/nonsilence.txt")) {$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";}
  if (!open(D, "<$lang/phones/disambig.txt"))   {$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";}
@ -336,6 +326,30 @@ sub check_summation {
 check_disjoint; print "\n";
 check_summation; print "\n";

+@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence");
+@list2 = ("roots", "sets");
+foreach(@list1) {
+  check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
+}
+foreach(@list2) {
+  check_txt_int("$lang/phones/$_", \%psymtab, 1); print "\n";
+}
+if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
+  check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n";
+} else {
+  print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
+  if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
+    print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
+    $warning = 1;
+  } else {
+    print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
+    $exit = 1;
+  }
+} 
+if (-e "$lang/phones/word_boundary.txt") {
+  check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n";
+}
+
 # Checking optional_silence.txt -------------------------------
 print "Checking optional_silence.txt ...\n";
 $idx = 1;
@ -550,7 +564,7 @@ if (-s "$lang/phones/word_boundary.int") {
 }

 # Check oov -------------------------------
-check_txt_int("$lang/oov", \%wsymtab); print "\n";
+check_txt_int("$lang/oov", \%wsymtab, 0); print "\n";


 # Check determinizability of G.fst
@ -580,7 +594,6 @@ if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") {
 if ($exit == 1) { print "--> ERROR (see error messages above)\n"; exit 1;}
 else {
  if ($warning == 1) { print "--> WARNING (check output above for warnings)\n"; exit 0; }
-  else { print "--> SUCCESS\n"; exit 0; }
+  else { print "--> SUCCESS [validating lang directory $lang]\n"; exit 0; }
 }

-
--- a/src/bin/compile-train-graphs-fsts.cc
+++ b/src/bin/compile-train-graphs-fsts.cc
@ -43,8 +43,8 @@ int main(int argc, char *argv[]) {
        "of disambiguation symbols.\n"
        "Warning: you probably want to set the --transition-scale and --self-loop-scale\n"
        "options; the defaults (zero) are probably not appropriate.\n"
-        "Usage:   compile-train-graphs-fsts [options] tree-in model-in lexicon-fst-in "
-        " graphs-rspecifier graphs-wspecifier\n"
+        "Usage:   compile-train-graphs-fsts [options] <tree-in> <model-in> <lexicon-fst-in> "
+        " <graphs-rspecifier> <graphs-wspecifier>\n"
        "e.g.: \n"
        " compile-train-graphs-fsts --read-disambig-syms=disambig.list\\\n"
        "   tree 1.mdl lex.fst ark:train.fsts ark:graphs.fsts\n";
--- a/src/bin/compile-train-graphs.cc
+++ b/src/bin/compile-train-graphs.cc
@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
    const char *usage =
        "Creates training graphs (without transition-probabilities, by default)\n"
        "\n"
-        "Usage:   compile-train-graphs [options] tree-in model-in lexicon-fst-in transcriptions-rspecifier graphs-wspecifier\n"
+        "Usage:   compile-train-graphs [options] <tree-in> <model-in> <lexicon-fst-in> <transcriptions-rspecifier> <graphs-wspecifier>\n"
        "e.g.: \n"
        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra ark:graphs.fsts\n";
    ParseOptions po(usage);
--- a/src/bin/get-post-on-ali.cc
+++ b/src/bin/get-post-on-ali.cc
@ -1,6 +1,7 @@
 // bin/get-post-on-ali.cc

 // Copyright 2013  Brno University of Technology (Author: Karel Vesely)
+//           2014  Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -31,18 +32,24 @@ int main(int argc, char *argv[]) {
  typedef kaldi::int32 int32;
  try {
    const char *usage =
-        "This program extracts a vector of per-frame posteriors that are selected\n"
-        "by an alignment (ie. posteriors that are under the alignment path).\n"
-        "This can be used as a per-frame confidence measure.\n"
+        "Given input posteriors, e.g. derived from lattice-to-post, and an alignment\n"
+        "typically derived from the best path of a lattice, outputs the probability in\n"
+        "the posterior of the corresponding index in the alignment, or zero if it was\n"
+        "not there.  These are output as a vector of weights, one per utterance.\n"
+        "While, by default, lattice-to-post (as a source of posteriors) and sources of\n"
+        "alignments such as lattice-best-path will output transition-ids as the index,\n"
+        "it will generally make sense to either convert these to pdf-ids using\n"
+        "post-to-pdf-post and ali-to-pdf respectively, or to phones using post-to-phone-post\n"
+        "and (ali-to-phones --per-frame=true).  Since this program only sees the integer\n"
+        "indexes, it does not care what they represent-- but of course they should match\n"
+        "(e.g. don't input posteriors with transition-ids and alignments with pdf-ids).\n"
+        "See http://kaldi.sourceforge.net/hmm.html#transition_model_identifiers for an\n"
+        "explanation of these types of indexes.\n"
        "\n"
-        "By intuition, it is better to use pdf-posteriors and pdf-alignments,\n"
-        "because the posteriors of competing hypothesis that are in the same frame\n"
-        "at same 'pdf-state' are summed up, which is in some sense similar\n"
-        "to what is done by C-max which sums the posteriors of overlapping words.\n"
-        "The difference here is that the granularity is per-frame.\n"
+        "See also: weight-post, post-to-weights, reverse-weights\n"
        "\n"
-        "Usage:  get-post-on-ali [options] <posteriors-rspecifier> <ali-rspecifier> <conf-wspecifier>\n"
-        "e.g.: get-post-on-ali ark:post.ark ark:ali.ark ark:conf.ark\n";
+        "Usage:  get-post-on-ali [options] <posteriors-rspecifier> <ali-rspecifier> <weights-wspecifier>\n"
+        "e.g.: get-post-on-ali ark:post.ark ark,s,cs:ali.ark ark:weights.ark\n";

    ParseOptions po(usage);

--- a/src/bin/post-to-pdf-post.cc
+++ b/src/bin/post-to-pdf-post.cc
@ -32,6 +32,7 @@ int main(int argc, char *argv[]) {
    const char *usage =
        "This program turns per-frame posteriors, which have transition-ids as\n"
        "the integers, into pdf-level posteriors\n"
+        "See also: post-to-phone-post, post-to-weights, get-post-on-ali\n"
        "\n"
        "Usage:  post-to-pdf-post [options] <model-file> <posteriors-rspecifier> <posteriors-wspecifier>\n"
        "e.g.: post-to-pdf-post 1.mdl ark:- ark:-\n";
--- a/src/bin/post-to-phone-post.cc
+++ b/src/bin/post-to-phone-post.cc
@ -30,6 +30,7 @@ int main(int argc, char *argv[]) {

    const char *usage =
        "Convert posteriors to phone-level posteriors\n"
+        "See also: post-to-pdf-post, post-to-weights, get-post-on-ali\n"
        "\n"
        "Usage: post-to-phone-post [options] <model> <post-rspecifier> <phone-post-wspecifier>\n"
        " e.g.: post-to-phone-post --binary=false 1.mdl \"ark:ali-to-post 1.ali|\" ark,t:-\n";
--- a/src/bin/post-to-weights.cc
+++ b/src/bin/post-to-weights.cc
@ -31,7 +31,9 @@ int main(int argc, char *argv[]) {
    const char *usage =
        "Turn posteriors into per-frame weights (typically most useful after\n"
        "weight-silence-post, to get silence weights)\n"
-        "Usage: post-to-weights post-rspecifier weights-wspecifier\n";
+        "See also: weight-silence-post, post-to-pdf-post, post-to-phone-post\n"
+        "get-post-on-ali\n"
+        "Usage: post-to-weights <post-rspecifier> <weights-wspecifier>\n";
    
    ParseOptions po(usage); 
    po.Read(argc, argv);
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@ -1128,7 +1128,6 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
      *tot_objf += weight * log(this_prob);
      *tot_weight += weight;
      (*this)(m, label) += weight / this_prob; 
-
    }
  }
 }
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@ -580,7 +580,7 @@ class CuMatrix: public CuMatrixBase<Real> {
  void CompObjfAndDeriv(const std::vector<MatrixElement<Real> > &elements,
                        const CuMatrix<Real> &A,
                        Real *tot_objf,
-                        Real* tot_weight);
+                        Real *tot_weight);

 private:
  void Destroy();
--- a/src/featbin/feat-to-len.cc
+++ b/src/featbin/feat-to-len.cc
@ -28,28 +28,39 @@ int main(int argc, char *argv[]) {

    const char *usage =
        "Reads an archive of features and writes a corresponding archive\n"
-        "that maps utterance-id to utterance length in frames.\n"
-        "Usage: feat-to-len [options] in-rspecifier out-wspecifier\n"
-        "e.g.: feat-to-len scp:feats.scp ark,t:feats.lengths\n";
+        "that maps utterance-id to utterance length in frames, or (with\n"
+        "one argument) print to stdout the total number of frames in the\n"
+        "input archive.\n"
+        "Usage: feat-to-len [options] <in-rspecifier> [<out-wspecifier>]\n"
+        "e.g.: feat-to-len scp:feats.scp ark,t:feats.lengths\n"
+        "or: feat-to-len scp:feats.scp\n";
    
    ParseOptions po(usage);

    po.Read(argc, argv);

-    if (po.NumArgs() != 2) {
+    if (po.NumArgs() != 1 && po.NumArgs() != 2) {
      po.PrintUsage();
      exit(1);
    }

-    std::string rspecifier = po.GetArg(1);
-    std::string wspecifier = po.GetArg(2);
+    if (po.NumArgs() == 2) {
+      std::string rspecifier = po.GetArg(1);
+      std::string wspecifier = po.GetArg(2);

-    Int32Writer length_writer(wspecifier);
+      Int32Writer length_writer(wspecifier);

-    SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
-    for (; !kaldi_reader.Done(); kaldi_reader.Next())
-      length_writer.Write(kaldi_reader.Key(), kaldi_reader.Value().NumRows());
-    
+      SequentialBaseFloatMatrixReader matrix_reader(rspecifier);
+      for (; !matrix_reader.Done(); matrix_reader.Next())
+        length_writer.Write(matrix_reader.Key(), matrix_reader.Value().NumRows());
+    } else {
+      int64 tot = 0;
+      std::string rspecifier = po.GetArg(1);
+      SequentialBaseFloatMatrixReader matrix_reader(rspecifier);
+      for (; !matrix_reader.Done(); matrix_reader.Next())
+        tot += matrix_reader.Value().NumRows();
+      std::cout << tot << std::endl;
+    }
    return 0;
  } catch(const std::exception &e) {
    std::cerr << e.what();
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@ -234,8 +234,8 @@ int main(int argc, char *argv[]) {
    const char *usage =
        "Finds the path having the smallest edit-distance between two lattices.\n"
        "For efficiency put the smallest lattices first (for example reference strings).\n"
-        "Usage: lattice-oracle [options] test-lattice-rspecifier reference-rspecifier "
-        "transcriptions-wspecifier [edit-distance-wspecifier]\n"
+        "Usage: lattice-oracle [options] <test-lattice-rspecifier> <reference-rspecifier> "
+        "<transcriptions-wspecifier> [<edit-distance-wspecifier>]\n"
        " e.g.: lattice-oracle ark:lat.1 'ark:sym2int.pl -f 2- data/lang/words.txt <data/test/text' ark,t:-\n";
        
    ParseOptions po(usage);
@ -260,20 +260,21 @@ int main(int argc, char *argv[]) {
    
    po.Read(argc, argv);
 
-    if (po.NumArgs() != 3) {
+    if (po.NumArgs() != 3 && po.NumArgs() != 4) {
      po.PrintUsage();
      exit(1);
    }

    std::string lats_rspecifier = po.GetArg(1),
        reference_rspecifier = po.GetArg(2),
-        transcriptions_wspecifier = po.GetArg(3);
-
+        transcriptions_wspecifier = po.GetArg(3),
+        edit_distance_wspecifier = po.GetOptArg(4);
+    
    // will read input as  lattices
    SequentialLatticeReader lattice_reader(lats_rspecifier);
    RandomAccessInt32VectorReader reference_reader(reference_rspecifier);
-
    Int32VectorWriter transcriptions_writer(transcriptions_wspecifier);
+    Int32Writer edit_distance_writer(edit_distance_wspecifier);
    
    // Guoguo Chen added the implementation for option "write-lattices".
    CompactLatticeWriter lats_writer(lats_wspecifier);
@ -360,8 +361,10 @@ int main(int argc, char *argv[]) {
        // count errors
        int32 correct, substitutions, insertions, deletions, num_words;
        CountErrors(best_path, &correct, &substitutions, &insertions, &deletions, &num_words);
-        int32 toterrs = substitutions + insertions + deletions;
-        KALDI_LOG << "%WER " << (100.*toterrs) / num_words << " [ " << toterrs
+        int32 tot_errs = substitutions + insertions + deletions;
+        if (edit_distance_wspecifier != "")
+          edit_distance_writer.Write(key, tot_errs);
+        KALDI_LOG << "%WER " << (100.*tot_errs) / num_words << " [ " << tot_errs
                  << " / " << num_words << ", " << insertions << " insertions, " << deletions
                  << " deletions, " << substitutions << " sub ]";
        tot_correct += correct;
@ -397,7 +400,7 @@ int main(int argc, char *argv[]) {
        }

        // Guoguo Chen added the implementation for option "write-lattices".
-        // Currently it's just a naive implementation: traversal the original
+        // Currently it's just a naive implementation: traverse the original
        // lattice and get the path corresponding to the oracle word sequence.
        // Note that this new lattice has the alignment information.
        if (lats_wspecifier != "") {
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@ -1002,53 +1002,52 @@ void MatrixBase<Real>::MulRowsVec(const VectorBase<Real> &scale) {
  }
 }

+
 template<typename Real> 
 void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
-  KALDI_ASSERT(src.NumCols() > 0 && src.NumCols() <= this->NumCols());
-  KALDI_ASSERT(this->NumCols() % src.NumCols() == 0 || 
-  	this->NumCols() % (src.NumCols() - 1) < this->NumCols() / (src.NumCols() - 1));
-  int group_size = 0;
-  if (this->NumCols() % src.NumCols() == 0) {
-    group_size = this->NumCols() / src.NumCols();
-  } else {
-    group_size = this->NumCols() / src.NumCols() + 1; 
-  }
-  MatrixIndexT M = num_rows_, N = num_cols_;
+  KALDI_ASSERT(src.NumRows() == this->NumRows() &&
+               this->NumCols() % src.NumCols() == 0);
+  int32 group_size = this->NumCols() / src.NumCols(),
+      num_groups = this->NumCols() / group_size,
+      num_rows = this->NumRows();

-  for (MatrixIndexT i = 0; i < M; i++) 
-    for (MatrixIndexT j = 0; j < N; j++) 
-      (*this)(i, j) *= src(i, j / group_size);
+  for (MatrixIndexT i = 0; i < num_rows; i++) {
+    Real *data = this->RowData(i);
+    for (MatrixIndexT j = 0; j < num_groups; j++, data += group_size) {
+      Real scale = src(i, j);
+      cblas_Xscal(group_size, scale, data, 1);
+    }
+  }
 }

 template<typename Real> 
-void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &src1,
-                                       const MatrixBase<Real> &src2,
+void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
+                                       const MatrixBase<Real> &output,
                                       Real power) {
-  KALDI_ASSERT(src2.NumCols() > 0 && src2.NumCols() <= this->NumCols());
-  KALDI_ASSERT(this->NumCols() % src2.NumCols() == 0 || 
-  	this->NumCols() % (src2.NumCols() - 1) < this->NumCols() / (src2.NumCols() - 1));
-  int group_size = 0;
-  if (this->NumCols() % src2.NumCols() == 0) {
-    group_size = this->NumCols() / src2.NumCols();
-  } else {
-    group_size = this->NumCols() / src2.NumCols() + 1; 
-  }
-  MatrixIndexT M = this->NumRows(), N = this->NumCols(); 
+  KALDI_ASSERT(input.NumCols() == this->NumCols() && input.NumRows() == this->NumRows());
+  KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 &&
+               this->NumRows() == output.NumRows());
+
+  int group_size = this->NumCols() / output.NumCols(),
+    num_rows = this->NumRows(), num_cols = this->NumCols(); 

  if (power == 1.0) {   
-    for (MatrixIndexT i = 0; i < M; i++) 
-      for (MatrixIndexT j = 0; j < N; j++) 
-	  (*this)(i, j) = (src1(i, j) == 0 ? 0 : (src1(i, j) > 0 ? 1 : -1));
+    for (MatrixIndexT i = 0; i < num_rows; i++) { 
+      for (MatrixIndexT j = 0; j < num_cols; j++) {
+        Real input_val = input(i, j);
+        (*this)(i, j) = (input_val == 0 ? 0 : (input_val > 0 ? 1 : -1));
+      }
+    }
  } else {
-    for (MatrixIndexT i = 0; i < M; i++) {
-      for (MatrixIndexT j = 0; j < N; j++) {
-        if (src2(i, j / group_size) == 0) {
+    for (MatrixIndexT i = 0; i < num_rows; i++) {
+      for (MatrixIndexT j = 0; j < num_cols; j++) {
+        Real output_val = output(i, j / group_size),
+          input_val = input(i, j);
+        if (output_val == 0) 
          (*this)(i, j) = 0;
-        } else {
-      	  (*this)(i, j) = pow(std::abs(src1(i, j)), power - 1) * 
-              (src2(i, j / group_size) > 0 ? pow(src2(i, j / group_size), 1 - power) : 1) * 
-              (src1(i, j) >= 0 ? 1 : -1) ;
-        }
+         else
+      	  (*this)(i, j) = pow(std::abs(input_val), power - 1) * 
+              pow(output_val, 1 - power) * (input_val >= 0 ? 1 : -1) ;
      }
    }
  }
@ -2428,12 +2427,15 @@ void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
    }
  }
 }
+
 template<typename Real>
 void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) {
-  int group_size = src.NumCols() / this->NumCols();
-  KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size);
-  for (MatrixIndexT i = 0; i < src.NumRows(); i++)
-    for (MatrixIndexT j = 0; j < this->NumCols(); j++)
+  KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 &&
+               src.NumRows() == this->NumRows());
+  int group_size = src.NumCols() / this->NumCols(),
+    num_rows = this->NumRows(), num_cols = this->NumCols();
+  for (MatrixIndexT i = 0; i < num_rows; i++)
+    for (MatrixIndexT j = 0; j < num_cols; j++)
      (*this)(i, j) = src.Row(i).Range(j * group_size,  group_size).Norm(power);
 }

--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@ -240,8 +240,9 @@ class MatrixBase {
  /// each row by a scalar taken from that dimension of the vector.
  void MulRowsVec(const VectorBase<Real> &scale);

-  /// divide each row into src.NumCols() groups, 
-  /// and then scale i'th row's jth group of elements by src[i, j].   
+  /// Divide each row into src.NumCols() equal groups, and then scale i'th row's
+  /// j'th group of elements by src(i, j).  Requires src.NumRows() ==
+  /// this->NumRows() and this->NumCols() % src.NumCols() == 0.
  void MulRowsGroupMat(const MatrixBase<Real> &src);
    
  /// Returns logdet of matrix.
@ -418,8 +419,8 @@ class MatrixBase {
  /// Set each element to y = log(1 + exp(x))
  void SoftHinge(const MatrixBase<Real> &src);
  
-  /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
-  /// where G = x.NumCols() / y.NumCols() must be an integer.
+  /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p).
+  /// Requires src.NumRows() == this->NumRows() and  src.NumCols() % this->NumCols() == 0.
  void GroupPnorm(const MatrixBase<Real> &src, Real power);


--- a/src/matrix/optimization.cc
+++ b/src/matrix/optimization.cc
@ -469,9 +469,9 @@ int32 LinearCgd(const LinearCgdOptions &opts,
      residual_factor = opts.recompute_residual_factor *
                        opts.recompute_residual_factor;
  
-  // Note: although from a mathematical point of view the method should
-  // converge after M iterations, in practice it does not always converge
-  // to good precision after that many iterations so we let the maximum
+  // Note: although from a mathematical point of view the method should converge
+  // after M iterations, in practice (due to roundoff) it does not always
+  // converge to good precision after that many iterations so we let the maximum
  // be 1.5 * M + 5 instead.
  int32 k = 0;
  for (; k < M + M / 2 + 5 && k != opts.max_iters; k++) {
--- a/src/matrix/qr.cc
+++ b/src/matrix/qr.cc
@ -86,8 +86,8 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
    } else {
      cblas_Xscal(dim, inv_v1, v, 1);
    }
-    if (!KALDI_ISFINITE(inv_v1) || !KALDI_ISFINITE(x1)) {
-      KALDI_ERR << "NaN or inf encountered in HouseBackward";
+    if (KALDI_ISNAN(inv_v1)) {
+      KALDI_ERR << "NaN encountered in HouseBackward";
    }
  }
 }
@ -142,8 +142,8 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
    } else {
      cblas_Xscal(dim, inv_v1, v, 1);
    }
-    if (!KALDI_ISFINITE(inv_v1) || !KALDI_ISFINITE(x1)) {
-      KALDI_ERR << "NaN or inf encountered in HouseBackward";
+    if (KALDI_ISNAN(inv_v1)) {
+      KALDI_ERR << "NaN encountered in HouseBackward";
    }
  }
 }
--- a/src/nnet2/Makefile
+++ b/src/nnet2/Makefile
@ -19,7 +19,8 @@ OBJFILES = nnet-component.o nnet-nnet.o train-nnet.o train-nnet-ensemble.o nnet-
     nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o nnet-example.o \
     get-feature-transform.o widen-nnet.o nnet-precondition-online.o \
     nnet-example-functions.o nnet-compute-discriminative.o \
-     nnet-compute-discriminative-parallel.o online-nnet2-decodable.o
+     nnet-compute-discriminative-parallel.o online-nnet2-decodable.o \
+     train-nnet-perturbed.o

 LIBNAME = kaldi-nnet2

--- a/src/nnet2/nnet-component.h
+++ b/src/nnet2/nnet-component.h
@ -1595,6 +1595,9 @@ class FixedAffineComponent: public Component {
  virtual Component* Copy() const;
  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
+
+  // Function to provide access to linear_params_.
+  const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 protected:
  friend class AffineComponent;
  CuMatrix<BaseFloat> linear_params_;
--- a/src/nnet2/nnet-example.cc
+++ b/src/nnet2/nnet-example.cc
@ -67,6 +67,40 @@ void NnetExample::Read(std::istream &is, bool binary) {
 }


+
+void ExamplesRepository::AcceptExamples(
+    std::vector<NnetExample> *examples) {
+  KALDI_ASSERT(!examples->empty());
+  empty_semaphore_.Wait();
+  KALDI_ASSERT(examples_.empty());
+  examples_.swap(*examples);
+  full_semaphore_.Signal();
+}
+
+void ExamplesRepository::ExamplesDone() {
+  empty_semaphore_.Wait();
+  KALDI_ASSERT(examples_.empty());
+  done_ = true;
+  full_semaphore_.Signal();
+}
+
+bool ExamplesRepository::ProvideExamples(
+    std::vector<NnetExample> *examples) {
+  full_semaphore_.Wait();
+  if (done_) {
+    KALDI_ASSERT(examples_.empty());
+    full_semaphore_.Signal(); // Increment the semaphore so
+    // the call by the next thread will not block.
+    return false; // no examples to return-- all finished.
+  } else {
+    KALDI_ASSERT(!examples_.empty() && examples->empty());
+    examples->swap(examples_);
+    empty_semaphore_.Signal();
+    return true;
+  }
+}
+
+
 void DiscriminativeNnetExample::Write(std::ostream &os,
                                              bool binary) const {
  // Note: weight, num_ali, den_lat, input_frames, left_context and spk_info are
--- a/src/nnet2/nnet-example.h
+++ b/src/nnet2/nnet-example.h
@ -23,6 +23,7 @@
 #include "nnet2/nnet-nnet.h"
 #include "util/table-types.h"
 #include "lat/kaldi-lattice.h"
+#include "thread/kaldi-semaphore.h"

 namespace kaldi {
 namespace nnet2 {
@ -64,6 +65,35 @@ typedef SequentialTableReader<KaldiObjectHolder<NnetExample > > SequentialNnetEx
 typedef RandomAccessTableReader<KaldiObjectHolder<NnetExample > > RandomAccessNnetExampleReader;


+/** This class stores neural net training examples to be used in
+    multi-threaded training.  */
+class ExamplesRepository {
+ public:
+  /// The following function is called by the code that reads in the examples,
+  /// with a batch of examples.  [It will empty the vector "examples").
+  void AcceptExamples(std::vector<NnetExample> *examples);
+
+  /// The following function is called by the code that reads in the examples,
+  /// when we're done reading examples.
+  void ExamplesDone();
+  
+  /// This function is called by the code that does the training.  It gets the
+  /// training examples, and if they are available, puts them in "examples" and
+  /// returns true.  It returns false when there are no examples left and
+  /// ExamplesDone() has been called.
+  bool ProvideExamples(std::vector<NnetExample> *examples);
+  
+  ExamplesRepository(): empty_semaphore_(1), done_(false) { }
+ private:
+  Semaphore full_semaphore_;
+  Semaphore empty_semaphore_;
+
+  std::vector<NnetExample> examples_;
+  bool done_;
+  KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository);
+};
+
+
 /**
   This struct is used to store the information we need for discriminative training
   (MMI or MPE).  Each example corresponds to one chunk of a file (for better randomization
@ -116,7 +146,7 @@ struct DiscriminativeNnetExample {
  void Read(std::istream &is, bool binary);
 };

-// Tes, the length of typenames is getting out of hand.
+// Yes, the length of typenames is getting out of hand.
 typedef TableWriter<KaldiObjectHolder<DiscriminativeNnetExample > >
   DiscriminativeNnetExampleWriter;
 typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeNnetExample > >
--- a/src/nnet2/nnet-nnet.cc
+++ b/src/nnet2/nnet-nnet.cc
@ -432,6 +432,12 @@ void Nnet::RemovePreconditioning() {
          *(dynamic_cast<AffineComponent*>(components_[i])));
      delete components_[i];
      components_[i] = ac;
+    } else if (dynamic_cast<AffineComponentPreconditionedOnline*>(
+        components_[i]) != NULL) {
+      AffineComponent *ac = new AffineComponent(
+          *(dynamic_cast<AffineComponent*>(components_[i])));
+      delete components_[i];
+      components_[i] = ac;
    }
  }
  SetIndexes();
--- a/src/nnet2/nnet-update-parallel.cc
+++ b/src/nnet2/nnet-update-parallel.cc
@ -26,68 +26,6 @@
 namespace kaldi {
 namespace nnet2 {

-/** This struct stores neural net training examples to be used in
-    multi-threaded training.  */
-class ExamplesRepository {
- public:
-  /// The following function is called by the code that reads in the examples,
-  /// with a batch of examples.  [It will empty the vector "examples").
-  void AcceptExamples(std::vector<NnetExample> *examples);
-
-  /// The following function is called by the code that reads in the examples,
-  /// when we're done reading examples.
-  void ExamplesDone();
-  
-  /// This function is called by the code that does the training.  It gets the
-  /// training examples, and if they are available, puts them in "examples" and
-  /// returns true.  It returns false when there are no examples left and
-  /// ExamplesDone() has been called.
-  bool ProvideExamples(std::vector<NnetExample> *examples);
-  
-  ExamplesRepository(): empty_semaphore_(1), done_(false) { }
- private:
-  Semaphore full_semaphore_;
-  Semaphore empty_semaphore_;
-
-  std::vector<NnetExample> examples_;
-  bool done_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository);
-};
-
-
-void ExamplesRepository::AcceptExamples(
-    std::vector<NnetExample> *examples) {
-  KALDI_ASSERT(!examples->empty());
-  empty_semaphore_.Wait();
-  KALDI_ASSERT(examples_.empty());
-  examples_.swap(*examples);
-  full_semaphore_.Signal();
-}
-
-void ExamplesRepository::ExamplesDone() {
-  empty_semaphore_.Wait();
-  KALDI_ASSERT(examples_.empty());
-  done_ = true;
-  full_semaphore_.Signal();
-}
-
-bool ExamplesRepository::ProvideExamples(
-    std::vector<NnetExample> *examples) {
-  full_semaphore_.Wait();
-  if (done_) {
-    KALDI_ASSERT(examples_.empty());
-    full_semaphore_.Signal(); // Increment the semaphore so
-    // the call by the next thread will not block.
-    return false; // no examples to return-- all finished.
-  } else {
-    KALDI_ASSERT(!examples_.empty() && examples->empty());
-    examples->swap(examples_);
-    empty_semaphore_.Signal();
-    return true;
-  }
-}
-
-

 class DoBackpropParallelClass: public MultiThreadable {
 public:
--- a/src/nnet2/nnet-update.cc
+++ b/src/nnet2/nnet-update.cc
@ -39,8 +39,8 @@ double NnetUpdater::ComputeForMinibatch(
  CuMatrix<BaseFloat> tmp_deriv;
  double ans = ComputeObjfAndDeriv(data, &tmp_deriv, tot_accuracy);
  if (nnet_to_update_ != NULL)
-    Backprop(data, &tmp_deriv); // this is summed (after weighting), not
-                                // averaged.
+    Backprop(&tmp_deriv); // this is summed (after weighting), not
+                          // averaged.
  return ans;
 }

@ -133,9 +133,7 @@ double NnetUpdater::ComputeTotAccuracy(
 }


-void NnetUpdater::Backprop(const std::vector<NnetExample> &data,
-                           CuMatrix<BaseFloat> *deriv) const {
-  int32 num_chunks = data.size();
+void NnetUpdater::Backprop(CuMatrix<BaseFloat> *deriv) const {
  // We assume ComputeObjfAndDeriv has already been called.
  for (int32 c = nnet_.NumComponents() - 1; c >= 0; c--) {
    const Component &component = nnet_.GetComponent(c);
@ -146,7 +144,7 @@ void NnetUpdater::Backprop(const std::vector<NnetExample> &data,
    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
    const CuMatrix<BaseFloat> &output_deriv(*deriv);

-    component.Backprop(input, output, output_deriv, num_chunks,
+    component.Backprop(input, output, output_deriv, num_chunks_,
                       component_to_update, &input_deriv);
    input_deriv.Swap(deriv);
  }
--- a/src/nnet2/nnet-update.h
+++ b/src/nnet2/nnet-update.h
@ -29,22 +29,20 @@
 namespace kaldi {
 namespace nnet2 {

-/* This header provides functionality for sample-by-sample stochastic
+/** @file
+   This header provides functionality for sample-by-sample stochastic
   gradient descent and gradient computation with a neural net.
-   See also nnet-compute.h which is the same thing but for
+   See also \ref nnet-compute.h which is the same thing but for
   whole utterances.
-   This is the inner part of the training code; see nnet-train.h
-   which contains a wrapper for this, with functionality for
-   automatically keeping the learning rates for each layer updated
-   using a heuristic involving validation-set gradients.
 */

+class NnetEnsembleTrainer;
+
 // This class NnetUpdater contains functions for updating the neural net or
 // computing its gradient, given a set of NnetExamples. We
 // define it in the header file becaused it's needed by the ensemble training.
 // But in normal cases its functionality should be used by calling DoBackprop(),
 // and by ComputeNnetObjf()
-class NnetEnsembleTrainer;
 class NnetUpdater {
 public:
  // Note: in the case of training with SGD, "nnet" and "nnet_to_update" will
@ -84,8 +82,7 @@ class NnetUpdater {
  /// contain, at input, the derivative w.r.t. the output layer (as computed by
  /// ComputeObjfAndDeriv), but will be used as a temporary variable by this
  /// function.
-  void Backprop(const std::vector<NnetExample> &data,
-                CuMatrix<BaseFloat> *deriv) const;
+  void Backprop(CuMatrix<BaseFloat> *deriv) const;

  friend class NnetEnsembleTrainer;
 private:
@ -100,10 +97,6 @@ class NnetUpdater {
  std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
  // for the outputs of each of the components.

-  // These weights are one per parameter; they equal to the "weight"
-  // member variables in the NnetExample structures.  These
-  // will typically be about one on average.
-  CuVector<BaseFloat> chunk_weights_;
 };

 /// This function computes the objective function and either updates the model
--- a/src/nnet2/train-nnet-ensemble.cc
+++ b/src/nnet2/train-nnet-ensemble.cc
@ -90,12 +90,13 @@ void NnetEnsembleTrainer::TrainOneMinibatch() {
    post_mat[i].ApplyLog();
    std::vector<BaseFloat> log_post_correct;
    post_mat[i].Lookup(sv_labels_ind, &log_post_correct);
-    BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(), log_post_correct.end(), static_cast<BaseFloat>(0));
-    
+    BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(),
+                                                  log_post_correct.end(),
+                                                  static_cast<BaseFloat>(0));
    avg_logprob_this_phase_ += log_prob_this_net;
    tmp_deriv.InvertElements();
    tmp_deriv.MulElements(post_avg);
-    updater_ensemble_[i]->Backprop(buffer_, &tmp_deriv);
+    updater_ensemble_[i]->Backprop(&tmp_deriv);
  }
  count_this_phase_ += buffer_.size();
  buffer_.clear();
--- a/src/nnet2/train-nnet-perturbed.cc
+++ b/src/nnet2/train-nnet-perturbed.cc
@ -0,0 +1,710 @@
+// nnet2/train-nnet-perturbed.cc
+
+// Copyright 2012-2014   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet2/train-nnet-perturbed.h"
+#include "nnet2/nnet-update.h"
+#include "thread/kaldi-thread.h"
+
+namespace kaldi {
+namespace nnet2 {
+
+
+class NnetPerturbedUpdater {
+ public:
+  // Note: in the case of training with SGD, "nnet" and "nnet_to_update" will be
+  // identical.  They'd be different if we're accumulating the gradient for a
+  // held-out set and don't want to update the model, but this shouldn't happen
+  // for this "perturbed" update.  nnet_to_update may be NULL if you don't
+  // want do do backprop, but this probably doesn't make sense.
+  // num_layers_before_input is the number of layers to ignore before what
+  // we consider to be the input (x) for purposes of this technique.  This will
+  // likely equal 2: one for the feature-splicing layer (SpliceComponent) and
+  // one for the preconditioning layer (FixedAffineComponent).  The within_class_covar
+  // argument (within_class_covar)
+  // 
+  // within_class_covar is the within-class covariance matrix
+  NnetPerturbedUpdater(const Nnet &nnet,
+                       int32 num_layers_before_input,
+                       const CuMatrix<BaseFloat> &within_class_covar,
+                       Nnet *nnet_to_update);
+  
+  // This function does the entire forward and backward computation for this
+  // minbatch.  Outputs to tot_objf_orig and tot_objf_perturbed the total
+  // objective function (including any weighting factors) over this minibatch,
+  // and the same after perturbing the data.
+  void ComputeForMinibatch(const std::vector<NnetExample> &data,
+                           BaseFloat D,
+                           double *tot_objf_orig,
+                           double *tot_objf_perturbed);
+  
+ protected:
+
+  /// takes the input and formats as a single matrix, in forward_data_[0].
+  void FormatInput(const std::vector<NnetExample> &data);
+
+  /// Do the forward propagation for layers 0 ... num_layers_before_input_ - 1,
+  /// typically the first two layers.  This will be called once per minibatch.
+  void PropagateInitial() { Propagate(0, num_layers_before_input_); }
+
+
+  /// Do the forward propagation for layers num_layers_before_input_
+  /// ... num-layers-1, typically all but the first two layers.  This will be
+  /// called twice per minibatch, once before and once after perturbing the
+  /// inputs.
+  void PropagateRemaining() { Propagate(num_layers_before_input_,
+                                        nnet_.NumComponents()); }
+
+  /// Internal Propagate function, does the forward computation for
+  /// layers begin_layer ... end_layer - 1.
+  void Propagate(int32 begin_layer, int32 end_layer);
+  
+  /// Computes objective function and derivative at output layer, but does not
+  /// do the backprop [for that, see Backprop()].  This will be called twice per
+  /// minibatch, once before and once after perturbing the inputs.
+  void ComputeObjfAndDeriv(const std::vector<MatrixElement<BaseFloat> > &sv_labels,
+                           CuMatrix<BaseFloat> *deriv,
+                           BaseFloat *tot_objf,
+                           BaseFloat *tot_weight) const;
+
+  /// Computes supervision labels from data.
+  void ComputeSupervisionLabels(const std::vector<NnetExample> &data,
+                                std::vector<MatrixElement<BaseFloat> > *sv_labels);
+
+  /// Backprop must be called after ComputeObjfAndDeriv (it will be called
+  /// twice, the first time with a NULL nnet_to_update pointer).  It does the
+  /// backpropagation (not including the first num_layers_before_input_ layers).
+  /// "nnet_to_update" is updated, if non-NULL.  Note: "deriv" will contain, at
+  /// input, the derivative w.r.t. the output layer (as computed by
+  /// ComputeObjfAndDeriv), but will be used as a temporary variable by this
+  /// function, and exit, will contain the derivative of the objective function
+  /// w.r.t. the input of layer num_layers_before_input_.
+  void Backprop(Nnet *nnet_to_update,
+                CuMatrix<BaseFloat> *deriv) const;
+
+  /// Perturb the input features (actually, the features at the input of layer
+  /// num_layers_before_input_).  This modifies the value of
+  /// forward_data_[num_layers_before_input_].  For the math, see \ref
+  /// train-nnet-perturbed.h
+  void PerturbInput(const CuMatrix<BaseFloat> &deriv_at_input,
+                    BaseFloat D);                    
+  
+ private:
+  
+  const Nnet &nnet_;
+  
+  Nnet *nnet_to_update_;  
+  int32 num_layers_before_input_;  // Number of layers before whichever layer we
+                                   // regard as the input for purposes of this
+                                   // method (normally 2, to include splicing
+                                   // layer and preconditioning layer)
+
+  const CuMatrix<BaseFloat> &within_class_covar_;
+  
+  int32 num_chunks_; // same as the minibatch size.
+  
+  std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
+  // for the outputs of each of the components.
+};
+
+
+NnetPerturbedUpdater::NnetPerturbedUpdater(const Nnet &nnet,
+                                           int32 num_layers_before_input,
+                                           const CuMatrix<BaseFloat> &within_class_covar,
+                                           Nnet *nnet_to_update):
+    nnet_(nnet),
+    nnet_to_update_(nnet_to_update),
+    num_layers_before_input_(num_layers_before_input),
+    within_class_covar_(within_class_covar) {
+  KALDI_ASSERT(num_layers_before_input_ >= 0 &&
+               num_layers_before_input < nnet.NumComponents());
+  for (int32 c = 0; c < num_layers_before_input_; c++) {
+    const Component *comp = &(nnet.GetComponent(c));
+    const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
+    if (uc != NULL) {
+      KALDI_ERR << "One of the pre-input layers is updatable.";
+    }
+  }
+}    
+
+void NnetPerturbedUpdater::PerturbInput(
+    const CuMatrix<BaseFloat> &deriv_at_input,
+    BaseFloat D) {
+  // The code doesn't handle the case where there is further splicing after the
+  // input.
+  KALDI_ASSERT(num_chunks_ == deriv_at_input.NumRows());
+  // For the math, see train-nnet-perturbed.h.
+  // deriv_at_input is \nabla in the math.
+
+  // "input" is the input features, currently unmodified, but we'll
+  // modify them.
+  CuMatrix<BaseFloat> &input(forward_data_[num_layers_before_input_]);
+  KALDI_ASSERT(SameDim(input, deriv_at_input));
+  // Each row of deriv_w will equal (W nabla_t)', where ' is transpose.
+  CuMatrix<BaseFloat> deriv_w(input.NumRows(), input.NumCols());
+  // note: for the second transpose-ness argument below we can choose either
+  // kTrans or kNoTrans because the matrix is symmetric.  I'm guessing that
+  // kTrans will be faster.
+  deriv_w.AddMatMat(1.0, deriv_at_input, kNoTrans,
+                    within_class_covar_, kTrans, 0.0);
+  
+  // k will be used to compute and store the gradient-scaling factor k_t.
+  CuVector<BaseFloat> k(deriv_at_input.NumRows());
+  // after the next call, each element of k will contain (\nabla_t^T W \nabla_t)
+  // We want k_t = D / sqrt(\nabla_t^T W \nabla_t)
+  // so we need to take this to the power -0.5.
+  // We can't do this if it's zero, so we first floor to a very small value.
+  k.AddDiagMatMat(1.0, deriv_w, kNoTrans, deriv_at_input, kTrans, 0.0);
+  int32 num_floored = k.ApplyFloor(1.0e-20);
+  if (num_floored > 0.0) {
+    // Should only happen at the very start of training, 
+    KALDI_WARN << num_floored << " gradients floored (derivative at input was "
+               << "close to zero).. should only happen at start of training "
+               << "or when adding a new layer.";
+  }
+  k.ApplyPow(-0.5);
+  // now we have k_t = 1.0 / sqrt(\nabla_t^T W \nabla_t).
+  // in the math, k_t contains an additional factor of D, but we'll
+  // add this later.
+  // Below, we will do  x'_t = x_t - k_t W \nabla_t
+  // Here, each row of deriv_w contains the transpose of W \nabla_t.
+  // The factor of D is because it was missing in k.
+  input.AddDiagVecMat(-1.0 * D, k, deriv_w, kNoTrans, 1.0);
+}
+
+void NnetPerturbedUpdater::ComputeForMinibatch(
+    const std::vector<NnetExample> &data,
+    BaseFloat D,
+    double *tot_objf_orig,
+    double *tot_objf_perturbed) {
+
+  FormatInput(data);
+  PropagateInitial();
+  PropagateRemaining();
+  CuMatrix<BaseFloat> tmp_deriv;
+
+  std::vector<MatrixElement<BaseFloat> > sv_labels;
+  ComputeSupervisionLabels(data, &sv_labels);
+  
+  BaseFloat tot_objf, tot_weight;
+  ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight);
+
+  KALDI_VLOG(4) << "Objective function (original) is " << (tot_objf/tot_weight)
+                << " per sample, over " << tot_weight << " samples (weighted).";
+  *tot_objf_orig = tot_objf;
+  
+  // only backprops till layer number num_layers_before_input_,
+  // and derivative at that layer is in tmp_deriv.
+  Backprop(NULL, &tmp_deriv);
+
+  // perturb forward_data_[num_layers_before_input_].
+  PerturbInput(tmp_deriv, D);
+  
+  // Now propagate forward again from that point.
+  PropagateRemaining();
+
+  ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight);
+  KALDI_VLOG(4) << "Objective function (perturbed) is " << (tot_objf/tot_weight)
+                << " per sample, over " << tot_weight << " samples (weighted).";
+  *tot_objf_perturbed = tot_objf;
+
+  // The actual model updating would happen in the next call.
+  if (nnet_to_update_ != NULL)
+    Backprop(nnet_to_update_, &tmp_deriv);
+}
+
+void NnetPerturbedUpdater::Propagate(int32 begin_layer, int32 end_layer) {
+  static int32 num_times_printed = 0;
+  
+  for (int32 c = begin_layer; c < end_layer; c++) {
+    const Component &component = nnet_.GetComponent(c);
+    const CuMatrix<BaseFloat> &input = forward_data_[c];
+    CuMatrix<BaseFloat> &output = forward_data_[c+1];
+    // Note: the Propagate function will automatically resize the
+    // output.
+    component.Propagate(input, num_chunks_, &output);
+
+    KALDI_VLOG(4) << "Propagating: sum at output of " << c << " is " << output.Sum();
+    
+    // If we won't need the output of the previous layer for
+    // backprop, delete it to save memory.
+    bool need_last_output =
+        (c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
+        component.BackpropNeedsInput();
+    if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
+      KALDI_VLOG(3) << "Stddev of data for component " << c
+                    << " for this minibatch is "
+                    << (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
+                        (forward_data_[c].NumRows() * forward_data_[c].NumCols()));
+      num_times_printed++;
+    }
+    if (!need_last_output && c != num_layers_before_input_)
+      forward_data_[c].Resize(0, 0); // We won't need this data.
+  }
+}
+
+void NnetPerturbedUpdater::ComputeSupervisionLabels(
+    const std::vector<NnetExample> &data,
+    std::vector<MatrixElement<BaseFloat> > *sv_labels) {
+  sv_labels->clear();
+  sv_labels->reserve(num_chunks_); // We must have at least this many labels.
+  for (int32 m = 0; m < num_chunks_; m++) {
+    for (size_t i = 0; i < data[m].labels.size(); i++) {
+      MatrixElement<BaseFloat> 
+          tmp = {m, data[m].labels[i].first, data[m].labels[i].second};
+      sv_labels->push_back(tmp);
+    }
+  }  
+}
+
+void NnetPerturbedUpdater::ComputeObjfAndDeriv(
+    const std::vector<MatrixElement<BaseFloat> > &sv_labels,
+    CuMatrix<BaseFloat> *deriv,
+    BaseFloat *tot_objf,
+    BaseFloat *tot_weight) const {
+  int32 num_components = nnet_.NumComponents();  
+  deriv->Resize(num_chunks_, nnet_.OutputDim()); // sets to zero.
+  const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
+  KALDI_ASSERT(SameDim(output, *deriv));
+  
+  deriv->CompObjfAndDeriv(sv_labels, output, tot_objf, tot_weight);
+}
+
+
+void NnetPerturbedUpdater::Backprop(Nnet *nnet_to_update,
+                                    CuMatrix<BaseFloat> *deriv) const {
+  // We assume ComputeObjfAndDeriv has already been called.
+  for (int32 c = nnet_.NumComponents() - 1; c >= num_layers_before_input_; c--) {
+    const Component &component = nnet_.GetComponent(c);
+    Component *component_to_update = (nnet_to_update == NULL ? NULL :
+                                      &(nnet_to_update->GetComponent(c)));
+    const CuMatrix<BaseFloat> &input = forward_data_[c],
+        &output = forward_data_[c+1];
+    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
+    const CuMatrix<BaseFloat> &output_deriv(*deriv);
+    
+    component.Backprop(input, output, output_deriv, num_chunks_,
+                       component_to_update, &input_deriv);
+    input_deriv.Swap(deriv);
+  }
+}
+
+
+void NnetPerturbedUpdater::FormatInput(const std::vector<NnetExample> &data) {
+  KALDI_ASSERT(data.size() > 0);
+  int32 num_splice = nnet_.LeftContext() + 1 + nnet_.RightContext();
+  KALDI_ASSERT(data[0].input_frames.NumRows() >= num_splice);
+  
+  int32 feat_dim = data[0].input_frames.NumCols(),
+         spk_dim = data[0].spk_info.Dim(),
+         tot_dim = feat_dim + spk_dim; // we append these at the neural net
+                                       // input... note, spk_dim might be 0.
+  KALDI_ASSERT(tot_dim == nnet_.InputDim());
+  KALDI_ASSERT(data[0].left_context >= nnet_.LeftContext());
+  int32 ignore_frames = data[0].left_context - nnet_.LeftContext(); // If
+  // the NnetExample has more left-context than we need, ignore some.
+  // this may happen in settings where we increase the amount of context during
+  // training, e.g. by adding layers that require more context.
+  num_chunks_ = data.size();
+  
+  forward_data_.resize(nnet_.NumComponents() + 1);
+
+  // First copy to a single matrix on the CPU, so we can copy to
+  // GPU with a single copy command.
+  Matrix<BaseFloat> temp_forward_data(num_splice * num_chunks_,
+                                      tot_dim);
+  
+  for (int32 chunk = 0; chunk < num_chunks_; chunk++) {
+    SubMatrix<BaseFloat> dest(temp_forward_data,
+                              chunk * num_splice, num_splice,
+                              0, feat_dim);
+
+    Matrix<BaseFloat> full_src(data[chunk].input_frames);
+    SubMatrix<BaseFloat> src(full_src, ignore_frames, num_splice, 0, feat_dim);
+                             
+    dest.CopyFromMat(src);
+    if (spk_dim != 0) {
+      SubMatrix<BaseFloat> spk_dest(temp_forward_data,
+                                    chunk * num_splice, num_splice,
+                                    feat_dim, spk_dim);
+      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
+    }
+  }
+  forward_data_[0].Swap(&temp_forward_data); // Copy to GPU, if being used.
+}
+
+
+
+void DoBackpropPerturbed(const Nnet &nnet,
+                         int32 num_layers_before_input,
+                         const CuMatrix<BaseFloat> &within_class_covar,
+                         BaseFloat D,
+                         const std::vector<NnetExample> &examples,
+                         Nnet *nnet_to_update,
+                         double *tot_objf_orig,
+                         double *tot_objf_perturbed) {
+  
+  try {
+    NnetPerturbedUpdater updater(nnet, num_layers_before_input,
+                                 within_class_covar, nnet_to_update);
+
+    updater.ComputeForMinibatch(examples, D, tot_objf_orig, tot_objf_perturbed);
+  } catch (...) {
+    KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info();
+    throw;
+  }
+}
+
+
+NnetPerturbedTrainer::NnetPerturbedTrainer(
+    const NnetPerturbedTrainerConfig &config,
+    const SpMatrix<BaseFloat> &within_class_covar,    
+    Nnet *nnet):
+    config_(config), nnet_(nnet), logprob_this_phase_(0.0),
+    logprob_perturbed_this_phase_(0.0), weight_this_phase_(0.0),
+    logprob_total_(0.0), logprob_perturbed_total_(0.0),
+    weight_total_(0.0),
+    D_(config.initial_d) {
+  InitWithinClassCovar(within_class_covar);
+  num_phases_ = 0;
+  bool first_time = true;
+  BeginNewPhase(first_time);
+}
+
+
+// This function is used in class NnetPerturbedTrainer
+// and the function DoBackpropPerturbedParallel.
+void InitWithinClassCovar(
+    const SpMatrix<BaseFloat> &within_class_covar,
+    const Nnet &nnet,
+    int32 *num_layers_before_input,
+    CuMatrix<BaseFloat> *within_class_covar_out) {  
+
+  CuSpMatrix<BaseFloat> orig_covar(within_class_covar);
+  *num_layers_before_input = 0;
+  KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input);
+  const Component *comp = &(nnet.GetComponent(*num_layers_before_input));
+  // Skip over any SpliceComponent that appears at the beginning of
+  // the network.
+  if (dynamic_cast<const SpliceComponent*>(comp) != NULL)
+    (*num_layers_before_input)++;
+  
+  KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input);
+  comp = &(nnet.GetComponent(*num_layers_before_input));
+
+  const FixedAffineComponent *fa =
+      dynamic_cast<const FixedAffineComponent*>(comp);
+  if (fa != NULL) {
+    (*num_layers_before_input)++;
+    const CuMatrix<BaseFloat> &linear_params = fa->LinearParams();
+    if (linear_params.NumCols() != orig_covar.NumCols()) {
+      KALDI_ERR << "The neural network seems to expect a (spliced) feature "
+                << "dimension of " << linear_params.NumCols() << ", but your "
+                << "LDA stats have a dimension of " << orig_covar.NumCols();
+    }
+    CuMatrix<BaseFloat> temp(linear_params.NumRows(), orig_covar.NumRows());
+    // temp = linear_params . orig_covar
+    temp.AddMatSp(1.0, linear_params, kNoTrans, orig_covar, 0.0);
+    within_class_covar_out->Resize(linear_params.NumRows(),
+                                   linear_params.NumRows());
+    // temp = linear_params . orig_covar . linear_params^T
+    within_class_covar_out->AddMatMat(1.0, temp, kNoTrans,
+                                      linear_params, kTrans, 0.0);
+    // note: this should be symmetric, spot-test it like this:
+    KALDI_ASSERT(ApproxEqual(TraceMatMat(*within_class_covar_out,
+                                         *within_class_covar_out, kNoTrans),
+                             TraceMatMat(*within_class_covar_out,
+                                         *within_class_covar_out, kTrans)));
+  } else {
+    if (comp->InputDim() != orig_covar.NumCols()) {
+      KALDI_ERR << "The neural network seems to expect a (spliced) feature "
+                << "dimension of " << comp->InputDim() << ", but your "
+                << "LDA stats have a dimension of " << orig_covar.NumCols();
+    }
+    within_class_covar_out->Resize(orig_covar.NumRows(), orig_covar.NumCols());
+    within_class_covar_out->CopyFromSp(orig_covar);
+  }
+}
+  
+
+
+void NnetPerturbedTrainer::InitWithinClassCovar(
+    const SpMatrix<BaseFloat> &within_class_covar) {
+  kaldi::nnet2::InitWithinClassCovar(within_class_covar, *nnet_,
+                                     &num_layers_before_input_,
+                                     &within_class_covar_);
+}  
+
+void NnetPerturbedTrainer::TrainOnExample(const NnetExample &value) {
+  buffer_.push_back(value);
+  if (static_cast<int32>(buffer_.size()) == config_.minibatch_size)
+    TrainOneMinibatch();
+}
+
+void NnetPerturbedTrainer::TrainOneMinibatch() {
+  KALDI_ASSERT(!buffer_.empty());
+
+  double tot_objf_orig, tot_objf_perturbed;
+  DoBackpropPerturbed(*nnet_, num_layers_before_input_, within_class_covar_, D_,
+                      buffer_, nnet_, &tot_objf_orig, &tot_objf_perturbed);
+
+  logprob_this_phase_ += tot_objf_orig;
+  logprob_perturbed_this_phase_ += tot_objf_perturbed;
+  double weight = TotalNnetTrainingWeight(buffer_);
+  UpdateD(tot_objf_orig / weight, tot_objf_perturbed / weight);
+  weight_this_phase_ += weight;
+  buffer_.clear();
+  minibatches_seen_this_phase_++;
+  if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
+    bool first_time = false;
+    BeginNewPhase(first_time);
+  }
+}
+
+
+void NnetPerturbedTrainer::UpdateD(BaseFloat orig_objf_per_example,                                   
+                                   BaseFloat perturbed_objf_per_example) {
+  
+  BaseFloat diff = orig_objf_per_example - perturbed_objf_per_example;
+  // note: diff should be positive in the normal case.
+  KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0);
+  BaseFloat objf_ratio = config_.target_objf_change /
+      std::max<BaseFloat>(1.0e-20, diff),
+      D_ratio = pow(objf_ratio, config_.tune_d_power);
+  if (D_ratio > config_.max_d_factor)
+    D_ratio = config_.max_d_factor;
+  else if (D_ratio < 1.0 / config_.max_d_factor)
+    D_ratio = 1.0 / config_.max_d_factor;
+  BaseFloat D_new = D_ * D_ratio;
+  
+  KALDI_VLOG(3) << "Training objective function normal/perturbed is "
+                << orig_objf_per_example << '/' << perturbed_objf_per_example
+                << ", diff " << diff << " vs. target "
+                << config_.target_objf_change
+                << ", changing D by factor " << D_ratio << " to " << D_new;
+  D_ = D_new;  
+}
+
+void NnetPerturbedTrainer::BeginNewPhase(bool first_time) {
+  if (!first_time) {
+    BaseFloat logprob = logprob_this_phase_/weight_this_phase_,
+        logprob_perturbed = logprob_perturbed_this_phase_/weight_this_phase_,
+        diff = logprob - logprob_perturbed;
+    KALDI_LOG << "Training objective function normal->perturbed is "
+              << logprob << " -> " << logprob_perturbed << ", diff "
+              << diff << " vs. target " << config_.target_objf_change
+              << ", over " << weight_this_phase_ << " frames, D is "
+              << D_;
+  }
+  logprob_total_ += logprob_this_phase_;
+  logprob_perturbed_total_ += logprob_perturbed_this_phase_;
+  weight_total_ += weight_this_phase_;
+  logprob_this_phase_ = 0.0;
+  logprob_perturbed_this_phase_ = 0.0;
+  weight_this_phase_ = 0.0;
+  minibatches_seen_this_phase_ = 0;
+  num_phases_++;
+}
+
+
+NnetPerturbedTrainer::~NnetPerturbedTrainer() {
+  if (!buffer_.empty()) {
+    KALDI_LOG << "Doing partial minibatch of size "
+              << buffer_.size();
+    TrainOneMinibatch();
+    if (minibatches_seen_this_phase_ != 0) {
+      bool first_time = false;
+      BeginNewPhase(first_time);
+    }
+  }
+  if (weight_total_ == 0.0) {
+    KALDI_WARN << "No data seen.";
+  } else {
+    KALDI_LOG << "Did backprop on " << weight_total_
+              << " examples, average log-prob normal->perturbed per frame is "
+              << (logprob_total_ / weight_total_) << " -> "
+              << (logprob_perturbed_total_ / weight_total_);
+    KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
+              << (logprob_total_ / weight_total_);
+  }
+}
+
+
+// compare with DoBackpropParallelClass
+class TrainParallelPerturbedClass: public MultiThreadable {
+ public:
+  // This constructor is only called for a temporary object
+  // that we pass to the RunMultiThreaded function.
+  TrainParallelPerturbedClass(const NnetPerturbedTrainerConfig &config,
+                              const CuMatrix<BaseFloat> &within_class_covar,
+                              int32 num_layers_before_input,
+                              BaseFloat *D,
+                              Nnet *nnet,
+                              ExamplesRepository *repository,
+                              double *log_prob_orig_ptr,
+                              double *log_prob_perturbed_ptr,
+                              double *tot_weight_ptr):
+      config_(config), within_class_covar_(within_class_covar),
+      num_layers_before_input_(num_layers_before_input), D_(D),
+      nnet_(nnet), repository_(repository),
+      log_prob_orig_ptr_(log_prob_orig_ptr),
+      log_prob_perturbed_ptr_(log_prob_perturbed_ptr),
+      tot_weight_ptr_(tot_weight_ptr),
+      log_prob_orig_(0.0),
+      log_prob_perturbed_(0.0),
+      tot_weight_(0.0) { }
+
+  // Use the default copy constructor.
+  
+  // This does the main function of the class.
+  void operator () () {
+    std::vector<NnetExample> examples;
+    while (repository_->ProvideExamples(&examples)) {
+      double objf_orig, objf_perturbed,
+          weight = TotalNnetTrainingWeight(examples);
+      DoBackpropPerturbed(*nnet_, num_layers_before_input_,
+                          within_class_covar_, *D_,
+                          examples, nnet_,
+                          &objf_orig, &objf_perturbed);
+      UpdateD(objf_orig / weight, objf_perturbed / weight);
+      
+      tot_weight_ += weight;
+      log_prob_orig_ += objf_orig;
+      log_prob_perturbed_ += objf_perturbed;
+      KALDI_VLOG(4) << "Thread " << thread_id_ << " saw "
+                    << tot_weight_ << " frames so far (weighted); likelihood "
+                    << "per frame (orig->perturbed) so far is "
+                    << (log_prob_orig_ / tot_weight_) << " -> "
+                    << (log_prob_perturbed_ / tot_weight_);
+      examples.clear();
+    }    
+  }
+  
+  ~TrainParallelPerturbedClass() {
+    *log_prob_orig_ptr_ += log_prob_orig_;
+    *log_prob_perturbed_ptr_ += log_prob_perturbed_;
+    *tot_weight_ptr_ += tot_weight_;
+  }
+ private:
+  void UpdateD(BaseFloat orig_logprob, BaseFloat perturbed_logprob) {
+    BaseFloat diff = orig_logprob - perturbed_logprob;
+    // note: diff should be positive in the normal case.
+    KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0);
+    // divide the power we raise the ratio to when tuning D, by the
+    // number of threads; this should ensure stability of the update.
+    BaseFloat tune_d_power = config_.tune_d_power / g_num_threads;
+    BaseFloat objf_ratio = config_.target_objf_change /
+        std::max<BaseFloat>(1.0e-20, diff),
+        D_ratio = pow(objf_ratio, tune_d_power);
+    if (D_ratio > config_.max_d_factor)
+      D_ratio = config_.max_d_factor;
+    else if (D_ratio < 1.0 / config_.max_d_factor)
+      D_ratio = 1.0 / config_.max_d_factor;
+    BaseFloat D_new = (*D_) * D_ratio;
+    *D_ = D_new;  
+    
+    // Note: we are accessing *D_ from multiple threads without
+    // locking, but the negative consequences of this contention are
+    // very small (
+    KALDI_VLOG(3) << "Training objective function normal->perturbed is "
+                  << orig_logprob << " -> " << perturbed_logprob
+                  << ", diff " << diff << " vs. target "
+                  << config_.target_objf_change
+                  << ", changing D by factor " << D_ratio << " to " << D_new;
+  }
+
+  const NnetPerturbedTrainerConfig &config_;
+  const CuMatrix<BaseFloat> &within_class_covar_;
+  int32 num_layers_before_input_;
+  BaseFloat *D_;  // Constant D that controls how much to perturb the data.  We
+                  // update this as well as use it.
+  Nnet *nnet_;
+  ExamplesRepository *repository_;
+
+  double *log_prob_orig_ptr_;
+  double *log_prob_perturbed_ptr_;
+  double *tot_weight_ptr_;
+  double log_prob_orig_;  // log-like times num frames (before perturbing features)
+  double log_prob_perturbed_;  // log-like times num frames (after perturbing features)
+  double tot_weight_;  // normalizing factor for the above.
+};
+
+void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config,
+                                 const SpMatrix<BaseFloat> &within_class_covar,
+                                 SequentialNnetExampleReader *example_reader,
+                                 double *tot_objf_orig,
+                                 double *tot_objf_perturbed,
+                                 double *tot_weight,
+                                 Nnet *nnet) {
+
+  // within_class_covar_processed is the within-class covar as CuMatrix, possibly
+  // projected by the preconditioning transform in any FixedAffineComponent.
+  CuMatrix<BaseFloat> within_class_covar_processed;
+  int32 num_layers_before_input;
+  InitWithinClassCovar(within_class_covar, *nnet,
+                       &num_layers_before_input,
+                       &within_class_covar_processed);
+  BaseFloat D = config.initial_d;
+
+  ExamplesRepository repository; // handles parallel programming issues regarding  
+
+  *tot_objf_orig = *tot_objf_perturbed = *tot_weight = 0.0;
+
+  TrainParallelPerturbedClass trainer_proto(config,
+                                            within_class_covar_processed,
+                                            num_layers_before_input, &D,
+                                            nnet, &repository,
+                                            tot_objf_orig,
+                                            tot_objf_perturbed,
+                                            tot_weight);
+
+  {
+    // The initialization of the following class spawns the threads that
+    // process the examples.  They get re-joined in its destructor.
+    MultiThreader<TrainParallelPerturbedClass> m(g_num_threads, trainer_proto);
+    
+    std::vector<NnetExample> examples;
+    for (; !example_reader->Done(); example_reader->Next()) {
+      examples.push_back(example_reader->Value());
+      if (examples.size() == config.minibatch_size)
+        repository.AcceptExamples(&examples);
+    }
+    if (!examples.empty()) // partial minibatch.
+      repository.AcceptExamples(&examples);
+    // Here, the destructor of "m" re-joins the threads, and
+    // does the summing of the gradients if we're doing gradient
+    // computation (i.e. &nnet != nnet_to_update).  This gets
+    // done in the destructors of the objects of type
+    // DoBackpropParallelClass.
+    repository.ExamplesDone();
+  }
+  KALDI_LOG << "Did backprop on " << *tot_weight << " examples, average log-prob "
+            << "per frame (orig->perturbed) is "
+            << (*tot_objf_orig / *tot_weight) << " -> "
+            << (*tot_objf_perturbed / *tot_weight) << " over "
+            << *tot_weight << " samples (weighted).";
+  
+  KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
+            << (*tot_objf_orig / *tot_weight);
+}
+
+
+
+
+} // namespace nnet2
+} // namespace kaldi
--- a/src/nnet2/train-nnet-perturbed.h
+++ b/src/nnet2/train-nnet-perturbed.h
@ -0,0 +1,327 @@
+// nnet2/train-nnet-perturbed.h
+
+// Copyright 2012-2014  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET2_TRAIN_NNET_PERTURBED_H_
+#define KALDI_NNET2_TRAIN_NNET_PERTURBED_H_
+
+#include "nnet2/nnet-nnet.h"
+#include "nnet2/nnet-example.h"
+#include "itf/options-itf.h"
+
+namespace kaldi {
+namespace nnet2 {
+
+/**
+   @file
+
+   This file was modified from train-nnet.h in order to implement an idea
+   about perturbing the training examples slightly, in a direction that's
+   opposite to the gradient of the objective function w.r.t. those examples.
+   It's a bit like the idea in "Intriguing properties of neural networks", the
+   training method they mention, except they have a more complicated formulation
+   with L-BFGS.  We can justify our idea by approximating the neural network
+   plus objective-function evaluation as a linear function.
+
+   Note: before doing this, we want to make sure the input features have a
+   reasonable distribution, and our choice for this is to make the within-class
+   covariance matrix unit.  [note: we don't have to normalize the mean to zero,
+   this won't matter.]  Rather than explicitly transforming the features using
+   a transform T, it turns out that we have to multiply the gradients by something
+   like T T'.  We'll describe this later.
+
+   Suppose the actual input features are x.  Typically we do frame splicing
+   as part of the network, and it's more convenient to do the perturbation on
+   the spliced features, so x may actually be the output of the network's
+   first (splicing) layer.  Suppose the within-class covariance matrix of
+   x is W.  If we do the Cholesky transform
+     W = C C^T,
+   then C^{-1} W C^{-T} = I, so if we define
+     T =(def) C^{-1} and
+   and transformed features
+     \hat{x} =(def) T x
+   then it's easy to show that the within-class covariance matrix of the
+   transformed features \hat{x} would be I.
+
+   The way we formulate the perturbed-feature thing is somewhat similar to the
+   "Intriguing properties of neural networks" paper, except we're not in image
+   recognition so no need to keep features in the range [0, 1].  Given a training
+   example \hat{x}_t, we want to find a perturbed example
+       \hat{x}'_t = \hat{x}_t + d_t
+   that gives the worst possible loss-value, such that ||d_t|| <= D, where D is
+   a scalar length parameter (e.g. D = 0.1), and ||.|| is the 2-norm.  This means
+   that we want to perturb the training example in the most damaging way possible,
+   given that it should not change by more than a certain amount.  Because we've
+   normalized the within-class covariance we believe that using a normal 2-norm
+   on d_t, rather than a more general form of inner-product, is suitable.
+
+   Anyway, we make a simplifying assumption that the loss function for a particular
+   sample is just a linear function of the input, and when we get to the space of
+   \hat{x}, it just means we go a certain distance D down the gradient.  How we
+   set a suitable value for D, we'll come to later.
+   
+   Suppose by backpropagating the
+   derivative to x we get a derivative \nabla_t of the objective function (e.g. a
+   log-probability) w.r.t. x_t.  Then we can get the derivative \hat{\nabla}_t of
+   the objective function w.r.t. \hat{x}_t, by identifying
+       x_t^T nabla_t = \hat{x}_t^T \hat{\nabla}_t
+       x_t^T nabla_t = x_t^T T^T \hat{\nabla}_t
+       x_t^T nabla_t = x_t^T T^T T^{-T} \nabla_t, since T^T T^{-T} = I.
+       [note, ^T is transpose and ^{-T} is inverse-of-transpose.]
+   so  \hat{\nabla}_t = T^{-T} \nabla_t.
+   (this is not the formal way of getting these derivatives, it's just how I remember).
+   Anyway, we now have
+       \hat{x}'_t =(def) \hat{x}_t  - k_t T^{-T} \nabla_t
+   where k_t is chosen to ensure that
+                        k_t || T^{-T} \nabla_t ||_2 = D
+      k_t sqrt( \nabla_t^T T^{-1} T^{-T} \nabla_t ) = D
+   so
+     k_t = D / sqrt(\nabla_t^T T^{-1} T^{-T} \nabla_t)
+         = D / sqrt(\nabla_t^T C C^T \nabla_t)
+         = D / sqrt(\nabla_t^T W \nabla_t)
+   Now, we actually want the update in terms of the parameter x instead of \hat{x},
+   so multiplying the definition of \hat{x}'_t above by T^{-1} on the left, we have:
+       x'_t = x_t - k_t T^{-1} T^{-T} \nabla_t
+            = x_t - k_t W \nabla_t
+  (note: we can also use W \nabla_t for efficiently computing k_t).
+
+  It will actually be more efficient to do this after the FixedAffineTransform
+  layer that we used to "precondition" the features, so after the second layer
+  of the input rather than the first.  All we need to do is to get the
+  within-class covariance matrix W in that space (after the
+  FixedAffineTransform) instead.  We'll use the name x for that space, and forget
+  about the original input space.
+
+  Next, we want to discuss how we'll set the constant D.  D is a proportion of
+  the within-class covariance.  However, it's not clear a priori how to set
+  this, or that we can tune it just once and then leave it fixed for other
+  setups.  For one thing, if the input features contain a lot of "nuisance"
+  dimension that are not very informative about the class, it may be necessary
+  for D to be smaller (because hopefully the gradients will be small in those
+  nuisance directions).  There is another issue that this whole method is
+  intended to improve generalization, so we only want to use it strongly if
+  generalization is actually a problem.  For example, if we have so much
+  training data and so few parameters that we have no trouble generalizing, we
+  might not want to apply this method too strongly.  Our method will be to set D
+  in order to get, on average, a certain degradation which we'll call
+  "target-objf-change" in the objective function per frame.  Each time we
+  apply this perturbation to a minibatch, we'll see whether the degradation in
+  objective is greater or less than "target-objf-change", and we'll change
+  D accordingly.  We'll use a simple heuristic that D should change proportionally
+  to the 0.5'th power of the ratio between the "target-objf-change" and the
+  observed objective function change for this minibatch, but never by more than
+  a factor of two.  Note: the only significance of 0.5 here is that 0.5 <= 1; a
+  smaller number means slower changes in D, so it should change over about 2
+  minibatches to the right number.   If this proves unstable, we'll change it.
+
+  Next, it's not absolutely clear how we should set target-objf-change-- the
+  value which determines how much objective-function degradation we want the
+  perturbation to produce on average (per sample).  To put this in perspective,
+  for speech tasks with small amounts of data (say, <30 hours) and a couple thousand
+  classes
+  we typically see objective values like: training-set -0.6 and valdiation-set -1.1.
+  These are avearage log-probabilities per frame, of the correct class.
+  The two numbers are quite different because there is substantial overtraining.  Note: for Karel's
+  nnet1 setup, the difference is typically smaller, more like -0.8 vs. -1.0, as
+  that setup monitors the validation-set objective and decreases the learning rate
+  when it starts to degrade.  Now, for much larger training sets, we might
+  see smaller differences in training-set versus validation-set objective function:
+  for instance: say, -1.40 versus -1.45.  (For larger training sets the objectives tend
+  to be more negative simply because we have more leaves).  We measure these values each
+  iteration: see the files compute_prob_train.*.log and compute_prob_valid.*.log produced
+  by the example scripts.   The reason why I discuss these values
+  is that if the training-set and validation-set objective functions are very close, then
+  it means that there is not much overtraining going on and we don't want to apply this
+  method too strongly; on the other hand, if they are very different, it means we are
+  overtraining badly and we may want to apply this method more.
+
+  So we plan to set target-objf-change to the following value, at the script level:
+
+   target-objf-change = target-multiplier * (training-objf - validation-objf))
+
+  (e.g. target-multiplier = 1.0).
+  Note that if target-objf-change is less than a specified min-target-objf-change
+  (e.g. 0.1) then we won't apply the perturbed training at all, which will save
+  time.  The method is intended to help generalization, and if we're generalizing
+  well then we don't need to apply it.
+  The training and validation objective functions are computed over
+  different (randomly chosen) sets, each with about 3000 samples, and it can
+  sometimes happen that the validation objective function can be better than the
+  training set objective function.  Also, the validation set is sampled from a
+  held-out subset of 300 utterances by default; this is done out of a concern
+  that the correlations within an utterance can be very high, so if we use the
+  same utterances for training and validation, then the validation set is not
+  really held-out.  But the smallish number (300) of validation utterances
+  increases the randomness in the training and validation objectives.
+*/
+
+
+
+struct NnetPerturbedTrainerConfig {
+  int32 minibatch_size;
+  int32 minibatches_per_phase;
+  // target_objf_change will be set from the command line to a value >0.0.
+  BaseFloat target_objf_change;
+  BaseFloat initial_d;
+  // tune_d_power is not configurable from the command line.
+  BaseFloat tune_d_power;
+  // max_d_factor is not configurable from the command line.
+  BaseFloat max_d_factor;
+
+
+  NnetPerturbedTrainerConfig(): minibatch_size(500),
+                                minibatches_per_phase(50),
+                                target_objf_change(0.1),
+                                initial_d(0.05),
+                                tune_d_power(0.5),
+                                max_d_factor(2.0){ }
+  
+  void Register (OptionsItf *po) {
+    po->Register("minibatch-size", &minibatch_size,
+                 "Number of samples per minibatch of training data.");
+    po->Register("minibatches-per-phase", &minibatches_per_phase,
+                 "Number of minibatches to wait before printing training-set "
+                 "objective.");
+    po->Register("target-objf-change", &target_objf_change, "Target objective "
+                 "function change from feature perturbation, used to set "
+                 "feature distance parameter D");
+    po->Register("initial-d", &initial_d, "Initial value of parameter D "
+                 "It will ultimately be set according to --target-objf-change");
+  }  
+};
+
+
+/// Class NnetPerturbedTrainer is as NnetSimpleTrainer but implements feature
+/// perturbation; see the comment at the top of this file (\ref
+/// train-nnet-perturbed.h) for more details.
+
+class NnetPerturbedTrainer {
+ public:
+  NnetPerturbedTrainer(const NnetPerturbedTrainerConfig &config,
+                       const SpMatrix<BaseFloat> &within_class_covar,
+                       Nnet *nnet);
+  
+  /// TrainOnExample will take the example and add it to a buffer;
+  /// if we've reached the minibatch size it will do the training.
+  void TrainOnExample(const NnetExample &value);
+
+  ~NnetPerturbedTrainer();
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetPerturbedTrainer);
+  
+  void TrainOneMinibatch();
+
+  // This function initializes within_class_covar_ and num_layers_before_input_.
+  // The input within_class_covar is the within-class covariance on the original
+  // raw features, computed from LDA stats, but if this neural network has
+  // a data-preconditioning layer of type FixedAffineComponent then we will
+  // project the transform with that and treat the output of that transform
+  // as the input x (this is more efficient).
+  void InitWithinClassCovar(const SpMatrix<BaseFloat> &within_class_covar);
+
+  void UpdateD(BaseFloat orig_objf_per_example,
+               BaseFloat perturbed_objf_per_example);
+  
+  // The following function is called by TrainOneMinibatch() when we enter a new
+  // phase.  A phase is just a certain number of epochs, and now matters only
+  // for diagnostics (originally it meant something more).
+  void BeginNewPhase(bool first_time);
+  
+  // Things we were given in the initializer:
+  NnetPerturbedTrainerConfig config_;
+
+  Nnet *nnet_; // the nnet we're training.
+
+  // static information:
+  // num_layers_before_input_ is the number of initial layers before what we
+  // consider to be the input for this method: normally 2, for the splicing
+  // layer and the (FixedAffineComponent) data-preconditioning layer.
+  int32 num_layers_before_input_;
+  // The within_class_covar_ variable below is the within-class covariance; if
+  // we have a (FixedAffineComponent) data-preconditioning layer, we'd project
+  // the within-class-covariance with that and store it as within_class_covar_.
+  CuMatrix<BaseFloat> within_class_covar_;
+  
+  // State information:
+  int32 num_phases_;
+  int32 minibatches_seen_this_phase_;
+  std::vector<NnetExample> buffer_;
+
+  double logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
+  double logprob_perturbed_this_phase_;  // same for perturbed log-prob
+  double weight_this_phase_; // count corresponding to the above.
+  
+  double logprob_total_;
+  double logprob_perturbed_total_;
+  double weight_total_;
+
+  BaseFloat D_;  // The distance factor D.
+};
+
+
+
+
+/// This function computes the objective function and either updates the model
+/// or adds to parameter gradients.  It returns the cross-entropy objective
+/// function summed over all samples (normalize this by dividing by
+/// TotalNnetTrainingWeight(examples)).  It is mostly a wrapper for
+/// a class NnetPerturbedUpdater that's defined in train-nnet-perturbed.cc, but we
+/// don't want to expose that complexity at this level.
+/// All these examples will be treated as one minibatch.
+///
+/// D is the distance factor that determines how much to perturb examples;
+/// this is optimized in outer-level code (see class NnetPerturbedTrainer).
+/// num_layers_before_input determines how many layers to skip before we find
+/// the activation that we regard as the input x to the network, for purposes
+/// of this method (e.g. we might skip over the splicing layer and a layer
+/// that preconditions the input).
+/// within_class_covar (actually a symmetric matrix, but represented as CuMatrix),
+/// is the within-class covariance of the features, measured at that level,
+/// which ultimately will be derived from LDA stats on the data.
+
+void DoBackpropPerturbed(const Nnet &nnet,
+                         int32 num_layers_before_input,
+                         const CuMatrix<BaseFloat> &within_class_covar,
+                         BaseFloat D,
+                         const std::vector<NnetExample> &examples,
+                         Nnet *nnet_to_update,
+                         double *tot_objf_orig,
+                         double *tot_objf_perturbed);
+
+
+
+/// This function is similar to "DoBackpropParallel" as declared in
+/// nnet-update-parallel.h, but supports "perturbed" training.  It's intended
+/// for multi-threaded CPU-based training.  The number of threads will be
+/// set to g_num_threads.
+/// within_class_covar is the within-class covariance after any splicing
+/// but before preconditioning, as needed for the LDA computation.
+/// All pointer arguments must be non-NULL.
+void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config,
+                                 const SpMatrix<BaseFloat> &within_class_covar,
+                                 SequentialNnetExampleReader *example_reader,
+                                 double *tot_objf_orig,
+                                 double *tot_objf_perturbed,
+                                 double *tot_weight,
+                                 Nnet *nnet);
+
+
+} // namespace nnet2
+} // namespace kaldi
+
+#endif
--- a/src/nnet2/train-nnet.h
+++ b/src/nnet2/train-nnet.h
@ -48,7 +48,7 @@ struct NnetSimpleTrainerConfig {
 // Class NnetSimpleTrainer doesn't do much apart from batching up the
 // input into minibatches and giving it to the neural net code 
 // to call Update(), which will typically do stochastic gradient
-// descent.  It also reports training-set
+// descent.  It also reports training-set objective-function values.
 // It takes in the training examples through the call
 // "TrainOnExample()".
 class NnetSimpleTrainer {
@ -66,8 +66,9 @@ class NnetSimpleTrainer {
  
  void TrainOneMinibatch();
  
-  // The following function is called by TrainOneMinibatch()
-  // when we enter a new phase.
+  // The following function is called by TrainOneMinibatch() when we enter a new
+  // phase.  A phase is just a certain number of epochs, and now matters only
+  // for diagnostics (originally it meant something more).
  void BeginNewPhase(bool first_time);
  
  // Things we were given in the initializer:
--- a/src/nnet2bin/Makefile
+++ b/src/nnet2bin/Makefile
@ -25,7 +25,8 @@ BINFILES = nnet-randomize-frames nnet-am-info nnet-init \
   nnet-train-discriminative-simple nnet-train-discriminative-parallel \
   nnet-modify-learning-rates nnet-normalize-stddev nnet-perturb-egs \
   nnet-perturb-egs-fmllr nnet-get-weighted-egs nnet-adjust-priors \
-   cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning
+   cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning \
+   nnet-train-simple-perturbed nnet-train-parallel-perturbed

 OBJFILES =

--- a/src/nnet2bin/nnet-get-feature-transform.cc
+++ b/src/nnet2bin/nnet-get-feature-transform.cc
@ -36,12 +36,16 @@ int main(int argc, char *argv[]) {
    bool binary = true;
    FeatureTransformEstimateOptions opts;
    std::string write_cholesky;
+    std::string write_within_covar;
    ParseOptions po(usage);
-    po.Register("binary", &binary, "Write accumulators in binary mode.");
+    po.Register("binary", &binary, "Write outputs in binary mode.");
    po.Register("write-cholesky", &write_cholesky, "If supplied, write to this "
-                "wxfilename the Cholesky factor of the within-class covariance."
+                "wxfilename the Cholesky factor of the within-class covariance. "
                "Can be used for perturbing features.  E.g. "
                "--write-cholesky=exp/nnet5/cholesky.tpmat");
+    po.Register("write-within-covar", &write_within_covar, "If supplied, write "
+                "to this wxfilename the within-class covariance (as a symmetric "
+                "matrix). E.g. --write-within-covar=exp/nnet5/within_covar.mat");
    opts.Register(&po);
    po.Read(argc, argv);

@ -61,10 +65,18 @@ int main(int argc, char *argv[]) {

    Matrix<BaseFloat> mat;
    TpMatrix<BaseFloat> cholesky;
-    fte.Estimate(opts, &mat, write_cholesky != "" ? &cholesky : NULL);
+    fte.Estimate(opts, &mat,
+                 (write_cholesky != "" || write_within_covar != "" ?
+                  &cholesky : NULL));
    WriteKaldiObject(mat, projection_wxfilename, binary);
-    if (write_cholesky != "")
+    if (write_cholesky != "") {
      WriteKaldiObject(cholesky, write_cholesky, binary);
+    }
+    if (write_within_covar != "") {
+      SpMatrix<BaseFloat> within_var(cholesky.NumRows());
+      within_var.AddTp2(1.0, cholesky, kNoTrans, 0.0);
+      WriteKaldiObject(within_var, write_within_covar, binary);
+    }
    return 0;
  } catch(const std::exception &e) {
    std::cerr << e.what();
--- a/src/nnet2bin/nnet-train-parallel-perturbed.cc
+++ b/src/nnet2bin/nnet-train-parallel-perturbed.cc
@ -0,0 +1,127 @@
+// nnet2bin/nnet-train-parallel-perturbed.cc
+
+// Copyright 2012-2014  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet2/train-nnet-perturbed.h"
+#include "nnet2/am-nnet.h"
+#include "thread/kaldi-thread.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet2;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train the neural network parameters with backprop and stochastic\n"
+        "gradient descent using minibatches.  The training frames and labels\n"
+        "are read via a pipe from nnet-randomize-frames.  This is like nnet-train-parallel,\n"
+        "using multiple threads in a Hogwild type of update, but also adding\n"
+        "perturbed training (see src/nnet2/train-nnet-perturbed.h for info)\n"
+        "\n"
+        "Usage:  nnet-train-parallel-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
+        "\n"
+        "e.g.:\n"
+        "nnet-randomize-frames [args] | nnet-train-parallel-pertured \\\n"
+        " --within-covar=within.spmat --num-threads=8 --target-objf-change=0.2 1.nnet ark:- 2.nnet\n";
+    
+    bool binary_write = true;
+    bool zero_stats = true;
+    int32 srand_seed = 0;
+    std::string within_covar_rxfilename;
+    NnetPerturbedTrainerConfig train_config;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("within-covar", &within_covar_rxfilename,
+                "rxfilename of within-class covariance-matrix, written as "
+                "SpMatrix.  Must be specified.");
+    po.Register("zero-stats", &zero_stats, "If true, zero stats "
+                "stored with the neural net (only affects mixing up).");
+    po.Register("srand", &srand_seed,
+                "Seed for random number generator (e.g., for dropout)");
+    po.Register("num-threads", &g_num_threads, "Number of training threads to use "
+                "in the parallel update. [Note: if you use a parallel "
+                "implementation of BLAS, the actual number of threads may be larger.]");
+    train_config.Register(&po);
+    
+    po.Read(argc, argv);
+    srand(srand_seed);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        nnet_wxfilename = po.GetArg(3);
+
+    if (within_covar_rxfilename == "") {
+      KALDI_ERR << "The option --within-covar is required.";
+    }
+    
+    TransitionModel trans_model;
+    AmNnet am_nnet;
+    {
+      bool binary_read;
+      Input ki(nnet_rxfilename, &binary_read);
+      trans_model.Read(ki.Stream(), binary_read);
+      am_nnet.Read(ki.Stream(), binary_read);
+    }
+
+    KALDI_ASSERT(train_config.minibatch_size > 0);
+
+    SpMatrix<BaseFloat> within_covar;
+    ReadKaldiObject(within_covar_rxfilename, &within_covar);
+    
+    if (zero_stats) am_nnet.GetNnet().ZeroStats();
+
+    SequentialNnetExampleReader example_reader(examples_rspecifier);
+    
+    
+    double tot_objf_orig, tot_objf_perturbed, tot_weight;
+    // logging info will be printed from within the next call.
+    DoBackpropPerturbedParallel(train_config,
+                                within_covar,
+                                &example_reader,
+                                &tot_objf_orig,
+                                &tot_objf_perturbed,
+                                &tot_weight,
+                                &(am_nnet.GetNnet()));
+    {
+      Output ko(nnet_wxfilename, binary_write);
+      trans_model.Write(ko.Stream(), binary_write);
+      am_nnet.Write(ko.Stream(), binary_write);
+    }
+    
+    KALDI_LOG << "Finished training, processed " << tot_weight
+              << " training examples (weighted).  Wrote model to "
+              << nnet_wxfilename;
+    return (tot_weight == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
--- a/src/nnet2bin/nnet-train-parallel.cc
+++ b/src/nnet2bin/nnet-train-parallel.cc
@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
        "Usage:  nnet-train-parallel [options] <model-in> <training-examples-in> <model-out>\n"
        "\n"
        "e.g.:\n"
-        "nnet-randomize-frames [args] | nnet-train-simple 1.nnet ark:- 2.nnet\n";
+        "nnet-randomize-frames [args] | nnet-train-parallel --num-threads=8 1.nnet ark:- 2.nnet\n";
    
    bool binary_write = true;
    bool zero_stats = true;
--- a/src/nnet2bin/nnet-train-perturbed.cc
+++ b/src/nnet2bin/nnet-train-perturbed.cc
@ -0,0 +1,137 @@
+// nnet2bin/nnet-train-perturbed.cc
+
+// Copyright 2012-2014  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet2/nnet-randomize.h"
+#include "nnet2/train-nnet-perturbed.h"
+#include "nnet2/am-nnet.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet2;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train the neural network parameters with backprop and stochastic\n"
+        "gradient descent using minibatches.  The training frames and labels\n"
+        "are read via a pipe from nnet-randomize-frames.  This version of the\n"
+        "training program does not update the learning rate, but uses\n"
+        "the learning rates stored in the neural nets.\n"
+        "\n"
+        "Usage:  nnet-train-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
+        "note: the option --within-covar=<file> is needed\n"
+        "\n"
+        "e.g.:\n"
+        "nnet-randomize-frames [args] | nnet-train-perturbed --within-covar=within.spmat 1.nnet ark:- 2.nnet\n";
+    
+    bool binary_write = true;
+    bool zero_stats = true;
+    int32 srand_seed = 0;
+    std::string use_gpu = "yes";
+    std::string within_covar_rxfilename;
+    NnetPerturbedTrainerConfig train_config;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("within-covar", &within_covar_rxfilename,
+                "rxfilename of within-class covariance-matrix, written as "
+                "SpMatrix.  Must be specified.");
+    po.Register("zero-stats", &zero_stats, "If true, zero occupation "
+                "counts stored with the neural net (only affects mixing up).");
+    po.Register("srand", &srand_seed, "Seed for random number generator "
+                "(relevant if you have layers of type AffineComponentPreconditioned "
+                "with l2-penalty != 0.0");
+    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 
+    
+    train_config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    srand(srand_seed);
+    
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+    if (within_covar_rxfilename == "") {
+      KALDI_ERR << "The option --within-covar is required.";
+    }
+    
+    std::string nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        nnet_wxfilename = po.GetArg(3);
+    
+    int64 num_examples = 0;
+
+    {
+      TransitionModel trans_model;
+      AmNnet am_nnet;
+      {
+        bool binary_read;
+        Input ki(nnet_rxfilename, &binary_read);
+        trans_model.Read(ki.Stream(), binary_read);
+        am_nnet.Read(ki.Stream(), binary_read);
+      }
+
+      SpMatrix<BaseFloat> within_covar;
+      ReadKaldiObject(within_covar_rxfilename, &within_covar);
+
+      if (zero_stats) am_nnet.GetNnet().ZeroStats();
+    
+      { // want to make sure this object deinitializes before
+        // we write the model, as it does something in the destructor.
+        NnetPerturbedTrainer trainer(train_config,
+                                     within_covar,
+                                     &(am_nnet.GetNnet()));
+      
+        SequentialNnetExampleReader example_reader(examples_rspecifier);
+
+        for (; !example_reader.Done(); example_reader.Next(), num_examples++)
+          trainer.TrainOnExample(example_reader.Value());  // It all happens here!
+      }
+    
+      {
+        Output ko(nnet_wxfilename, binary_write);
+        trans_model.Write(ko.Stream(), binary_write);
+        am_nnet.Write(ko.Stream(), binary_write);
+      }
+    }
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    
+    KALDI_LOG << "Finished training, processed " << num_examples
+              << " training examples.  Wrote model to "
+              << nnet_wxfilename;
+    return (num_examples == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
--- a/src/nnet2bin/nnet-train-simple-perturbed.cc
+++ b/src/nnet2bin/nnet-train-simple-perturbed.cc
@ -0,0 +1,138 @@
+// nnet2bin/nnet-train-simple-perturbed.cc
+
+// Copyright 2012-2014  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet2/nnet-randomize.h"
+#include "nnet2/train-nnet-perturbed.h"
+#include "nnet2/am-nnet.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet2;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train the neural network parameters with backprop and stochastic\n"
+        "gradient descent using minibatches.  The training frames and labels\n"
+        "are read via a pipe from nnet-randomize-frames.  This is as nnet-train-simple\n"
+        "but implements perturbed training (see src/nnet2/train-nnet-perturbed.h for\n"
+        "details)\n"
+        "\n"
+        "Usage:  nnet-train-simple-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
+        "note: the option --within-covar=<file> is needed\n"
+        "\n"
+        "e.g.:\n"
+        "nnet-randomize-frames [args] | nnet-train-simple-perturbed \\\n"
+        " --within-covar=within.spmat --target-objf-change=0.2 1.nnet ark:- 2.nnet\n";
+    
+    bool binary_write = true;
+    bool zero_stats = true;
+    int32 srand_seed = 0;
+    std::string use_gpu = "yes";
+    std::string within_covar_rxfilename;
+    NnetPerturbedTrainerConfig train_config;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("within-covar", &within_covar_rxfilename,
+                "rxfilename of within-class covariance-matrix, written as "
+                "SpMatrix.  Must be specified.");
+    po.Register("zero-stats", &zero_stats, "If true, zero occupation "
+                "counts stored with the neural net (only affects mixing up).");
+    po.Register("srand", &srand_seed, "Seed for random number generator "
+                "(relevant if you have layers of type AffineComponentPreconditioned "
+                "with l2-penalty != 0.0");
+    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 
+    
+    train_config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    srand(srand_seed);
+    
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+    if (within_covar_rxfilename == "") {
+      KALDI_ERR << "The option --within-covar is required.";
+    }
+    
+    std::string nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        nnet_wxfilename = po.GetArg(3);
+    
+    int64 num_examples = 0;
+
+    {
+      TransitionModel trans_model;
+      AmNnet am_nnet;
+      {
+        bool binary_read;
+        Input ki(nnet_rxfilename, &binary_read);
+        trans_model.Read(ki.Stream(), binary_read);
+        am_nnet.Read(ki.Stream(), binary_read);
+      }
+
+      SpMatrix<BaseFloat> within_covar;
+      ReadKaldiObject(within_covar_rxfilename, &within_covar);
+
+      if (zero_stats) am_nnet.GetNnet().ZeroStats();
+    
+      { // want to make sure this object deinitializes before
+        // we write the model, as it does something in the destructor.
+        NnetPerturbedTrainer trainer(train_config,
+                                     within_covar,
+                                     &(am_nnet.GetNnet()));
+      
+        SequentialNnetExampleReader example_reader(examples_rspecifier);
+
+        for (; !example_reader.Done(); example_reader.Next(), num_examples++)
+          trainer.TrainOnExample(example_reader.Value());  // It all happens here!
+      }
+    
+      {
+        Output ko(nnet_wxfilename, binary_write);
+        trans_model.Write(ko.Stream(), binary_write);
+        am_nnet.Write(ko.Stream(), binary_write);
+      }
+    }
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    
+    KALDI_LOG << "Finished training, processed " << num_examples
+              << " training examples.  Wrote model to "
+              << nnet_wxfilename;
+    return (num_examples == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+