diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh
index 347d01d22..4ca29ea9b 100755
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@@ -1,11 +1,5 @@
 #!/bin/bash
 
-# CAUTION: I changed e.g. 1.trans to trans.1 in the scripts.  If you ran it
-# part-way through prior to this, to convert to the new naming
-# convention, run:
-# for x in `find . -name '*.trans'`; do mv $x `echo $x | perl -ane 's/(\d+)\.trans/trans.$1/;print;'`; done
-# but be careful as this will not follow soft links.
-
 . cmd.sh
 
 # call the next line with the directory where the RM data is
diff --git a/egs/swbd/s5/conf/decode.config b/egs/swbd/s5/conf/decode.config
index d91f86183..1940883b2 100644
--- a/egs/swbd/s5/conf/decode.config
+++ b/egs/swbd/s5/conf/decode.config
@@ -1,5 +1,2 @@
 beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
 first_beam=8.0 # beam for 1st-pass decoding in SAT.
-
-
-
diff --git a/egs/swbd/s5/local/run_nnet_cpu.sh b/egs/swbd/s5/local/run_nnet_cpu.sh
index b8b109f6c..367575eb3 100755
--- a/egs/swbd/s5/local/run_nnet_cpu.sh
+++ b/egs/swbd/s5/local/run_nnet_cpu.sh
@@ -21,8 +21,8 @@
 )
 
 # Here are the results (copied from RESULTS file)
-#exp/nnet6a/decode_train_dev/wer_10:%WER 24.87 [ 12053 / 48460, 1590 ins, 3017 del, 7446 sub ]
-#exp/nnet6a/decode_eval2000/score_10/eval2000.ctm.filt.sys:     | Sum/Avg    | 4459  42989 | 77.1   16.0    6.9    2.7   25.6   62.6 |
+#exp/nnet6a/decode_train_dev/wer_11:%WER 24.30 [ 11774 / 48460, 1619 ins, 2877 del, 7278 sub ]
+#exp/nnet6a/decode_eval2000/score_10/eval2000.ctm.filt.sys:     | Sum/Avg    | 4459  42989 | 77.8   16.0    6.3    3.0   25.3   62.6 |
 
 
 # Here are some older results when the system had 2k not 4k leaves and ran from a worse SAT
diff --git a/egs/wsj/s5/RESULTS b/egs/wsj/s5/RESULTS
index df84bbf98..15f03d268 100644
--- a/egs/wsj/s5/RESULTS
+++ b/egs/wsj/s5/RESULTS
@@ -191,6 +191,7 @@ exp/tri4a_dnn/decode_bd_tgpr_eval92/wer_10:%WER 4.00 [ 226 / 5643, 34 ins, 12 de
 # and for eval92 is 3.79, the same system.  (On this setup, discriminative training helped a lot,
 # which seems to be the reason we can't beat the SGMM+MMI numbers here.)
 
-exp/nnet5c1/decode_bd_tgpr_dev93/wer_10:%WER 7.48 [ 616 / 8234, 73 ins, 98 del, 445 sub ]
-exp/nnet5c1/decode_bd_tgpr_eval92/wer_11:%WER 4.41 [ 249 / 5643, 29 ins, 19 del, 201 sub ]
-# Note: my 4.41% result is worse than Karel's 4.00%.
+
+exp/nnet5c1/decode_bd_tgpr_dev93/wer_14:%WER 7.32 [ 603 / 8234, 61 ins, 101 del, 441 sub ]
+exp/nnet5c1/decode_bd_tgpr_eval92/wer_14:%WER 4.39 [ 248 / 5643, 32 ins, 17 del, 199 sub ]
+# Note: my 4.39% result is worse than Karel's 4.00%.
diff --git a/egs/wsj/s5/local/wsj_data_prep.sh b/egs/wsj/s5/local/wsj_data_prep.sh
index 65143694f..685b57aa7 100755
--- a/egs/wsj/s5/local/wsj_data_prep.sh
+++ b/egs/wsj/s5/local/wsj_data_prep.sh
@@ -47,7 +47,7 @@ cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
  grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
 
 nl=`cat train_si84.flist | wc -l`
-[ "$nl" -eq 7138 ] || echo "Warning: expected 37416 lines in train_si84.flist, got $nl"
+[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl"
 
 # This version for SI-284
 cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index bfc0ea132..44010b266 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -281,7 +281,6 @@ steps/train_quick.sh --cmd "$train_cmd" \
   exp/tri4b/graph_bd_tgpr data/test_eval92 exp/tri4b/decode_bd_tgpr_eval92 || exit 1;
 ) &
 
-
 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 # all the data).  Use 30 jobs.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
diff --git a/egs/wsj/s5/steps/append_feats.sh b/egs/wsj/s5/steps/append_feats.sh
new file mode 100755
index 000000000..370638cd4
--- /dev/null
+++ b/egs/wsj/s5/steps/append_feats.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+# This script appends the features in two data directories.
+
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+# This config creates MFCC features with half the window size and window shift,
+# and splices and sub-samples them.  We'll use another script append_feats.sh
+# to combine (append) the data directories.
+
+# Begin configuration section.
+cmd=run.pl
+nj=4
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+   echo "usage: append_feats.sh [options] <src-data-dir1> <src-data-dir2> <dest-data-dir> <log-dir> <path-to-storage-dir>";
+   echo "options: "
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data_src1=$1
+data_src2=$2
+data=$3
+logdir=$4
+mfccdir=$5
+
+utils/split_data.sh $data_src1 $nj || exit 1;
+utils/split_data.sh $data_src2 $nj || exit 1;
+
+mkdir -p $mfccdir $logdir
+
+rm -rf $data
+mkdir -p `basename $data` # Make sure directory one level up exists.
+cp -r $data_src1 $data # so we get the other files, such as utt2spk.
+rm $data/cmvn.scp
+rm -r $data/split* 2>/dev/null
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+$cmd JOB=1:$nj $logdir/append.JOB.log \
+   append-feats --truncate-frames=true \
+   scp:$data_src1/split$nj/JOB/feats.scp scp:$data_src2/split$nj/JOB/feats.scp \
+   ark,scp:$mfccdir/appended_$name.JOB.ark,$mfccdir/appended_$name.JOB.scp || exit 1;
+              
+# concatenate the .scp files together.
+for ((n=1; n<=nj; n++)); do
+  cat $mfccdir/appended_$name.$n.scp >> $data/feats.scp || exit 1;
+done > $data/feats.scp
+
+
+nf=`cat $data/feats.scp | wc -l` 
+nu=`cat $data/utt2spk | wc -l` 
+if [ $nf -ne $nu ]; then
+  echo "It seems not all of the feature files were successfully ($nf != $nu);"
+  echo "consider using utils/fix_data_dir.sh $data"
+fi
+
+echo "Succeeded creating MFCC features for $name"
diff --git a/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh b/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
index ffb0a25d5..e87d6cded 100755
--- a/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
+++ b/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
@@ -18,6 +18,10 @@ max_mem=20000000 # This will stop the processes getting too large.
 # This is in bytes, but not "real" bytes-- you have to multiply
 # by something like 5 or 10 to get real bytes (not sure why so large)
 # End configuration section.
+num_threads=1 # Number of threads used in nnet-logprob computation.  If you set
+              # this to a different value, make sure to also set the appropriate
+              # queue options.  If you set this too high it won't use all the
+              # threads as most of the time will be taken in the decoder.
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -104,9 +108,10 @@ fi
 
 if [ $sub_split -eq 1 ]; then 
   $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
-   nnet-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+   nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats" ark:- \| \
+   latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
     --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
-     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+     $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
 else
   for n in `seq $nj`; do
     if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
@@ -120,9 +125,10 @@ else
       mkdir -p $dir/part
       feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
       $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
-        nnet-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+        nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats_subset" ark:- \| \
+        latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
         --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
-          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
+          $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
       echo Merging archives for data subset $n
       rm $dir/.error 2>/dev/null;
       for k in `seq $sub_split`; do
diff --git a/egs/wsj/s5/steps/train_nnet_cpu.sh b/egs/wsj/s5/steps/train_nnet_cpu.sh
index 28e47695e..9e4f086bc 100755
--- a/egs/wsj/s5/steps/train_nnet_cpu.sh
+++ b/egs/wsj/s5/steps/train_nnet_cpu.sh
@@ -14,36 +14,42 @@ num_iters_final=10 # Number of final iterations to give to the
                    # optimization over the validation set.
 initial_learning_rate=0.02 # for RM; or 0.01 is suitable for Swbd.
 final_learning_rate=0.004  # for RM; or 0.001 is suitable for Swbd.
-num_valid_utts=300    # held-out utterances, used only for diagnostics.
-num_valid_frames_shrink=2000 # a subset of the frames in "valid_utts", used only
-                             # for estimating shrinkage parameters and for
-                             # objective-function reporting.
+num_utts_subset=300    # number of utterances in validation and training
+                       # subsets used for shrinkage and diagnostics
+num_valid_frames_shrink=0 # number of validation frames in the subset
+                             # used for shrinking
+num_train_frames_shrink=2000  # number of training frames in the subset used
+                              # for shrinking (by default we use all training
+                              # frames for this.)
 shrink_interval=3 # shrink every $shrink_interval iters,
                 # except at the start of training when we do it every iter.
-num_valid_frames_combine=10000 # combination weights at the very end.
+num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
+num_train_frames_combine=10000 # # train frames for the above.
+num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
 minibatch_size=128 # by default use a smallish minibatch size for neural net training; this controls instability
                    # which would otherwise be a problem with multi-threaded update.  Note:
                    # it also interacts with the "preconditioned" update, so it's not completely cost free.
-samples_per_iteration=400000 # each iteration of training, see this many samples
-                             # per job.
+samples_per_iter=400000 # each iteration of training, see this many samples
+                             # per job.  This is just a guideline; it will pick a number
+                             # that divides the number of samples in the entire data.
 shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
-                         # on each iter.  You could set it to 0 or to a large value for complete
-                         # randomization, but this would both consume memory and cause spikes in
-                         # disk I/O.  Smaller is easier on disk and memory but less random.  It's
-                         # not a huge deal though, as samples are anyway randomized right at the start.
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
 num_jobs_nnet=8 # Number of neural net jobs to run in parallel.
 
 add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=2
 initial_num_hidden_layers=1  # we'll add the rest one by one.
 num_parameters=2000000 # 2 million parameters by default.
-stage=-7
+stage=-9
 realign_iters=""
 beam=10  # for realignment.
 retry_beam=40
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
-shuffle_opts="-tc 5" # max 5 jobs running at one time (a lot of I/O.)
+io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
 nnet_config_opts=
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 lda_dim=250
@@ -54,7 +60,11 @@ shrink=true
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-mkl_num_threads=1
+
+valid_is_heldout=false # For some reason, holding out the validation set from the training set
+                       # seems to hurt, so by default we don't do it (i.e. it's included in training)
+random_copy=false
+cleanup=true
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -72,7 +82,7 @@ if [ $# != 4 ]; then
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
   echo "                                                   # while reducing learning rate (determines #iterations, together"
-  echo "                                                   # with --samples-per-iteration and --num-jobs-nnet)"
+  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
   echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
   echo "                                                   # after learning rate fully reduced"
   echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
@@ -95,21 +105,27 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads."
-  echo "  --shuffle-opts <opts|\"-tc 5\">                  # Options given to e.g. queue.pl for the job that shuffles the "
-  echo "                                                   # data. (prevents stressing the disk). "
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
-  echo "  --samples-per-iteration <#samples|400000>        # Number of samples of data to process per iteration, per"
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
   echo "                                                   # process."
   echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
   echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
   echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
   echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
   echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
-  echo "  --stage <stage|-7>                               # Used to run a partially-completed training process from somewhere in"
+  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
+  echo "                                                   # (the validation subset is held out from training)"
+  echo "  --num-valid-frames-shrink <#frames|2000>         # Number of frames from the validation set used for shrinking"
+  echo "  --num-train-frames-shrink <#frames|0>            # Number of frames from the training set used for shrinking"
+  echo "                                                   # (by default it's included in training, which for some reason helps)."
+  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
+  echo "  --num-valid-frames-combine <#frames|10000>       # Number of frames used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
   
-
   exit 1;
 fi
 
@@ -144,8 +160,11 @@ cp $alidir/tree $dir
 
 
 # Get list of validation utterances. 
-awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_valid_utts \
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/valid_uttlist || exit 1;
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+     head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
+
 
 ## Set up features.  Note: these are different from the normal features
 ## because we have one rspecifier that has the features for the entire
@@ -154,33 +173,49 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
-     split_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
+  delta) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
     valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
    ;;
-  lda) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
-      split_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+  lda) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
       valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+      train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
 if [ -f $alidir/trans.1 ]; then
   echo "$0: using transforms from $alidir"
-  feats="$feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
-  split_feats="$split_feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
   valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
 fi
 
+if [ $stage -le -9 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=`feat-to-len scp:$data/feats.scp ark,t:- | awk '{x += $2;} END{print x;}'` || exit 1;
+  echo $num_frames > $dir/num_frames
+else
+  num_frames=`cat $dir/num_frames` || exit 1;
+fi
+
+# Working out number of iterations per epoch.
+iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
+[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
+samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
+echo "Every epoch, splitting the data up into $iters_per_epoch iterations,"
+echo "giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
+
+
 ## Do LDA on top of whatever features we already have; store the matrix which
 ## we'll put into the neural network as a constant.
 
-if [ $stage -le -7 ]; then
-  echo "Accumulating LDA statistics."
+if [ $stage -le -8 ]; then
+  echo "$0: Accumulating LDA statistics."
   $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
       weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
-      acc-lda --rand-prune=$randprune $alidir/final.mdl "$split_feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \
+      acc-lda --rand-prune=$randprune $alidir/final.mdl "$feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \
        $dir/lda.JOB.acc || exit 1;
   est-lda --dim=$lda_dim $dir/lda.mat $dir/lda.*.acc \
       2>$dir/log/lda_est.log || exit 1;
@@ -195,10 +230,10 @@ if [ $initial_num_hidden_layers -gt $num_hidden_layers ]; then
 fi
 
 
-if [ $stage -le -6 ]; then
+if [ $stage -le -7 ]; then
   echo "$0: initializing neural net";
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers.
+  # single hidden layer; we need this to add new layers. 
   if [ ! -z "$alpha" ]; then
     utils/nnet-cpu/make_nnet_config_preconditioned.pl --alpha $alpha $nnet_config_opts \
       --learning-rate $initial_learning_rate \
@@ -219,14 +254,14 @@ if [ $stage -le -6 ]; then
        $dir/0.mdl || exit 1;
 fi
 
-if [ $stage -le -5 ]; then
+if [ $stage -le -6 ]; then
   echo "Training transition probabilities and setting priors"
   $cmd $dir/log/train_trans.log \
     nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
     || exit 1;
 fi
 
-if [ $stage -le -4 ]; then
+if [ $stage -le -5 ]; then
   echo "Compiling graphs of transcripts"
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
     compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst  \
@@ -239,118 +274,113 @@ cp $alidir/ali.*.gz $dir
 
 nnet_context_opts="--left-context=`nnet-am-info $dir/0.mdl 2>/dev/null | grep -w left-context | awk '{print $2}'` --right-context=`nnet-am-info $dir/0.mdl 2>/dev/null | grep -w right-context | awk '{print $2}'`" || exit 1;
 
-if [ $stage -le -3 ]; then
-  echo "Getting validation examples."
-  $cmd $dir/log/create_valid_subset_shrink.log \
+if [ $stage -le -4 ]; then
+  echo "Getting validation and training subset examples."
+  rm $dir/.error 2>/dev/null
+  $cmd $dir/log/create_valid_subset.log \
     nnet-get-egs $nnet_context_opts "$valid_feats" \
      "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $dir/0.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
-     "ark:$dir/valid_all.egs" || exit 1;
-  echo "Getting subsets of validation examples for shrinking and combination."
+     "ark:$dir/valid_all.egs" || touch $dir/.error &
+  $cmd $dir/log/create_train_subset.log \
+    nnet-get-egs $nnet_context_opts "$train_subset_feats" \
+     "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $dir/0.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && exit 1;
+  echo "Getting subsets of validation examples for shrinking, diagnostics and combination."
   $cmd $dir/log/create_valid_subset_shrink.log \
-    nnet-subset-egs --n=$num_valid_frames_shrink ark:$dir/valid_all.egs ark:$dir/valid_shrink.egs  &
+    nnet-subset-egs --n=$num_valid_frames_shrink ark:$dir/valid_all.egs \
+     ark:$dir/valid_shrink.egs || touch $dir/.error &
   $cmd $dir/log/create_valid_subset_combine.log \
-    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs ark:$dir/valid_combine.egs  &
+    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+        ark:$dir/valid_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_valid_subset_diagnostic.log \
+    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    ark:$dir/valid_diagnostic.egs || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset_shrink.log \
+    nnet-subset-egs --n=$num_train_frames_shrink ark:$dir/train_subset_all.egs \
+    ark:$dir/train_shrink.egs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_combine.log \
+    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    ark:$dir/train_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_diagnostic.log \
+    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    ark:$dir/train_diagnostic.egs || touch $dir/.error &
   wait
-  [ ! -s $dir/valid_shrink.egs ] && echo "No validation examples for shrinking" && exit 1;
-  [ ! -s $dir/valid_combine.egs ] && echo "No validation examples for combination" && exit 1;
-  rm $dir/valid_all.egs
+  cat $dir/valid_shrink.egs $dir/train_shrink.egs > $dir/shrink.egs
+  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
+
+  for f in $dir/{shrink,combine,train_diagnostic,valid_diagnostic}.egs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_{shrink,combine}.egs
 fi
 
-if [ $stage -le -2 ]; then
+if [ $stage -le -3 ]; then
   mkdir -p $dir/egs
   mkdir -p $dir/temp
   echo "Creating training examples";
-  # in $dir/egs, create $num_jobs_nnet separate files with training examples,
-  # with randomly shuffled order.  We shuffle the order of examples in each
-  # file.  Then on each iteration, for each training process, we'll take a 
-  # random subset of blocks of examples within that process's file.
-  # We take them in blocks, because it avoids the overhead of fseek() while
-  # creating the examples.
+  # in $dir/egs, create $num_jobs_nnet separate files with training examples.
+  # The order is not randomized at this point.
 
   egs_list=
   for n in `seq 1 $num_jobs_nnet`; do
-    egs_list="$egs_list ark,scp:$dir/egs/egs_orig.$n.ark,$dir/egs/egs_orig.$n.scp"
+    egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark"
   done
   echo "Generating training examples on disk"
   # The examples will go round-robin to egs_list.
-  $cmd $dir/log/get_egs.log \
+  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
     nnet-get-egs $nnet_context_opts "$feats" \
-    "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
+    "ark,cs:gunzip -c $dir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
     nnet-copy-egs ark:- $egs_list || exit 1;
 fi
 
+if [ $stage -le -2 ]; then
+  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
+  # then split into multiple parts egs.JOB.*.scp for different parts of the
+  # data, 0 .. $iters_per_epoch-1.
+
+  if [ $iters_per_epoch -eq 1 ]; then
+    echo "Since iters-per-epoch == 1, just concatenating the data."
+    for n in `seq 1 $num_jobs_nnet`; do
+      cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
+      rm $dir/egs/egs_orig.$n.*.ark || exit 1;
+    done
+  else # We'll have to split it up using nnet-copy-egs.
+    egs_list=
+    for n in `seq 0 $[$iters_per_epoch-1]`; do
+      egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark"
+    done
+    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
+      nnet-copy-egs --random=$random_copy --srand=JOB \
+        "ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list '&&' \
+        rm $dir/egs/egs_orig.JOB.*.ark || exit 1;
+  fi
+fi
+
 if [ $stage -le -1 ]; then
   # Next, shuffle the order of the examples in each of those files.
-  # In order to not use too much memory (in case the size of the files is
-  # huge) we do this by randomizing the order of the .scp file and then
-  # just call nnet-copy-egs.  If the file system is willing to store
-  # stuff in memory, it is free to do so.  This is not super-optimal in
-  # terms of file system performance but it's simple and it won't fail when
-  # the data gets large.
+  # Each one should not be too large, so we can do this in memory.
   echo "Shuffling the order of training examples"
   echo "(in order to avoid stressing the disk, these won't all run at once)."
-  $cmd $shuffle_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.JOB.log \
-    utils/shuffle_list.pl --srand JOB $dir/egs/egs_orig.JOB.scp \| \
-    nnet-copy-egs scp:- ark,scp:$dir/egs/egs.JOB.ark,$dir/egs/egs.JOB.scp \
-    '&&' rm $dir/egs/egs_orig.JOB.ark $dir/egs/egs_orig.JOB.scp
-  smallest_len=`wc -l $dir/egs/egs.*.scp | sort -n -k1 | awk '{print $1}' | head -1`
-  # If the $samples_per_iteration is more than each split of the data,
-  # append to each .scp file the .scp files from the next one or two 
-  # splits (or more), so each one is larger...
-  rm $dir/egs/egs.*.scp.orig 2>/dev/null
-  if [ $samples_per_iteration -gt $smallest_len ]; then
-    extra_files=$[($samples_per_iteration-1) / $smallest_len]
-    echo Each part of the data has about $smallest_len lines which is less than the 
-    echo samples per iteration $samples_per_iteration, so appending next $extra_files
-    echo files to each scp file
-    for n in `seq $num_jobs_nnet`; do mv $dir/egs/egs.$n.scp $dir/egs/egs.$n.scp.orig; done
-    for n in `seq $num_jobs_nnet`; do
-      for e in `seq 0 $extra_files`; do
-         m=$[(($n + $e - 1)%$num_jobs_nnet)+1]
-         cat $dir/egs/egs.$m.scp.orig
-      done > $dir/egs/egs.$n.scp
-    done
-  fi  
+
+  for n in `seq 0 $[$iters_per_epoch-1]`; do
+    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
+      nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
+      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark '&&' \
+      rm $dir/egs/egs_tmp.JOB.$n.ark || exit 1;
+  done
 fi
 
-num_egs=`grep wrote $dir/log/get_egs.log | tail -1 | awk '{print $NF}'` || exit 1;
-! [ $num_egs -gt 0 ] && echo "bad num_egs $num_egs" && exit 1;
-num_iters_reduce=$[ 1 + (($num_egs * $num_epochs)/($num_jobs_nnet * $samples_per_iteration))]
-num_iters_extra=$[1 + (($num_egs * $num_epochs_extra)/($num_jobs_nnet * $samples_per_iteration))]
+num_iters_reduce=$[$num_epochs * $iters_per_epoch];
+num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
 num_iters=$[$num_iters_reduce+$num_iters_extra]
 
 echo "Will train for $num_epochs + $num_epochs_extra epochs, equalling "
 echo " $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
 echo " (while reducing learning rate) + (with constant learning rate)."
 
-function get_list {
-  # usage: get_list <samples-per-iter> <iter> <input-file> >output
-  #
-  # Outputs an scp file for this job for this iteration.  The
-  # output will have <samples-per-iter> lines, and will contain lines from
-  # egs.JOB.scp, possibly with repeats.  It will be sorted numerically on its
-  # first field, so the .ark file is accessed in order (we then pipe to
-  # nnet-shuffle-egs to randomize the order).  The way we do it is, we imagine
-  # we had concatenated the file $dir/egs/egs.JOB.scp infinite times, and
-  # taken from the concatenated file, the lines 
-  # <samples-per-iter> * <iter> ...  <samples-per-iter> * (<iter> + 1) - 1,
-  # and then sorted them on the first field (which is a number).
-  # We don't actually implement it this way, we do it a bit more efficiently.
-  # We require that samples-per-iter <= (#lines in input-file).
-  [ $# -ne 3 ] && echo "get_list: bad usage" && exit 1;
-  samples_per_iter=$1
-  my_iter=$2
-  input_file=$3
-  start=$[$my_iter * $samples_per_iter]; # starting-point in concatenated file.
-  input_len=`cat $input_file | wc -l`
-  start=$[$start - $input_len*($start/$input_len)]; # remove whole multiples of input_len
-  # we have to concatenate the input file to itself.
-  cat $input_file $input_file | \
-     head -n $[$start + $samples_per_iter] | tail -n $samples_per_iter | \
-     sort -k2 -k1n
-}
-
-
 # up till $last_normal_shrink_iter we will shrink the parameters
 # in the normal way using the dev set, but after that we will
 # only re-compute the shrinkage parameters periodically.
@@ -361,22 +391,19 @@ x=0
 while [ $x -lt $num_iters ]; do
   if [ $x -ge 0 ] && [ $stage -le $x ]; then
 
-    # Set off a job that does diagnostics, in the background.
-    $cmd $parallel_opts $dir/log/compute_prob.$x.log \
-      nnet-compute-prob $dir/$x.mdl ark:$dir/valid_shrink.egs &
+    # Set off jobs doing some diagnostics, in the background.
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$dir/valid_diagnostic.egs &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$dir/train_diagnostic.egs &
 
     if echo $realign_iters | grep -w $x >/dev/null; then
       echo "Realigning data (pass $x)"
       $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
         nnet-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$dir/$x.mdl" \
-         "ark:gunzip -c $dir/fsts.JOB.gz|" "$split_feats" \
+         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
         "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
     fi
-    for n in `seq $num_jobs_nnet`; do
-      # the following command gets a subset of the n'th scp file, containing
-      # $samples_per_iteration lines.
-      get_list $samples_per_iteration $x $dir/egs/egs.$n.scp > $dir/temp/egs.$x.$n.scp
-    done      
 
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
@@ -388,11 +415,10 @@ while [ $x -lt $num_iters ]; do
     fi
 
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
-      MKL_NUM_THREADS=$mkl_num_threads \
-         nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
-           scp:$dir/temp/egs.$x.JOB.scp ark:- \| \
-         nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
-         "$mdl" ark:- $dir/$[$x+1].JOB.mdl \
+       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
+         ark:$dir/egs/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
+       nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
+        "$mdl" ark:- $dir/$[$x+1].JOB.mdl \
        || exit 1;
 
     nnets_list=
@@ -410,10 +436,10 @@ while [ $x -lt $num_iters ]; do
       if [ $x -le $last_normal_shrink_iter ] || [ $[$x % $shrink_interval] -eq 0 ]; then
         # For earlier iterations (while we've recently beeen adding layers), or every
         # $shrink_interval=3 iters , just do shrinking normally.
+        mb=$[($num_valid_frames_shrink+$num_train_frames_shrink+$num_threads-1)/$num_threads]
         $cmd $parallel_opts $dir/log/shrink.$x.log \
-          MKL_NUM_THREADS=$mkl_num_threads nnet-combine-fast --num-threads=$num_threads --verbose=3 \
-            --minibatch-size=$[($num_valid_frames_shrink+$num_threads-1)/$num_threads] \
-            $dir/$[$x+1].mdl ark:$dir/valid_shrink.egs $dir/$[$x+1].mdl || exit 1;
+          nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
+            $dir/$[$x+1].mdl ark:$dir/shrink.egs $dir/$[$x+1].mdl || exit 1;
       fi
     fi
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -423,7 +449,7 @@ while [ $x -lt $num_iters ]; do
         nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
          $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
     fi
-    rm $nnets_list $dir/temp/egs.$x.*.scp
+    rm $nnets_list
   fi
   x=$[$x+1]
 done
@@ -435,15 +461,32 @@ nnets_list=
 for x in `seq $[$num_iters-$num_iters_final+1] $num_iters`; do
   [ $x -gt $mix_up_iter ] && nnets_list="$nnets_list $dir/$x.mdl"
 done
-$cmd $parallel_opts $dir/log/combine.log \
-  MKL_NUM_THREADS=$mkl_num_threads nnet-combine-fast --num-threads=$num_threads \
-    --verbose=3 --minibatch-size=$[($num_valid_frames_shrink+$num_threads-1)/$num_threads] \
-     $nnets_list ark:$dir/valid_combine.egs $dir/final.mdl || exit 1;
+if [ $stage -le $num_iters ]; then
+  mb=$[($num_valid_frames_combine+$num_train_frames_combine+$num_threads-1)/$num_threads]
+  $cmd $parallel_opts $dir/log/combine.log \
+    nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
+    $nnets_list ark:$dir/combine.egs $dir/final.mdl || exit 1;
+fi
 
 # Compute the probability of the final, combined model with
 # the same subset we used for the previous compute_probs, as the
 # different subsets will lead to different probs.
-$cmd $parallel_opts $dir/log/compute_prob.final.log \
-  nnet-compute-prob $dir/final.mdl ark:$dir/valid_shrink.egs || exit 1;
+$cmd $dir/log/compute_prob_valid.final.log \
+  nnet-compute-prob $dir/final.mdl ark:$dir/valid_diagnostic.egs &
+$cmd $dir/log/compute_prob_train.final.log \
+  nnet-compute-prob $dir/final.mdl ark:$dir/train_diagnostic.egs &
 
 echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  echo Removing training examples
+  rm -r $dir/egs
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+       # delete all but every 10th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi
diff --git a/egs/wsj/s5/steps/train_nnet_cpu_mmi.sh b/egs/wsj/s5/steps/train_nnet_cpu_mmi.sh
index 85ad1b759..7d1e17110 100755
--- a/egs/wsj/s5/steps/train_nnet_cpu_mmi.sh
+++ b/egs/wsj/s5/steps/train_nnet_cpu_mmi.sh
@@ -31,16 +31,17 @@ num_jobs_nnet=8 # Number of neural net training jobs to run in parallel.
                 # not the same as the num-jobs (nj) which will be the same as the
                 # alignment and denlat directories.
 stage=0
-sub_stage=-2 # this can be used to start from a particular sub-iteration of an
+sub_stage=-3 # this can be used to start from a particular sub-iteration of an
              # iteration
 acwt=0.1
 boost=0.0  # boosting for BMMI (you can try 0.1).. this is applied per frame.
 transform_dir=  # Note: by default any transforms in $alidir will be used.
 
 parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
-shuffle_opts="-tc 5" # max 5 jobs running at one time (a lot of I/O.)
+io_opts="-tc 10" # max 5 jobs running at one time (a lot of I/O.)
 num_threads=16 # number of threads for neural net trainer..
 mkl_num_threads=1
+random_copy=false
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -71,8 +72,7 @@ if [ $# != 6 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads."
-  echo "  --shuffle-opts <opts|\"-tc 5\">                  # Options given to e.g. queue.pl for the job that shuffles the "
-  echo "                                                   # data. (prevents stressing the disk). "
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for any especially I/O intensive jobs"
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, for each"
@@ -181,34 +181,37 @@ while [ $x -lt $num_epochs ]; do
   echo "Epoch $x of $num_epochs"
 
   if [ $stage -le $x ] && $first_iter_of_epoch; then
-    if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
+    if [ $stage -lt $x ] || [ $sub_stage -le -3 ]; then
       # First get the per-frame posteriors, by rescoring the lattices; this
       # process also gives us at the same time the posteriors of each state for
       # each frame (by default, pruned to 0.01 with a randomized algorithm).
       # The matrix-logprob stage produces a diagnostic and passes the pseudo-log-like
-      # matrix through unchanged.
-      $cmd JOB=1:$nj $dir/log/post.$z.JOB.log \
-        nnet-logprob2 $dir/$x.1.mdl "$feats" "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
+      # matrix through unchanged.  (Note: nnet-logprob2-parallel can use up to
+      # $num_threads threads, but in practice it may be limited by the speed of
+      # the other elements of the pipe.
+      $cmd $parallel_opts JOB=1:$nj $dir/log/post.$z.JOB.log \
+        nnet-logprob2-parallel --num-threads=$num_threads $dir/$x.1.mdl "$feats" \
+          "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
         matrix-logprob ark:- "ark:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $dir/$x.1.mdl ark:- ark:-|" ark:- \| \
         lattice-rescore-mapped $dir/$x.1.mdl "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark:- ark:- \| \
         lattice-boost-ali --b=$boost --silence-phones=$silphonelist $dir/$x.1.mdl ark:- "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
         lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
         post-to-pdf-post $dir/$x.1.mdl ark:- "ark:|gzip -c >$dir/post/den_post.$z.JOB.gz" || exit 1;
     fi
-    if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
+    if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
       # run nnet-get-egs for all files, to get the training examples for each frame--
       # combines the feature and label/posterior information.  The posterior information
       # consists of 2 things: the numerator posteriors from the alignments, the denominator
       # posteriors from the lattices (times -1), and the smoothing posteriors from the 
       # neural net log-probs (times E).  
       # We copy the examples for each job round-robin to multiple archives, one for each
-      # of 1...$num_jobs_nnet.  We write these along with .scp files, for more convenient
-      # and memory-efficient randomization.
+      # of 1...$num_jobs_nnet.  
       egs_out=""
       for n in `seq 1 $num_jobs_nnet`; do
-        egs_out="$egs_out ark,scp:$dir/egs/egs.$z.$n.JOB.ark,$dir/egs/egs.$z.$n.JOB.scp"
+        # indexes are egs_orig.$z.$num_jobs_nnet.$nj
+        egs_out="$egs_out ark:$dir/egs/egs_orig.$z.$n.JOB.ark"
       done
-      $cmd JOB=1:$nj $dir/log/egs.$z.JOB.log \
+      $cmd JOB=1:$nj $dir/log/get_egs.$z.JOB.log \
          ali-to-pdf $dir/$x.1.mdl "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
          ali-to-post ark:- ark:- \| \
          sum-post --scale2=$E ark:- "ark:gunzip -c $dir/post/smooth_post.$z.JOB.gz|" ark:- \| \
@@ -223,23 +226,33 @@ while [ $x -lt $num_epochs ]; do
       tail -n 50 $dir/log/post.$z.*.log | perl -e '$acwt=shift @ARGV; $acwt>0.0 || die "bad acwt"; while(<STDIN>) { if (m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames += $2; } if (m|matrix-logprob.+Average log-prob per frame is (\S+) over (\S+) frames|) { $tot_num_like += $1*$2; $tot_num_frames += $2; } } if (abs($tot_frames - $tot_num_frames) > 0.01*($tot_frames + $tot_num_frames)) { print STDERR "#frames differ $tot_frames vs $tot_num_frames\n"; }  $tot_den_lat_like /= $tot_frames; $tot_num_like /= $tot_num_frames; $objf = $acwt * $tot_num_like - $tot_den_lat_like; print $objf."\n"; ' $acwt > $dir/log/objf.$z.log
       echo "Objf on EBW iter $z is `cat $dir/log/objf.$z.log`"
     fi
-    if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
-      echo "Shuffling the order of training examples and splitting them up"
-      echo "(in order to avoid stressing the disk, these won't all run at once)."
-
+    if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
+      echo "Merging training examples across original #jobs ($nj), and "
+      echo "splitting across number of nnet jobs $num_jobs_nnet"
       egs_out2=""
       for n in `seq 1 $iters_per_epoch`; do
-        egs_out2="$egs_out2 ark:$dir/egs/egs_split.$z.$n.JOB.ark"
+        # indexes of egs_merged are: egs_merged.$z.$iters_per_epoch.$num_jobs_nnet
+        egs_out2="$egs_out2 ark:$dir/egs/egs_merged.$z.$n.JOB.ark"
       done
       # Note: in the following command, JOB goes from 1 to $num_jobs_nnet, so one
       # job per parallel training job (different from the previous command).
       # We sum up over the index JOB in the previous $cmd, and write to multiple
       # archives, this time one for each "sub-iter".
-      $cmd $shuffle_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.JOB.log \
-        cat $dir/egs/egs.$z.JOB.*.scp \| \
-        utils/shuffle_list.pl --srand "\$[($z*$num_jobs_nnet)+JOB]" \| \
-        nnet-copy-egs scp:- $egs_out2 || exit 1; ##'&&' \
-        ##rm $dir/egs/egs.$z.JOB.*.scp $dir/egs/egs.$z.JOB.*.ark || exit 1;
+      # indexes of egs_orig are: egs_orig.$z.$num_jobs_nnet.$nj
+      $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/merge_and_split.$x.JOB.log \
+        cat $dir/egs/egs_orig.$z.JOB.*.ark \| \
+        nnet-copy-egs --random=$random_copy "--srand=\$[JOB+($x*$num_jobs_nnet)]" \
+          ark:- $egs_out2 '&&' rm $dir/egs/egs_orig.$z.JOB.*.ark || exit 1;
+    fi
+    if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
+      echo "Randomizing order of examples in each job"
+      for n in `seq 1 $iters_per_epoch`; do
+        s=$[$num_jobs_nnet*($n+($iters_per_epoch*$z))] # for srand
+        $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$z.$n.JOB.log \
+          nnet-shuffle-egs "--srand=\$[JOB+$s]" \
+          ark:$dir/egs/egs_merged.$z.$n.JOB.ark ark:$dir/egs/egs.$z.$n.JOB.ark '&&' \
+          rm $dir/egs/egs_merged.$z.$n.JOB.ark || exit 1;
+      done
     fi
   fi
   if [ $stage -le $x ]; then
@@ -250,7 +263,7 @@ while [ $x -lt $num_epochs ]; do
       if [ $stage -lt $x ] || [ $sub_stage -le $y ]; then
         $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.$y.JOB.log \
           nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
-          $dir/$x.$y.mdl ark:$dir/egs/egs_split.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
+          $dir/$x.$y.mdl ark:$dir/egs/egs.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
           || exit 1;
         nnets_list=
         for n in `seq 1 $num_jobs_nnet`; do
diff --git a/egs/wsj/s5/utils/nnet-cpu/make_nnet_config.pl b/egs/wsj/s5/utils/nnet-cpu/make_nnet_config.pl
index be1c602da..9efc97479 100755
--- a/egs/wsj/s5/utils/nnet-cpu/make_nnet_config.pl
+++ b/egs/wsj/s5/utils/nnet-cpu/make_nnet_config.pl
@@ -68,7 +68,7 @@ Options:
    --input-left-context <n>        #  #frames of left context for input features; default 0.
    --input-right-context <n>       #  #frames of right context for input features; default 0.
    --param-stdddev-factor <f>      #  Factor which can be used to modify the standard deviation of
-                                   #  randomly nitialized features (default, 1.  Gets multiplied by
+                                   #  randomly initialized features (default, 1.  Gets multiplied by
                                    #  1/sqrt of number of inputs).
    --initial-num-hidden-layers <n> <config-file>   #  If >0, number of hidden layers to initialize the network with.
                                    #  In this case, the positional parameter <num-hidden-layers> is only
diff --git a/src/bin/Makefile b/src/bin/Makefile
index 6ab09c6de..35c65a308 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -19,7 +19,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         align-mapped align-compiled-mapped latgen-faster-mapped \
         hmm-info pdf-to-counts analyze-counts extract-ctx post-to-phone-post \
         post-to-pdf-post duplicate-matrix logprob-to-post prob-to-post copy-post \
-        matrix-logprob
+        matrix-logprob matrix-sum
 
 OBJFILES = 
 
diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc
new file mode 100644
index 000000000..5f20a9492
--- /dev/null
+++ b/src/bin/matrix-sum.cc
@@ -0,0 +1,87 @@
+// bin/matrix-sum.cc
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+        "Sum (and optionally scale) two archives of input matrices\n"
+        "of the same dimension\n"
+        "\n"
+        "Usage: matrix-sum [options] <matrix-rspecifier1> <matrix-rspecifier2> <sum-wspecifier>\n";
+
+    BaseFloat scale1 = 1.0, scale2 = 1.0;
+
+    ParseOptions po(usage);
+
+    po.Register("scale1", &scale1, "Scale applied to first matrix");
+    po.Register("scale2", &scale2, "Scale applied to second matrix");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    std::string rspecifier1 = po.GetArg(1);
+    std::string rspecifier2 = po.GetArg(2);
+    std::string wspecifier = po.GetArg(3);
+    
+    SequentialBaseFloatMatrixReader mat1_reader(rspecifier1);
+    RandomAccessBaseFloatMatrixReader mat2_reader(rspecifier2);
+    BaseFloatMatrixWriter mat_writer(wspecifier);
+    
+    int32 num_done = 0, num_err = 0;
+    
+    for (; !mat1_reader.Done(); mat1_reader.Next()) {
+      std::string key = mat1_reader.Key();
+      Matrix<BaseFloat> mat1 (mat1_reader.Value());
+      if (!mat2_reader.HasKey(key)) {
+        KALDI_WARN << "No such key " << key << " in second table.";
+        num_err++;
+        continue;
+      }
+      const Matrix<BaseFloat> &mat2 (mat2_reader.Value(key));
+      if (!SameDim(mat1, mat2)) {
+        KALDI_WARN << "Matrices for key " << key << " have different dims "
+                   << mat1.NumRows() << " x " << mat1.NumCols() << " vs. "
+                   << mat2.NumRows() << " x " << mat2.NumCols();
+        num_err++;
+        continue;
+      }
+      if (scale1 != 1.0) mat1.Scale(scale1);
+      mat1.AddMat(scale2, mat2);
+      mat_writer.Write(key, mat1);
+      num_done++;
+    }
+    KALDI_LOG << "Added " << num_done << " matrices; " << num_err
+              << " had errors.";
+
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
+
diff --git a/src/cudamatrix/cu-matrix-inl.h b/src/cudamatrix/cu-matrix-inl.h
index cb594edeb..3bdf4f78a 100644
--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@@ -93,10 +93,13 @@ Real* CuMatrix<Real>::RowData(MatrixIndexT r) {
 
 
 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols) {
+void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
+                            MatrixResizeType resize_type) {
+  // This code does not currently support the other resize_type options.
+  KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
   if (num_rows_ == rows && num_cols_ == cols) {
-    // SetZero();
-    return *this;
+    if (resize_type == kSetZero) SetZero();
+    return;
   }
 
   Destroy();
@@ -108,17 +111,15 @@ CuMatrix<Real>& CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols) {
     cuSafeCall(cudaMallocPitch((void**)&data_, &pitch, row_bytes, rows));
     num_rows_ = rows; num_cols_ = cols; 
     stride_ = pitch/sizeof(Real);
-    SetZero();
+    if (resize_type == kSetZero) SetZero();
   } else
   #endif
   {
-    mat_.Resize(rows, cols);
+    mat_.Resize(rows, cols, resize_type);
     num_rows_=rows;
     num_cols_=cols;
-    stride_=mat_.Stride();
+    stride_= mat_.Stride();
   }
-  
-  return *this;
 }
 
 
@@ -134,7 +135,7 @@ void CuMatrix<Real>::Destroy() {
   } else
   #endif
   {
-    mat_.Destroy();
+    mat_.Resize(0, 0);
   }
   num_rows_ = num_cols_ = stride_ = 0;
 }
@@ -142,9 +143,8 @@ void CuMatrix<Real>::Destroy() {
 
 
 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
-  Resize(src.NumRows(), src.NumCols());
- 
+void CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
+  KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
   #if HAVE_CUDA==1 
   if (CuDevice::Instantiate().Enabled()) { 
     Timer tim;
@@ -152,7 +152,8 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
     MatrixIndexT dst_pitch = stride_*sizeof(Real);
     MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
     MatrixIndexT width = src.NumCols()*sizeof(Real);
-    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, width, src.NumRows(), cudaMemcpyDeviceToDevice));
+    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
+                            width, src.NumRows(), cudaMemcpyDeviceToDevice));
 
     CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromMatD2D",tim.Elapsed());
   } else
@@ -160,16 +161,13 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
   {
     mat_.CopyFromMat(src.mat_);
   }
-
-  return *this;
 }
 
 
 
 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
-  Resize(src.NumRows(), src.NumCols());
-
+void CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
+  KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
   #if HAVE_CUDA==1 
   if (CuDevice::Instantiate().Enabled()) { 
     Timer tim;
@@ -177,7 +175,8 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
     MatrixIndexT dst_pitch = stride_*sizeof(Real);
     MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
     MatrixIndexT width = src.NumCols()*sizeof(Real);
-    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, width, src.NumRows(), cudaMemcpyHostToDevice));
+    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
+                            width, src.NumRows(), cudaMemcpyHostToDevice));
 
     CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromMatH2D",tim.Elapsed());
   } else
@@ -185,18 +184,13 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
   {
     mat_.CopyFromMat(src);
   }
-
-  return *this;
 }
 
 
-
 template<typename Real>
 void CuMatrix<Real>::CopyToMat(Matrix<Real> *dst) const {
-  if (dst->NumRows() != NumRows()  ||  dst->NumCols() != NumCols()) {
-    dst->Resize(NumRows(), NumCols());
-  }
-
+  KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols());
+  
   #if HAVE_CUDA==1 
   if (CuDevice::Instantiate().Enabled()) { 
 
@@ -257,7 +251,7 @@ void CuMatrix<Real>::Read(std::istream &is, bool binary) {
 
 template<typename Real>
 void CuMatrix<Real>::Write(std::ostream &os, bool binary) const {
-  Matrix<BaseFloat> tmp;
+  Matrix<BaseFloat> tmp(NumRows(), NumCols(), kUndefined);
   CopyToMat(&tmp);
   tmp.Write(os, binary); 
 }
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 228c4151a..f1e744198 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -46,15 +46,41 @@ class CuMatrix {
  public:
 
   /// Default Constructor
-  CuMatrix<Real>()
-   : num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
-  }
+  CuMatrix<Real>():
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { }
+
   /// Constructor with memory initialisation
-  CuMatrix<Real>(MatrixIndexT rows, MatrixIndexT cols)
-   : num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
+  CuMatrix<Real>(MatrixIndexT rows, MatrixIndexT cols):
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
     Resize(rows, cols); 
   }
 
+  // Note: we had to remove the "explicit" keyword due
+  // to problems with STL vectors of CuMatrix.
+  CuMatrix<Real>(const CuMatrix<Real> &other):
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) {
+    Resize(other.NumRows(), other.NumCols(), kUndefined);
+    CopyFromMat(other);
+  }
+
+  explicit CuMatrix<Real>(const Matrix<Real> &other):
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) {
+    Resize(other.NumRows(), other.NumCols(), kUndefined);
+    CopyFromMat(other);
+  }
+
+  CuMatrix<Real> &operator = (const CuMatrix<Real> &other) {
+    Resize(other.NumRows(), other.NumCols(), kUndefined);
+    CopyFromMat(other);
+    return *this;
+  }  
+
+  CuMatrix<Real> &operator = (const Matrix<Real> &other) {
+    Resize(other.NumRows(), other.NumCols(), kUndefined);
+    CopyFromMat(other);
+    return *this;
+  }  
+  
   /// Destructor
   ~CuMatrix() {
     Destroy(); 
@@ -65,14 +91,12 @@ class CuMatrix {
     return num_rows_; 
   }
 
-  MatrixIndexT NumCols() const { 
-    return num_cols_; 
-  }
+  MatrixIndexT NumCols() const { return num_cols_;  }
 
-  MatrixIndexT Stride() const { 
-    return stride_; 
-  }
+  MatrixIndexT Stride() const { return stride_; }
 
+  // MatrixDim is a struct containing "rows", "cols" and "stride",
+  // that is an argument of most CUDA kernels.
   ::MatrixDim Dim() const { 
     ::MatrixDim d = { num_rows_, num_cols_, stride_ }; 
     return d; 
@@ -87,41 +111,34 @@ class CuMatrix {
   Real* RowData(MatrixIndexT r);
 
   /// Get size of matrix in bytes
-  MatrixIndexT SizeInBytes() const { 
-    return num_rows_*stride_*sizeof(Real); 
-  }
-  
+  MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
+
   /// Get size of matrix row in bytes
-  MatrixIndexT RowSizeInBytes() const {
-    return num_cols_*sizeof(Real); 
-  }
+  MatrixIndexT RowSizeInBytes() const { return num_cols_*sizeof(Real); }
   
   /// Get size of matrix stride in bytes
-  MatrixIndexT StrideSizeInBytes() const {
-    return stride_*sizeof(Real); 
-  }
+  MatrixIndexT StrideSizeInBytes() const { return stride_*sizeof(Real); }
 
   /// Allocate the memory
-  ThisType& Resize(MatrixIndexT rows, MatrixIndexT cols);
-
-  /// Deallocate the memory
-  void Destroy();
-
-  /// Copy functions (reallocates when needed)
-  ThisType&        CopyFromMat(const CuMatrix<Real> &src);
-  ThisType&        CopyFromMat(const Matrix<Real> &src);
-  void             CopyToMat(Matrix<Real> *dst) const;
-
+  void Resize(MatrixIndexT rows, MatrixIndexT cols,
+              MatrixResizeType resize_type = kSetZero);
+  
+  /// Copy functions (reallocates when needed, but note from Dan: eventually
+  /// I'll change it to just die if the sizes don't match, like the Matrix class.)
+  void CopyFromMat(const CuMatrix<Real> &src);
+  void CopyFromMat(const Matrix<Real> &src);
+  void CopyToMat(Matrix<Real> *dst) const;
+  
   /// Copy row interval from matrix
   /// @param r      [in] number of rows to copy.
   /// @param src    [in] source matrix.
   /// @param src_ro [in] source matrix row offset.
   /// @param dst_ro [in] destination matrix row offset.
-  void             CopyRowsFromMat(int32 r, const CuMatrix<Real> &src, int32 src_ro, int32 dst_ro);
+  void CopyRowsFromMat(int32 r, const CuMatrix<Real> &src, int32 src_ro, int32 dst_ro);
 
   /// I/O functions
-  void             Read(std::istream &is, bool binary);
-  void             Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;
 
   /// Math operations, some calling kernels
   void SetZero();
@@ -154,6 +171,8 @@ class CuMatrix {
   }
 
  private:
+  void Destroy();
+  
   MatrixIndexT num_rows_;
   MatrixIndexT num_cols_;
   MatrixIndexT stride_;
diff --git a/src/cudamatrix/cu-vector-inl.h b/src/cudamatrix/cu-vector-inl.h
index 1bea81e87..91f140177 100644
--- a/src/cudamatrix/cu-vector-inl.h
+++ b/src/cudamatrix/cu-vector-inl.h
@@ -44,8 +44,6 @@ const Real* CuVector<Real>::Data() const {
   }
 }
 
-
-
 template<typename Real>
 Real* CuVector<Real>::Data() { 
   #if HAVE_CUDA==1
@@ -58,15 +56,12 @@ Real* CuVector<Real>::Data() {
   }
 }
 
-
-
 template<typename Real>
-CuVector<Real>& CuVector<Real>::Resize(MatrixIndexT dim) {
+void CuVector<Real>::Resize(MatrixIndexT dim) {
   if (dim_ == dim) {
-    // SetZero();
-    return *this;
+    SetZero();
+    return;
   }
-
   Destroy();
 
   #if HAVE_CUDA==1
@@ -80,8 +75,6 @@ CuVector<Real>& CuVector<Real>::Resize(MatrixIndexT dim) {
 
   dim_ = dim;
   SetZero();
-
-  return *this;
 }
 
 
@@ -106,10 +99,8 @@ void CuVector<Real>::Destroy() {
 
 
 template<typename Real>
-CuVector<Real>& CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
-  Resize(src.Dim());
-  
-  #if HAVE_CUDA==1
+void CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
+#if HAVE_CUDA==1
   if (CuDevice::Instantiate().Enabled()) { 
     Timer tim;
     cuSafeCall(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyDeviceToDevice));
@@ -119,16 +110,13 @@ CuVector<Real>& CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
   {
     vec_.CopyFromVec(src.vec_);
   }
-
-  return *this;
 }
 
 
 
 template<typename Real>
-CuVector<Real>& CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
-  Resize(src.Dim());
-
+void CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
+  KALDI_ASSERT(src.Dim() == dim_);
   #if HAVE_CUDA==1
   if (CuDevice::Instantiate().Enabled()) { 
     Timer tim;
@@ -141,16 +129,14 @@ CuVector<Real>& CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
   {
     vec_.CopyFromVec(src);
   }
-  return *this;
 }
 
 
 
 template<typename Real>
 void CuVector<Real>::CopyToVec(Vector<Real> *dst) const {
-  if (dst->Dim() != dim_) {
-    dst->Resize(dim_);
-  }
+  KALDI_ASSERT(dst->Dim() == dim_);
+
 
   #if HAVE_CUDA==1
   if (CuDevice::Instantiate().Enabled()) { 
@@ -177,7 +163,7 @@ void CuVector<Real>::Read(std::istream &is, bool binary) {
 
 template<typename Real>
 void CuVector<Real>::Write(std::ostream &os, bool binary) const {
-  Vector<BaseFloat> tmp;
+  Vector<BaseFloat> tmp(Dim());
   CopyToVec(&tmp);
   tmp.Write(os, binary); 
 }
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index 388d5ade8..1320ea729 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -46,6 +46,16 @@ class CuVector {
     Resize(dim); 
   }
 
+  CuVector<Real>(const CuVector<Real> &v): dim_(0), data_(NULL) { 
+    Resize(v.dim_);
+    CopyFromVec(v);
+  }
+
+  CuVector<Real>(const Vector<Real> &v): dim_(0), data_(NULL) { 
+    Resize(v.Dim());
+    CopyFromVec(v);
+  }
+  
   /// Destructor
   ~CuVector() {
     Destroy(); 
@@ -61,19 +71,16 @@ class CuVector {
   Real* Data();
  
   /// Allocate the memory
-  ThisType& Resize(MatrixIndexT dim);
-
-  /// Deallocate the memory
-  void Destroy();
+  void Resize(MatrixIndexT dim);
 
   /// Copy functions (lazy reallocation when needed)
-  ThisType&        CopyFromVec(const CuVector<Real> &src);
-  ThisType&        CopyFromVec(const Vector<Real> &src);
-  void             CopyToVec(Vector<Real> *dst) const;
+  void CopyFromVec(const CuVector<Real> &src);
+  void CopyFromVec(const Vector<Real> &src);
+  void CopyToVec(Vector<Real> *dst) const;
   
   /// I/O 
-  void             Read(std::istream &is, bool binary);
-  void             Write(std::ostream &is, bool binary) const;
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &is, bool binary) const;
   
   /// Math operations
   void SetZero();
@@ -94,6 +101,7 @@ class CuVector {
   }
 
 private:
+  void Destroy();
   MatrixIndexT dim_; ///< dimension of the vector
   Real *data_; ///< GPU data pointer
   Vector<Real> vec_; ///< non-GPU vector as back-up
diff --git a/src/decoder/simple-decoder.h b/src/decoder/simple-decoder.h
index 0a5f3e825..b4b345617 100644
--- a/src/decoder/simple-decoder.h
+++ b/src/decoder/simple-decoder.h
@@ -234,7 +234,7 @@ class SimpleDecoder {
         const Arc &arc = aiter.Value();
         if (arc.ilabel == 0) {  // propagate nonemitting only...
           Token *new_tok = new Token(arc, tok);
-          if (new_tok->arc_.weight.Value() > cutoff) {
+          if (new_tok->weight_.Value() > cutoff) {
             Token::TokenDelete(new_tok);
           } else {
             unordered_map<StateId, Token*>::iterator find_iter
diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc
index 3b39e384d..608fbb793 100644
--- a/src/feat/mel-computations.cc
+++ b/src/feat/mel-computations.cc
@@ -213,6 +213,10 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
     int32 offset = bins_[i].first;
     const Vector<BaseFloat> &v (bins_[i].second);
     (*mel_energies_out)(i) = VecVec(v, power_spectrum.Range(offset, v.Dim()));
+    // The following assert was added due to a problem with OpenBlas that
+    // we had at one point (it was a bug in that library).  Just to detect
+    // it early.
+    KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i)));
   }
 
   if (debug_) {
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index d50cf0b95..d65c2be14 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -9,7 +9,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
     feat-to-len feat-to-dim fmpe-apply-transform fmpe-acc-stats fmpe-init \
     fmpe-est fmpe-copy fmpe-sum-accs append-feats extend-transform-dim \
     get-full-lda-mat compute-spectrogram-feats extract-feature-segments \
-    reverse-feats paste-feats select-feats
+    reverse-feats paste-feats select-feats subsample-feats
 
 OBJFILES = 
 
diff --git a/src/featbin/append-feats.cc b/src/featbin/append-feats.cc
index e7b6467a8..46450ae96 100644
--- a/src/featbin/append-feats.cc
+++ b/src/featbin/append-feats.cc
@@ -1,6 +1,7 @@
 // featbin/append-feats.cc
 
-// Copyright 2012   Petr Motlicek;  Pawel Swietojanski
+// Copyright 2012   Petr Motlicek  Pawel Swietojanski
+//                  Johns Hopkins University (author: Daniel Povey)
 
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -32,15 +33,11 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
 
-    int32 feats_offset_in1 = 0;
-    int32 feats_offset_in2 = 0;
-    int32 num_feats_in1 = 0;
-    int32 num_feats_in2 = 0;
-
-    po.Register("feats-offset-in1", &feats_offset_in1, "Feats 1 offset");
-    po.Register("num-feats-in1", &num_feats_in1, "Take num-feats from in1-rspeciifier");
-    po.Register("feats-offset-in2", &feats_offset_in2, "Feats 2 offset");
-    po.Register("num-feats-in2", &num_feats_in2, "Take num-feats from in2-rspeciifier");
+    bool truncate_frames = false;
+    
+    po.Register("truncate-frames", &truncate_frames, "If true, do not treat it "
+                "as an error when files differ in number of frames, but truncate "
+                "the longest one.");
 
     po.Read(argc, argv);
 
@@ -53,80 +50,47 @@ int main(int argc, char *argv[]) {
     std::string rspecifier2 = po.GetArg(2);
     std::string wspecifier = po.GetArg(3);
 
-    KALDI_ASSERT(feats_offset_in1 >= 0 && feats_offset_in2 >= 0);
+    BaseFloatMatrixWriter feats_writer(wspecifier);
+    SequentialBaseFloatMatrixReader feats_reader1(rspecifier1);
+    RandomAccessBaseFloatMatrixReader feats_reader2(rspecifier2);
 
-    BaseFloatMatrixWriter kaldi_writer(wspecifier);
-    SequentialBaseFloatMatrixReader kaldi_reader1(rspecifier1);
-    RandomAccessBaseFloatMatrixReader kaldi_reader2(rspecifier2);
+    int32 num_done = 0, num_err = 0;
 
-    // Peeking in the archives to get the feature dimensions
-    if (kaldi_reader1.Done()) {
-      KALDI_ERR << "Could not read any features from " << rspecifier1
-                << ". (empty archive?)";
-    }
-    std::string utt = kaldi_reader1.Key();
-    if (!kaldi_reader2.HasKey(utt)) {
-      KALDI_ERR << "Could not read features for key " << utt << " from "
-                << rspecifier2 << ". (empty archive?)";
-    }
-
-    int32 dim_feats_in1 = kaldi_reader1.Value().NumCols();
-    int32 dim_feats_in2 = kaldi_reader2.Value(utt).NumCols();
-    if (num_feats_in1 == 0)
-      num_feats_in1 = dim_feats_in1 - feats_offset_in1;
-    if (num_feats_in2 == 0)
-      num_feats_in2 = dim_feats_in2 - feats_offset_in2;
-
-    KALDI_LOG << "Reading features from " << rspecifier1 << " and " << rspecifier2;
-    KALDI_LOG << "\tdim1 = " << dim_feats_in1 << "; offset1 = " << feats_offset_in1
-              << "; num1 = " << num_feats_in1 << "; dim2 = " << dim_feats_in2
-              << "; offset2 = " << feats_offset_in2 << "; num2 = " << num_feats_in2;
-
-    KALDI_ASSERT((feats_offset_in1 + num_feats_in1) <= dim_feats_in1);
-    KALDI_ASSERT((feats_offset_in2 + num_feats_in2) <= dim_feats_in2);
-
-    for (; !kaldi_reader1.Done(); kaldi_reader1.Next()) {
-      utt = kaldi_reader1.Key();
-      if (!kaldi_reader2.HasKey(utt)) {
+    for (; !feats_reader1.Done(); feats_reader1.Next()) {
+      std::string utt = feats_reader1.Key();
+      if (!feats_reader2.HasKey(utt)) {
         KALDI_WARN << "Could not find features for " << utt << " in "
                    << rspecifier2 << ": producing no output for the utterance";
+        num_err++;
         continue;
       }
-
-      const Matrix<BaseFloat> &feats1 = kaldi_reader1.Value();
-      const Matrix<BaseFloat> &feats2 = kaldi_reader2.Value(utt);
-      int32 num_frames = feats1.NumRows();
-      KALDI_VLOG(1) << "Utterance : " << utt << ": # of frames = " << num_frames;
-
-      KALDI_ASSERT(feats1.NumCols() == dim_feats_in1 &&
-                   feats2.NumCols() == dim_feats_in2);
-      if (num_frames != feats2.NumRows()) {
-        KALDI_WARN << "Utterance " << utt << ": " << num_frames
-                   << " frames read from " << rspecifier1 << " and "
-                   << feats2.NumRows() << " frames read from " << rspecifier2
-                   << ": producing no output for the utterance";
+      
+      const Matrix<BaseFloat> &feats1 = feats_reader1.Value();
+      const Matrix<BaseFloat> &feats2 = feats_reader2.Value(utt);
+      if (feats1.NumRows() != feats2.NumRows() && !truncate_frames) {
+        KALDI_WARN << "For utterance " << utt << ", features have different "
+                   << "#frames " << feats1.NumRows() << " vs. "
+                   << feats2.NumRows() << ", producing no output (use "
+                   << "--truncate-frames=true if you want output)";
+        num_err++;
         continue;
       }
-
-      SubMatrix<BaseFloat> new_feats1(feats1, 0, num_frames, feats_offset_in1,
-                                      num_feats_in1);
-      SubMatrix<BaseFloat> new_feats2(feats2, 0, num_frames, feats_offset_in2,
-                                      num_feats_in2);
-      Matrix<BaseFloat> output_feats(num_frames, new_feats1.NumCols() +
-                                     new_feats2.NumCols());
-      output_feats.Range(0, num_frames, 0,
-                         new_feats1.NumCols()).CopyFromMat(new_feats1);
-      output_feats.Range(0, num_frames, new_feats1.NumCols(),
-                         new_feats2.NumCols()).CopyFromMat(new_feats2);
-      kaldi_writer.Write(utt, output_feats);
+      int32 num_frames = std::min(feats1.NumRows(), feats2.NumRows()),
+          dim1 = feats1.NumCols(), dim2 = feats2.NumCols();
+      Matrix<BaseFloat> output(num_frames, dim1 + dim2, kUndefined);
+      output.Range(0, num_frames, 0, dim1).CopyFromMat(
+          feats1.Range(0, num_frames, 0, dim1));
+      output.Range(0, num_frames, dim1, dim2).CopyFromMat(
+          feats2.Range(0, num_frames, 0, dim2));
+      
+      feats_writer.Write(utt, output);
+      num_done++;
     }
-
-    return 0;
-  }
-  catch (const std::exception& e) {
+    KALDI_LOG << "Appended " << num_done << " feats; " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+  } catch (const std::exception& e) {
     std::cerr << e.what();
     return -1;
   }
 }
-
-
diff --git a/src/featbin/compute-mfcc-feats.cc b/src/featbin/compute-mfcc-feats.cc
index 35f3c95f5..e51ffc748 100644
--- a/src/featbin/compute-mfcc-feats.cc
+++ b/src/featbin/compute-mfcc-feats.cc
@@ -21,7 +21,6 @@
 #include "feat/feature-mfcc.h"
 #include "feat/wave-reader.h"
 
-
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
diff --git a/src/featbin/select-feats.cc b/src/featbin/select-feats.cc
index c1d36dbc5..db3543461 100644
--- a/src/featbin/select-feats.cc
+++ b/src/featbin/select-feats.cc
@@ -31,10 +31,10 @@ int main(int argc, char *argv[]) {
     using namespace std;
     
     const char *usage =
-      "Select certain dimensions of the feature file;  think of it as the unix\n"
-      "command cut -f ...\n"
-      "Usage: select-feats selection in-rspecifier out-wspecifier\n"
-      "  e.g. select-feats 0,24-22,3-12 scp:feats.scp ark,scp:feat-red.ark,feat-red.scp\n";
+        "Select certain dimensions of the feature file;  think of it as the unix\n"
+        "command cut -f ...\n"
+        "Usage: select-feats selection in-rspecifier out-wspecifier\n"
+        "  e.g. select-feats 0,24-22,3-12 scp:feats.scp ark,scp:feat-red.ark,feat-red.scp\n";
     
     ParseOptions po(usage);
     
diff --git a/src/featbin/subsample-feats.cc b/src/featbin/subsample-feats.cc
new file mode 100644
index 000000000..e66a81307
--- /dev/null
+++ b/src/featbin/subsample-feats.cc
@@ -0,0 +1,96 @@
+// featbin/select-feats.cc
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+#include <algorithm>
+#include <iterator>
+#include <utility>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace std;
+    
+    const char *usage =
+        "Sub-samples features by taking every n'th frame"
+        "\n"
+        "Usage: subsample-feats [options] in-rspecifier out-wspecifier\n"
+        "  e.g. subsample-feats --n=2 ark:- ark:-\n";
+    
+    ParseOptions po(usage);
+
+    int32 n = 1, offset = 0;
+
+    po.Register("n", &n, "Take every n'th feature, for this value of n");
+    po.Register("offset", &offset, "Start with the feature with this offset, "
+                "then take every n'th feature.");
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }    
+
+    string rspecifier = po.GetArg(1);
+    string wspecifier = po.GetArg(2);
+    
+    SequentialBaseFloatMatrixReader feat_reader(rspecifier);
+    BaseFloatMatrixWriter feat_writer(wspecifier);
+
+    int32 num_done = 0, num_err = 0;
+    
+    // process all keys
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string utt = feat_reader.Key();
+      const Matrix<BaseFloat> feats(feat_reader.Value());
+
+      // This code could, of course, be much more efficient; I'm just
+      // keeping it simple.
+      int32 num_indexes = 0;
+      for (int32 k = offset; k < feats.NumRows(); k += n)
+        num_indexes++; // k is the index.
+      
+      if (num_indexes == 0) {
+        KALDI_WARN << "For utterance " << utt << ", output would have no rows, "
+                   << "producing no output.";
+        num_err++;
+        continue;
+      }
+      Matrix<BaseFloat> output(num_indexes, feats.NumCols());
+      int32 i = 0;
+      for (int32 k = offset; k < feats.NumRows(); k += n, i++) {
+        SubVector<BaseFloat> src(feats, k), dest(output, i);
+        dest.CopyFromVec(src);
+      }
+      KALDI_ASSERT(i == num_indexes);
+      feat_writer.Write(utt, output);
+      num_done++;
+    }
+    KALDI_LOG << "Sub-sampled " << num_done << " feats; " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/featbin/subset-feats.cc b/src/featbin/subset-feats.cc
index 9733c227c..7f80a4c68 100644
--- a/src/featbin/subset-feats.cc
+++ b/src/featbin/subset-feats.cc
@@ -25,11 +25,11 @@ int main(int argc, char *argv[]) {
     using namespace kaldi;
 
     const char *usage =
-        "Copy a subset of features\n"
+        "Copy a subset of features (the first n features)\n"
         "Usage: subset-feats [options] in-rspecifier out-wspecifier\n";
 
     ParseOptions po(usage);
-
+    
     int32 n = 10;
     po.Register("n", &n, "If nonnegative, copy the first n feature files.");
 
diff --git a/src/featbin/transform-feats.cc b/src/featbin/transform-feats.cc
index 594b4fcb0..0d7aa63d0 100644
--- a/src/featbin/transform-feats.cc
+++ b/src/featbin/transform-feats.cc
@@ -160,11 +160,9 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Applied transform to " << num_done << " utterances; " << num_error
               << " had errors.";
 
-    return 0;
+    return (num_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }
-
-
diff --git a/src/matrix/cblas-wrappers.h b/src/matrix/cblas-wrappers.h
index a922a7e4a..4c3f526b4 100644
--- a/src/matrix/cblas-wrappers.h
+++ b/src/matrix/cblas-wrappers.h
@@ -27,9 +27,24 @@
 
 namespace kaldi {
 
-inline void cblas_Xscal(const int N, float *X, const int incX, float *Y,
-                        const int incY, const float c, const float s) {
-  cblas_srot(N, X, incX, Y, incY, c, s);
+
+inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
+                        const int incY) {
+  cblas_scopy(N, X, incX, Y, incY);
+}
+
+inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
+                        const int incY) {
+  cblas_dcopy(N, X, incX, Y, incY);
+}
+
+
+inline float cblas_Xasum(const int N, const float *X, const int incX) {
+  return cblas_sasum(N, X, incX);
+}
+
+inline double cblas_Xasum(const int N, const double *X, const int incX) {
+  return cblas_dasum(N, X, incX);
 }
 
 inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
@@ -58,11 +73,11 @@ inline void cblas_Xaxpy(const int N, const double alpha, const double *X,
                         const int incX, double *Y, const int incY) {
   cblas_daxpy(N, alpha, X, incX, Y, incY);
 }
-inline void cblas_Xscal(const int N,const float alpha, float *data,
+inline void cblas_Xscal(const int N, const float alpha, float *data,
                         const int inc) {
   cblas_sscal(N, alpha, data, inc);
 }
-inline void cblas_Xscal(const int N,const double alpha, double *data, 
+inline void cblas_Xscal(const int N, const double alpha, double *data, 
                         const int inc) {
   cblas_dscal(N, alpha, data, inc);
 }
@@ -226,6 +241,78 @@ inline void cblas_Xsyrk(
   cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
               dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
 }
+
+/// matrix-vector multiply using a banded matrix; we always call this
+/// with b = 1 meaning we're multiplying by a diagonal matrix.  This is used for
+/// elementwise multiplication.  We miss some of the arguments out of this
+/// wrapper.
+inline void cblas_Xsbmv1(
+    const MatrixIndexT dim,
+    const double *A,
+    const double alpha,
+    const double *x,
+    const double beta,
+    double *y) {
+  cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
+              1, x, 1, beta, y, 1);
+}
+
+inline void cblas_Xsbmv1(
+    const MatrixIndexT dim,
+    const float *A,
+    const float alpha,
+    const float *x,
+    const float beta,
+    float *y) {
+  cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
+              1, x, 1, beta, y, 1);
+}
+
+
+/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
+/// extend this somehow.
+inline void mul_elements(
+    const MatrixIndexT dim,
+    const double *a,
+    double *b) { // does b *= a, elementwise.
+  double c1, c2, c3, c4;
+  MatrixIndexT i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+inline void mul_elements(
+    const MatrixIndexT dim,
+    const float *a,
+    float *b) { // does b *= a, elementwise.
+  float c1, c2, c3, c4;
+  MatrixIndexT i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+
+
 // add clapack here
 #ifndef HAVE_ATLAS
 inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index e4f9320ef..e983a7a8c 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -495,6 +495,41 @@ template
 void MatrixBase<double>::CopyFromMat(const MatrixBase<double> & M,
                                      MatrixTransposeType Trans);
 
+// Specialize the template for CopyFromSp for float, float.
+template<>
+template<>
+void MatrixBase<float>::CopyFromSp(const SpMatrix<float> & M) {
+  KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
+  MatrixIndexT num_rows = num_rows_, stride = stride_;
+  const float *Mdata = M.Data();
+  float *row_data = data_, *col_data = data_;
+  for (MatrixIndexT i = 0; i < num_rows; i++) {
+    cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
+    cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column.
+    Mdata += i+1;
+    row_data += stride;
+    col_data += 1;
+  }
+}
+
+// Specialize the template for CopyFromSp for double, double.
+template<>
+template<>
+void MatrixBase<double>::CopyFromSp(const SpMatrix<double> & M) {
+  KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
+  MatrixIndexT num_rows = num_rows_, stride = stride_;
+  const double *Mdata = M.Data();
+  double *row_data = data_, *col_data = data_;
+  for (MatrixIndexT i = 0; i < num_rows; i++) {
+    cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
+    cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column.
+    Mdata += i+1;
+    row_data += stride;
+    col_data += 1;
+  }
+}
+
+  
 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromSp(const SpMatrix<OtherReal> & M) {
@@ -711,12 +746,16 @@ void Matrix<Real>::Destroy() {
 template<typename Real>
 void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
   KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
-  MatrixIndexT i;
-  MatrixIndexT j;
-
-  for (i = 0; i < num_rows_; i++) {
-    for (j = 0; j < num_cols_; j++) {
-      (*this)(i, j) *= a(i, j);
+  
+  if (num_cols_ == stride_ && num_cols_ == a.stride_) {
+    mul_elements(num_rows_ * num_cols_, a.data_, data_);
+  } else {
+    MatrixIndexT a_stride = a.stride_, stride = stride_;
+    Real *data = data_, *a_data = a.data_;
+    for (MatrixIndexT i = 0; i < num_rows_; i++) {
+      mul_elements(num_cols_, a_data, data);
+      a_data += a_stride;
+      data += stride;
     }
   }
 }
@@ -1052,10 +1091,10 @@ bad:
 // would not allow its contents to be changed.
 template<typename Real>
 SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
-                           const MatrixIndexT    ro,
-                           const MatrixIndexT    r,
-                           const MatrixIndexT    co,
-                           const MatrixIndexT    c) {
+                           const MatrixIndexT ro,
+                           const MatrixIndexT r,
+                           const MatrixIndexT co,
+                           const MatrixIndexT c) {
   KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(ro) <
                static_cast<UnsignedMatrixIndexT>(M.num_rows_) &&
                static_cast<UnsignedMatrixIndexT>(co) <
@@ -1985,6 +2024,13 @@ Real MatrixBase<Real>::ApplySoftMax() {
   return max + log(sum);
 }
 
+template<typename Real>
+void MatrixBase<Real>::ApplyTanh() {
+  for (MatrixIndexT r = 0; r < num_rows_; r++) {
+    SubVector<Real> v(*this, r);
+    v.ApplyTanh();
+  }
+}
 
 template<class Real>
 template<class OtherReal>
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 2ce317a01..7ac095ff3 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -340,6 +340,9 @@ class MatrixBase {
   /// Apply soft-max to the collection of all elements of the
   /// matrix and return normalizer (log sum of exponentials).
   Real ApplySoftMax();
+
+  /// Apply the tanh function to each element of the matrix.
+  void ApplyTanh();
   
   /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
    * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
@@ -530,7 +533,7 @@ class Matrix : public MatrixBase<Real> {
 
   /// Basic constructor.  Sets to zero by default.
   /// if set_zero == false, memory contents are undefined.
-  Matrix(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type = kSetZero) :
+  Matrix(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type = kSetZero):
     MatrixBase<Real>() { Resize(r, c, resize_type); }
 
   /// Swaps the contents of *this and *other.  Shallow swap.
@@ -553,7 +556,7 @@ class Matrix : public MatrixBase<Real> {
   /// It is symmetric, so no option for transpose, and NumRows == Cols
   template<typename OtherReal>
   explicit Matrix(const SpMatrix<OtherReal> & M) : MatrixBase<Real>() {
-    Resize(M.NumRows(), M.NumRows());
+    Resize(M.NumRows(), M.NumRows(), kUndefined);
     this->CopyFromSp(M);
   }
 
@@ -562,10 +565,10 @@ class Matrix : public MatrixBase<Real> {
   explicit Matrix(const TpMatrix<OtherReal> & M,
                     MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
     if (trans == kNoTrans) {
-      Resize(M.NumRows(), M.NumCols());
+      Resize(M.NumRows(), M.NumCols(), kUndefined);
       this->CopyFromTp(M);
     } else {
-      Resize(M.NumCols(), M.NumRows());
+      Resize(M.NumCols(), M.NumRows(), kUndefined);
       this->CopyFromTp(M, kTrans);
     }
   }
@@ -584,9 +587,6 @@ class Matrix : public MatrixBase<Real> {
   /// Distructor to free matrices.
   ~Matrix() { Destroy(); }
 
-  /// Deallocates memory and sets to empty matrix.
-  void Destroy();
-
   /// Sets matrix to a specified size (zero is OK as long as both r and c are
   /// zero).  The value of the new data depends on resize_type:
   ///   -if kSetZero, the new data will be zero
@@ -601,9 +601,8 @@ class Matrix : public MatrixBase<Real> {
   /// Assignment operator that takes MatrixBase.
   Matrix<Real> &operator = (const MatrixBase<Real> &other) {
     if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols()) {
-      Resize(other.NumRows(), other.NumCols());
-    }
+        MatrixBase<Real>::NumCols() != other.NumCols())
+      Resize(other.NumRows(), other.NumCols(), kUndefined);
     MatrixBase<Real>::CopyFromMat(other);
     return *this;
   }
@@ -611,15 +610,17 @@ class Matrix : public MatrixBase<Real> {
   /// Assignment operator. Needed for inclusion in std::vector.
   Matrix<Real> &operator = (const Matrix<Real> &other) {
     if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols()) {
-      Resize(other.NumRows(), other.NumCols());
-    }
+        MatrixBase<Real>::NumCols() != other.NumCols())
+      Resize(other.NumRows(), other.NumCols(), kUndefined);
     MatrixBase<Real>::CopyFromMat(other);
     return *this;
   }
   
 
  private:
+  /// Deallocates memory and sets to empty matrix (dimension 0, 0).
+  void Destroy();
+  
   /// Init assumes the current class contents are invalid (i.e. junk or have
   /// already been freed), and it sets the matrix to newly allocated memory with
   /// the specified number of rows and columns.  r == c == 0 is acceptable.  The data
@@ -677,8 +678,8 @@ class SubMatrix : public MatrixBase<Real> {
             const MatrixIndexT ro,  // row offset, 0 < ro < NumRows()
             const MatrixIndexT r,   // number of rows, r > 0
             const MatrixIndexT co,  // column offset, 0 < co < NumCols()
-            const MatrixIndexT c);  // number of columns, c > 0
-
+            const MatrixIndexT c);   // number of columns, c > 0
+  
   ~SubMatrix<Real>() {}
   
   /// This type of constructor is needed for Range() to work [in Matrix base
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index bf18d25b4..f0df755fe 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -375,7 +375,13 @@ template
 void VectorBase<double>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT row);
 
 
-// takes elements to a power.  Throws exception if could not (but only for power != 1 ad power != 2).
+#ifdef HAVE_MKL
+template<>
+void VectorBase<float>::ApplyPow(float power) { vsPowx(dim_, data_, power, data_); }
+template<>
+void VectorBase<double>::ApplyPow(double power) { vdPowx(dim_, data_, power, data_); }
+#else
+// takes elements to a power.  Throws exception if could not (but only for power != 1 and power != 2).
 template<typename Real>
 void VectorBase<Real>::ApplyPow(Real power) {
   if (power == 1.0) return;
@@ -399,6 +405,7 @@ void VectorBase<Real>::ApplyPow(Real power) {
     }
   }
 }
+#endif
 
 // Computes the p-th norm. Throws exception if could not.
 template<typename Real>
@@ -534,14 +541,13 @@ template<typename Real>
 void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
   // note the double accumulator
   KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride();
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    double sum = 0.0;
-    const Real *src = M.Data() + i;
-    for (MatrixIndexT j = 0; j < num_rows; j++)
-      sum += src[j*stride];
-    data_[i] = alpha * sum + beta * data_[i];
-  }
+  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
+  Real *data = data_;
+  cblas_Xscal(dim, beta, data, 1);
+  const Real *m_data = M.Data();
+
+  for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
+    cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
 }
 
 template<typename Real>
@@ -651,6 +657,25 @@ Real VectorBase<Real>::ApplySoftMax() {
   return max + log(sum);
 }
 
+#ifdef HAVE_MKL
+template<>
+void VectorBase<float>::ApplyTanh() { vsTanh(dim_, data_, data_); }
+template<>
+void VectorBase<double>::ApplyTanh() { vdTanh(dim_, data_, data_); }
+#else
+template<typename Real>
+void VectorBase<Real>::ApplyTanh() {
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    Real x = data_[i];
+    if (x > 0.0) {
+      x = -1.0 + 2.0 / (1.0 + exp(-2.0 * x));
+    } else {
+      x = 1.0 - 2.0 / (1.0 + exp(2.0 * x));
+    }
+    data_[i] = x;
+  }
+}
+#endif
 
 template<typename Real>
 void VectorBase<Real>::Add(Real c) {
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 14de5781f..3ab0b270a 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -126,6 +126,10 @@ class VectorBase {
   /// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$
   Real ApplySoftMax();
 
+  /// Apply the tanh function to each element of a vector.  If using MKL, does
+  /// it using the "less accurate" options.
+  void ApplyTanh();
+
   /// Take all  elements of vector to a power.
   void ApplyPow(Real power);
 
@@ -322,20 +326,20 @@ class Vector: public VectorBase<Real> {
 
   /// Copy constructor.  The need for this is controversial.
   Vector(const Vector<Real> &v) : VectorBase<Real>()  { //  (cannot be explicit)
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
     this->CopyFromVec(v);
   }
 
   /// Copy-constructor from base-class, needed to copy from SubVector.
   explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
     this->CopyFromVec(v);
   }
 
   /// Type conversion constructor.
   template<typename OtherReal>
   explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
     this->CopyFromVec(v);
   }
 
@@ -372,14 +376,14 @@ class Vector: public VectorBase<Real> {
 
   /// Assignment operator, protected so it can only be used by std::vector
   Vector<Real> &operator = (const Vector<Real> &other) {
-    Resize(other.Dim());
+    Resize(other.Dim(), kUndefined);
     this->CopyFromVec(other);
     return *this;
   }
 
   /// Assignment operator that takes VectorBase.
   Vector<Real> &operator = (const VectorBase<Real> &other) {
-    Resize(other.Dim());
+    Resize(other.Dim(), kUndefined);
     this->CopyFromVec(other);
     return *this;
   }
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index ef9562526..4255a0a8e 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -679,6 +679,28 @@ template<class Real> static void UnitTestAxpy() {
   }
 }
 
+template<class Real> static void UnitTestCopySp() {
+  // Checking that the various versions of copying
+  // matrix to SpMatrix work the same in the symmetric case.
+  for (MatrixIndexT iter = 0;iter < 5;iter++) {
+    int32 dim = 5 + rand() %  10;
+    SpMatrix<Real> S(dim), T(dim);
+    S.SetRandn();
+    Matrix<Real> M(S);
+    T.CopyFromMat(M, kTakeMeanAndCheck);
+    AssertEqual(S, T);
+    T.SetZero();
+    T.CopyFromMat(M, kTakeMean);
+    AssertEqual(S, T);
+    T.SetZero();
+    T.CopyFromMat(M, kTakeLower);
+    AssertEqual(S, T);
+    T.SetZero();
+    T.CopyFromMat(M, kTakeUpper);
+    AssertEqual(S, T);
+  }
+}
+
 
 template<class Real> static void UnitTestPower() {
   for (MatrixIndexT iter = 0;iter < 5;iter++) {
@@ -1415,7 +1437,7 @@ template<class Real> static void UnitTestInverse() {
 
 
 template<class Real> static void UnitTestMulElements() {
-  for (MatrixIndexT iter = 0;iter < 5;iter++) {
+  for (MatrixIndexT iter = 0; iter < 5; iter++) {
 	MatrixIndexT dimM = 20 + rand()%10, dimN = 20 + rand()%10;
 	Matrix<Real> A(dimM, dimN), B(dimM, dimN), C(dimM, dimN);
 	InitRand(&A);
@@ -1430,6 +1452,7 @@ template<class Real> static void UnitTestMulElements() {
   }
 }
 
+
 template<class Real> static void UnitTestSpLogExp() {
   for (MatrixIndexT i = 0; i < 5; i++) {
     MatrixIndexT dimM = 10 + rand() % 10;
@@ -1860,6 +1883,27 @@ template<class Real> static void  UnitTestLimitCond() {
   }
 }
 
+template<class Real> static void  UnitTestTanh() {
+  for (MatrixIndexT i = 0; i < 10; i++) {
+    MatrixIndexT dimM = 5 + rand() % 10, dimN = 5 + rand() % 10;
+    Matrix<Real> M(dimM, dimN);
+    Matrix<Real> N(M);
+    for(int32 r = 0; r < dimM; r++) {
+      for (int32 c = 0; c < dimN; c++) {
+        Real x = N(r, c);
+        if (x > 0.0) {
+          x = -1.0 + 2.0 / (1.0 + exp(-2.0 * x));
+        } else {
+          x = 1.0 - 2.0 / (1.0 + exp(2.0 * x));
+        }
+        N(r, c) = x;
+      }
+    }
+    M.ApplyTanh();
+    AssertEqual(M, N);
+  }
+}
+
 template<class Real> static void  UnitTestSimple() {
   for (MatrixIndexT i = 0;i < 5;i++) {
 	MatrixIndexT dimM = 20 + rand()%10, dimN = 20 + rand()%20;
@@ -3541,6 +3585,7 @@ template<class Real> static void MatrixUnitTest(bool full_test) {
   UnitTestDotprod<Real>();
   // UnitTestSvdVariants<Real>();
   UnitTestPower<Real>();
+  UnitTestCopySp<Real>();
   UnitTestDeterminant<Real>();
   KALDI_LOG << " Point F";
   UnitTestDeterminantSign<Real>();
@@ -3566,6 +3611,7 @@ template<class Real> static void MatrixUnitTest(bool full_test) {
   UnitTestRange<Real>();
   UnitTestSimpleForVec<Real>();
   UnitTestSimpleForMat<Real>();
+  UnitTestTanh<Real>();
   UnitTestNorm<Real>();
   UnitTestMul<Real>();
   KALDI_LOG << " Point I";
diff --git a/src/matrix/sp-matrix.cc b/src/matrix/sp-matrix.cc
index c9901c4ea..bc57bed0d 100644
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@@ -169,9 +169,17 @@ void SpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
         break;
       }
     case kTakeLower:
-      for (MatrixIndexT i = 0; i < D; i++)
-        for (MatrixIndexT j = 0; j <= i; j++)
-          (*this)(i, j) = M(i, j);
+      { // making this one a bit more efficient.
+        const Real *src = M.Data();
+        Real *dest = this->data_;
+        MatrixIndexT stride = M.Stride();
+        for (MatrixIndexT i = 0; i < D; i++) {
+          for (MatrixIndexT j = 0; j <= i; j++)
+            dest[j] = src[j];
+          dest += i + 1;
+          src += stride;
+        }
+      }
       break;
     case kTakeUpper:
       for (MatrixIndexT i = 0; i < D; i++)
diff --git a/src/nnet-cpu/Makefile b/src/nnet-cpu/Makefile
index 99da7c07d..e17448a98 100644
--- a/src/nnet-cpu/Makefile
+++ b/src/nnet-cpu/Makefile
@@ -10,7 +10,7 @@ OBJFILES = nnet-component.o nnet-nnet.o nnet-update.o train-nnet.o \
      nnet-randomize.o nnet-compute.o am-nnet.o nnet-functions.o  \
      nnet-precondition.o shrink-nnet.o combine-nnet.o combine-nnet-a.o \
      mixup-nnet.o nnet-lbfgs.o nnet-update-parallel.o combine-nnet-fast.o \
-     nnet-fix.o
+     nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o
 
 #nnet-compute.o nnet-train.o
 # nnet-nnet.o nnet-loss.o nnet-rnnlm.o
diff --git a/src/nnet-cpu/nnet-component-test.cc b/src/nnet-cpu/nnet-component-test.cc
index 320f3e069..68e282104 100644
--- a/src/nnet-cpu/nnet-component-test.cc
+++ b/src/nnet-cpu/nnet-component-test.cc
@@ -183,6 +183,23 @@ void UnitTestSigmoidComponent() {
   }
 }
 
+void UnitTestReduceComponent() {
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+  
+  int32 input_dim = 10 + rand() % 50, n = 1 + rand() % 3;
+  {
+    ReduceComponent reduce_component(input_dim, n);
+    UnitTestGenericComponentInternal(reduce_component);
+  }
+  {
+    ReduceComponent reduce_component;
+    reduce_component.InitFromString("dim=15 n=3");
+    UnitTestGenericComponentInternal(reduce_component);
+  }
+}
+
+
 template<class T>
 void UnitTestGenericComponent() { // works if it has an initializer from int,
   // e.g. tanh, sigmoid.
@@ -463,6 +480,8 @@ int main() {
     UnitTestGenericComponent<TanhComponent>();
     UnitTestGenericComponent<PermuteComponent>();
     UnitTestGenericComponent<SoftmaxComponent>();
+    UnitTestSigmoidComponent();
+    UnitTestReduceComponent();
     UnitTestAffineComponent();
     UnitTestAffinePreconInputComponent();
     UnitTestBlockAffineComponent();
diff --git a/src/nnet-cpu/nnet-component.cc b/src/nnet-cpu/nnet-component.cc
index e291d67ad..9d0d4629b 100644
--- a/src/nnet-cpu/nnet-component.cc
+++ b/src/nnet-cpu/nnet-component.cc
@@ -47,6 +47,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new TanhComponent();
   } else if (component_type == "SoftmaxComponent") {
     ans = new SoftmaxComponent();
+  } else if (component_type == "ReduceComponent") {
+    ans = new ReduceComponent();
   } else if (component_type == "AffineComponent") {
     ans = new AffineComponent();
   } else if (component_type == "AffineComponentA") {
@@ -407,20 +409,8 @@ void TanhComponent::Propagate(const MatrixBase<BaseFloat> &in,
   // Apply tanh function to each element of the output...
   // the tanh function may be written as -1 + ( 2 / (1 + e^{-2 x})),
   // which is a scaled and shifted sigmoid.
-  out->Resize(in.NumRows(), in.NumCols());
-  int32 num_rows = in.NumRows(), num_cols = in.NumCols();
-  for(int32 r = 0; r < num_rows; r++) {
-    const BaseFloat *in_data = in.RowData(r),
-        *in_data_end = in_data + num_cols;
-    BaseFloat *out_data = out->RowData(r);
-    for (; in_data != in_data_end; ++in_data, ++out_data) {
-      if (*in_data > 0.0) {
-        *out_data = -1.0 + 2.0 / (1.0 + exp(-2.0 * *in_data));
-      } else {
-        *out_data = 1.0 - 2.0 / (1.0 + exp(2.0 * *in_data));
-      }
-    }
-  }
+  *out = in;
+  out->ApplyTanh();
 }
 
 void TanhComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value
@@ -502,6 +492,67 @@ void SoftmaxComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value
   }
 }
 
+void ReduceComponent::InitFromString(std::string args) {
+  std::string orig_args(args);
+  int32 dim, n;
+  bool ok = ParseFromString("dim", &args, &dim) &&
+      ParseFromString("n", &args, &n);
+  if (!args.empty())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << args;
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << orig_args;
+  Init(dim, n);
+}
+
+void ReduceComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<ReduceComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<N>");
+  ReadBasicType(is, binary, &n_);
+  ExpectToken(is, binary, "</ReduceComponent>");
+}
+
+void ReduceComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ReduceComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<N>");
+  WriteBasicType(os, binary, n_);
+  WriteToken(os, binary, "</ReduceComponent>");
+}
+
+void ReduceComponent::Propagate(const MatrixBase<BaseFloat> &in,
+                                int32 num_chunks,
+                                Matrix<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumRows() > 0 && in.NumCols() == InputDim());
+  out->Resize(in.NumRows(), OutputDim());
+  int32 num_frames = in.NumRows(), input_dim = in.NumCols(), n = n_;
+  for (int32 r = 0; r < num_frames; r++) {
+    const BaseFloat *src = in.RowData(r);
+    BaseFloat *dest = out->RowData(r);
+    for (int32 c = 0; c < input_dim; c++)
+      dest[c / n] += src[c];
+  }    
+}
+
+void ReduceComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value,
+                               const MatrixBase<BaseFloat> &, // out_value,
+                               const MatrixBase<BaseFloat> &out_deriv,
+                               int32, // num_chunks
+                               Component *, // to_update
+                               Matrix<BaseFloat> *in_deriv) const {
+  int32 num_frames = out_deriv.NumRows(),
+      input_dim = InputDim(), n = n_;
+  in_deriv->Resize(num_frames, input_dim, kUndefined);
+  for (int32 r = 0; r < num_frames; r++) {
+    const BaseFloat *src = out_deriv.RowData(r);
+    BaseFloat *dest = in_deriv->RowData(r);
+    for (int32 c = 0; c < input_dim; c++)
+      dest[c] = src[c / n];
+  }    
+}
+
 void AffineComponent::Scale(BaseFloat scale) {
   linear_params_.Scale(scale);
   bias_params_.Scale(scale);
@@ -857,11 +908,11 @@ void AffineComponentPreconditioned::Update(
   // Add the 1.0 at the end of each row "in_value_temp"
   for (int32 i = 0; i < in_value.NumRows(); i++)
     in_value_temp(i, in_value.NumCols()) = 1.0;
-
+  
   Matrix<BaseFloat> in_value_precon(in_value_temp.NumRows(),
-                                    in_value_temp.NumCols()),
+                                    in_value_temp.NumCols(), kUndefined),
       out_deriv_precon(out_deriv.NumRows(),
-                       out_deriv.NumCols());
+                       out_deriv.NumCols(), kUndefined);
   // each row of in_value_precon will be that same row of
   // in_value, but multiplied by the inverse of a Fisher
   // matrix that has been estimated from all the other rows,
diff --git a/src/nnet-cpu/nnet-component.h b/src/nnet-cpu/nnet-component.h
index 9dbcb4d49..743f9e09e 100644
--- a/src/nnet-cpu/nnet-component.h
+++ b/src/nnet-cpu/nnet-component.h
@@ -225,6 +225,8 @@ class NonlinearComponent: public Component {
   void Scale(BaseFloat scale);
   void Add(BaseFloat alpha, const NonlinearComponent &other);
 
+  // The following functions are unique to NonlinearComponent.
+  // They mostly relate to diagnostics.
   const Vector<double> &ValueSum() const { return value_sum_; }
   const Vector<double> &DerivSum() const { return deriv_sum_; }
   double Count() const { return count_; }
@@ -324,6 +326,37 @@ class SoftmaxComponent: public NonlinearComponent {
   SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
 };
 
+/// This layer just sums up groups of n inputs to produce one output.
+class ReduceComponent: public Component {
+ public:
+  void Init(int32 dim, int32 n) { KALDI_ASSERT(dim > 0 && n > 0);dim_ = dim; n_ = n; }
+  ReduceComponent(int32 dim, int32 n) { Init(dim, n); }
+  ReduceComponent(): dim_(0), n_(0) { } // e.g. prior to Read()
+  explicit ReduceComponent(const ReduceComponent &other):
+      dim_(other.dim_), n_(other.n_) {}
+  virtual Component* Copy() const { return new ReduceComponent(*this); }
+  virtual std::string Type() const { return "ReduceComponent"; }
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return (dim_ + n_ - 1) / n_; }
+  virtual void InitFromString(std::string args);
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual void Propagate(const MatrixBase<BaseFloat> &in,
+                         int32 num_chunks,
+                         Matrix<BaseFloat> *out) const;
+  virtual void Backprop(const MatrixBase<BaseFloat> &in_value,
+                        const MatrixBase<BaseFloat> &out_value,                        
+                        const MatrixBase<BaseFloat> &out_deriv,
+                        int32 num_chunks,
+                        Component *to_update, // may be identical to "this".
+                        Matrix<BaseFloat> *in_deriv) const;
+  virtual bool BackpropNeedsInput() const { return false; }
+  virtual bool BackpropNeedsOutput() const { return false; }
+ private:
+  int32 dim_;
+  int32 n_;
+};
+
 
 // Affine means a linear function plus an offset.
 // Note: although this class can be instantiated, it also
diff --git a/src/nnet-cpu/nnet-limit-rank.cc b/src/nnet-cpu/nnet-limit-rank.cc
new file mode 100644
index 000000000..12ab967f4
--- /dev/null
+++ b/src/nnet-cpu/nnet-limit-rank.cc
@@ -0,0 +1,108 @@
+// nnet/nnet-limit-rank.cc
+
+// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet-cpu/nnet-limit-rank.h"
+#include "thread/kaldi-task-sequence.h"
+
+namespace kaldi {
+
+class LimitRankClass {
+ public:
+  LimitRankClass(const NnetLimitRankOpts &opts,
+                 int32 c,
+                 Nnet *nnet): opts_(opts), c_(c), nnet_(nnet) { }
+  void operator () () {
+    AffineComponent *ac = dynamic_cast<AffineComponent*>(
+        &(nnet_->GetComponent(c_)));
+    KALDI_ASSERT(ac != NULL);
+
+    // We'll limit the rank of just the linear part, keeping the bias vector full.
+    Matrix<BaseFloat> M (ac->LinearParams());
+    int32 rows = M.NumRows(), cols = M.NumCols(), rc_min = std::min(rows, cols);
+    Vector<BaseFloat> s(rc_min);
+    Matrix<BaseFloat> U(rows, rc_min), Vt(rc_min, cols);
+    // Do the destructive svd M = U diag(s) V^T.  It actually outputs the transpose of V.
+    M.DestructiveSvd(&s, &U, &Vt);
+    SortSvd(&s, &U, &Vt); // Sort the singular values from largest to smallest.
+
+    int32 d = GetRetainedDim(rows, cols);
+    BaseFloat old_svd_sum = s.Sum();
+    U.Resize(rows, d, kCopyData);
+    s.Resize(d, kCopyData);
+    Vt.Resize(d, cols, kCopyData);
+    BaseFloat new_svd_sum = s.Sum();
+    KALDI_LOG << "For component " << c_ << " of dimension " << rows
+              << " x " << cols << ", reduced rank from "
+              << rc_min <<  " to " << d << ", SVD sum reduced from "
+              << old_svd_sum << " to " << new_svd_sum;
+    Vt.MulRowsVec(s); // Vt <-- diag(s) Vt.
+    M.AddMatMat(1.0, U, kNoTrans, Vt, kNoTrans, 0.0); // Reconstruct with reduced
+    // rank.
+    Vector<BaseFloat> bias_params(ac->BiasParams());
+    ac->SetParams(bias_params, M);
+  }
+
+  int32 GetRetainedDim(int32 rows, int32 cols) {
+    if (opts_.parameter_proportion <= 0.0 || opts_.parameter_proportion > 1.0)
+      KALDI_ERR << "bad --parameter-proportion " << opts_.parameter_proportion;
+    // If we do SVD to dimension d, so that it's U diag(s) V^T where
+    // U is rows * d, s is d, and V is cols * d, then the #params is as follows...
+    //   the first column of U has free parameters (#rows - 1) [the -1 is due to
+    //   the length constraint]; the second has (#rows - 2) [subtract 1 for the
+    //   length constraint and one for orthogonality with the previous row], etc.
+    //   Total is params(U) = (rows * d) - ((d(d+1))/2),
+    //            params(s) = d,
+    //            params(V) = (cols * d) - ((d(d+1))/2),
+    //   So total is (rows + cols) * d - d * d .
+    //   For example, if d = #rows, this equals (#rows * #cols)
+    //   We are solving for:
+    //   (rows * cols) * parameter_proportion = (rows + cols) * d - d * d, or
+    //   d^2 - d * (rows + cols) + (rows*cols)*parameter_proportion
+    //   In quadratic equation
+    //   a = 1.0,
+    //   b = -(rows + cols)
+    //   c = rows * cols * parameter_proportion.
+    //   Take smaller solution.
+    BaseFloat a = 1.0, b = -(rows + cols),
+        c = rows * cols * opts_.parameter_proportion;
+    BaseFloat x = (-b - sqrt(b * b - 4 * a * c)) / (2.0 * a);
+    int32 ans = static_cast<int32>(x);
+    KALDI_ASSERT(ans > 0 && ans <= std::min(rows, cols));
+    return ans;
+  }
+  
+  ~LimitRankClass() { }
+ private:
+  const NnetLimitRankOpts &opts_;
+  int32 c_;
+  Nnet *nnet_;
+};
+
+
+void LimitRankParallel(const NnetLimitRankOpts &opts,
+                            Nnet *nnet) {
+  TaskSequencerConfig task_config;
+  task_config.num_threads = opts.num_threads;
+  TaskSequencer<LimitRankClass> tc(task_config);
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    if (dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c))) != NULL)
+      tc.Run(new LimitRankClass(opts, c, nnet));
+  }
+}
+
+  
+} // namespace
diff --git a/src/nnet-cpu/nnet-limit-rank.h b/src/nnet-cpu/nnet-limit-rank.h
new file mode 100644
index 000000000..3e835049b
--- /dev/null
+++ b/src/nnet-cpu/nnet-limit-rank.h
@@ -0,0 +1,56 @@
+// nnet-cpu/nnet-limit-rank.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
+#define KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
+
+#include "nnet-cpu/nnet-nnet.h"
+#include "util/table-types.h"
+#include "thread/kaldi-semaphore.h"
+#include "thread/kaldi-thread.h"
+#include "nnet-cpu/nnet-update.h"
+
+namespace kaldi {
+
+struct NnetLimitRankOpts {
+  int32 num_threads;
+  BaseFloat parameter_proportion;
+  
+  NnetLimitRankOpts(): num_threads(1), parameter_proportion(0.75) { }
+
+  void Register(ParseOptions *po) {
+    po->Register("num-threads", &num_threads, "Number of threads used for "
+                 "rank-limiting operation; note, will never use more than "
+                 "#layers.");
+    po->Register("parameter-proportion", &parameter_proportion, "Proportion of "
+                 "dimension of each transform to limit the rank to.");
+  }  
+};
+
+
+/// This function limits the rank of each affine transform in the
+/// neural net, by zeroing out the smallest singular values.  The number of
+/// singular values to zero out is determined on a layer by layer basis, using
+/// "parameter_proportion" to set the proportion of parameters to remove.
+void LimitRankParallel(const NnetLimitRankOpts &opts,
+                       Nnet *nnet);
+                            
+
+
+} // namespace
+
+#endif // KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
diff --git a/src/nnet-cpu/nnet-nnet.cc b/src/nnet-cpu/nnet-nnet.cc
index 2c51ab9d3..ba00a4236 100644
--- a/src/nnet-cpu/nnet-nnet.cc
+++ b/src/nnet-cpu/nnet-nnet.cc
@@ -361,6 +361,17 @@ void Nnet::RemoveDropout() {
     KALDI_LOG << "Removed " << removed << " dropout components.";
 }
 
+void Nnet::RemovePreconditioning() {
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (dynamic_cast<AffineComponentPreconditioned*>(components_[i]) != NULL) {
+      AffineComponent *ac = new AffineComponent(
+          *(dynamic_cast<AffineComponent*>(components_[i])));
+      delete components_[i];
+      components_[i] = ac;
+    }
+  }
+}
+
 void Nnet::AddNnet(const VectorBase<BaseFloat> &scale_params,
                    const Nnet &other) {
   KALDI_ASSERT(scale_params.Dim() == this->NumUpdatableComponents());
diff --git a/src/nnet-cpu/nnet-nnet.h b/src/nnet-cpu/nnet-nnet.h
index 7656803f8..1aec44fa7 100644
--- a/src/nnet-cpu/nnet-nnet.h
+++ b/src/nnet-cpu/nnet-nnet.h
@@ -104,6 +104,10 @@ class Nnet {
 
   /// Excise any components of type DropoutComponent.
   void RemoveDropout();
+
+  /// Replace any components of type AffineComponentPreconditioned with
+  /// components of type AffineComponent.
+  void RemovePreconditioning();
   
   /// For each updatatable component, adds to it
   /// the corresponding element of "other" times the
diff --git a/src/nnet-cpu/nnet-precondition.cc b/src/nnet-cpu/nnet-precondition.cc
index 82cf5eb18..8ba982d97 100644
--- a/src/nnet-cpu/nnet-precondition.cc
+++ b/src/nnet-cpu/nnet-precondition.cc
@@ -25,7 +25,13 @@ void PreconditionDirections(const MatrixBase<BaseFloat> &R,
                             MatrixBase<BaseFloat> *P) {
 
   int32 N = R.NumRows(), D = R.NumCols();
-  KALDI_ASSERT(SameDim(R, *P) && N > 1);
+  KALDI_ASSERT(SameDim(R, *P) && N > 0);
+  if (N == 1) {
+    KALDI_WARN << "Trying to precondition set of only one frames: returning "
+               << "unchanged.  Ignore this warning if infrequent.";
+    P->CopyFromMat(R);
+    return;
+  }
   MatrixBase<BaseFloat> &Q = *P;
   
   if (N >= D) {
diff --git a/src/nnet-cpu/nnet-stats.h b/src/nnet-cpu/nnet-stats.h
new file mode 100644
index 000000000..604a18622
--- /dev/null
+++ b/src/nnet-cpu/nnet-stats.h
@@ -0,0 +1,93 @@
+// nnet-cpu/nnet-stats.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET_CPU_NNET_FIX_H_
+#define KALDI_NNET_CPU_NNET_FIX_H_
+
+#include "nnet-cpu/nnet-nnet.h"
+
+namespace kaldi {
+
+/* This program computes various statistics from a neural net.  These are
+   summaries of certain quantities already present in the network as
+   stored on disk, especially regarding certain average values and
+   derivatives of the sigmoids.   
+*/
+
+struct NnetStatsConfig {  
+  BaseFloat bucket_width;
+  NnetStatsConfig(): bucket_width(0.025) { }
+  
+  void Register(ParseOptions *po) {
+    po->Register("bucket-width", &bucket_width, "Width of bucket in average-derivative "
+                 "stats for analysis.");
+  }
+};
+
+class NnetStats {
+ public:
+  NnetStats(int32 affine_component_index, BaseFloat bucket_width):
+      affine_component_index_(affine_component_index),
+      bucket_width_(bucket_width), global_(0, -1) { }
+  
+  // Use default copy constructor and assignment operator.
+  
+  void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
+
+  void AddStatsFromNnet(const Nnet &nnet);
+  
+  void PrintStats(std::ostream &os);  
+ private:
+
+  struct StatsElement {
+    BaseFloat deriv_begin; // avg-deriv, beginning of bucket.
+    BaseFloat deriv_end;   // avg-deriv, end of bucket.
+    BaseFloat deriv_sum;   // sum of avg-deriv within bucket.
+    BaseFloat deriv_sumsq;   // Sum-squared of avg-deriv within bucket.
+    BaseFloat abs_value_sum; // Sum of abs(avg-value).  Tells us whether it's
+    // saturating at one or both ends.
+    BaseFloat abs_value_sumsq; // Sum-squared of abs(avg-value).
+    int32 count;      // Number of nonlinearities in this bucket.
+
+    StatsElement(BaseFloat deriv_begin,
+                 BaseFloat deriv_end):
+        deriv_begin(deriv_begin), deriv_end(deriv_end), deriv_sum(0.0),
+        deriv_sumsq(0.0), abs_value_sum(0.0), abs_value_sumsq(0.0), count(0) { }
+    void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
+    // Outputs stats for this bucket; no newline
+    void PrintStats(std::ostream &os); 
+  };
+  int32 BucketFor(BaseFloat avg_deriv); // returns the bucket
+  // for this avg-derivative value, and makes sure it is allocated.
+  
+  int32 affine_component_index_; // Component index of the affine component
+                                // associated with this nonlinearity.
+  BaseFloat bucket_width_; // width of buckets of stats we store (in derivative values).
+  
+  std::vector<StatsElement> buckets_; // Stats divided into buckets by avg_deriv.
+  StatsElement global_; // All the stats.
+  
+};
+
+void GetNnetStats(const NnetStatsConfig &config,
+                  const Nnet &nnet,
+                  std::vector<NnetStats> *stats);
+
+
+} // namespace
+
+#endif // KALDI_NNET_CPU_NNET_FIX_H_
diff --git a/src/nnet-cpu/nnet-update.cc b/src/nnet-cpu/nnet-update.cc
index 755624d11..80d83e3a0 100644
--- a/src/nnet-cpu/nnet-update.cc
+++ b/src/nnet-cpu/nnet-update.cc
@@ -44,7 +44,6 @@ class NnetUpdater {
   // Possibly splices input together from forward_data_[component].
   //   MatrixBase<BaseFloat> &GetSplicedInput(int32 component, Matrix<BaseFloat> *temp_matrix);
 
-
   void Propagate();
 
   /// Computes objective function and derivative at output layer.
@@ -153,10 +152,10 @@ void NnetUpdater::Backprop(const std::vector<NnetTrainingExample> &data,
                      &output = forward_data_[c+1];
     Matrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
     const Matrix<BaseFloat> &output_deriv(*deriv);
- 
+
     component.Backprop(input, output, output_deriv, num_chunks,
                        component_to_update, &input_deriv);
-    *deriv = input_deriv;
+    input_deriv.Swap(deriv);
   }
 }
 
diff --git a/src/nnet-cpu/rescale-nnet.cc b/src/nnet-cpu/rescale-nnet.cc
new file mode 100644
index 000000000..682de723b
--- /dev/null
+++ b/src/nnet-cpu/rescale-nnet.cc
@@ -0,0 +1,212 @@
+// nnet/rescale-nnet.cc
+
+// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet-cpu/rescale-nnet.h"
+
+namespace kaldi {
+
+
+class NnetRescaler {
+ public:
+  NnetRescaler(const NnetRescaleConfig &config,
+               const std::vector<NnetTrainingExample> &examples,
+               Nnet *nnet):
+      config_(config), examples_(examples), nnet_(nnet) {}
+                            
+  void Rescale();
+
+ private:
+  /// takes the input and formats as a single matrix, in forward_data_[0].
+  void FormatInput(const std::vector<NnetTrainingExample> &data,
+                   Matrix<BaseFloat> *input);
+  void RescaleComponent(int32 c, int32 num_chunks,
+                        MatrixBase<BaseFloat> *cur_data_in,
+                        Matrix<BaseFloat> *next_data);
+
+  void ComputeRelevantIndexes();
+  
+  BaseFloat GetTargetAvgDeriv(int32 c);
+  
+  const NnetRescaleConfig &config_;
+  const std::vector<NnetTrainingExample> &examples_;
+  Nnet *nnet_;
+  std::set<int32> relevant_indexes_; // values of c with AffineComponent followed
+  // by (at c+1) NonlinearComponent that is not SoftmaxComponent.
+};
+
+
+void NnetRescaler::FormatInput(const std::vector<NnetTrainingExample> &data,
+                               Matrix<BaseFloat> *input) {
+  KALDI_ASSERT(data.size() > 0);
+  int32 num_splice = nnet_->LeftContext() + 1 + nnet_->RightContext();
+  KALDI_ASSERT(data[0].input_frames.NumRows() == num_splice);
+
+  int32 feat_dim = data[0].input_frames.NumCols(),
+         spk_dim = data[0].spk_info.Dim(),
+         tot_dim = feat_dim + spk_dim; // we append these at the neural net
+                                       // input... note, spk_dim might be 0.
+  KALDI_ASSERT(tot_dim == nnet_->InputDim());
+  int32 num_chunks = data.size();
+
+  input->Resize(num_splice * num_chunks,
+                tot_dim);
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    SubMatrix<BaseFloat> dest(*input,
+                              chunk * num_splice, num_splice,
+                              0, feat_dim);
+    const Matrix<BaseFloat> &src(data[chunk].input_frames);
+    dest.CopyFromMat(src);
+    if (spk_dim != 0) {
+      SubMatrix<BaseFloat> spk_dest(*input,
+                                    chunk * num_splice, num_splice,
+                                    feat_dim, spk_dim);
+      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
+    }
+  }
+}
+
+void NnetRescaler::ComputeRelevantIndexes() {
+  for (int32 c = 0; c + 1 < nnet_->NumComponents(); c++)
+    if (dynamic_cast<AffineComponent*>(&nnet_->GetComponent(c)) != NULL &&
+        (dynamic_cast<NonlinearComponent*>(&nnet_->GetComponent(c+1)) != NULL &&
+         dynamic_cast<SoftmaxComponent*>(&nnet_->GetComponent(c+1)) == NULL))
+      relevant_indexes_.insert(c);
+}
+
+
+BaseFloat NnetRescaler::GetTargetAvgDeriv(int32 c) {
+  KALDI_ASSERT(relevant_indexes_.count(c) == 1);
+  BaseFloat factor;
+  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
+    factor = 0.25;
+  else if (dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
+    factor = 1.0;
+  else
+    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
+  
+  int32 last_c = *std::max_element(relevant_indexes_.begin(), relevant_indexes_.end()),
+      first_c = *std::min_element(relevant_indexes_.begin(), relevant_indexes_.end());
+  if (c == first_c)
+    return factor * config_.target_first_layer_avg_deriv;
+  else if (c == last_c)
+    return factor * config_.target_last_layer_avg_deriv;
+  else
+    return factor * config_.target_avg_deriv;
+}
+
+// Here, c is the index of the affine component, and
+// c + 1 is the index of the nonlinear component; *cur_data is the
+// output of the affine component.
+void NnetRescaler::RescaleComponent(
+    int32 c,
+    int32 num_chunks,
+    MatrixBase<BaseFloat> *cur_data_in,
+    Matrix<BaseFloat> *next_data) {
+  int32 rows = cur_data_in->NumRows(), cols = cur_data_in->NumCols();
+  // Only handle sigmoid or tanh here.
+  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) == NULL &&
+      dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) == NULL)
+    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
+  // the nonlinear component:
+  NonlinearComponent &nc =
+      *(dynamic_cast<NonlinearComponent*>(&(nnet_->GetComponent(c + 1))));
+  
+  BaseFloat orig_avg_deriv, target_avg_deriv = GetTargetAvgDeriv(c);
+  BaseFloat cur_scaling = 1.0; // current rescaling factor (on input).
+  int32 num_iters = 10;
+  
+  Matrix<BaseFloat> cur_data(*cur_data_in),
+      ones(rows, cols), in_deriv(rows, cols);
+      
+  ones.Set(1.0);
+  nc.Propagate(cur_data, num_chunks, next_data);
+  nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
+  BaseFloat cur_avg_deriv;
+  cur_avg_deriv = in_deriv.Sum() / (rows * cols);
+  orig_avg_deriv = cur_avg_deriv;
+  for (int32 iter = 0; iter < num_iters; iter++) {
+    // We already have "cur_avg_deriv"; perturb the scale and compute
+    // the next avg_deriv, so we can see how it changes with the scale.
+    cur_data.CopyFromMat(*cur_data_in);
+    cur_data.Scale(cur_scaling + config_.delta);
+    nc.Propagate(cur_data, num_chunks, next_data);
+    nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
+    BaseFloat next_avg_deriv = in_deriv.Sum() / (rows * cols);
+    KALDI_ASSERT(next_avg_deriv < cur_avg_deriv);
+    // "gradient" is how avg_deriv changes as we change the scale.
+    // should be negative.
+    BaseFloat gradient = (next_avg_deriv - cur_avg_deriv) / config_.delta;
+    KALDI_ASSERT(gradient < 0.0);
+    BaseFloat proposed_change = (target_avg_deriv - cur_avg_deriv) / gradient;
+    KALDI_VLOG(2) << "cur_avg_deriv = " << cur_avg_deriv << ", target_avg_deriv = "
+                  << target_avg_deriv << ", gradient = " << gradient
+                  << ", proposed_change " << proposed_change; 
+    // Limit size of proposed change in "cur_scaling", to ensure stability.
+    if (fabs(proposed_change / cur_scaling) > config_.max_change)
+      proposed_change = cur_scaling * config_.max_change *
+          (proposed_change > 0.0 ? 1.0 : -1.0);
+    cur_scaling += proposed_change;
+    
+    cur_data.CopyFromMat(*cur_data_in);
+    cur_data.Scale(cur_scaling);
+    nc.Propagate(cur_data, num_chunks, next_data);
+    nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
+    cur_avg_deriv = in_deriv.Sum() / (rows * cols);
+    if (fabs(proposed_change) < config_.min_change) break; // Terminate the
+    // optimization
+  }
+  UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(
+      &nnet_->GetComponent(c));
+  KALDI_ASSERT(uc != NULL);
+  uc->Scale(cur_scaling); // scale the parameters of the previous
+  // AffineComponent.
+  
+  KALDI_LOG << "For component " << c << ", scaling parameters by "
+            << cur_scaling << "; average "
+            << "derivative changed from " << orig_avg_deriv << " to "
+            << cur_avg_deriv << "; target was " << target_avg_deriv;
+}
+    
+
+
+void NnetRescaler::Rescale() {
+  ComputeRelevantIndexes(); // set up relevant_indexes_.
+  Matrix<BaseFloat> cur_data, next_data;
+  FormatInput(examples_, &cur_data);
+  int32 num_chunks = examples_.size();
+  for (int32 c = 0; c < nnet_->NumComponents(); c++) {
+    Component &component = nnet_->GetComponent(c);
+    if (relevant_indexes_.count(c - 1) == 1) {
+      // the following function call also appropriately sets "next_data"
+      // after doing the rescaling
+      RescaleComponent(c - 1, num_chunks, &cur_data, &next_data);
+    } else {
+      component.Propagate(cur_data, num_chunks, &next_data);
+    }
+    cur_data.Swap(&next_data);
+  }
+}
+
+void RescaleNnet(const NnetRescaleConfig &rescale_config,
+                 const std::vector<NnetTrainingExample> &examples,
+                 Nnet *nnet) {
+  NnetRescaler rescaler(rescale_config, examples, nnet);
+  rescaler.Rescale();
+}
+
+
+} // namespace
diff --git a/src/nnet-cpu/rescale-nnet.h b/src/nnet-cpu/rescale-nnet.h
new file mode 100644
index 000000000..f8c6ecced
--- /dev/null
+++ b/src/nnet-cpu/rescale-nnet.h
@@ -0,0 +1,76 @@
+// nnet-cpu/rescale-nnet.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET_CPU_RESCALE_NNET_H_
+#define KALDI_NNET_CPU_RESCALE_NNET_H_
+
+#include "nnet-cpu/nnet-update.h"
+#include "nnet-cpu/nnet-compute.h"
+#include "util/parse-options.h"
+
+// Neural net rescaling is a rescaling of the parameters of the various layers
+// of a neural net, done so as to match certain specified statistics on the
+// average derivative of the sigmoid, measured on sample data.  This relates to
+// how "saturated" the sigmoid is.
+
+namespace kaldi {
+
+
+struct NnetRescaleConfig {
+  BaseFloat target_avg_deriv;
+  BaseFloat target_first_layer_avg_deriv;
+  BaseFloat target_last_layer_avg_deriv;
+
+  // These are relatively unimportant; for now they have no
+  // command line options.
+  BaseFloat num_iters;
+  BaseFloat delta;
+  BaseFloat max_change; // maximum change on any one iteration (to
+  // ensure stability).
+  BaseFloat min_change; // minimum change on any one iteration (controls
+  // termination
+  
+  NnetRescaleConfig(): target_avg_deriv(0.2),
+                       target_first_layer_avg_deriv(0.3),
+                       target_last_layer_avg_deriv(0.1),
+                       num_iters(10),
+                       delta(0.01),
+                       max_change(0.2), min_change(1.0e-05) { }
+  
+  void Register(ParseOptions *po) {
+    po->Register("target-avg-deriv", &target_avg_deriv, "Target average derivative "
+                 "for hidden layers that are the not the first or last hidden layer "
+                 "(as fraction of maximum derivative of the nonlinearity)");
+    po->Register("target-first-layer-avg-deriv", &target_first_layer_avg_deriv,
+                 "Target average derivative for the first hidden layer"
+                 "(as fraction of maximum derivative of the nonlinearity)");
+    po->Register("target-last-layer-avg-deriv", &target_last_layer_avg_deriv,
+                 "Target average derivative for the last hidden layer, if "
+                 "#hid-layers > 1"
+                 "(as fraction of maximum derivative of the nonlinearity)");
+  }  
+};
+
+void RescaleNnet(const NnetRescaleConfig &rescale_config,
+                 const std::vector<NnetTrainingExample> &examples,
+                 Nnet *nnet);
+  
+
+
+} // namespace
+
+#endif
diff --git a/src/nnet-cpubin/Makefile b/src/nnet-cpubin/Makefile
index 25e937288..6583f9d07 100644
--- a/src/nnet-cpubin/Makefile
+++ b/src/nnet-cpubin/Makefile
@@ -13,7 +13,8 @@ BINFILES = nnet-randomize-frames nnet-am-info nnet-train nnet-init \
    nnet-train-lbfgs nnet-get-egs nnet-train-parallel nnet-gradient \
    nnet-get-preconditioner nnet-precondition nnet-select-egs nnet-combine-fast \
    nnet-subset-egs nnet-shuffle-egs nnet-am-fix nnet-logprob  nnet-logprob2 \
-   nnet-logprob2-parallel nnet-logprob-parallel
+   nnet-logprob2-parallel nnet-logprob-parallel nnet-am-stats nnet-am-rescale \
+   nnet-am-limit-rank
 
 OBJFILES =
 
diff --git a/src/nnet-cpubin/nnet-am-copy.cc b/src/nnet-cpubin/nnet-am-copy.cc
index cd458a7ac..b3ab7bfcf 100644
--- a/src/nnet-cpubin/nnet-am-copy.cc
+++ b/src/nnet-cpubin/nnet-am-copy.cc
@@ -41,6 +41,7 @@ int main(int argc, char *argv[]) {
     int32 truncate = -1;
     bool binary_write = true;
     bool remove_dropout = false;
+    bool remove_preconditioning = false;
     BaseFloat learning_rate_factor = 1.0, learning_rate = -1;
     std::string learning_rates = "";
     std::string scales = "";
@@ -64,6 +65,8 @@ int main(int argc, char *argv[]) {
                 "to this many components by removing the last components.");
     po.Register("remove-dropout", &remove_dropout, "Set this to true to remove "
                 "any dropout components.");
+    po.Register("remove-preconditioning", &remove_preconditioning, "Set this to true to replace "
+                "components of type AffineComponentPreconditioned with AffineComponent.");
     po.Register("stats-from", &stats_from, "Before copying neural net, copy the "
                 "statistics in any layer of type NonlinearComponent, from this "
                 "neural network: provide the extended filename.");
@@ -133,6 +136,8 @@ int main(int argc, char *argv[]) {
 
     if (remove_dropout) am_nnet.GetNnet().RemoveDropout();
 
+    if (remove_preconditioning) am_nnet.GetNnet().RemovePreconditioning();
+
     if (stats_from != "") {
       // Copy the stats associated with the layers descending from
       // NonlinearComponent.
diff --git a/src/nnet-cpubin/nnet-am-fix.cc b/src/nnet-cpubin/nnet-am-fix.cc
index d8b92992a..24c1d8785 100644
--- a/src/nnet-cpubin/nnet-am-fix.cc
+++ b/src/nnet-cpubin/nnet-am-fix.cc
@@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
         "e.g.:\n"
         " nnet-am-fix 1.mdl 1_fixed.mdl\n"
         "or:\n"
-        " nnet-am-shrink-rows --get-counts-from=1.gradient 1.mdl 1_shrunk.mdl\n";
+        " nnet-am-fix --get-counts-from=1.gradient 1.mdl 1_shrunk.mdl\n";
 
     bool binary_write = true;
     NnetFixConfig config;
diff --git a/src/nnet-cpubin/nnet-am-limit-rank.cc b/src/nnet-cpubin/nnet-am-limit-rank.cc
new file mode 100644
index 000000000..c5f34f1c9
--- /dev/null
+++ b/src/nnet-cpubin/nnet-am-limit-rank.cc
@@ -0,0 +1,81 @@
+// nnet-cpubin/nnet-am-limit-rank.cc
+
+// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet-cpu/nnet-limit-rank.h"
+#include "nnet-cpu/am-nnet.h"
+#include "hmm/transition-model.h"
+#include "tree/context-dep.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Copy a (cpu-based) neural net and its associated transition model,\n"
+        "but modify it to reduce the effective parameter count by limiting\n"
+        "the rank of weight matrices.\n"
+        "\n"
+        "Usage:  nnet-am-limit-rank [options] <nnet-in> <nnet-out>\n"
+        "e.g.:\n"
+        " nnet-am-limit-rank 1.mdl 1_limited.mdl\n";
+    
+
+    bool binary_write = true;
+    NnetLimitRankOpts config;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        nnet_wxfilename = po.GetArg(2);
+    
+    TransitionModel trans_model;
+    AmNnet am_nnet;
+    {
+      bool binary;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    LimitRankParallel(config, &am_nnet.GetNnet());
+    
+    {
+      Output ko(nnet_wxfilename, binary_write);
+      trans_model.Write(ko.Stream(), binary_write);
+      am_nnet.Write(ko.Stream(), binary_write);
+    }
+    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
+              << " to " << nnet_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/nnet-cpubin/nnet-am-rescale.cc b/src/nnet-cpubin/nnet-am-rescale.cc
new file mode 100644
index 000000000..17e78bfc8
--- /dev/null
+++ b/src/nnet-cpubin/nnet-am-rescale.cc
@@ -0,0 +1,92 @@
+// nnet-cpubin/nnet-am-rescale.cc
+
+// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet-cpu/rescale-nnet.h"
+#include "nnet-cpu/am-nnet.h"
+#include "hmm/transition-model.h"
+#include "tree/context-dep.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Rescale the parameters in a neural net to achieve certain target\n"
+        "statistics, relating to the average derivative of the sigmoids\n"
+        "measured at some supplied data.  This relates to how saturated\n"
+        "the sigmoids are (we try to match the statistics of `good' neural\n"
+        "nets).\n"
+        "\n"
+        "Usage:  nnet-am-rescale [options] <nnet-in> <examples-in> <nnet-out>\n"
+        "e.g.:\n"
+        " nnet-am-rescale 1.mdl valid.egs 1_rescaled.mdl\n";
+
+    bool binary_write = true;
+    NnetRescaleConfig config;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        egs_rspecifier = po.GetArg(2), 
+        nnet_wxfilename = po.GetArg(3);
+    
+    TransitionModel trans_model;
+    AmNnet am_nnet;
+    {
+      bool binary;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    std::vector<NnetTrainingExample> egs;
+
+    // This block adds samples to "egs".
+    SequentialNnetTrainingExampleReader example_reader(
+        egs_rspecifier);
+    for (; !example_reader.Done(); example_reader.Next())
+      egs.push_back(example_reader.Value());
+    KALDI_LOG << "Read " << egs.size() << " examples.";
+    KALDI_ASSERT(!egs.empty());
+    
+    RescaleNnet(config, egs, &am_nnet.GetNnet());
+    
+    {
+      Output ko(nnet_wxfilename, binary_write);
+      trans_model.Write(ko.Stream(), binary_write);
+      am_nnet.Write(ko.Stream(), binary_write);
+    }
+    KALDI_LOG << "Rescaled neural net and wrote it to " << nnet_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/nnet-cpubin/nnet-am-stats.cc b/src/nnet-cpubin/nnet-am-stats.cc
new file mode 100644
index 000000000..b078090af
--- /dev/null
+++ b/src/nnet-cpubin/nnet-am-stats.cc
@@ -0,0 +1,72 @@
+// nnet-cpubin/nnet-am-stats.cc
+
+// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet-cpu/nnet-stats.h"
+#include "nnet-cpu/am-nnet.h"
+#include "hmm/transition-model.h"
+#include "tree/context-dep.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Print some statistics about the average derivatives of the sigmoid layers\n"
+        "of the neural net, that are stored in the net\n"
+        "\n"
+        "Usage:  nnet-am-stats [options] <nnet-in>\n"
+        "e.g.:\n"
+        " nnet-am-stats 1.mdl 1_fixed.mdl\n";
+    
+    NnetStatsConfig config;
+    
+    ParseOptions po(usage);
+    config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 1) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1);
+    
+    TransitionModel trans_model;
+    AmNnet am_nnet;
+    {
+      bool binary;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    std::vector<NnetStats> stats;
+    GetNnetStats(config, am_nnet.GetNnet(), &stats);
+    KALDI_ASSERT(!stats.empty());
+    for (size_t i = 0; i < stats.size(); i++)
+      stats[i].PrintStats(std::cout);
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/nnet-cpubin/nnet-subset-egs.cc b/src/nnet-cpubin/nnet-subset-egs.cc
index f4a957886..a8958d1e9 100644
--- a/src/nnet-cpubin/nnet-subset-egs.cc
+++ b/src/nnet-cpubin/nnet-subset-egs.cc
@@ -85,7 +85,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read
               << " neural-network training examples ";
     
-    return (static_cast<size_t>(n) == egs.size() ? 0 : 1);
+    return (num_read != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/nnet/nnet-cache-tgtmat.cc b/src/nnet/nnet-cache-tgtmat.cc
index 5a42bf1e8..c70dc2513 100644
--- a/src/nnet/nnet-cache-tgtmat.cc
+++ b/src/nnet/nnet-cache-tgtmat.cc
@@ -88,8 +88,8 @@ void CacheTgtMat::AddData(const CuMatrix<BaseFloat> &features, const CuMatrix<Ba
       features_.CopyRowsFromMat(leftover, features_leftover_, 0, 0);
       targets_.CopyRowsFromMat(leftover, targets_leftover_, 0, 0);
       
-      features_leftover_.Destroy();
-      targets_leftover_.Destroy();
+      features_leftover_.Resize(0, 0);
+      targets_leftover_.Resize(0, 0);
       filling_pos_ += leftover;
     } 
   }
diff --git a/src/nnet/nnet-cache.cc b/src/nnet/nnet-cache.cc
index 91cefd4d2..bb243e252 100644
--- a/src/nnet/nnet-cache.cc
+++ b/src/nnet/nnet-cache.cc
@@ -91,7 +91,7 @@ void Cache::AddData(const CuMatrix<BaseFloat> &features, const std::vector<int32
                 targets_leftover_.begin()+leftover,
                 targets_.begin());
 
-      features_leftover_.Destroy();
+      features_leftover_.Resize(0, 0);
       targets_leftover_.resize(0);
       filling_pos_ += leftover;
     } 
diff --git a/src/nnet/nnet-loss.cc b/src/nnet/nnet-loss.cc
index 83bcb3c9f..0fb72c4a2 100644
--- a/src/nnet/nnet-loss.cc
+++ b/src/nnet/nnet-loss.cc
@@ -32,7 +32,7 @@ void Xent::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &t
   diff->Resize(net_out.NumRows(), net_out.NumCols());
 
   // compute derivative wrt. activations of last layer of neurons
-  diff->CopyFromMat(net_out);
+  *diff = net_out;
   diff->AddMat(-1.0, target);
 
   // we'll not produce per-frame classification accuracy for soft labels
@@ -40,7 +40,8 @@ void Xent::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &t
 
   // :TODO: reimplement when needed
   // compute xentropy (ON CPU)
-  Matrix<BaseFloat> target_host, net_out_host;
+  Matrix<BaseFloat> target_host(target.NumRows(), target.NumCols(), kUndefined),
+      net_out_host(net_out.NumRows(), net_out.NumCols(), kUndefined);
   target.CopyToMat(&target_host);
   net_out.CopyToMat(&net_out_host);
   BaseFloat val;
@@ -69,7 +70,7 @@ void Xent::EvalVec(const CuMatrix<BaseFloat> &net_out, const std::vector<int32>
   // get the xentropy and global error 
   target_device_.CopyFromVec(target);
   if(&net_out != diff) { //<allow no-copy speedup
-    diff->CopyFromMat(net_out);
+    *diff = net_out;
   }
   cu::DiffXent(target_device_, diff, &log_post_tgt_);
   //
@@ -83,7 +84,8 @@ void Xent::EvalVec(const CuMatrix<BaseFloat> &net_out, const std::vector<int32>
   // The frame-level xentropy statistics are computed as:
   // log(sum_row(net_out.*target_mat)))
   // they now are stored in vector log_post_tgt_
-  // 
+  //
+  log_post_tgt_host_.Resize(log_post_tgt_.Dim());
   log_post_tgt_.CopyToVec(&log_post_tgt_host_);
   loss_    -= log_post_tgt_host_.Sum();
   
@@ -110,9 +112,10 @@ std::string Xent::Report() {
 void Mse::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &target, CuMatrix<BaseFloat> *diff) {
   KALDI_ASSERT(net_out.NumCols() == target.NumCols());
   KALDI_ASSERT(net_out.NumRows() == target.NumRows());
-  diff->Resize(net_out.NumRows(), net_out.NumCols());
+
 
   // compute derivative w.r.t. neural nerwork outputs
+  diff->Resize(net_out.NumRows(), net_out.NumCols());
   diff->CopyFromMat(net_out);
   diff->AddMat(-1.0, target);
 
@@ -147,9 +150,9 @@ std::string Mse::Report() {
 void MseProgress::Eval(const CuMatrix<BaseFloat>& net_out, const CuMatrix<BaseFloat>& target, CuMatrix<BaseFloat>* diff) {
   KALDI_ASSERT(net_out.NumCols() == target.NumCols());
   KALDI_ASSERT(net_out.NumRows() == target.NumRows());
-  diff->Resize(net_out.NumRows(),net_out.NumCols());
 
   //compute derivative w.r.t. neural nerwork outputs
+  diff->Resize(net_out.NumRows(),net_out.NumCols());
   diff->CopyFromMat(net_out);
   diff->AddMat(-1.0,target);
 
diff --git a/src/nnetbin/cmvn-to-nnet.cc b/src/nnetbin/cmvn-to-nnet.cc
index 807ac3460..5140533bf 100644
--- a/src/nnetbin/cmvn-to-nnet.cc
+++ b/src/nnetbin/cmvn-to-nnet.cc
@@ -94,8 +94,7 @@ int main(int argc, char *argv[]) {
       //the pointer will be given to the nnet, so we don't need to call delete
       
       //convert Vector to CuVector
-      CuVector<BaseFloat> cu_shift;
-      cu_shift.CopyFromVec(shift);
+      CuVector<BaseFloat> cu_shift(shift);
 
       //set the weights
       shift_component->SetShiftVec(cu_shift);
@@ -110,8 +109,7 @@ int main(int argc, char *argv[]) {
       //the pointer will be given to the nnet, so we don't need to call delete
       
       //convert Vector to CuVector
-      CuVector<BaseFloat> cu_scale;
-      cu_scale.CopyFromVec(scale);
+      CuVector<BaseFloat> cu_scale(scale);
 
       //set the weights
       scale_component->SetScaleVec(cu_scale);
diff --git a/src/nnetbin/nnet-forward.cc b/src/nnetbin/nnet-forward.cc
index ee7ff09f7..2eabbc638 100644
--- a/src/nnetbin/nnet-forward.cc
+++ b/src/nnetbin/nnet-forward.cc
@@ -129,6 +129,7 @@ int main(int argc, char *argv[]) {
       }
 
       // push priors to GPU
+      priors.Resize(tmp_priors.Dim());
       priors.CopyFromVec(tmp_priors);
     }
 
@@ -150,7 +151,7 @@ int main(int argc, char *argv[]) {
         }
       }
       // push it to gpu
-      feats.CopyFromMat(mat);
+      feats = mat;
       // fwd-pass
       nnet_transf.Feedforward(feats, &feats_transf);
       nnet.Feedforward(feats_transf, &nnet_out);
@@ -169,7 +170,8 @@ int main(int argc, char *argv[]) {
         }
       }
      
-      //download from GPU 
+      //download from GPU
+      nnet_out_host.Resize(nnet_out.NumRows(), nnet_out.NumCols());
       nnet_out.CopyToMat(&nnet_out_host);
       //check for NaN/inf
       for(int32 r=0; r<nnet_out_host.NumRows(); r++) {
diff --git a/src/nnetbin/nnet-train-mmi-sequential.cc b/src/nnetbin/nnet-train-mmi-sequential.cc
index 03a40069c..2628dabaa 100644
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ b/src/nnetbin/nnet-train-mmi-sequential.cc
@@ -223,12 +223,13 @@ int main(int argc, char *argv[]) {
         
         //3) propagate the feature to get the log-posteriors (nnet w/o sofrmax)
         // push features to GPU
-        feats.CopyFromMat(mat);
+        feats = mat;
         // possibly apply transform
         nnet_transf.Feedforward(feats, &feats_transf);
         // propagate through the nnet (assuming w/o softmax)
         nnet.Propagate(feats_transf, &nnet_out);
-        // pop it back to the HOST
+        // transfer it back to the host
+        nnet_out_h.Resize(nnet_out.NumRows(), nnet_out.NumCols(), kUndefined);
         nnet_out.CopyToMat(&nnet_out_h);
         // TODO: poccibly divide by priors
 
@@ -277,7 +278,7 @@ int main(int argc, char *argv[]) {
 
         //7) backpropagate through the nnet
         if (!crossvalidate) {
-          nnet_diff.CopyFromMat(nnet_diff_h);
+          nnet_diff = nnet_diff_h;
           nnet.Backpropagate(nnet_diff, NULL);
         }
 
diff --git a/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc b/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
index d623c7de7..e26977846 100644
--- a/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
+++ b/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
@@ -139,8 +139,8 @@ int main(int argc, char *argv[]) {
             continue;
           }
           // push features/targets to GPU
-          feats.CopyFromMat(fea_mat);
-          targets.CopyFromMat(tgt_mat);
+          feats = fea_mat;
+          targets = tgt_mat;
           // possibly apply feature transform
           nnet_transf.Feedforward(feats, &feats_transf);
           // add to cache
diff --git a/src/nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc b/src/nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc
index 95bba2acc..6bfd5151c 100644
--- a/src/nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc
+++ b/src/nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc
@@ -142,7 +142,7 @@ int main(int argc, char *argv[]) {
             continue;
           }
           // push features to GPU
-          feats.CopyFromMat(mat);
+          feats = mat;
           // possibly apply transform
           nnet_transf.Feedforward(feats, &feats_transf);
           // add to cache
diff --git a/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc b/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
index 2c6cb0dd6..d69cef29a 100644
--- a/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
+++ b/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
@@ -138,6 +138,7 @@ int main(int argc, char *argv[]) {
             num_other_error++;
           } else { //dimension OK
             // push features to GPU
+            feats.Resize(mat.NumRows(), mat.NumCols(), kUndefined);
             feats.CopyFromMat(mat);
             // possibly apply transform
             nnet_transf.Feedforward(feats, &feats_transf);
diff --git a/src/nnetbin/rbm-train-cd1-frmshuff.cc b/src/nnetbin/rbm-train-cd1-frmshuff.cc
index de147b735..d99c7e9b2 100644
--- a/src/nnetbin/rbm-train-cd1-frmshuff.cc
+++ b/src/nnetbin/rbm-train-cd1-frmshuff.cc
@@ -132,7 +132,8 @@ int main(int argc, char *argv[]) {
         rbm_transf.Feedforward(feats, &feats_transf);
         // subsample the feats to get faster epochs
         if(drop_data > 0.0) {
-          Matrix<BaseFloat> mat2;
+          Matrix<BaseFloat> mat2(feats_transf.NumRows(), feats_transf.NumCols(),
+                                 kUndefined);
           feats_transf.CopyToMat(&mat2);
           for(int32 r=mat2.NumRows()-1; r >= 0; r--) {
             if(RandUniform() < drop_data) {
diff --git a/src/nnetbin/transf-to-nnet.cc b/src/nnetbin/transf-to-nnet.cc
index f85460db0..f607338ec 100644
--- a/src/nnetbin/transf-to-nnet.cc
+++ b/src/nnetbin/transf-to-nnet.cc
@@ -62,8 +62,7 @@ int main(int argc, char *argv[]) {
     //the pointer will be given to the nnet, so we don't need to call delete
 
     //convert Matrix to CuMatrix
-    CuMatrix<BaseFloat> cu_transform;
-    cu_transform.CopyFromMat(transform);
+    CuMatrix<BaseFloat> cu_transform(transform);
 
     //set the weights
     layer->SetLinearity(cu_transform);
diff --git a/windows/INSTALL b/windows/INSTALL
index d651486ed..e5e531f5b 100644
--- a/windows/INSTALL
+++ b/windows/INSTALL
@@ -17,7 +17,7 @@
 # Also we have not been checking that the code compiles in Visual Studio.
 # If anyone would like to maintain the Windows setup, we would like that,
 # but unfortunately, the situation right now is that it is not being
-# maintained.  
+# maintained.
 
 (A) Installing the Windows version of OpenFst.
   This is maintained by Paul Dixon; it has some small code changes versus