A lot of changes: script changes RE neural nets (more efficient IO, slightly better WERs); various new functionality for nnets and improving some feature-related binaries' interfaces.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1976 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-02-04 21:57:06 +00:00 · 2013-02-04 21:57:06 +00:00 · f699fd2be1
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@ -1,11 +1,5 @@
 #!/bin/bash

-# CAUTION: I changed e.g. 1.trans to trans.1 in the scripts.  If you ran it
-# part-way through prior to this, to convert to the new naming
-# convention, run:
-# for x in `find . -name '*.trans'`; do mv $x `echo $x | perl -ane 's/(\d+)\.trans/trans.$1/;print;'`; done
-# but be careful as this will not follow soft links.
-
 . cmd.sh

 # call the next line with the directory where the RM data is
--- a/egs/swbd/s5/conf/decode.config
+++ b/egs/swbd/s5/conf/decode.config
@ -1,5 +1,2 @@
 beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
 first_beam=8.0 # beam for 1st-pass decoding in SAT.
-
-
-
--- a/egs/swbd/s5/local/run_nnet_cpu.sh
+++ b/egs/swbd/s5/local/run_nnet_cpu.sh
@ -21,8 +21,8 @@
 )

 # Here are the results (copied from RESULTS file)
-#exp/nnet6a/decode_train_dev/wer_10:%WER 24.87 [ 12053 / 48460, 1590 ins, 3017 del, 7446 sub ]
-#exp/nnet6a/decode_eval2000/score_10/eval2000.ctm.filt.sys:     | Sum/Avg    | 4459  42989 | 77.1   16.0    6.9    2.7   25.6   62.6 |
+#exp/nnet6a/decode_train_dev/wer_11:%WER 24.30 [ 11774 / 48460, 1619 ins, 2877 del, 7278 sub ]
+#exp/nnet6a/decode_eval2000/score_10/eval2000.ctm.filt.sys:     | Sum/Avg    | 4459  42989 | 77.8   16.0    6.3    3.0   25.3   62.6 |


 # Here are some older results when the system had 2k not 4k leaves and ran from a worse SAT
--- a/egs/wsj/s5/RESULTS
+++ b/egs/wsj/s5/RESULTS
@ -191,6 +191,7 @@ exp/tri4a_dnn/decode_bd_tgpr_eval92/wer_10:%WER 4.00 [ 226 / 5643, 34 ins, 12 de
 # and for eval92 is 3.79, the same system.  (On this setup, discriminative training helped a lot,
 # which seems to be the reason we can't beat the SGMM+MMI numbers here.)

-exp/nnet5c1/decode_bd_tgpr_dev93/wer_10:%WER 7.48 [ 616 / 8234, 73 ins, 98 del, 445 sub ]
-exp/nnet5c1/decode_bd_tgpr_eval92/wer_11:%WER 4.41 [ 249 / 5643, 29 ins, 19 del, 201 sub ]
-# Note: my 4.41% result is worse than Karel's 4.00%.
+
+exp/nnet5c1/decode_bd_tgpr_dev93/wer_14:%WER 7.32 [ 603 / 8234, 61 ins, 101 del, 441 sub ]
+exp/nnet5c1/decode_bd_tgpr_eval92/wer_14:%WER 4.39 [ 248 / 5643, 32 ins, 17 del, 199 sub ]
+# Note: my 4.39% result is worse than Karel's 4.00%.
--- a/egs/wsj/s5/local/wsj_data_prep.sh
+++ b/egs/wsj/s5/local/wsj_data_prep.sh
@ -47,7 +47,7 @@ cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
 grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist

 nl=`cat train_si84.flist | wc -l`
-[ "$nl" -eq 7138 ] || echo "Warning: expected 37416 lines in train_si84.flist, got $nl"
+[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl"

 # This version for SI-284
 cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@ -281,7 +281,6 @@ steps/train_quick.sh --cmd "$train_cmd" \
  exp/tri4b/graph_bd_tgpr data/test_eval92 exp/tri4b/decode_bd_tgpr_eval92 || exit 1;
 ) &

-
 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 # all the data).  Use 30 jobs.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
--- a/egs/wsj/s5/steps/append_feats.sh
+++ b/egs/wsj/s5/steps/append_feats.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+# This script appends the features in two data directories.
+
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+# This config creates MFCC features with half the window size and window shift,
+# and splices and sub-samples them.  We'll use another script append_feats.sh
+# to combine (append) the data directories.
+
+# Begin configuration section.
+cmd=run.pl
+nj=4
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+   echo "usage: append_feats.sh [options] <src-data-dir1> <src-data-dir2> <dest-data-dir> <log-dir> <path-to-storage-dir>";
+   echo "options: "
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data_src1=$1
+data_src2=$2
+data=$3
+logdir=$4
+mfccdir=$5
+
+utils/split_data.sh $data_src1 $nj || exit 1;
+utils/split_data.sh $data_src2 $nj || exit 1;
+
+mkdir -p $mfccdir $logdir
+
+rm -rf $data
+mkdir -p `basename $data` # Make sure directory one level up exists.
+cp -r $data_src1 $data # so we get the other files, such as utt2spk.
+rm $data/cmvn.scp
+rm -r $data/split* 2>/dev/null
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+$cmd JOB=1:$nj $logdir/append.JOB.log \
+   append-feats --truncate-frames=true \
+   scp:$data_src1/split$nj/JOB/feats.scp scp:$data_src2/split$nj/JOB/feats.scp \
+   ark,scp:$mfccdir/appended_$name.JOB.ark,$mfccdir/appended_$name.JOB.scp || exit 1;
+              
+# concatenate the .scp files together.
+for ((n=1; n<=nj; n++)); do
+  cat $mfccdir/appended_$name.$n.scp >> $data/feats.scp || exit 1;
+done > $data/feats.scp
+
+
+nf=`cat $data/feats.scp | wc -l` 
+nu=`cat $data/utt2spk | wc -l` 
+if [ $nf -ne $nu ]; then
+  echo "It seems not all of the feature files were successfully ($nf != $nu);"
+  echo "consider using utils/fix_data_dir.sh $data"
+fi
+
+echo "Succeeded creating MFCC features for $name"
--- a/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
+++ b/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
@ -18,6 +18,10 @@ max_mem=20000000 # This will stop the processes getting too large.
 # This is in bytes, but not "real" bytes-- you have to multiply
 # by something like 5 or 10 to get real bytes (not sure why so large)
 # End configuration section.
+num_threads=1 # Number of threads used in nnet-logprob computation.  If you set
+              # this to a different value, make sure to also set the appropriate
+              # queue options.  If you set this too high it won't use all the
+              # threads as most of the time will be taken in the decoder.

 echo "$0 $@"  # Print the command line for logging

@ -104,9 +108,10 @@ fi

 if [ $sub_split -eq 1 ]; then 
  $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
-   nnet-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+   nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats" ark:- \| \
+   latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
-     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+     $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
 else
  for n in `seq $nj`; do
    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
@ -120,9 +125,10 @@ else
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
      $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
-        nnet-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+        nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats_subset" ark:- \| \
+        latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
-          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
+          $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
      echo Merging archives for data subset $n
      rm $dir/.error 2>/dev/null;
      for k in `seq $sub_split`; do
--- a/egs/wsj/s5/steps/train_nnet_cpu.sh
+++ b/egs/wsj/s5/steps/train_nnet_cpu.sh
@ -14,36 +14,42 @@ num_iters_final=10 # Number of final iterations to give to the
                   # optimization over the validation set.
 initial_learning_rate=0.02 # for RM; or 0.01 is suitable for Swbd.
 final_learning_rate=0.004  # for RM; or 0.001 is suitable for Swbd.
-num_valid_utts=300    # held-out utterances, used only for diagnostics.
-num_valid_frames_shrink=2000 # a subset of the frames in "valid_utts", used only
-                             # for estimating shrinkage parameters and for
-                             # objective-function reporting.
+num_utts_subset=300    # number of utterances in validation and training
+                       # subsets used for shrinkage and diagnostics
+num_valid_frames_shrink=0 # number of validation frames in the subset
+                             # used for shrinking
+num_train_frames_shrink=2000  # number of training frames in the subset used
+                              # for shrinking (by default we use all training
+                              # frames for this.)
 shrink_interval=3 # shrink every $shrink_interval iters,
                # except at the start of training when we do it every iter.
-num_valid_frames_combine=10000 # combination weights at the very end.
+num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
+num_train_frames_combine=10000 # # train frames for the above.
+num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
 minibatch_size=128 # by default use a smallish minibatch size for neural net training; this controls instability
                   # which would otherwise be a problem with multi-threaded update.  Note:
                   # it also interacts with the "preconditioned" update, so it's not completely cost free.
-samples_per_iteration=400000 # each iteration of training, see this many samples
-                             # per job.
+samples_per_iter=400000 # each iteration of training, see this many samples
+                             # per job.  This is just a guideline; it will pick a number
+                             # that divides the number of samples in the entire data.
 shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
-                         # on each iter.  You could set it to 0 or to a large value for complete
-                         # randomization, but this would both consume memory and cause spikes in
-                         # disk I/O.  Smaller is easier on disk and memory but less random.  It's
-                         # not a huge deal though, as samples are anyway randomized right at the start.
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
 num_jobs_nnet=8 # Number of neural net jobs to run in parallel.

 add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=2
 initial_num_hidden_layers=1  # we'll add the rest one by one.
 num_parameters=2000000 # 2 million parameters by default.
-stage=-7
+stage=-9
 realign_iters=""
 beam=10  # for realignment.
 retry_beam=40
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
-shuffle_opts="-tc 5" # max 5 jobs running at one time (a lot of I/O.)
+io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
 nnet_config_opts=
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 lda_dim=250
@ -54,7 +60,11 @@ shrink=true
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
 num_threads=16
-mkl_num_threads=1
+
+valid_is_heldout=false # For some reason, holding out the validation set from the training set
+                       # seems to hurt, so by default we don't do it (i.e. it's included in training)
+random_copy=false
+cleanup=true
 # End configuration section.

 echo "$0 $@"  # Print the command line for logging
@ -72,7 +82,7 @@ if [ $# != 4 ]; then
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
-  echo "                                                   # with --samples-per-iteration and --num-jobs-nnet)"
+  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
@ -95,21 +105,27 @@ if [ $# != 4 ]; then
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads."
-  echo "  --shuffle-opts <opts|\"-tc 5\">                  # Options given to e.g. queue.pl for the job that shuffles the "
-  echo "                                                   # data. (prevents stressing the disk). "
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
-  echo "  --samples-per-iteration <#samples|400000>        # Number of samples of data to process per iteration, per"
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
-  echo "  --stage <stage|-7>                               # Used to run a partially-completed training process from somewhere in"
+  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
+  echo "                                                   # (the validation subset is held out from training)"
+  echo "  --num-valid-frames-shrink <#frames|2000>         # Number of frames from the validation set used for shrinking"
+  echo "  --num-train-frames-shrink <#frames|0>            # Number of frames from the training set used for shrinking"
+  echo "                                                   # (by default it's included in training, which for some reason helps)."
+  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
+  echo "  --num-valid-frames-combine <#frames|10000>       # Number of frames used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  
-
  exit 1;
 fi

@ -144,8 +160,11 @@ cp $alidir/tree $dir


 # Get list of validation utterances. 
-awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_valid_utts \
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/valid_uttlist || exit 1;
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+     head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
+

 ## Set up features.  Note: these are different from the normal features
 ## because we have one rspecifier that has the features for the entire
@ -154,33 +173,49 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
-     split_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
+  delta) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
   ;;
-  lda) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
-      split_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+  lda) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+      train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
 if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
-  feats="$feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
-  split_feats="$split_feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
 fi

+if [ $stage -le -9 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=`feat-to-len scp:$data/feats.scp ark,t:- | awk '{x += $2;} END{print x;}'` || exit 1;
+  echo $num_frames > $dir/num_frames
+else
+  num_frames=`cat $dir/num_frames` || exit 1;
+fi
+
+# Working out number of iterations per epoch.
+iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
+[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
+samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
+echo "Every epoch, splitting the data up into $iters_per_epoch iterations,"
+echo "giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
+
+
 ## Do LDA on top of whatever features we already have; store the matrix which
 ## we'll put into the neural network as a constant.

-if [ $stage -le -7 ]; then
-  echo "Accumulating LDA statistics."
+if [ $stage -le -8 ]; then
+  echo "$0: Accumulating LDA statistics."
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
-      acc-lda --rand-prune=$randprune $alidir/final.mdl "$split_feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \
+      acc-lda --rand-prune=$randprune $alidir/final.mdl "$feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \
       $dir/lda.JOB.acc || exit 1;
  est-lda --dim=$lda_dim $dir/lda.mat $dir/lda.*.acc \
      2>$dir/log/lda_est.log || exit 1;
@ -195,10 +230,10 @@ if [ $initial_num_hidden_layers -gt $num_hidden_layers ]; then
 fi


-if [ $stage -le -6 ]; then
+if [ $stage -le -7 ]; then
  echo "$0: initializing neural net";
  # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers.
+  # single hidden layer; we need this to add new layers. 
  if [ ! -z "$alpha" ]; then
    utils/nnet-cpu/make_nnet_config_preconditioned.pl --alpha $alpha $nnet_config_opts \
      --learning-rate $initial_learning_rate \
@ -219,14 +254,14 @@ if [ $stage -le -6 ]; then
       $dir/0.mdl || exit 1;
 fi

-if [ $stage -le -5 ]; then
+if [ $stage -le -6 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
 fi

-if [ $stage -le -4 ]; then
+if [ $stage -le -5 ]; then
  echo "Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst  \
@ -239,118 +274,113 @@ cp $alidir/ali.*.gz $dir

 nnet_context_opts="--left-context=`nnet-am-info $dir/0.mdl 2>/dev/null | grep -w left-context | awk '{print $2}'` --right-context=`nnet-am-info $dir/0.mdl 2>/dev/null | grep -w right-context | awk '{print $2}'`" || exit 1;

-if [ $stage -le -3 ]; then
-  echo "Getting validation examples."
-  $cmd $dir/log/create_valid_subset_shrink.log \
+if [ $stage -le -4 ]; then
+  echo "Getting validation and training subset examples."
+  rm $dir/.error 2>/dev/null
+  $cmd $dir/log/create_valid_subset.log \
    nnet-get-egs $nnet_context_opts "$valid_feats" \
     "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $dir/0.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
-     "ark:$dir/valid_all.egs" || exit 1;
-  echo "Getting subsets of validation examples for shrinking and combination."
+     "ark:$dir/valid_all.egs" || touch $dir/.error &
+  $cmd $dir/log/create_train_subset.log \
+    nnet-get-egs $nnet_context_opts "$train_subset_feats" \
+     "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $dir/0.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && exit 1;
+  echo "Getting subsets of validation examples for shrinking, diagnostics and combination."
  $cmd $dir/log/create_valid_subset_shrink.log \
-    nnet-subset-egs --n=$num_valid_frames_shrink ark:$dir/valid_all.egs ark:$dir/valid_shrink.egs  &
+    nnet-subset-egs --n=$num_valid_frames_shrink ark:$dir/valid_all.egs \
+     ark:$dir/valid_shrink.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_combine.log \
-    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs ark:$dir/valid_combine.egs  &
+    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+        ark:$dir/valid_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_valid_subset_diagnostic.log \
+    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    ark:$dir/valid_diagnostic.egs || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset_shrink.log \
+    nnet-subset-egs --n=$num_train_frames_shrink ark:$dir/train_subset_all.egs \
+    ark:$dir/train_shrink.egs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_combine.log \
+    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    ark:$dir/train_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_diagnostic.log \
+    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    ark:$dir/train_diagnostic.egs || touch $dir/.error &
  wait
-  [ ! -s $dir/valid_shrink.egs ] && echo "No validation examples for shrinking" && exit 1;
-  [ ! -s $dir/valid_combine.egs ] && echo "No validation examples for combination" && exit 1;
-  rm $dir/valid_all.egs
+  cat $dir/valid_shrink.egs $dir/train_shrink.egs > $dir/shrink.egs
+  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
+
+  for f in $dir/{shrink,combine,train_diagnostic,valid_diagnostic}.egs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_{shrink,combine}.egs
 fi

-if [ $stage -le -2 ]; then
+if [ $stage -le -3 ]; then
  mkdir -p $dir/egs
  mkdir -p $dir/temp
  echo "Creating training examples";
-  # in $dir/egs, create $num_jobs_nnet separate files with training examples,
-  # with randomly shuffled order.  We shuffle the order of examples in each
-  # file.  Then on each iteration, for each training process, we'll take a 
-  # random subset of blocks of examples within that process's file.
-  # We take them in blocks, because it avoids the overhead of fseek() while
-  # creating the examples.
+  # in $dir/egs, create $num_jobs_nnet separate files with training examples.
+  # The order is not randomized at this point.

  egs_list=
  for n in `seq 1 $num_jobs_nnet`; do
-    egs_list="$egs_list ark,scp:$dir/egs/egs_orig.$n.ark,$dir/egs/egs_orig.$n.scp"
+    egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark"
  done
  echo "Generating training examples on disk"
  # The examples will go round-robin to egs_list.
-  $cmd $dir/log/get_egs.log \
+  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet-get-egs $nnet_context_opts "$feats" \
-    "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
+    "ark,cs:gunzip -c $dir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
    nnet-copy-egs ark:- $egs_list || exit 1;
 fi

+if [ $stage -le -2 ]; then
+  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
+  # then split into multiple parts egs.JOB.*.scp for different parts of the
+  # data, 0 .. $iters_per_epoch-1.
+
+  if [ $iters_per_epoch -eq 1 ]; then
+    echo "Since iters-per-epoch == 1, just concatenating the data."
+    for n in `seq 1 $num_jobs_nnet`; do
+      cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
+      rm $dir/egs/egs_orig.$n.*.ark || exit 1;
+    done
+  else # We'll have to split it up using nnet-copy-egs.
+    egs_list=
+    for n in `seq 0 $[$iters_per_epoch-1]`; do
+      egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark"
+    done
+    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
+      nnet-copy-egs --random=$random_copy --srand=JOB \
+        "ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list '&&' \
+        rm $dir/egs/egs_orig.JOB.*.ark || exit 1;
+  fi
+fi
+
 if [ $stage -le -1 ]; then
  # Next, shuffle the order of the examples in each of those files.
-  # In order to not use too much memory (in case the size of the files is
-  # huge) we do this by randomizing the order of the .scp file and then
-  # just call nnet-copy-egs.  If the file system is willing to store
-  # stuff in memory, it is free to do so.  This is not super-optimal in
-  # terms of file system performance but it's simple and it won't fail when
-  # the data gets large.
+  # Each one should not be too large, so we can do this in memory.
  echo "Shuffling the order of training examples"
  echo "(in order to avoid stressing the disk, these won't all run at once)."
-  $cmd $shuffle_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.JOB.log \
-    utils/shuffle_list.pl --srand JOB $dir/egs/egs_orig.JOB.scp \| \
-    nnet-copy-egs scp:- ark,scp:$dir/egs/egs.JOB.ark,$dir/egs/egs.JOB.scp \
-    '&&' rm $dir/egs/egs_orig.JOB.ark $dir/egs/egs_orig.JOB.scp
-  smallest_len=`wc -l $dir/egs/egs.*.scp | sort -n -k1 | awk '{print $1}' | head -1`
-  # If the $samples_per_iteration is more than each split of the data,
-  # append to each .scp file the .scp files from the next one or two 
-  # splits (or more), so each one is larger...
-  rm $dir/egs/egs.*.scp.orig 2>/dev/null
-  if [ $samples_per_iteration -gt $smallest_len ]; then
-    extra_files=$[($samples_per_iteration-1) / $smallest_len]
-    echo Each part of the data has about $smallest_len lines which is less than the 
-    echo samples per iteration $samples_per_iteration, so appending next $extra_files
-    echo files to each scp file
-    for n in `seq $num_jobs_nnet`; do mv $dir/egs/egs.$n.scp $dir/egs/egs.$n.scp.orig; done
-    for n in `seq $num_jobs_nnet`; do
-      for e in `seq 0 $extra_files`; do
-         m=$[(($n + $e - 1)%$num_jobs_nnet)+1]
-         cat $dir/egs/egs.$m.scp.orig
-      done > $dir/egs/egs.$n.scp
-    done
-  fi  
+
+  for n in `seq 0 $[$iters_per_epoch-1]`; do
+    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
+      nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
+      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark '&&' \
+      rm $dir/egs/egs_tmp.JOB.$n.ark || exit 1;
+  done
 fi

-num_egs=`grep wrote $dir/log/get_egs.log | tail -1 | awk '{print $NF}'` || exit 1;
-! [ $num_egs -gt 0 ] && echo "bad num_egs $num_egs" && exit 1;
-num_iters_reduce=$[ 1 + (($num_egs * $num_epochs)/($num_jobs_nnet * $samples_per_iteration))]
-num_iters_extra=$[1 + (($num_egs * $num_epochs_extra)/($num_jobs_nnet * $samples_per_iteration))]
+num_iters_reduce=$[$num_epochs * $iters_per_epoch];
+num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
 num_iters=$[$num_iters_reduce+$num_iters_extra]

 echo "Will train for $num_epochs + $num_epochs_extra epochs, equalling "
 echo " $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
 echo " (while reducing learning rate) + (with constant learning rate)."

-function get_list {
-  # usage: get_list <samples-per-iter> <iter> <input-file> >output
-  #
-  # Outputs an scp file for this job for this iteration.  The
-  # output will have <samples-per-iter> lines, and will contain lines from
-  # egs.JOB.scp, possibly with repeats.  It will be sorted numerically on its
-  # first field, so the .ark file is accessed in order (we then pipe to
-  # nnet-shuffle-egs to randomize the order).  The way we do it is, we imagine
-  # we had concatenated the file $dir/egs/egs.JOB.scp infinite times, and
-  # taken from the concatenated file, the lines 
-  # <samples-per-iter> * <iter> ...  <samples-per-iter> * (<iter> + 1) - 1,
-  # and then sorted them on the first field (which is a number).
-  # We don't actually implement it this way, we do it a bit more efficiently.
-  # We require that samples-per-iter <= (#lines in input-file).
-  [ $# -ne 3 ] && echo "get_list: bad usage" && exit 1;
-  samples_per_iter=$1
-  my_iter=$2
-  input_file=$3
-  start=$[$my_iter * $samples_per_iter]; # starting-point in concatenated file.
-  input_len=`cat $input_file | wc -l`
-  start=$[$start - $input_len*($start/$input_len)]; # remove whole multiples of input_len
-  # we have to concatenate the input file to itself.
-  cat $input_file $input_file | \
-     head -n $[$start + $samples_per_iter] | tail -n $samples_per_iter | \
-     sort -k2 -k1n
-}
-
-
 # up till $last_normal_shrink_iter we will shrink the parameters
 # in the normal way using the dev set, but after that we will
 # only re-compute the shrinkage parameters periodically.
@ -361,22 +391,19 @@ x=0
 while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then

-    # Set off a job that does diagnostics, in the background.
-    $cmd $parallel_opts $dir/log/compute_prob.$x.log \
-      nnet-compute-prob $dir/$x.mdl ark:$dir/valid_shrink.egs &
+    # Set off jobs doing some diagnostics, in the background.
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$dir/valid_diagnostic.egs &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$dir/train_diagnostic.egs &

    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "Realigning data (pass $x)"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        nnet-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$dir/$x.mdl" \
-         "ark:gunzip -c $dir/fsts.JOB.gz|" "$split_feats" \
+         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
        "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
    fi
-    for n in `seq $num_jobs_nnet`; do
-      # the following command gets a subset of the n'th scp file, containing
-      # $samples_per_iteration lines.
-      get_list $samples_per_iteration $x $dir/egs/egs.$n.scp > $dir/temp/egs.$x.$n.scp
-    done      

    echo "Training neural net (pass $x)"
    if [ $x -gt 0 ] && \
@ -388,11 +415,10 @@ while [ $x -lt $num_iters ]; do
    fi

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
-      MKL_NUM_THREADS=$mkl_num_threads \
-         nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
-           scp:$dir/temp/egs.$x.JOB.scp ark:- \| \
-         nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
-         "$mdl" ark:- $dir/$[$x+1].JOB.mdl \
+       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
+         ark:$dir/egs/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
+       nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
+        "$mdl" ark:- $dir/$[$x+1].JOB.mdl \
       || exit 1;

    nnets_list=
@ -410,10 +436,10 @@ while [ $x -lt $num_iters ]; do
      if [ $x -le $last_normal_shrink_iter ] || [ $[$x % $shrink_interval] -eq 0 ]; then
        # For earlier iterations (while we've recently beeen adding layers), or every
        # $shrink_interval=3 iters , just do shrinking normally.
+        mb=$[($num_valid_frames_shrink+$num_train_frames_shrink+$num_threads-1)/$num_threads]
        $cmd $parallel_opts $dir/log/shrink.$x.log \
-          MKL_NUM_THREADS=$mkl_num_threads nnet-combine-fast --num-threads=$num_threads --verbose=3 \
-            --minibatch-size=$[($num_valid_frames_shrink+$num_threads-1)/$num_threads] \
-            $dir/$[$x+1].mdl ark:$dir/valid_shrink.egs $dir/$[$x+1].mdl || exit 1;
+          nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
+            $dir/$[$x+1].mdl ark:$dir/shrink.egs $dir/$[$x+1].mdl || exit 1;
      fi
    fi
    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@ -423,7 +449,7 @@ while [ $x -lt $num_iters ]; do
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
         $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
-    rm $nnets_list $dir/temp/egs.$x.*.scp
+    rm $nnets_list
  fi
  x=$[$x+1]
 done
@ -435,15 +461,32 @@ nnets_list=
 for x in `seq $[$num_iters-$num_iters_final+1] $num_iters`; do
  [ $x -gt $mix_up_iter ] && nnets_list="$nnets_list $dir/$x.mdl"
 done
-$cmd $parallel_opts $dir/log/combine.log \
-  MKL_NUM_THREADS=$mkl_num_threads nnet-combine-fast --num-threads=$num_threads \
-    --verbose=3 --minibatch-size=$[($num_valid_frames_shrink+$num_threads-1)/$num_threads] \
-     $nnets_list ark:$dir/valid_combine.egs $dir/final.mdl || exit 1;
+if [ $stage -le $num_iters ]; then
+  mb=$[($num_valid_frames_combine+$num_train_frames_combine+$num_threads-1)/$num_threads]
+  $cmd $parallel_opts $dir/log/combine.log \
+    nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
+    $nnets_list ark:$dir/combine.egs $dir/final.mdl || exit 1;
+fi

 # Compute the probability of the final, combined model with
 # the same subset we used for the previous compute_probs, as the
 # different subsets will lead to different probs.
-$cmd $parallel_opts $dir/log/compute_prob.final.log \
-  nnet-compute-prob $dir/final.mdl ark:$dir/valid_shrink.egs || exit 1;
+$cmd $dir/log/compute_prob_valid.final.log \
+  nnet-compute-prob $dir/final.mdl ark:$dir/valid_diagnostic.egs &
+$cmd $dir/log/compute_prob_train.final.log \
+  nnet-compute-prob $dir/final.mdl ark:$dir/train_diagnostic.egs &

 echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  echo Removing training examples
+  rm -r $dir/egs
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+       # delete all but every 10th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi
--- a/egs/wsj/s5/steps/train_nnet_cpu_mmi.sh
+++ b/egs/wsj/s5/steps/train_nnet_cpu_mmi.sh
@ -31,16 +31,17 @@ num_jobs_nnet=8 # Number of neural net training jobs to run in parallel.
                # not the same as the num-jobs (nj) which will be the same as the
                # alignment and denlat directories.
 stage=0
-sub_stage=-2 # this can be used to start from a particular sub-iteration of an
+sub_stage=-3 # this can be used to start from a particular sub-iteration of an
             # iteration
 acwt=0.1
 boost=0.0  # boosting for BMMI (you can try 0.1).. this is applied per frame.
 transform_dir=  # Note: by default any transforms in $alidir will be used.

 parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
-shuffle_opts="-tc 5" # max 5 jobs running at one time (a lot of I/O.)
+io_opts="-tc 10" # max 5 jobs running at one time (a lot of I/O.)
 num_threads=16 # number of threads for neural net trainer..
 mkl_num_threads=1
+random_copy=false
 # End configuration section.

 echo "$0 $@"  # Print the command line for logging
@ -71,8 +72,7 @@ if [ $# != 6 ]; then
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads."
-  echo "  --shuffle-opts <opts|\"-tc 5\">                  # Options given to e.g. queue.pl for the job that shuffles the "
-  echo "                                                   # data. (prevents stressing the disk). "
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for any especially I/O intensive jobs"
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, for each"
@ -181,34 +181,37 @@ while [ $x -lt $num_epochs ]; do
  echo "Epoch $x of $num_epochs"

  if [ $stage -le $x ] && $first_iter_of_epoch; then
-    if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
+    if [ $stage -lt $x ] || [ $sub_stage -le -3 ]; then
      # First get the per-frame posteriors, by rescoring the lattices; this
      # process also gives us at the same time the posteriors of each state for
      # each frame (by default, pruned to 0.01 with a randomized algorithm).
      # The matrix-logprob stage produces a diagnostic and passes the pseudo-log-like
-      # matrix through unchanged.
-      $cmd JOB=1:$nj $dir/log/post.$z.JOB.log \
-        nnet-logprob2 $dir/$x.1.mdl "$feats" "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
+      # matrix through unchanged.  (Note: nnet-logprob2-parallel can use up to
+      # $num_threads threads, but in practice it may be limited by the speed of
+      # the other elements of the pipe.
+      $cmd $parallel_opts JOB=1:$nj $dir/log/post.$z.JOB.log \
+        nnet-logprob2-parallel --num-threads=$num_threads $dir/$x.1.mdl "$feats" \
+          "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
        matrix-logprob ark:- "ark:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $dir/$x.1.mdl ark:- ark:-|" ark:- \| \
        lattice-rescore-mapped $dir/$x.1.mdl "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark:- ark:- \| \
        lattice-boost-ali --b=$boost --silence-phones=$silphonelist $dir/$x.1.mdl ark:- "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
        post-to-pdf-post $dir/$x.1.mdl ark:- "ark:|gzip -c >$dir/post/den_post.$z.JOB.gz" || exit 1;
    fi
-    if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
+    if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
      # run nnet-get-egs for all files, to get the training examples for each frame--
      # combines the feature and label/posterior information.  The posterior information
      # consists of 2 things: the numerator posteriors from the alignments, the denominator
      # posteriors from the lattices (times -1), and the smoothing posteriors from the 
      # neural net log-probs (times E).  
      # We copy the examples for each job round-robin to multiple archives, one for each
-      # of 1...$num_jobs_nnet.  We write these along with .scp files, for more convenient
-      # and memory-efficient randomization.
+      # of 1...$num_jobs_nnet.  
      egs_out=""
      for n in `seq 1 $num_jobs_nnet`; do
-        egs_out="$egs_out ark,scp:$dir/egs/egs.$z.$n.JOB.ark,$dir/egs/egs.$z.$n.JOB.scp"
+        # indexes are egs_orig.$z.$num_jobs_nnet.$nj
+        egs_out="$egs_out ark:$dir/egs/egs_orig.$z.$n.JOB.ark"
      done
-      $cmd JOB=1:$nj $dir/log/egs.$z.JOB.log \
+      $cmd JOB=1:$nj $dir/log/get_egs.$z.JOB.log \
         ali-to-pdf $dir/$x.1.mdl "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
         ali-to-post ark:- ark:- \| \
         sum-post --scale2=$E ark:- "ark:gunzip -c $dir/post/smooth_post.$z.JOB.gz|" ark:- \| \
@ -223,23 +226,33 @@ while [ $x -lt $num_epochs ]; do
      tail -n 50 $dir/log/post.$z.*.log | perl -e '$acwt=shift @ARGV; $acwt>0.0 || die "bad acwt"; while(<STDIN>) { if (m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames += $2; } if (m|matrix-logprob.+Average log-prob per frame is (\S+) over (\S+) frames|) { $tot_num_like += $1*$2; $tot_num_frames += $2; } } if (abs($tot_frames - $tot_num_frames) > 0.01*($tot_frames + $tot_num_frames)) { print STDERR "#frames differ $tot_frames vs $tot_num_frames\n"; }  $tot_den_lat_like /= $tot_frames; $tot_num_like /= $tot_num_frames; $objf = $acwt * $tot_num_like - $tot_den_lat_like; print $objf."\n"; ' $acwt > $dir/log/objf.$z.log
      echo "Objf on EBW iter $z is `cat $dir/log/objf.$z.log`"
    fi
-    if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
-      echo "Shuffling the order of training examples and splitting them up"
-      echo "(in order to avoid stressing the disk, these won't all run at once)."
-
+    if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
+      echo "Merging training examples across original #jobs ($nj), and "
+      echo "splitting across number of nnet jobs $num_jobs_nnet"
      egs_out2=""
      for n in `seq 1 $iters_per_epoch`; do
-        egs_out2="$egs_out2 ark:$dir/egs/egs_split.$z.$n.JOB.ark"
+        # indexes of egs_merged are: egs_merged.$z.$iters_per_epoch.$num_jobs_nnet
+        egs_out2="$egs_out2 ark:$dir/egs/egs_merged.$z.$n.JOB.ark"
      done
      # Note: in the following command, JOB goes from 1 to $num_jobs_nnet, so one
      # job per parallel training job (different from the previous command).
      # We sum up over the index JOB in the previous $cmd, and write to multiple
      # archives, this time one for each "sub-iter".
-      $cmd $shuffle_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.JOB.log \
-        cat $dir/egs/egs.$z.JOB.*.scp \| \
-        utils/shuffle_list.pl --srand "\$[($z*$num_jobs_nnet)+JOB]" \| \
-        nnet-copy-egs scp:- $egs_out2 || exit 1; ##'&&' \
-        ##rm $dir/egs/egs.$z.JOB.*.scp $dir/egs/egs.$z.JOB.*.ark || exit 1;
+      # indexes of egs_orig are: egs_orig.$z.$num_jobs_nnet.$nj
+      $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/merge_and_split.$x.JOB.log \
+        cat $dir/egs/egs_orig.$z.JOB.*.ark \| \
+        nnet-copy-egs --random=$random_copy "--srand=\$[JOB+($x*$num_jobs_nnet)]" \
+          ark:- $egs_out2 '&&' rm $dir/egs/egs_orig.$z.JOB.*.ark || exit 1;
+    fi
+    if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
+      echo "Randomizing order of examples in each job"
+      for n in `seq 1 $iters_per_epoch`; do
+        s=$[$num_jobs_nnet*($n+($iters_per_epoch*$z))] # for srand
+        $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$z.$n.JOB.log \
+          nnet-shuffle-egs "--srand=\$[JOB+$s]" \
+          ark:$dir/egs/egs_merged.$z.$n.JOB.ark ark:$dir/egs/egs.$z.$n.JOB.ark '&&' \
+          rm $dir/egs/egs_merged.$z.$n.JOB.ark || exit 1;
+      done
    fi
  fi
  if [ $stage -le $x ]; then
@ -250,7 +263,7 @@ while [ $x -lt $num_epochs ]; do
      if [ $stage -lt $x ] || [ $sub_stage -le $y ]; then
        $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.$y.JOB.log \
          nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
-          $dir/$x.$y.mdl ark:$dir/egs/egs_split.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
+          $dir/$x.$y.mdl ark:$dir/egs/egs.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
          || exit 1;
        nnets_list=
        for n in `seq 1 $num_jobs_nnet`; do
--- a/egs/wsj/s5/utils/nnet-cpu/make_nnet_config.pl
+++ b/egs/wsj/s5/utils/nnet-cpu/make_nnet_config.pl
@ -68,7 +68,7 @@ Options:
   --input-left-context <n>        #  #frames of left context for input features; default 0.
   --input-right-context <n>       #  #frames of right context for input features; default 0.
   --param-stdddev-factor <f>      #  Factor which can be used to modify the standard deviation of
-                                   #  randomly nitialized features (default, 1.  Gets multiplied by
+                                   #  randomly initialized features (default, 1.  Gets multiplied by
                                   #  1/sqrt of number of inputs).
   --initial-num-hidden-layers <n> <config-file>   #  If >0, number of hidden layers to initialize the network with.
                                   #  In this case, the positional parameter <num-hidden-layers> is only
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@ -19,7 +19,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
        align-mapped align-compiled-mapped latgen-faster-mapped \
        hmm-info pdf-to-counts analyze-counts extract-ctx post-to-phone-post \
        post-to-pdf-post duplicate-matrix logprob-to-post prob-to-post copy-post \
-        matrix-logprob
+        matrix-logprob matrix-sum

 OBJFILES = 

--- a/src/bin/matrix-sum.cc
+++ b/src/bin/matrix-sum.cc
@ -0,0 +1,87 @@
+// bin/matrix-sum.cc
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+        "Sum (and optionally scale) two archives of input matrices\n"
+        "of the same dimension\n"
+        "\n"
+        "Usage: matrix-sum [options] <matrix-rspecifier1> <matrix-rspecifier2> <sum-wspecifier>\n";
+
+    BaseFloat scale1 = 1.0, scale2 = 1.0;
+
+    ParseOptions po(usage);
+
+    po.Register("scale1", &scale1, "Scale applied to first matrix");
+    po.Register("scale2", &scale2, "Scale applied to second matrix");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    std::string rspecifier1 = po.GetArg(1);
+    std::string rspecifier2 = po.GetArg(2);
+    std::string wspecifier = po.GetArg(3);
+    
+    SequentialBaseFloatMatrixReader mat1_reader(rspecifier1);
+    RandomAccessBaseFloatMatrixReader mat2_reader(rspecifier2);
+    BaseFloatMatrixWriter mat_writer(wspecifier);
+    
+    int32 num_done = 0, num_err = 0;
+    
+    for (; !mat1_reader.Done(); mat1_reader.Next()) {
+      std::string key = mat1_reader.Key();
+      Matrix<BaseFloat> mat1 (mat1_reader.Value());
+      if (!mat2_reader.HasKey(key)) {
+        KALDI_WARN << "No such key " << key << " in second table.";
+        num_err++;
+        continue;
+      }
+      const Matrix<BaseFloat> &mat2 (mat2_reader.Value(key));
+      if (!SameDim(mat1, mat2)) {
+        KALDI_WARN << "Matrices for key " << key << " have different dims "
+                   << mat1.NumRows() << " x " << mat1.NumCols() << " vs. "
+                   << mat2.NumRows() << " x " << mat2.NumCols();
+        num_err++;
+        continue;
+      }
+      if (scale1 != 1.0) mat1.Scale(scale1);
+      mat1.AddMat(scale2, mat2);
+      mat_writer.Write(key, mat1);
+      num_done++;
+    }
+    KALDI_LOG << "Added " << num_done << " matrices; " << num_err
+              << " had errors.";
+
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
+
--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@ -93,10 +93,13 @@ Real* CuMatrix<Real>::RowData(MatrixIndexT r) {


 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols) {
+void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
+                            MatrixResizeType resize_type) {
+  // This code does not currently support the other resize_type options.
+  KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
  if (num_rows_ == rows && num_cols_ == cols) {
-    // SetZero();
-    return *this;
+    if (resize_type == kSetZero) SetZero();
+    return;
  }

  Destroy();
@ -108,17 +111,15 @@ CuMatrix<Real>& CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols) {
    cuSafeCall(cudaMallocPitch((void**)&data_, &pitch, row_bytes, rows));
    num_rows_ = rows; num_cols_ = cols; 
    stride_ = pitch/sizeof(Real);
-    SetZero();
+    if (resize_type == kSetZero) SetZero();
  } else
  #endif
  {
-    mat_.Resize(rows, cols);
+    mat_.Resize(rows, cols, resize_type);
    num_rows_=rows;
    num_cols_=cols;
-    stride_=mat_.Stride();
+    stride_= mat_.Stride();
  }
-  
-  return *this;
 }


@ -134,7 +135,7 @@ void CuMatrix<Real>::Destroy() {
  } else
  #endif
  {
-    mat_.Destroy();
+    mat_.Resize(0, 0);
  }
  num_rows_ = num_cols_ = stride_ = 0;
 }
@ -142,9 +143,8 @@ void CuMatrix<Real>::Destroy() {


 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
-  Resize(src.NumRows(), src.NumCols());
- 
+void CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
+  KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
  #if HAVE_CUDA==1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
@ -152,7 +152,8 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
    MatrixIndexT dst_pitch = stride_*sizeof(Real);
    MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
    MatrixIndexT width = src.NumCols()*sizeof(Real);
-    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, width, src.NumRows(), cudaMemcpyDeviceToDevice));
+    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
+                            width, src.NumRows(), cudaMemcpyDeviceToDevice));

    CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromMatD2D",tim.Elapsed());
  } else
@ -160,16 +161,13 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
  {
    mat_.CopyFromMat(src.mat_);
  }
-
-  return *this;
 }



 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
-  Resize(src.NumRows(), src.NumCols());
-
+void CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
+  KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
  #if HAVE_CUDA==1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
@ -177,7 +175,8 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
    MatrixIndexT dst_pitch = stride_*sizeof(Real);
    MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
    MatrixIndexT width = src.NumCols()*sizeof(Real);
-    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, width, src.NumRows(), cudaMemcpyHostToDevice));
+    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
+                            width, src.NumRows(), cudaMemcpyHostToDevice));

    CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromMatH2D",tim.Elapsed());
  } else
@ -185,18 +184,13 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
  {
    mat_.CopyFromMat(src);
  }
-
-  return *this;
 }


-
 template<typename Real>
 void CuMatrix<Real>::CopyToMat(Matrix<Real> *dst) const {
-  if (dst->NumRows() != NumRows()  ||  dst->NumCols() != NumCols()) {
-    dst->Resize(NumRows(), NumCols());
-  }
-
+  KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols());
+  
  #if HAVE_CUDA==1 
  if (CuDevice::Instantiate().Enabled()) { 

@ -257,7 +251,7 @@ void CuMatrix<Real>::Read(std::istream &is, bool binary) {

 template<typename Real>
 void CuMatrix<Real>::Write(std::ostream &os, bool binary) const {
-  Matrix<BaseFloat> tmp;
+  Matrix<BaseFloat> tmp(NumRows(), NumCols(), kUndefined);
  CopyToMat(&tmp);
  tmp.Write(os, binary); 
 }
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@ -46,15 +46,41 @@ class CuMatrix {
 public:

  /// Default Constructor
-  CuMatrix<Real>()
-   : num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
-  }
+  CuMatrix<Real>():
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { }
+
  /// Constructor with memory initialisation
-  CuMatrix<Real>(MatrixIndexT rows, MatrixIndexT cols)
-   : num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
+  CuMatrix<Real>(MatrixIndexT rows, MatrixIndexT cols):
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
    Resize(rows, cols); 
  }

+  // Note: we had to remove the "explicit" keyword due
+  // to problems with STL vectors of CuMatrix.
+  CuMatrix<Real>(const CuMatrix<Real> &other):
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) {
+    Resize(other.NumRows(), other.NumCols(), kUndefined);
+    CopyFromMat(other);
+  }
+
+  explicit CuMatrix<Real>(const Matrix<Real> &other):
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) {
+    Resize(other.NumRows(), other.NumCols(), kUndefined);
+    CopyFromMat(other);
+  }
+
+  CuMatrix<Real> &operator = (const CuMatrix<Real> &other) {
+    Resize(other.NumRows(), other.NumCols(), kUndefined);
+    CopyFromMat(other);
+    return *this;
+  }  
+
+  CuMatrix<Real> &operator = (const Matrix<Real> &other) {
+    Resize(other.NumRows(), other.NumCols(), kUndefined);
+    CopyFromMat(other);
+    return *this;
+  }  
+  
  /// Destructor
  ~CuMatrix() {
    Destroy(); 
@ -65,14 +91,12 @@ class CuMatrix {
    return num_rows_; 
  }

-  MatrixIndexT NumCols() const { 
-    return num_cols_; 
-  }
+  MatrixIndexT NumCols() const { return num_cols_;  }

-  MatrixIndexT Stride() const { 
-    return stride_; 
-  }
+  MatrixIndexT Stride() const { return stride_; }

+  // MatrixDim is a struct containing "rows", "cols" and "stride",
+  // that is an argument of most CUDA kernels.
  ::MatrixDim Dim() const { 
    ::MatrixDim d = { num_rows_, num_cols_, stride_ }; 
    return d; 
@ -87,41 +111,34 @@ class CuMatrix {
  Real* RowData(MatrixIndexT r);

  /// Get size of matrix in bytes
-  MatrixIndexT SizeInBytes() const { 
-    return num_rows_*stride_*sizeof(Real); 
-  }
-  
+  MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
+
  /// Get size of matrix row in bytes
-  MatrixIndexT RowSizeInBytes() const {
-    return num_cols_*sizeof(Real); 
-  }
+  MatrixIndexT RowSizeInBytes() const { return num_cols_*sizeof(Real); }
  
  /// Get size of matrix stride in bytes
-  MatrixIndexT StrideSizeInBytes() const {
-    return stride_*sizeof(Real); 
-  }
+  MatrixIndexT StrideSizeInBytes() const { return stride_*sizeof(Real); }

  /// Allocate the memory
-  ThisType& Resize(MatrixIndexT rows, MatrixIndexT cols);
-
-  /// Deallocate the memory
-  void Destroy();
-
-  /// Copy functions (reallocates when needed)
-  ThisType&        CopyFromMat(const CuMatrix<Real> &src);
-  ThisType&        CopyFromMat(const Matrix<Real> &src);
-  void             CopyToMat(Matrix<Real> *dst) const;
-
+  void Resize(MatrixIndexT rows, MatrixIndexT cols,
+              MatrixResizeType resize_type = kSetZero);
+  
+  /// Copy functions (reallocates when needed, but note from Dan: eventually
+  /// I'll change it to just die if the sizes don't match, like the Matrix class.)
+  void CopyFromMat(const CuMatrix<Real> &src);
+  void CopyFromMat(const Matrix<Real> &src);
+  void CopyToMat(Matrix<Real> *dst) const;
+  
  /// Copy row interval from matrix
  /// @param r      [in] number of rows to copy.
  /// @param src    [in] source matrix.
  /// @param src_ro [in] source matrix row offset.
  /// @param dst_ro [in] destination matrix row offset.
-  void             CopyRowsFromMat(int32 r, const CuMatrix<Real> &src, int32 src_ro, int32 dst_ro);
+  void CopyRowsFromMat(int32 r, const CuMatrix<Real> &src, int32 src_ro, int32 dst_ro);

  /// I/O functions
-  void             Read(std::istream &is, bool binary);
-  void             Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;

  /// Math operations, some calling kernels
  void SetZero();
@ -154,6 +171,8 @@ class CuMatrix {
  }

 private:
+  void Destroy();
+  
  MatrixIndexT num_rows_;
  MatrixIndexT num_cols_;
  MatrixIndexT stride_;
--- a/src/cudamatrix/cu-vector-inl.h
+++ b/src/cudamatrix/cu-vector-inl.h
@ -44,8 +44,6 @@ const Real* CuVector<Real>::Data() const {
  }
 }

-
-
 template<typename Real>
 Real* CuVector<Real>::Data() { 
  #if HAVE_CUDA==1
@ -58,15 +56,12 @@ Real* CuVector<Real>::Data() {
  }
 }

-
-
 template<typename Real>
-CuVector<Real>& CuVector<Real>::Resize(MatrixIndexT dim) {
+void CuVector<Real>::Resize(MatrixIndexT dim) {
  if (dim_ == dim) {
-    // SetZero();
-    return *this;
+    SetZero();
+    return;
  }
-
  Destroy();

  #if HAVE_CUDA==1
@ -80,8 +75,6 @@ CuVector<Real>& CuVector<Real>::Resize(MatrixIndexT dim) {

  dim_ = dim;
  SetZero();
-
-  return *this;
 }


@ -106,10 +99,8 @@ void CuVector<Real>::Destroy() {


 template<typename Real>
-CuVector<Real>& CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
-  Resize(src.Dim());
-  
-  #if HAVE_CUDA==1
+void CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
+#if HAVE_CUDA==1
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
    cuSafeCall(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyDeviceToDevice));
@ -119,16 +110,13 @@ CuVector<Real>& CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
  {
    vec_.CopyFromVec(src.vec_);
  }
-
-  return *this;
 }



 template<typename Real>
-CuVector<Real>& CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
-  Resize(src.Dim());
-
+void CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
+  KALDI_ASSERT(src.Dim() == dim_);
  #if HAVE_CUDA==1
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
@ -141,16 +129,14 @@ CuVector<Real>& CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
  {
    vec_.CopyFromVec(src);
  }
-  return *this;
 }



 template<typename Real>
 void CuVector<Real>::CopyToVec(Vector<Real> *dst) const {
-  if (dst->Dim() != dim_) {
-    dst->Resize(dim_);
-  }
+  KALDI_ASSERT(dst->Dim() == dim_);
+

  #if HAVE_CUDA==1
  if (CuDevice::Instantiate().Enabled()) { 
@ -177,7 +163,7 @@ void CuVector<Real>::Read(std::istream &is, bool binary) {

 template<typename Real>
 void CuVector<Real>::Write(std::ostream &os, bool binary) const {
-  Vector<BaseFloat> tmp;
+  Vector<BaseFloat> tmp(Dim());
  CopyToVec(&tmp);
  tmp.Write(os, binary); 
 }
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@ -46,6 +46,16 @@ class CuVector {
    Resize(dim); 
  }

+  CuVector<Real>(const CuVector<Real> &v): dim_(0), data_(NULL) { 
+    Resize(v.dim_);
+    CopyFromVec(v);
+  }
+
+  CuVector<Real>(const Vector<Real> &v): dim_(0), data_(NULL) { 
+    Resize(v.Dim());
+    CopyFromVec(v);
+  }
+  
  /// Destructor
  ~CuVector() {
    Destroy(); 
@ -61,19 +71,16 @@ class CuVector {
  Real* Data();
 
  /// Allocate the memory
-  ThisType& Resize(MatrixIndexT dim);
-
-  /// Deallocate the memory
-  void Destroy();
+  void Resize(MatrixIndexT dim);

  /// Copy functions (lazy reallocation when needed)
-  ThisType&        CopyFromVec(const CuVector<Real> &src);
-  ThisType&        CopyFromVec(const Vector<Real> &src);
-  void             CopyToVec(Vector<Real> *dst) const;
+  void CopyFromVec(const CuVector<Real> &src);
+  void CopyFromVec(const Vector<Real> &src);
+  void CopyToVec(Vector<Real> *dst) const;
  
  /// I/O 
-  void             Read(std::istream &is, bool binary);
-  void             Write(std::ostream &is, bool binary) const;
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &is, bool binary) const;
  
  /// Math operations
  void SetZero();
@ -94,6 +101,7 @@ class CuVector {
  }

 private:
+  void Destroy();
  MatrixIndexT dim_; ///< dimension of the vector
  Real *data_; ///< GPU data pointer
  Vector<Real> vec_; ///< non-GPU vector as back-up
--- a/src/decoder/simple-decoder.h
+++ b/src/decoder/simple-decoder.h
@ -234,7 +234,7 @@ class SimpleDecoder {
        const Arc &arc = aiter.Value();
        if (arc.ilabel == 0) {  // propagate nonemitting only...
          Token *new_tok = new Token(arc, tok);
-          if (new_tok->arc_.weight.Value() > cutoff) {
+          if (new_tok->weight_.Value() > cutoff) {
            Token::TokenDelete(new_tok);
          } else {
            unordered_map<StateId, Token*>::iterator find_iter
--- a/src/feat/mel-computations.cc
+++ b/src/feat/mel-computations.cc
@ -213,6 +213,10 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
    int32 offset = bins_[i].first;
    const Vector<BaseFloat> &v (bins_[i].second);
    (*mel_energies_out)(i) = VecVec(v, power_spectrum.Range(offset, v.Dim()));
+    // The following assert was added due to a problem with OpenBlas that
+    // we had at one point (it was a bug in that library).  Just to detect
+    // it early.
+    KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i)));
  }

  if (debug_) {
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@ -9,7 +9,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
    feat-to-len feat-to-dim fmpe-apply-transform fmpe-acc-stats fmpe-init \
    fmpe-est fmpe-copy fmpe-sum-accs append-feats extend-transform-dim \
    get-full-lda-mat compute-spectrogram-feats extract-feature-segments \
-    reverse-feats paste-feats select-feats
+    reverse-feats paste-feats select-feats subsample-feats

 OBJFILES = 

--- a/src/featbin/append-feats.cc
+++ b/src/featbin/append-feats.cc
@ -1,6 +1,7 @@
 // featbin/append-feats.cc

-// Copyright 2012   Petr Motlicek;  Pawel Swietojanski
+// Copyright 2012   Petr Motlicek  Pawel Swietojanski
+//                  Johns Hopkins University (author: Daniel Povey)

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -32,15 +33,11 @@ int main(int argc, char *argv[]) {

    ParseOptions po(usage);

-    int32 feats_offset_in1 = 0;
-    int32 feats_offset_in2 = 0;
-    int32 num_feats_in1 = 0;
-    int32 num_feats_in2 = 0;
-
-    po.Register("feats-offset-in1", &feats_offset_in1, "Feats 1 offset");
-    po.Register("num-feats-in1", &num_feats_in1, "Take num-feats from in1-rspeciifier");
-    po.Register("feats-offset-in2", &feats_offset_in2, "Feats 2 offset");
-    po.Register("num-feats-in2", &num_feats_in2, "Take num-feats from in2-rspeciifier");
+    bool truncate_frames = false;
+    
+    po.Register("truncate-frames", &truncate_frames, "If true, do not treat it "
+                "as an error when files differ in number of frames, but truncate "
+                "the longest one.");

    po.Read(argc, argv);

@ -53,80 +50,47 @@ int main(int argc, char *argv[]) {
    std::string rspecifier2 = po.GetArg(2);
    std::string wspecifier = po.GetArg(3);

-    KALDI_ASSERT(feats_offset_in1 >= 0 && feats_offset_in2 >= 0);
+    BaseFloatMatrixWriter feats_writer(wspecifier);
+    SequentialBaseFloatMatrixReader feats_reader1(rspecifier1);
+    RandomAccessBaseFloatMatrixReader feats_reader2(rspecifier2);

-    BaseFloatMatrixWriter kaldi_writer(wspecifier);
-    SequentialBaseFloatMatrixReader kaldi_reader1(rspecifier1);
-    RandomAccessBaseFloatMatrixReader kaldi_reader2(rspecifier2);
+    int32 num_done = 0, num_err = 0;

-    // Peeking in the archives to get the feature dimensions
-    if (kaldi_reader1.Done()) {
-      KALDI_ERR << "Could not read any features from " << rspecifier1
-                << ". (empty archive?)";
-    }
-    std::string utt = kaldi_reader1.Key();
-    if (!kaldi_reader2.HasKey(utt)) {
-      KALDI_ERR << "Could not read features for key " << utt << " from "
-                << rspecifier2 << ". (empty archive?)";
-    }
-
-    int32 dim_feats_in1 = kaldi_reader1.Value().NumCols();
-    int32 dim_feats_in2 = kaldi_reader2.Value(utt).NumCols();
-    if (num_feats_in1 == 0)
-      num_feats_in1 = dim_feats_in1 - feats_offset_in1;
-    if (num_feats_in2 == 0)
-      num_feats_in2 = dim_feats_in2 - feats_offset_in2;
-
-    KALDI_LOG << "Reading features from " << rspecifier1 << " and " << rspecifier2;
-    KALDI_LOG << "\tdim1 = " << dim_feats_in1 << "; offset1 = " << feats_offset_in1
-              << "; num1 = " << num_feats_in1 << "; dim2 = " << dim_feats_in2
-              << "; offset2 = " << feats_offset_in2 << "; num2 = " << num_feats_in2;
-
-    KALDI_ASSERT((feats_offset_in1 + num_feats_in1) <= dim_feats_in1);
-    KALDI_ASSERT((feats_offset_in2 + num_feats_in2) <= dim_feats_in2);
-
-    for (; !kaldi_reader1.Done(); kaldi_reader1.Next()) {
-      utt = kaldi_reader1.Key();
-      if (!kaldi_reader2.HasKey(utt)) {
+    for (; !feats_reader1.Done(); feats_reader1.Next()) {
+      std::string utt = feats_reader1.Key();
+      if (!feats_reader2.HasKey(utt)) {
        KALDI_WARN << "Could not find features for " << utt << " in "
                   << rspecifier2 << ": producing no output for the utterance";
+        num_err++;
        continue;
      }
-
-      const Matrix<BaseFloat> &feats1 = kaldi_reader1.Value();
-      const Matrix<BaseFloat> &feats2 = kaldi_reader2.Value(utt);
-      int32 num_frames = feats1.NumRows();
-      KALDI_VLOG(1) << "Utterance : " << utt << ": # of frames = " << num_frames;
-
-      KALDI_ASSERT(feats1.NumCols() == dim_feats_in1 &&
-                   feats2.NumCols() == dim_feats_in2);
-      if (num_frames != feats2.NumRows()) {
-        KALDI_WARN << "Utterance " << utt << ": " << num_frames
-                   << " frames read from " << rspecifier1 << " and "
-                   << feats2.NumRows() << " frames read from " << rspecifier2
-                   << ": producing no output for the utterance";
+      
+      const Matrix<BaseFloat> &feats1 = feats_reader1.Value();
+      const Matrix<BaseFloat> &feats2 = feats_reader2.Value(utt);
+      if (feats1.NumRows() != feats2.NumRows() && !truncate_frames) {
+        KALDI_WARN << "For utterance " << utt << ", features have different "
+                   << "#frames " << feats1.NumRows() << " vs. "
+                   << feats2.NumRows() << ", producing no output (use "
+                   << "--truncate-frames=true if you want output)";
+        num_err++;
        continue;
      }
-
-      SubMatrix<BaseFloat> new_feats1(feats1, 0, num_frames, feats_offset_in1,
-                                      num_feats_in1);
-      SubMatrix<BaseFloat> new_feats2(feats2, 0, num_frames, feats_offset_in2,
-                                      num_feats_in2);
-      Matrix<BaseFloat> output_feats(num_frames, new_feats1.NumCols() +
-                                     new_feats2.NumCols());
-      output_feats.Range(0, num_frames, 0,
-                         new_feats1.NumCols()).CopyFromMat(new_feats1);
-      output_feats.Range(0, num_frames, new_feats1.NumCols(),
-                         new_feats2.NumCols()).CopyFromMat(new_feats2);
-      kaldi_writer.Write(utt, output_feats);
+      int32 num_frames = std::min(feats1.NumRows(), feats2.NumRows()),
+          dim1 = feats1.NumCols(), dim2 = feats2.NumCols();
+      Matrix<BaseFloat> output(num_frames, dim1 + dim2, kUndefined);
+      output.Range(0, num_frames, 0, dim1).CopyFromMat(
+          feats1.Range(0, num_frames, 0, dim1));
+      output.Range(0, num_frames, dim1, dim2).CopyFromMat(
+          feats2.Range(0, num_frames, 0, dim2));
+      
+      feats_writer.Write(utt, output);
+      num_done++;
    }
-
-    return 0;
-  }
-  catch (const std::exception& e) {
+    KALDI_LOG << "Appended " << num_done << " feats; " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+  } catch (const std::exception& e) {
    std::cerr << e.what();
    return -1;
  }
 }
-
-
--- a/src/featbin/compute-mfcc-feats.cc
+++ b/src/featbin/compute-mfcc-feats.cc
@ -21,7 +21,6 @@
 #include "feat/feature-mfcc.h"
 #include "feat/wave-reader.h"

-
 int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;
--- a/src/featbin/select-feats.cc
+++ b/src/featbin/select-feats.cc
@ -31,10 +31,10 @@ int main(int argc, char *argv[]) {
    using namespace std;
    
    const char *usage =
-      "Select certain dimensions of the feature file;  think of it as the unix\n"
-      "command cut -f ...\n"
-      "Usage: select-feats selection in-rspecifier out-wspecifier\n"
-      "  e.g. select-feats 0,24-22,3-12 scp:feats.scp ark,scp:feat-red.ark,feat-red.scp\n";
+        "Select certain dimensions of the feature file;  think of it as the unix\n"
+        "command cut -f ...\n"
+        "Usage: select-feats selection in-rspecifier out-wspecifier\n"
+        "  e.g. select-feats 0,24-22,3-12 scp:feats.scp ark,scp:feat-red.ark,feat-red.scp\n";
    
    ParseOptions po(usage);
    
--- a/src/featbin/subsample-feats.cc
+++ b/src/featbin/subsample-feats.cc
@ -0,0 +1,96 @@
+// featbin/select-feats.cc
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+#include <algorithm>
+#include <iterator>
+#include <utility>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace std;
+    
+    const char *usage =
+        "Sub-samples features by taking every n'th frame"
+        "\n"
+        "Usage: subsample-feats [options] in-rspecifier out-wspecifier\n"
+        "  e.g. subsample-feats --n=2 ark:- ark:-\n";
+    
+    ParseOptions po(usage);
+
+    int32 n = 1, offset = 0;
+
+    po.Register("n", &n, "Take every n'th feature, for this value of n");
+    po.Register("offset", &offset, "Start with the feature with this offset, "
+                "then take every n'th feature.");
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }    
+
+    string rspecifier = po.GetArg(1);
+    string wspecifier = po.GetArg(2);
+    
+    SequentialBaseFloatMatrixReader feat_reader(rspecifier);
+    BaseFloatMatrixWriter feat_writer(wspecifier);
+
+    int32 num_done = 0, num_err = 0;
+    
+    // process all keys
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string utt = feat_reader.Key();
+      const Matrix<BaseFloat> feats(feat_reader.Value());
+
+      // This code could, of course, be much more efficient; I'm just
+      // keeping it simple.
+      int32 num_indexes = 0;
+      for (int32 k = offset; k < feats.NumRows(); k += n)
+        num_indexes++; // k is the index.
+      
+      if (num_indexes == 0) {
+        KALDI_WARN << "For utterance " << utt << ", output would have no rows, "
+                   << "producing no output.";
+        num_err++;
+        continue;
+      }
+      Matrix<BaseFloat> output(num_indexes, feats.NumCols());
+      int32 i = 0;
+      for (int32 k = offset; k < feats.NumRows(); k += n, i++) {
+        SubVector<BaseFloat> src(feats, k), dest(output, i);
+        dest.CopyFromVec(src);
+      }
+      KALDI_ASSERT(i == num_indexes);
+      feat_writer.Write(utt, output);
+      num_done++;
+    }
+    KALDI_LOG << "Sub-sampled " << num_done << " feats; " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
--- a/src/featbin/subset-feats.cc
+++ b/src/featbin/subset-feats.cc
@ -25,11 +25,11 @@ int main(int argc, char *argv[]) {
    using namespace kaldi;

    const char *usage =
-        "Copy a subset of features\n"
+        "Copy a subset of features (the first n features)\n"
        "Usage: subset-feats [options] in-rspecifier out-wspecifier\n";

    ParseOptions po(usage);
-
+    
    int32 n = 10;
    po.Register("n", &n, "If nonnegative, copy the first n feature files.");

--- a/src/featbin/transform-feats.cc
+++ b/src/featbin/transform-feats.cc
@ -160,11 +160,9 @@ int main(int argc, char *argv[]) {
    KALDI_LOG << "Applied transform to " << num_done << " utterances; " << num_error
              << " had errors.";

-    return 0;
+    return (num_done != 0 ? 0 : 1);
  } catch(const std::exception &e) {
    std::cerr << e.what();
    return -1;
  }
 }
-
-
--- a/src/matrix/cblas-wrappers.h
+++ b/src/matrix/cblas-wrappers.h
@ -27,9 +27,24 @@

 namespace kaldi {

-inline void cblas_Xscal(const int N, float *X, const int incX, float *Y,
-                        const int incY, const float c, const float s) {
-  cblas_srot(N, X, incX, Y, incY, c, s);
+
+inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
+                        const int incY) {
+  cblas_scopy(N, X, incX, Y, incY);
+}
+
+inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
+                        const int incY) {
+  cblas_dcopy(N, X, incX, Y, incY);
+}
+
+
+inline float cblas_Xasum(const int N, const float *X, const int incX) {
+  return cblas_sasum(N, X, incX);
+}
+
+inline double cblas_Xasum(const int N, const double *X, const int incX) {
+  return cblas_dasum(N, X, incX);
 }

 inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
@ -58,11 +73,11 @@ inline void cblas_Xaxpy(const int N, const double alpha, const double *X,
                        const int incX, double *Y, const int incY) {
  cblas_daxpy(N, alpha, X, incX, Y, incY);
 }
-inline void cblas_Xscal(const int N,const float alpha, float *data,
+inline void cblas_Xscal(const int N, const float alpha, float *data,
                        const int inc) {
  cblas_sscal(N, alpha, data, inc);
 }
-inline void cblas_Xscal(const int N,const double alpha, double *data, 
+inline void cblas_Xscal(const int N, const double alpha, double *data, 
                        const int inc) {
  cblas_dscal(N, alpha, data, inc);
 }
@ -226,6 +241,78 @@ inline void cblas_Xsyrk(
  cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
 }
+
+/// matrix-vector multiply using a banded matrix; we always call this
+/// with b = 1 meaning we're multiplying by a diagonal matrix.  This is used for
+/// elementwise multiplication.  We miss some of the arguments out of this
+/// wrapper.
+inline void cblas_Xsbmv1(
+    const MatrixIndexT dim,
+    const double *A,
+    const double alpha,
+    const double *x,
+    const double beta,
+    double *y) {
+  cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
+              1, x, 1, beta, y, 1);
+}
+
+inline void cblas_Xsbmv1(
+    const MatrixIndexT dim,
+    const float *A,
+    const float alpha,
+    const float *x,
+    const float beta,
+    float *y) {
+  cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
+              1, x, 1, beta, y, 1);
+}
+
+
+/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
+/// extend this somehow.
+inline void mul_elements(
+    const MatrixIndexT dim,
+    const double *a,
+    double *b) { // does b *= a, elementwise.
+  double c1, c2, c3, c4;
+  MatrixIndexT i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+inline void mul_elements(
+    const MatrixIndexT dim,
+    const float *a,
+    float *b) { // does b *= a, elementwise.
+  float c1, c2, c3, c4;
+  MatrixIndexT i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+
+
 // add clapack here
 #ifndef HAVE_ATLAS
 inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@ -495,6 +495,41 @@ template
 void MatrixBase<double>::CopyFromMat(const MatrixBase<double> & M,
                                     MatrixTransposeType Trans);

+// Specialize the template for CopyFromSp for float, float.
+template<>
+template<>
+void MatrixBase<float>::CopyFromSp(const SpMatrix<float> & M) {
+  KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
+  MatrixIndexT num_rows = num_rows_, stride = stride_;
+  const float *Mdata = M.Data();
+  float *row_data = data_, *col_data = data_;
+  for (MatrixIndexT i = 0; i < num_rows; i++) {
+    cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
+    cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column.
+    Mdata += i+1;
+    row_data += stride;
+    col_data += 1;
+  }
+}
+
+// Specialize the template for CopyFromSp for double, double.
+template<>
+template<>
+void MatrixBase<double>::CopyFromSp(const SpMatrix<double> & M) {
+  KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
+  MatrixIndexT num_rows = num_rows_, stride = stride_;
+  const double *Mdata = M.Data();
+  double *row_data = data_, *col_data = data_;
+  for (MatrixIndexT i = 0; i < num_rows; i++) {
+    cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
+    cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column.
+    Mdata += i+1;
+    row_data += stride;
+    col_data += 1;
+  }
+}
+
+  
 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromSp(const SpMatrix<OtherReal> & M) {
@ -711,12 +746,16 @@ void Matrix<Real>::Destroy() {
 template<typename Real>
 void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
  KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
-  MatrixIndexT i;
-  MatrixIndexT j;
-
-  for (i = 0; i < num_rows_; i++) {
-    for (j = 0; j < num_cols_; j++) {
-      (*this)(i, j) *= a(i, j);
+  
+  if (num_cols_ == stride_ && num_cols_ == a.stride_) {
+    mul_elements(num_rows_ * num_cols_, a.data_, data_);
+  } else {
+    MatrixIndexT a_stride = a.stride_, stride = stride_;
+    Real *data = data_, *a_data = a.data_;
+    for (MatrixIndexT i = 0; i < num_rows_; i++) {
+      mul_elements(num_cols_, a_data, data);
+      a_data += a_stride;
+      data += stride;
    }
  }
 }
@ -1052,10 +1091,10 @@ bad:
 // would not allow its contents to be changed.
 template<typename Real>
 SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
-                           const MatrixIndexT    ro,
-                           const MatrixIndexT    r,
-                           const MatrixIndexT    co,
-                           const MatrixIndexT    c) {
+                           const MatrixIndexT ro,
+                           const MatrixIndexT r,
+                           const MatrixIndexT co,
+                           const MatrixIndexT c) {
  KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(ro) <
               static_cast<UnsignedMatrixIndexT>(M.num_rows_) &&
               static_cast<UnsignedMatrixIndexT>(co) <
@ -1985,6 +2024,13 @@ Real MatrixBase<Real>::ApplySoftMax() {
  return max + log(sum);
 }

+template<typename Real>
+void MatrixBase<Real>::ApplyTanh() {
+  for (MatrixIndexT r = 0; r < num_rows_; r++) {
+    SubVector<Real> v(*this, r);
+    v.ApplyTanh();
+  }
+}

 template<class Real>
 template<class OtherReal>
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@ -340,6 +340,9 @@ class MatrixBase {
  /// Apply soft-max to the collection of all elements of the
  /// matrix and return normalizer (log sum of exponentials).
  Real ApplySoftMax();
+
+  /// Apply the tanh function to each element of the matrix.
+  void ApplyTanh();
  
  /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
   * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
@ -530,7 +533,7 @@ class Matrix : public MatrixBase<Real> {

  /// Basic constructor.  Sets to zero by default.
  /// if set_zero == false, memory contents are undefined.
-  Matrix(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type = kSetZero) :
+  Matrix(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type = kSetZero):
    MatrixBase<Real>() { Resize(r, c, resize_type); }

  /// Swaps the contents of *this and *other.  Shallow swap.
@ -553,7 +556,7 @@ class Matrix : public MatrixBase<Real> {
  /// It is symmetric, so no option for transpose, and NumRows == Cols
  template<typename OtherReal>
  explicit Matrix(const SpMatrix<OtherReal> & M) : MatrixBase<Real>() {
-    Resize(M.NumRows(), M.NumRows());
+    Resize(M.NumRows(), M.NumRows(), kUndefined);
    this->CopyFromSp(M);
  }

@ -562,10 +565,10 @@ class Matrix : public MatrixBase<Real> {
  explicit Matrix(const TpMatrix<OtherReal> & M,
                    MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
    if (trans == kNoTrans) {
-      Resize(M.NumRows(), M.NumCols());
+      Resize(M.NumRows(), M.NumCols(), kUndefined);
      this->CopyFromTp(M);
    } else {
-      Resize(M.NumCols(), M.NumRows());
+      Resize(M.NumCols(), M.NumRows(), kUndefined);
      this->CopyFromTp(M, kTrans);
    }
  }
@ -584,9 +587,6 @@ class Matrix : public MatrixBase<Real> {
  /// Distructor to free matrices.
  ~Matrix() { Destroy(); }

-  /// Deallocates memory and sets to empty matrix.
-  void Destroy();
-
  /// Sets matrix to a specified size (zero is OK as long as both r and c are
  /// zero).  The value of the new data depends on resize_type:
  ///   -if kSetZero, the new data will be zero
@ -601,9 +601,8 @@ class Matrix : public MatrixBase<Real> {
  /// Assignment operator that takes MatrixBase.
  Matrix<Real> &operator = (const MatrixBase<Real> &other) {
    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols()) {
-      Resize(other.NumRows(), other.NumCols());
-    }
+        MatrixBase<Real>::NumCols() != other.NumCols())
+      Resize(other.NumRows(), other.NumCols(), kUndefined);
    MatrixBase<Real>::CopyFromMat(other);
    return *this;
  }
@ -611,15 +610,17 @@ class Matrix : public MatrixBase<Real> {
  /// Assignment operator. Needed for inclusion in std::vector.
  Matrix<Real> &operator = (const Matrix<Real> &other) {
    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols()) {
-      Resize(other.NumRows(), other.NumCols());
-    }
+        MatrixBase<Real>::NumCols() != other.NumCols())
+      Resize(other.NumRows(), other.NumCols(), kUndefined);
    MatrixBase<Real>::CopyFromMat(other);
    return *this;
  }
  

 private:
+  /// Deallocates memory and sets to empty matrix (dimension 0, 0).
+  void Destroy();
+  
  /// Init assumes the current class contents are invalid (i.e. junk or have
  /// already been freed), and it sets the matrix to newly allocated memory with
  /// the specified number of rows and columns.  r == c == 0 is acceptable.  The data
@ -677,8 +678,8 @@ class SubMatrix : public MatrixBase<Real> {
            const MatrixIndexT ro,  // row offset, 0 < ro < NumRows()
            const MatrixIndexT r,   // number of rows, r > 0
            const MatrixIndexT co,  // column offset, 0 < co < NumCols()
-            const MatrixIndexT c);  // number of columns, c > 0
-
+            const MatrixIndexT c);   // number of columns, c > 0
+  
  ~SubMatrix<Real>() {}
  
  /// This type of constructor is needed for Range() to work [in Matrix base
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@ -375,7 +375,13 @@ template
 void VectorBase<double>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT row);


-// takes elements to a power.  Throws exception if could not (but only for power != 1 ad power != 2).
+#ifdef HAVE_MKL
+template<>
+void VectorBase<float>::ApplyPow(float power) { vsPowx(dim_, data_, power, data_); }
+template<>
+void VectorBase<double>::ApplyPow(double power) { vdPowx(dim_, data_, power, data_); }
+#else
+// takes elements to a power.  Throws exception if could not (but only for power != 1 and power != 2).
 template<typename Real>
 void VectorBase<Real>::ApplyPow(Real power) {
  if (power == 1.0) return;
@ -399,6 +405,7 @@ void VectorBase<Real>::ApplyPow(Real power) {
    }
  }
 }
+#endif

 // Computes the p-th norm. Throws exception if could not.
 template<typename Real>
@ -534,14 +541,13 @@ template<typename Real>
 void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
  // note the double accumulator
  KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride();
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    double sum = 0.0;
-    const Real *src = M.Data() + i;
-    for (MatrixIndexT j = 0; j < num_rows; j++)
-      sum += src[j*stride];
-    data_[i] = alpha * sum + beta * data_[i];
-  }
+  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
+  Real *data = data_;
+  cblas_Xscal(dim, beta, data, 1);
+  const Real *m_data = M.Data();
+
+  for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
+    cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
 }

 template<typename Real>
@ -651,6 +657,25 @@ Real VectorBase<Real>::ApplySoftMax() {
  return max + log(sum);
 }

+#ifdef HAVE_MKL
+template<>
+void VectorBase<float>::ApplyTanh() { vsTanh(dim_, data_, data_); }
+template<>
+void VectorBase<double>::ApplyTanh() { vdTanh(dim_, data_, data_); }
+#else
+template<typename Real>
+void VectorBase<Real>::ApplyTanh() {
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    Real x = data_[i];
+    if (x > 0.0) {
+      x = -1.0 + 2.0 / (1.0 + exp(-2.0 * x));
+    } else {
+      x = 1.0 - 2.0 / (1.0 + exp(2.0 * x));
+    }
+    data_[i] = x;
+  }
+}
+#endif

 template<typename Real>
 void VectorBase<Real>::Add(Real c) {
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@ -126,6 +126,10 @@ class VectorBase {
  /// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$
  Real ApplySoftMax();

+  /// Apply the tanh function to each element of a vector.  If using MKL, does
+  /// it using the "less accurate" options.
+  void ApplyTanh();
+
  /// Take all  elements of vector to a power.
  void ApplyPow(Real power);

@ -322,20 +326,20 @@ class Vector: public VectorBase<Real> {

  /// Copy constructor.  The need for this is controversial.
  Vector(const Vector<Real> &v) : VectorBase<Real>()  { //  (cannot be explicit)
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
    this->CopyFromVec(v);
  }

  /// Copy-constructor from base-class, needed to copy from SubVector.
  explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
    this->CopyFromVec(v);
  }

  /// Type conversion constructor.
  template<typename OtherReal>
  explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
    this->CopyFromVec(v);
  }

@ -372,14 +376,14 @@ class Vector: public VectorBase<Real> {

  /// Assignment operator, protected so it can only be used by std::vector
  Vector<Real> &operator = (const Vector<Real> &other) {
-    Resize(other.Dim());
+    Resize(other.Dim(), kUndefined);
    this->CopyFromVec(other);
    return *this;
  }

  /// Assignment operator that takes VectorBase.
  Vector<Real> &operator = (const VectorBase<Real> &other) {
-    Resize(other.Dim());
+    Resize(other.Dim(), kUndefined);
    this->CopyFromVec(other);
    return *this;
  }
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@ -679,6 +679,28 @@ template<class Real> static void UnitTestAxpy() {
  }
 }

+template<class Real> static void UnitTestCopySp() {
+  // Checking that the various versions of copying
+  // matrix to SpMatrix work the same in the symmetric case.
+  for (MatrixIndexT iter = 0;iter < 5;iter++) {
+    int32 dim = 5 + rand() %  10;
+    SpMatrix<Real> S(dim), T(dim);
+    S.SetRandn();
+    Matrix<Real> M(S);
+    T.CopyFromMat(M, kTakeMeanAndCheck);
+    AssertEqual(S, T);
+    T.SetZero();
+    T.CopyFromMat(M, kTakeMean);
+    AssertEqual(S, T);
+    T.SetZero();
+    T.CopyFromMat(M, kTakeLower);
+    AssertEqual(S, T);
+    T.SetZero();
+    T.CopyFromMat(M, kTakeUpper);
+    AssertEqual(S, T);
+  }
+}
+

 template<class Real> static void UnitTestPower() {
  for (MatrixIndexT iter = 0;iter < 5;iter++) {
@ -1415,7 +1437,7 @@ template<class Real> static void UnitTestInverse() {


 template<class Real> static void UnitTestMulElements() {
-  for (MatrixIndexT iter = 0;iter < 5;iter++) {
+  for (MatrixIndexT iter = 0; iter < 5; iter++) {
 	MatrixIndexT dimM = 20 + rand()%10, dimN = 20 + rand()%10;
 	Matrix<Real> A(dimM, dimN), B(dimM, dimN), C(dimM, dimN);
 	InitRand(&A);
@ -1430,6 +1452,7 @@ template<class Real> static void UnitTestMulElements() {
  }
 }

+
 template<class Real> static void UnitTestSpLogExp() {
  for (MatrixIndexT i = 0; i < 5; i++) {
    MatrixIndexT dimM = 10 + rand() % 10;
@ -1860,6 +1883,27 @@ template<class Real> static void  UnitTestLimitCond() {
  }
 }

+template<class Real> static void  UnitTestTanh() {
+  for (MatrixIndexT i = 0; i < 10; i++) {
+    MatrixIndexT dimM = 5 + rand() % 10, dimN = 5 + rand() % 10;
+    Matrix<Real> M(dimM, dimN);
+    Matrix<Real> N(M);
+    for(int32 r = 0; r < dimM; r++) {
+      for (int32 c = 0; c < dimN; c++) {
+        Real x = N(r, c);
+        if (x > 0.0) {
+          x = -1.0 + 2.0 / (1.0 + exp(-2.0 * x));
+        } else {
+          x = 1.0 - 2.0 / (1.0 + exp(2.0 * x));
+        }
+        N(r, c) = x;
+      }
+    }
+    M.ApplyTanh();
+    AssertEqual(M, N);
+  }
+}
+
 template<class Real> static void  UnitTestSimple() {
  for (MatrixIndexT i = 0;i < 5;i++) {
 	MatrixIndexT dimM = 20 + rand()%10, dimN = 20 + rand()%20;
@ -3541,6 +3585,7 @@ template<class Real> static void MatrixUnitTest(bool full_test) {
  UnitTestDotprod<Real>();
  // UnitTestSvdVariants<Real>();
  UnitTestPower<Real>();
+  UnitTestCopySp<Real>();
  UnitTestDeterminant<Real>();
  KALDI_LOG << " Point F";
  UnitTestDeterminantSign<Real>();
@ -3566,6 +3611,7 @@ template<class Real> static void MatrixUnitTest(bool full_test) {
  UnitTestRange<Real>();
  UnitTestSimpleForVec<Real>();
  UnitTestSimpleForMat<Real>();
+  UnitTestTanh<Real>();
  UnitTestNorm<Real>();
  UnitTestMul<Real>();
  KALDI_LOG << " Point I";
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@ -169,9 +169,17 @@ void SpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
        break;
      }
    case kTakeLower:
-      for (MatrixIndexT i = 0; i < D; i++)
-        for (MatrixIndexT j = 0; j <= i; j++)
-          (*this)(i, j) = M(i, j);
+      { // making this one a bit more efficient.
+        const Real *src = M.Data();
+        Real *dest = this->data_;
+        MatrixIndexT stride = M.Stride();
+        for (MatrixIndexT i = 0; i < D; i++) {
+          for (MatrixIndexT j = 0; j <= i; j++)
+            dest[j] = src[j];
+          dest += i + 1;
+          src += stride;
+        }
+      }
      break;
    case kTakeUpper:
      for (MatrixIndexT i = 0; i < D; i++)
--- a/src/nnet-cpu/Makefile
+++ b/src/nnet-cpu/Makefile
@ -10,7 +10,7 @@ OBJFILES = nnet-component.o nnet-nnet.o nnet-update.o train-nnet.o \
     nnet-randomize.o nnet-compute.o am-nnet.o nnet-functions.o  \
     nnet-precondition.o shrink-nnet.o combine-nnet.o combine-nnet-a.o \
     mixup-nnet.o nnet-lbfgs.o nnet-update-parallel.o combine-nnet-fast.o \
-     nnet-fix.o
+     nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o

 #nnet-compute.o nnet-train.o
 # nnet-nnet.o nnet-loss.o nnet-rnnlm.o
--- a/src/nnet-cpu/nnet-component-test.cc
+++ b/src/nnet-cpu/nnet-component-test.cc
@ -183,6 +183,23 @@ void UnitTestSigmoidComponent() {
  }
 }

+void UnitTestReduceComponent() {
+  // We're testing that the gradients are computed correctly:
+  // the input gradients and the model gradients.
+  
+  int32 input_dim = 10 + rand() % 50, n = 1 + rand() % 3;
+  {
+    ReduceComponent reduce_component(input_dim, n);
+    UnitTestGenericComponentInternal(reduce_component);
+  }
+  {
+    ReduceComponent reduce_component;
+    reduce_component.InitFromString("dim=15 n=3");
+    UnitTestGenericComponentInternal(reduce_component);
+  }
+}
+
+
 template<class T>
 void UnitTestGenericComponent() { // works if it has an initializer from int,
  // e.g. tanh, sigmoid.
@ -463,6 +480,8 @@ int main() {
    UnitTestGenericComponent<TanhComponent>();
    UnitTestGenericComponent<PermuteComponent>();
    UnitTestGenericComponent<SoftmaxComponent>();
+    UnitTestSigmoidComponent();
+    UnitTestReduceComponent();
    UnitTestAffineComponent();
    UnitTestAffinePreconInputComponent();
    UnitTestBlockAffineComponent();
--- a/src/nnet-cpu/nnet-component.cc
+++ b/src/nnet-cpu/nnet-component.cc
@ -47,6 +47,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
    ans = new TanhComponent();
  } else if (component_type == "SoftmaxComponent") {
    ans = new SoftmaxComponent();
+  } else if (component_type == "ReduceComponent") {
+    ans = new ReduceComponent();
  } else if (component_type == "AffineComponent") {
    ans = new AffineComponent();
  } else if (component_type == "AffineComponentA") {
@ -407,20 +409,8 @@ void TanhComponent::Propagate(const MatrixBase<BaseFloat> &in,
  // Apply tanh function to each element of the output...
  // the tanh function may be written as -1 + ( 2 / (1 + e^{-2 x})),
  // which is a scaled and shifted sigmoid.
-  out->Resize(in.NumRows(), in.NumCols());
-  int32 num_rows = in.NumRows(), num_cols = in.NumCols();
-  for(int32 r = 0; r < num_rows; r++) {
-    const BaseFloat *in_data = in.RowData(r),
-        *in_data_end = in_data + num_cols;
-    BaseFloat *out_data = out->RowData(r);
-    for (; in_data != in_data_end; ++in_data, ++out_data) {
-      if (*in_data > 0.0) {
-        *out_data = -1.0 + 2.0 / (1.0 + exp(-2.0 * *in_data));
-      } else {
-        *out_data = 1.0 - 2.0 / (1.0 + exp(2.0 * *in_data));
-      }
-    }
-  }
+  *out = in;
+  out->ApplyTanh();
 }

 void TanhComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value
@ -502,6 +492,67 @@ void SoftmaxComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value
  }
 }

+void ReduceComponent::InitFromString(std::string args) {
+  std::string orig_args(args);
+  int32 dim, n;
+  bool ok = ParseFromString("dim", &args, &dim) &&
+      ParseFromString("n", &args, &n);
+  if (!args.empty())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << args;
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << orig_args;
+  Init(dim, n);
+}
+
+void ReduceComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<ReduceComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<N>");
+  ReadBasicType(is, binary, &n_);
+  ExpectToken(is, binary, "</ReduceComponent>");
+}
+
+void ReduceComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ReduceComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<N>");
+  WriteBasicType(os, binary, n_);
+  WriteToken(os, binary, "</ReduceComponent>");
+}
+
+void ReduceComponent::Propagate(const MatrixBase<BaseFloat> &in,
+                                int32 num_chunks,
+                                Matrix<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumRows() > 0 && in.NumCols() == InputDim());
+  out->Resize(in.NumRows(), OutputDim());
+  int32 num_frames = in.NumRows(), input_dim = in.NumCols(), n = n_;
+  for (int32 r = 0; r < num_frames; r++) {
+    const BaseFloat *src = in.RowData(r);
+    BaseFloat *dest = out->RowData(r);
+    for (int32 c = 0; c < input_dim; c++)
+      dest[c / n] += src[c];
+  }    
+}
+
+void ReduceComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value,
+                               const MatrixBase<BaseFloat> &, // out_value,
+                               const MatrixBase<BaseFloat> &out_deriv,
+                               int32, // num_chunks
+                               Component *, // to_update
+                               Matrix<BaseFloat> *in_deriv) const {
+  int32 num_frames = out_deriv.NumRows(),
+      input_dim = InputDim(), n = n_;
+  in_deriv->Resize(num_frames, input_dim, kUndefined);
+  for (int32 r = 0; r < num_frames; r++) {
+    const BaseFloat *src = out_deriv.RowData(r);
+    BaseFloat *dest = in_deriv->RowData(r);
+    for (int32 c = 0; c < input_dim; c++)
+      dest[c] = src[c / n];
+  }    
+}
+
 void AffineComponent::Scale(BaseFloat scale) {
  linear_params_.Scale(scale);
  bias_params_.Scale(scale);
@ -857,11 +908,11 @@ void AffineComponentPreconditioned::Update(
  // Add the 1.0 at the end of each row "in_value_temp"
  for (int32 i = 0; i < in_value.NumRows(); i++)
    in_value_temp(i, in_value.NumCols()) = 1.0;
-
+  
  Matrix<BaseFloat> in_value_precon(in_value_temp.NumRows(),
-                                    in_value_temp.NumCols()),
+                                    in_value_temp.NumCols(), kUndefined),
      out_deriv_precon(out_deriv.NumRows(),
-                       out_deriv.NumCols());
+                       out_deriv.NumCols(), kUndefined);
  // each row of in_value_precon will be that same row of
  // in_value, but multiplied by the inverse of a Fisher
  // matrix that has been estimated from all the other rows,
--- a/src/nnet-cpu/nnet-component.h
+++ b/src/nnet-cpu/nnet-component.h
@ -225,6 +225,8 @@ class NonlinearComponent: public Component {
  void Scale(BaseFloat scale);
  void Add(BaseFloat alpha, const NonlinearComponent &other);

+  // The following functions are unique to NonlinearComponent.
+  // They mostly relate to diagnostics.
  const Vector<double> &ValueSum() const { return value_sum_; }
  const Vector<double> &DerivSum() const { return deriv_sum_; }
  double Count() const { return count_; }
@ -324,6 +326,37 @@ class SoftmaxComponent: public NonlinearComponent {
  SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
 };

+/// This layer just sums up groups of n inputs to produce one output.
+class ReduceComponent: public Component {
+ public:
+  void Init(int32 dim, int32 n) { KALDI_ASSERT(dim > 0 && n > 0);dim_ = dim; n_ = n; }
+  ReduceComponent(int32 dim, int32 n) { Init(dim, n); }
+  ReduceComponent(): dim_(0), n_(0) { } // e.g. prior to Read()
+  explicit ReduceComponent(const ReduceComponent &other):
+      dim_(other.dim_), n_(other.n_) {}
+  virtual Component* Copy() const { return new ReduceComponent(*this); }
+  virtual std::string Type() const { return "ReduceComponent"; }
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return (dim_ + n_ - 1) / n_; }
+  virtual void InitFromString(std::string args);
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual void Propagate(const MatrixBase<BaseFloat> &in,
+                         int32 num_chunks,
+                         Matrix<BaseFloat> *out) const;
+  virtual void Backprop(const MatrixBase<BaseFloat> &in_value,
+                        const MatrixBase<BaseFloat> &out_value,                        
+                        const MatrixBase<BaseFloat> &out_deriv,
+                        int32 num_chunks,
+                        Component *to_update, // may be identical to "this".
+                        Matrix<BaseFloat> *in_deriv) const;
+  virtual bool BackpropNeedsInput() const { return false; }
+  virtual bool BackpropNeedsOutput() const { return false; }
+ private:
+  int32 dim_;
+  int32 n_;
+};
+

 // Affine means a linear function plus an offset.
 // Note: although this class can be instantiated, it also
--- a/src/nnet-cpu/nnet-limit-rank.cc
+++ b/src/nnet-cpu/nnet-limit-rank.cc
@ -0,0 +1,108 @@
+// nnet/nnet-limit-rank.cc
+
+// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet-cpu/nnet-limit-rank.h"
+#include "thread/kaldi-task-sequence.h"
+
+namespace kaldi {
+
+class LimitRankClass {
+ public:
+  LimitRankClass(const NnetLimitRankOpts &opts,
+                 int32 c,
+                 Nnet *nnet): opts_(opts), c_(c), nnet_(nnet) { }
+  void operator () () {
+    AffineComponent *ac = dynamic_cast<AffineComponent*>(
+        &(nnet_->GetComponent(c_)));
+    KALDI_ASSERT(ac != NULL);
+
+    // We'll limit the rank of just the linear part, keeping the bias vector full.
+    Matrix<BaseFloat> M (ac->LinearParams());
+    int32 rows = M.NumRows(), cols = M.NumCols(), rc_min = std::min(rows, cols);
+    Vector<BaseFloat> s(rc_min);
+    Matrix<BaseFloat> U(rows, rc_min), Vt(rc_min, cols);
+    // Do the destructive svd M = U diag(s) V^T.  It actually outputs the transpose of V.
+    M.DestructiveSvd(&s, &U, &Vt);
+    SortSvd(&s, &U, &Vt); // Sort the singular values from largest to smallest.
+
+    int32 d = GetRetainedDim(rows, cols);
+    BaseFloat old_svd_sum = s.Sum();
+    U.Resize(rows, d, kCopyData);
+    s.Resize(d, kCopyData);
+    Vt.Resize(d, cols, kCopyData);
+    BaseFloat new_svd_sum = s.Sum();
+    KALDI_LOG << "For component " << c_ << " of dimension " << rows
+              << " x " << cols << ", reduced rank from "
+              << rc_min <<  " to " << d << ", SVD sum reduced from "
+              << old_svd_sum << " to " << new_svd_sum;
+    Vt.MulRowsVec(s); // Vt <-- diag(s) Vt.
+    M.AddMatMat(1.0, U, kNoTrans, Vt, kNoTrans, 0.0); // Reconstruct with reduced
+    // rank.
+    Vector<BaseFloat> bias_params(ac->BiasParams());
+    ac->SetParams(bias_params, M);
+  }
+
+  int32 GetRetainedDim(int32 rows, int32 cols) {
+    if (opts_.parameter_proportion <= 0.0 || opts_.parameter_proportion > 1.0)
+      KALDI_ERR << "bad --parameter-proportion " << opts_.parameter_proportion;
+    // If we do SVD to dimension d, so that it's U diag(s) V^T where
+    // U is rows * d, s is d, and V is cols * d, then the #params is as follows...
+    //   the first column of U has free parameters (#rows - 1) [the -1 is due to
+    //   the length constraint]; the second has (#rows - 2) [subtract 1 for the
+    //   length constraint and one for orthogonality with the previous row], etc.
+    //   Total is params(U) = (rows * d) - ((d(d+1))/2),
+    //            params(s) = d,
+    //            params(V) = (cols * d) - ((d(d+1))/2),
+    //   So total is (rows + cols) * d - d * d .
+    //   For example, if d = #rows, this equals (#rows * #cols)
+    //   We are solving for:
+    //   (rows * cols) * parameter_proportion = (rows + cols) * d - d * d, or
+    //   d^2 - d * (rows + cols) + (rows*cols)*parameter_proportion
+    //   In quadratic equation
+    //   a = 1.0,
+    //   b = -(rows + cols)
+    //   c = rows * cols * parameter_proportion.
+    //   Take smaller solution.
+    BaseFloat a = 1.0, b = -(rows + cols),
+        c = rows * cols * opts_.parameter_proportion;
+    BaseFloat x = (-b - sqrt(b * b - 4 * a * c)) / (2.0 * a);
+    int32 ans = static_cast<int32>(x);
+    KALDI_ASSERT(ans > 0 && ans <= std::min(rows, cols));
+    return ans;
+  }
+  
+  ~LimitRankClass() { }
+ private:
+  const NnetLimitRankOpts &opts_;
+  int32 c_;
+  Nnet *nnet_;
+};
+
+
+void LimitRankParallel(const NnetLimitRankOpts &opts,
+                            Nnet *nnet) {
+  TaskSequencerConfig task_config;
+  task_config.num_threads = opts.num_threads;
+  TaskSequencer<LimitRankClass> tc(task_config);
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    if (dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c))) != NULL)
+      tc.Run(new LimitRankClass(opts, c, nnet));
+  }
+}
+
+  
+} // namespace
--- a/src/nnet-cpu/nnet-limit-rank.h
+++ b/src/nnet-cpu/nnet-limit-rank.h
@ -0,0 +1,56 @@
+// nnet-cpu/nnet-limit-rank.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
+#define KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
+
+#include "nnet-cpu/nnet-nnet.h"
+#include "util/table-types.h"
+#include "thread/kaldi-semaphore.h"
+#include "thread/kaldi-thread.h"
+#include "nnet-cpu/nnet-update.h"
+
+namespace kaldi {
+
+struct NnetLimitRankOpts {
+  int32 num_threads;
+  BaseFloat parameter_proportion;
+  
+  NnetLimitRankOpts(): num_threads(1), parameter_proportion(0.75) { }
+
+  void Register(ParseOptions *po) {
+    po->Register("num-threads", &num_threads, "Number of threads used for "
+                 "rank-limiting operation; note, will never use more than "
+                 "#layers.");
+    po->Register("parameter-proportion", &parameter_proportion, "Proportion of "
+                 "dimension of each transform to limit the rank to.");
+  }  
+};
+
+
+/// This function limits the rank of each affine transform in the
+/// neural net, by zeroing out the smallest singular values.  The number of
+/// singular values to zero out is determined on a layer by layer basis, using
+/// "parameter_proportion" to set the proportion of parameters to remove.
+void LimitRankParallel(const NnetLimitRankOpts &opts,
+                       Nnet *nnet);
+                            
+
+
+} // namespace
+
+#endif // KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
--- a/src/nnet-cpu/nnet-nnet.cc
+++ b/src/nnet-cpu/nnet-nnet.cc
@ -361,6 +361,17 @@ void Nnet::RemoveDropout() {
    KALDI_LOG << "Removed " << removed << " dropout components.";
 }

+void Nnet::RemovePreconditioning() {
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (dynamic_cast<AffineComponentPreconditioned*>(components_[i]) != NULL) {
+      AffineComponent *ac = new AffineComponent(
+          *(dynamic_cast<AffineComponent*>(components_[i])));
+      delete components_[i];
+      components_[i] = ac;
+    }
+  }
+}
+
 void Nnet::AddNnet(const VectorBase<BaseFloat> &scale_params,
                   const Nnet &other) {
  KALDI_ASSERT(scale_params.Dim() == this->NumUpdatableComponents());
--- a/src/nnet-cpu/nnet-nnet.h
+++ b/src/nnet-cpu/nnet-nnet.h
@ -104,6 +104,10 @@ class Nnet {

  /// Excise any components of type DropoutComponent.
  void RemoveDropout();
+
+  /// Replace any components of type AffineComponentPreconditioned with
+  /// components of type AffineComponent.
+  void RemovePreconditioning();
  
  /// For each updatatable component, adds to it
  /// the corresponding element of "other" times the
--- a/src/nnet-cpu/nnet-precondition.cc
+++ b/src/nnet-cpu/nnet-precondition.cc
@ -25,7 +25,13 @@ void PreconditionDirections(const MatrixBase<BaseFloat> &R,
                            MatrixBase<BaseFloat> *P) {

  int32 N = R.NumRows(), D = R.NumCols();
-  KALDI_ASSERT(SameDim(R, *P) && N > 1);
+  KALDI_ASSERT(SameDim(R, *P) && N > 0);
+  if (N == 1) {
+    KALDI_WARN << "Trying to precondition set of only one frames: returning "
+               << "unchanged.  Ignore this warning if infrequent.";
+    P->CopyFromMat(R);
+    return;
+  }
  MatrixBase<BaseFloat> &Q = *P;
  
  if (N >= D) {
--- a/src/nnet-cpu/nnet-stats.h
+++ b/src/nnet-cpu/nnet-stats.h
@ -0,0 +1,93 @@
+// nnet-cpu/nnet-stats.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET_CPU_NNET_FIX_H_
+#define KALDI_NNET_CPU_NNET_FIX_H_
+
+#include "nnet-cpu/nnet-nnet.h"
+
+namespace kaldi {
+
+/* This program computes various statistics from a neural net.  These are
+   summaries of certain quantities already present in the network as
+   stored on disk, especially regarding certain average values and
+   derivatives of the sigmoids.   
+*/
+
+struct NnetStatsConfig {  
+  BaseFloat bucket_width;
+  NnetStatsConfig(): bucket_width(0.025) { }
+  
+  void Register(ParseOptions *po) {
+    po->Register("bucket-width", &bucket_width, "Width of bucket in average-derivative "
+                 "stats for analysis.");
+  }
+};
+
+class NnetStats {
+ public:
+  NnetStats(int32 affine_component_index, BaseFloat bucket_width):
+      affine_component_index_(affine_component_index),
+      bucket_width_(bucket_width), global_(0, -1) { }
+  
+  // Use default copy constructor and assignment operator.
+  
+  void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
+
+  void AddStatsFromNnet(const Nnet &nnet);
+  
+  void PrintStats(std::ostream &os);  
+ private:
+
+  struct StatsElement {
+    BaseFloat deriv_begin; // avg-deriv, beginning of bucket.
+    BaseFloat deriv_end;   // avg-deriv, end of bucket.
+    BaseFloat deriv_sum;   // sum of avg-deriv within bucket.
+    BaseFloat deriv_sumsq;   // Sum-squared of avg-deriv within bucket.
+    BaseFloat abs_value_sum; // Sum of abs(avg-value).  Tells us whether it's
+    // saturating at one or both ends.
+    BaseFloat abs_value_sumsq; // Sum-squared of abs(avg-value).
+    int32 count;      // Number of nonlinearities in this bucket.
+
+    StatsElement(BaseFloat deriv_begin,
+                 BaseFloat deriv_end):
+        deriv_begin(deriv_begin), deriv_end(deriv_end), deriv_sum(0.0),
+        deriv_sumsq(0.0), abs_value_sum(0.0), abs_value_sumsq(0.0), count(0) { }
+    void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
+    // Outputs stats for this bucket; no newline
+    void PrintStats(std::ostream &os); 
+  };
+  int32 BucketFor(BaseFloat avg_deriv); // returns the bucket
+  // for this avg-derivative value, and makes sure it is allocated.
+  
+  int32 affine_component_index_; // Component index of the affine component
+                                // associated with this nonlinearity.
+  BaseFloat bucket_width_; // width of buckets of stats we store (in derivative values).
+  
+  std::vector<StatsElement> buckets_; // Stats divided into buckets by avg_deriv.
+  StatsElement global_; // All the stats.
+  
+};
+
+void GetNnetStats(const NnetStatsConfig &config,
+                  const Nnet &nnet,
+                  std::vector<NnetStats> *stats);
+
+
+} // namespace
+
+#endif // KALDI_NNET_CPU_NNET_FIX_H_
--- a/src/nnet-cpu/nnet-update.cc
+++ b/src/nnet-cpu/nnet-update.cc
@ -44,7 +44,6 @@ class NnetUpdater {
  // Possibly splices input together from forward_data_[component].
  //   MatrixBase<BaseFloat> &GetSplicedInput(int32 component, Matrix<BaseFloat> *temp_matrix);

-
  void Propagate();

  /// Computes objective function and derivative at output layer.
@ -153,10 +152,10 @@ void NnetUpdater::Backprop(const std::vector<NnetTrainingExample> &data,
                     &output = forward_data_[c+1];
    Matrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
    const Matrix<BaseFloat> &output_deriv(*deriv);
- 
+
    component.Backprop(input, output, output_deriv, num_chunks,
                       component_to_update, &input_deriv);
-    *deriv = input_deriv;
+    input_deriv.Swap(deriv);
  }
 }

--- a/src/nnet-cpu/rescale-nnet.cc
+++ b/src/nnet-cpu/rescale-nnet.cc
@ -0,0 +1,212 @@
+// nnet/rescale-nnet.cc
+
+// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet-cpu/rescale-nnet.h"
+
+namespace kaldi {
+
+
+class NnetRescaler {
+ public:
+  NnetRescaler(const NnetRescaleConfig &config,
+               const std::vector<NnetTrainingExample> &examples,
+               Nnet *nnet):
+      config_(config), examples_(examples), nnet_(nnet) {}
+                            
+  void Rescale();
+
+ private:
+  /// takes the input and formats as a single matrix, in forward_data_[0].
+  void FormatInput(const std::vector<NnetTrainingExample> &data,
+                   Matrix<BaseFloat> *input);
+  void RescaleComponent(int32 c, int32 num_chunks,
+                        MatrixBase<BaseFloat> *cur_data_in,
+                        Matrix<BaseFloat> *next_data);
+
+  void ComputeRelevantIndexes();
+  
+  BaseFloat GetTargetAvgDeriv(int32 c);
+  
+  const NnetRescaleConfig &config_;
+  const std::vector<NnetTrainingExample> &examples_;
+  Nnet *nnet_;
+  std::set<int32> relevant_indexes_; // values of c with AffineComponent followed
+  // by (at c+1) NonlinearComponent that is not SoftmaxComponent.
+};
+
+
+void NnetRescaler::FormatInput(const std::vector<NnetTrainingExample> &data,
+                               Matrix<BaseFloat> *input) {
+  KALDI_ASSERT(data.size() > 0);
+  int32 num_splice = nnet_->LeftContext() + 1 + nnet_->RightContext();
+  KALDI_ASSERT(data[0].input_frames.NumRows() == num_splice);
+
+  int32 feat_dim = data[0].input_frames.NumCols(),
+         spk_dim = data[0].spk_info.Dim(),
+         tot_dim = feat_dim + spk_dim; // we append these at the neural net
+                                       // input... note, spk_dim might be 0.
+  KALDI_ASSERT(tot_dim == nnet_->InputDim());
+  int32 num_chunks = data.size();
+
+  input->Resize(num_splice * num_chunks,
+                tot_dim);
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    SubMatrix<BaseFloat> dest(*input,
+                              chunk * num_splice, num_splice,
+                              0, feat_dim);
+    const Matrix<BaseFloat> &src(data[chunk].input_frames);
+    dest.CopyFromMat(src);
+    if (spk_dim != 0) {
+      SubMatrix<BaseFloat> spk_dest(*input,
+                                    chunk * num_splice, num_splice,
+                                    feat_dim, spk_dim);
+      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
+    }
+  }
+}
+
+void NnetRescaler::ComputeRelevantIndexes() {
+  for (int32 c = 0; c + 1 < nnet_->NumComponents(); c++)
+    if (dynamic_cast<AffineComponent*>(&nnet_->GetComponent(c)) != NULL &&
+        (dynamic_cast<NonlinearComponent*>(&nnet_->GetComponent(c+1)) != NULL &&
+         dynamic_cast<SoftmaxComponent*>(&nnet_->GetComponent(c+1)) == NULL))
+      relevant_indexes_.insert(c);
+}
+
+
+BaseFloat NnetRescaler::GetTargetAvgDeriv(int32 c) {
+  KALDI_ASSERT(relevant_indexes_.count(c) == 1);
+  BaseFloat factor;
+  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
+    factor = 0.25;
+  else if (dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
+    factor = 1.0;
+  else
+    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
+  
+  int32 last_c = *std::max_element(relevant_indexes_.begin(), relevant_indexes_.end()),
+      first_c = *std::min_element(relevant_indexes_.begin(), relevant_indexes_.end());
+  if (c == first_c)
+    return factor * config_.target_first_layer_avg_deriv;
+  else if (c == last_c)
+    return factor * config_.target_last_layer_avg_deriv;
+  else
+    return factor * config_.target_avg_deriv;
+}
+
+// Here, c is the index of the affine component, and
+// c + 1 is the index of the nonlinear component; *cur_data is the
+// output of the affine component.
+void NnetRescaler::RescaleComponent(
+    int32 c,
+    int32 num_chunks,
+    MatrixBase<BaseFloat> *cur_data_in,
+    Matrix<BaseFloat> *next_data) {
+  int32 rows = cur_data_in->NumRows(), cols = cur_data_in->NumCols();
+  // Only handle sigmoid or tanh here.
+  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) == NULL &&
+      dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) == NULL)
+    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
+  // the nonlinear component:
+  NonlinearComponent &nc =
+      *(dynamic_cast<NonlinearComponent*>(&(nnet_->GetComponent(c + 1))));
+  
+  BaseFloat orig_avg_deriv, target_avg_deriv = GetTargetAvgDeriv(c);
+  BaseFloat cur_scaling = 1.0; // current rescaling factor (on input).
+  int32 num_iters = 10;
+  
+  Matrix<BaseFloat> cur_data(*cur_data_in),
+      ones(rows, cols), in_deriv(rows, cols);
+      
+  ones.Set(1.0);
+  nc.Propagate(cur_data, num_chunks, next_data);
+  nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
+  BaseFloat cur_avg_deriv;
+  cur_avg_deriv = in_deriv.Sum() / (rows * cols);
+  orig_avg_deriv = cur_avg_deriv;
+  for (int32 iter = 0; iter < num_iters; iter++) {
+    // We already have "cur_avg_deriv"; perturb the scale and compute
+    // the next avg_deriv, so we can see how it changes with the scale.
+    cur_data.CopyFromMat(*cur_data_in);
+    cur_data.Scale(cur_scaling + config_.delta);
+    nc.Propagate(cur_data, num_chunks, next_data);
+    nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
+    BaseFloat next_avg_deriv = in_deriv.Sum() / (rows * cols);
+    KALDI_ASSERT(next_avg_deriv < cur_avg_deriv);
+    // "gradient" is how avg_deriv changes as we change the scale.
+    // should be negative.
+    BaseFloat gradient = (next_avg_deriv - cur_avg_deriv) / config_.delta;
+    KALDI_ASSERT(gradient < 0.0);
+    BaseFloat proposed_change = (target_avg_deriv - cur_avg_deriv) / gradient;
+    KALDI_VLOG(2) << "cur_avg_deriv = " << cur_avg_deriv << ", target_avg_deriv = "
+                  << target_avg_deriv << ", gradient = " << gradient
+                  << ", proposed_change " << proposed_change; 
+    // Limit size of proposed change in "cur_scaling", to ensure stability.
+    if (fabs(proposed_change / cur_scaling) > config_.max_change)
+      proposed_change = cur_scaling * config_.max_change *
+          (proposed_change > 0.0 ? 1.0 : -1.0);
+    cur_scaling += proposed_change;
+    
+    cur_data.CopyFromMat(*cur_data_in);
+    cur_data.Scale(cur_scaling);
+    nc.Propagate(cur_data, num_chunks, next_data);
+    nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
+    cur_avg_deriv = in_deriv.Sum() / (rows * cols);
+    if (fabs(proposed_change) < config_.min_change) break; // Terminate the
+    // optimization
+  }
+  UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(
+      &nnet_->GetComponent(c));
+  KALDI_ASSERT(uc != NULL);
+  uc->Scale(cur_scaling); // scale the parameters of the previous
+  // AffineComponent.
+  
+  KALDI_LOG << "For component " << c << ", scaling parameters by "
+            << cur_scaling << "; average "
+            << "derivative changed from " << orig_avg_deriv << " to "
+            << cur_avg_deriv << "; target was " << target_avg_deriv;
+}
+    
+
+
+void NnetRescaler::Rescale() {
+  ComputeRelevantIndexes(); // set up relevant_indexes_.
+  Matrix<BaseFloat> cur_data, next_data;
+  FormatInput(examples_, &cur_data);
+  int32 num_chunks = examples_.size();
+  for (int32 c = 0; c < nnet_->NumComponents(); c++) {
+    Component &component = nnet_->GetComponent(c);
+    if (relevant_indexes_.count(c - 1) == 1) {
+      // the following function call also appropriately sets "next_data"
+      // after doing the rescaling
+      RescaleComponent(c - 1, num_chunks, &cur_data, &next_data);
+    } else {
+      component.Propagate(cur_data, num_chunks, &next_data);
+    }
+    cur_data.Swap(&next_data);
+  }
+}
+
+void RescaleNnet(const NnetRescaleConfig &rescale_config,
+                 const std::vector<NnetTrainingExample> &examples,
+                 Nnet *nnet) {
+  NnetRescaler rescaler(rescale_config, examples, nnet);
+  rescaler.Rescale();
+}
+
+
+} // namespace
--- a/src/nnet-cpu/rescale-nnet.h
+++ b/src/nnet-cpu/rescale-nnet.h
@ -0,0 +1,76 @@
+// nnet-cpu/rescale-nnet.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET_CPU_RESCALE_NNET_H_
+#define KALDI_NNET_CPU_RESCALE_NNET_H_
+
+#include "nnet-cpu/nnet-update.h"
+#include "nnet-cpu/nnet-compute.h"
+#include "util/parse-options.h"
+
+// Neural net rescaling is a rescaling of the parameters of the various layers
+// of a neural net, done so as to match certain specified statistics on the
+// average derivative of the sigmoid, measured on sample data.  This relates to
+// how "saturated" the sigmoid is.
+
+namespace kaldi {
+
+
+struct NnetRescaleConfig {
+  BaseFloat target_avg_deriv;
+  BaseFloat target_first_layer_avg_deriv;
+  BaseFloat target_last_layer_avg_deriv;
+
+  // These are relatively unimportant; for now they have no
+  // command line options.
+  BaseFloat num_iters;
+  BaseFloat delta;
+  BaseFloat max_change; // maximum change on any one iteration (to
+  // ensure stability).
+  BaseFloat min_change; // minimum change on any one iteration (controls
+  // termination
+  
+  NnetRescaleConfig(): target_avg_deriv(0.2),
+                       target_first_layer_avg_deriv(0.3),
+                       target_last_layer_avg_deriv(0.1),
+                       num_iters(10),
+                       delta(0.01),
+                       max_change(0.2), min_change(1.0e-05) { }
+  
+  void Register(ParseOptions *po) {
+    po->Register("target-avg-deriv", &target_avg_deriv, "Target average derivative "
+                 "for hidden layers that are the not the first or last hidden layer "
+                 "(as fraction of maximum derivative of the nonlinearity)");
+    po->Register("target-first-layer-avg-deriv", &target_first_layer_avg_deriv,
+                 "Target average derivative for the first hidden layer"
+                 "(as fraction of maximum derivative of the nonlinearity)");
+    po->Register("target-last-layer-avg-deriv", &target_last_layer_avg_deriv,
+                 "Target average derivative for the last hidden layer, if "
+                 "#hid-layers > 1"
+                 "(as fraction of maximum derivative of the nonlinearity)");
+  }  
+};
+
+void RescaleNnet(const NnetRescaleConfig &rescale_config,
+                 const std::vector<NnetTrainingExample> &examples,
+                 Nnet *nnet);
+  
+
+
+} // namespace
+
+#endif
--- a/src/nnet-cpubin/Makefile
+++ b/src/nnet-cpubin/Makefile
@ -13,7 +13,8 @@ BINFILES = nnet-randomize-frames nnet-am-info nnet-train nnet-init \
   nnet-train-lbfgs nnet-get-egs nnet-train-parallel nnet-gradient \
   nnet-get-preconditioner nnet-precondition nnet-select-egs nnet-combine-fast \
   nnet-subset-egs nnet-shuffle-egs nnet-am-fix nnet-logprob  nnet-logprob2 \
-   nnet-logprob2-parallel nnet-logprob-parallel
+   nnet-logprob2-parallel nnet-logprob-parallel nnet-am-stats nnet-am-rescale \
+   nnet-am-limit-rank

 OBJFILES =

--- a/src/nnet-cpubin/nnet-am-copy.cc
+++ b/src/nnet-cpubin/nnet-am-copy.cc
@ -41,6 +41,7 @@ int main(int argc, char *argv[]) {
    int32 truncate = -1;
    bool binary_write = true;
    bool remove_dropout = false;
+    bool remove_preconditioning = false;
    BaseFloat learning_rate_factor = 1.0, learning_rate = -1;
    std::string learning_rates = "";
    std::string scales = "";
@ -64,6 +65,8 @@ int main(int argc, char *argv[]) {
                "to this many components by removing the last components.");
    po.Register("remove-dropout", &remove_dropout, "Set this to true to remove "
                "any dropout components.");
+    po.Register("remove-preconditioning", &remove_preconditioning, "Set this to true to replace "
+                "components of type AffineComponentPreconditioned with AffineComponent.");
    po.Register("stats-from", &stats_from, "Before copying neural net, copy the "
                "statistics in any layer of type NonlinearComponent, from this "
                "neural network: provide the extended filename.");
@ -133,6 +136,8 @@ int main(int argc, char *argv[]) {

    if (remove_dropout) am_nnet.GetNnet().RemoveDropout();

+    if (remove_preconditioning) am_nnet.GetNnet().RemovePreconditioning();
+
    if (stats_from != "") {
      // Copy the stats associated with the layers descending from
      // NonlinearComponent.
--- a/src/nnet-cpubin/nnet-am-fix.cc
+++ b/src/nnet-cpubin/nnet-am-fix.cc
@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
        "e.g.:\n"
        " nnet-am-fix 1.mdl 1_fixed.mdl\n"
        "or:\n"
-        " nnet-am-shrink-rows --get-counts-from=1.gradient 1.mdl 1_shrunk.mdl\n";
+        " nnet-am-fix --get-counts-from=1.gradient 1.mdl 1_shrunk.mdl\n";

    bool binary_write = true;
    NnetFixConfig config;
--- a/src/nnet-cpubin/nnet-am-limit-rank.cc
+++ b/src/nnet-cpubin/nnet-am-limit-rank.cc
@ -0,0 +1,81 @@
+// nnet-cpubin/nnet-am-limit-rank.cc
+
+// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet-cpu/nnet-limit-rank.h"
+#include "nnet-cpu/am-nnet.h"
+#include "hmm/transition-model.h"
+#include "tree/context-dep.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Copy a (cpu-based) neural net and its associated transition model,\n"
+        "but modify it to reduce the effective parameter count by limiting\n"
+        "the rank of weight matrices.\n"
+        "\n"
+        "Usage:  nnet-am-limit-rank [options] <nnet-in> <nnet-out>\n"
+        "e.g.:\n"
+        " nnet-am-limit-rank 1.mdl 1_limited.mdl\n";
+    
+
+    bool binary_write = true;
+    NnetLimitRankOpts config;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        nnet_wxfilename = po.GetArg(2);
+    
+    TransitionModel trans_model;
+    AmNnet am_nnet;
+    {
+      bool binary;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    LimitRankParallel(config, &am_nnet.GetNnet());
+    
+    {
+      Output ko(nnet_wxfilename, binary_write);
+      trans_model.Write(ko.Stream(), binary_write);
+      am_nnet.Write(ko.Stream(), binary_write);
+    }
+    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
+              << " to " << nnet_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
--- a/src/nnet-cpubin/nnet-am-rescale.cc
+++ b/src/nnet-cpubin/nnet-am-rescale.cc
@ -0,0 +1,92 @@
+// nnet-cpubin/nnet-am-rescale.cc
+
+// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet-cpu/rescale-nnet.h"
+#include "nnet-cpu/am-nnet.h"
+#include "hmm/transition-model.h"
+#include "tree/context-dep.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Rescale the parameters in a neural net to achieve certain target\n"
+        "statistics, relating to the average derivative of the sigmoids\n"
+        "measured at some supplied data.  This relates to how saturated\n"
+        "the sigmoids are (we try to match the statistics of `good' neural\n"
+        "nets).\n"
+        "\n"
+        "Usage:  nnet-am-rescale [options] <nnet-in> <examples-in> <nnet-out>\n"
+        "e.g.:\n"
+        " nnet-am-rescale 1.mdl valid.egs 1_rescaled.mdl\n";
+
+    bool binary_write = true;
+    NnetRescaleConfig config;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        egs_rspecifier = po.GetArg(2), 
+        nnet_wxfilename = po.GetArg(3);
+    
+    TransitionModel trans_model;
+    AmNnet am_nnet;
+    {
+      bool binary;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    std::vector<NnetTrainingExample> egs;
+
+    // This block adds samples to "egs".
+    SequentialNnetTrainingExampleReader example_reader(
+        egs_rspecifier);
+    for (; !example_reader.Done(); example_reader.Next())
+      egs.push_back(example_reader.Value());
+    KALDI_LOG << "Read " << egs.size() << " examples.";
+    KALDI_ASSERT(!egs.empty());
+    
+    RescaleNnet(config, egs, &am_nnet.GetNnet());
+    
+    {
+      Output ko(nnet_wxfilename, binary_write);
+      trans_model.Write(ko.Stream(), binary_write);
+      am_nnet.Write(ko.Stream(), binary_write);
+    }
+    KALDI_LOG << "Rescaled neural net and wrote it to " << nnet_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
--- a/src/nnet-cpubin/nnet-am-stats.cc
+++ b/src/nnet-cpubin/nnet-am-stats.cc
@ -0,0 +1,72 @@
+// nnet-cpubin/nnet-am-stats.cc
+
+// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet-cpu/nnet-stats.h"
+#include "nnet-cpu/am-nnet.h"
+#include "hmm/transition-model.h"
+#include "tree/context-dep.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Print some statistics about the average derivatives of the sigmoid layers\n"
+        "of the neural net, that are stored in the net\n"
+        "\n"
+        "Usage:  nnet-am-stats [options] <nnet-in>\n"
+        "e.g.:\n"
+        " nnet-am-stats 1.mdl 1_fixed.mdl\n";
+    
+    NnetStatsConfig config;
+    
+    ParseOptions po(usage);
+    config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 1) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1);
+    
+    TransitionModel trans_model;
+    AmNnet am_nnet;
+    {
+      bool binary;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    std::vector<NnetStats> stats;
+    GetNnetStats(config, am_nnet.GetNnet(), &stats);
+    KALDI_ASSERT(!stats.empty());
+    for (size_t i = 0; i < stats.size(); i++)
+      stats[i].PrintStats(std::cout);
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
--- a/src/nnet-cpubin/nnet-subset-egs.cc
+++ b/src/nnet-cpubin/nnet-subset-egs.cc
@ -85,7 +85,7 @@ int main(int argc, char *argv[]) {
    KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read
              << " neural-network training examples ";
    
-    return (static_cast<size_t>(n) == egs.size() ? 0 : 1);
+    return (num_read != 0 ? 0 : 1);
  } catch(const std::exception &e) {
    std::cerr << e.what() << '\n';
    return -1;
--- a/src/nnet/nnet-cache-tgtmat.cc
+++ b/src/nnet/nnet-cache-tgtmat.cc
@ -88,8 +88,8 @@ void CacheTgtMat::AddData(const CuMatrix<BaseFloat> &features, const CuMatrix<Ba
      features_.CopyRowsFromMat(leftover, features_leftover_, 0, 0);
      targets_.CopyRowsFromMat(leftover, targets_leftover_, 0, 0);
      
-      features_leftover_.Destroy();
-      targets_leftover_.Destroy();
+      features_leftover_.Resize(0, 0);
+      targets_leftover_.Resize(0, 0);
      filling_pos_ += leftover;
    } 
  }
--- a/src/nnet/nnet-cache.cc
+++ b/src/nnet/nnet-cache.cc
@ -91,7 +91,7 @@ void Cache::AddData(const CuMatrix<BaseFloat> &features, const std::vector<int32
                targets_leftover_.begin()+leftover,
                targets_.begin());

-      features_leftover_.Destroy();
+      features_leftover_.Resize(0, 0);
      targets_leftover_.resize(0);
      filling_pos_ += leftover;
    } 
--- a/src/nnet/nnet-loss.cc
+++ b/src/nnet/nnet-loss.cc
@ -32,7 +32,7 @@ void Xent::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &t
  diff->Resize(net_out.NumRows(), net_out.NumCols());

  // compute derivative wrt. activations of last layer of neurons
-  diff->CopyFromMat(net_out);
+  *diff = net_out;
  diff->AddMat(-1.0, target);

  // we'll not produce per-frame classification accuracy for soft labels
@ -40,7 +40,8 @@ void Xent::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &t

  // :TODO: reimplement when needed
  // compute xentropy (ON CPU)
-  Matrix<BaseFloat> target_host, net_out_host;
+  Matrix<BaseFloat> target_host(target.NumRows(), target.NumCols(), kUndefined),
+      net_out_host(net_out.NumRows(), net_out.NumCols(), kUndefined);
  target.CopyToMat(&target_host);
  net_out.CopyToMat(&net_out_host);
  BaseFloat val;
@ -69,7 +70,7 @@ void Xent::EvalVec(const CuMatrix<BaseFloat> &net_out, const std::vector<int32>
  // get the xentropy and global error 
  target_device_.CopyFromVec(target);
  if(&net_out != diff) { //<allow no-copy speedup
-    diff->CopyFromMat(net_out);
+    *diff = net_out;
  }
  cu::DiffXent(target_device_, diff, &log_post_tgt_);
  //
@ -83,7 +84,8 @@ void Xent::EvalVec(const CuMatrix<BaseFloat> &net_out, const std::vector<int32>
  // The frame-level xentropy statistics are computed as:
  // log(sum_row(net_out.*target_mat)))
  // they now are stored in vector log_post_tgt_
-  // 
+  //
+  log_post_tgt_host_.Resize(log_post_tgt_.Dim());
  log_post_tgt_.CopyToVec(&log_post_tgt_host_);
  loss_    -= log_post_tgt_host_.Sum();
  
@ -110,9 +112,10 @@ std::string Xent::Report() {
 void Mse::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &target, CuMatrix<BaseFloat> *diff) {
  KALDI_ASSERT(net_out.NumCols() == target.NumCols());
  KALDI_ASSERT(net_out.NumRows() == target.NumRows());
-  diff->Resize(net_out.NumRows(), net_out.NumCols());
+

  // compute derivative w.r.t. neural nerwork outputs
+  diff->Resize(net_out.NumRows(), net_out.NumCols());
  diff->CopyFromMat(net_out);
  diff->AddMat(-1.0, target);

@ -147,9 +150,9 @@ std::string Mse::Report() {
 void MseProgress::Eval(const CuMatrix<BaseFloat>& net_out, const CuMatrix<BaseFloat>& target, CuMatrix<BaseFloat>* diff) {
  KALDI_ASSERT(net_out.NumCols() == target.NumCols());
  KALDI_ASSERT(net_out.NumRows() == target.NumRows());
-  diff->Resize(net_out.NumRows(),net_out.NumCols());

  //compute derivative w.r.t. neural nerwork outputs
+  diff->Resize(net_out.NumRows(),net_out.NumCols());
  diff->CopyFromMat(net_out);
  diff->AddMat(-1.0,target);

--- a/src/nnetbin/cmvn-to-nnet.cc
+++ b/src/nnetbin/cmvn-to-nnet.cc
@ -94,8 +94,7 @@ int main(int argc, char *argv[]) {
      //the pointer will be given to the nnet, so we don't need to call delete
      
      //convert Vector to CuVector
-      CuVector<BaseFloat> cu_shift;
-      cu_shift.CopyFromVec(shift);
+      CuVector<BaseFloat> cu_shift(shift);

      //set the weights
      shift_component->SetShiftVec(cu_shift);
@ -110,8 +109,7 @@ int main(int argc, char *argv[]) {
      //the pointer will be given to the nnet, so we don't need to call delete
      
      //convert Vector to CuVector
-      CuVector<BaseFloat> cu_scale;
-      cu_scale.CopyFromVec(scale);
+      CuVector<BaseFloat> cu_scale(scale);

      //set the weights
      scale_component->SetScaleVec(cu_scale);
--- a/src/nnetbin/nnet-forward.cc
+++ b/src/nnetbin/nnet-forward.cc
@ -129,6 +129,7 @@ int main(int argc, char *argv[]) {
      }

      // push priors to GPU
+      priors.Resize(tmp_priors.Dim());
      priors.CopyFromVec(tmp_priors);
    }

@ -150,7 +151,7 @@ int main(int argc, char *argv[]) {
        }
      }
      // push it to gpu
-      feats.CopyFromMat(mat);
+      feats = mat;
      // fwd-pass
      nnet_transf.Feedforward(feats, &feats_transf);
      nnet.Feedforward(feats_transf, &nnet_out);
@ -169,7 +170,8 @@ int main(int argc, char *argv[]) {
        }
      }
     
-      //download from GPU 
+      //download from GPU
+      nnet_out_host.Resize(nnet_out.NumRows(), nnet_out.NumCols());
      nnet_out.CopyToMat(&nnet_out_host);
      //check for NaN/inf
      for(int32 r=0; r<nnet_out_host.NumRows(); r++) {
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ b/src/nnetbin/nnet-train-mmi-sequential.cc
@ -223,12 +223,13 @@ int main(int argc, char *argv[]) {
        
        //3) propagate the feature to get the log-posteriors (nnet w/o sofrmax)
        // push features to GPU
-        feats.CopyFromMat(mat);
+        feats = mat;
        // possibly apply transform
        nnet_transf.Feedforward(feats, &feats_transf);
        // propagate through the nnet (assuming w/o softmax)
        nnet.Propagate(feats_transf, &nnet_out);
-        // pop it back to the HOST
+        // transfer it back to the host
+        nnet_out_h.Resize(nnet_out.NumRows(), nnet_out.NumCols(), kUndefined);
        nnet_out.CopyToMat(&nnet_out_h);
        // TODO: poccibly divide by priors

@ -277,7 +278,7 @@ int main(int argc, char *argv[]) {

        //7) backpropagate through the nnet
        if (!crossvalidate) {
-          nnet_diff.CopyFromMat(nnet_diff_h);
+          nnet_diff = nnet_diff_h;
          nnet.Backpropagate(nnet_diff, NULL);
        }

--- a/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
+++ b/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
@ -139,8 +139,8 @@ int main(int argc, char *argv[]) {
            continue;
          }
          // push features/targets to GPU
-          feats.CopyFromMat(fea_mat);
-          targets.CopyFromMat(tgt_mat);
+          feats = fea_mat;
+          targets = tgt_mat;
          // possibly apply feature transform
          nnet_transf.Feedforward(feats, &feats_transf);
          // add to cache
--- a/src/nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc
+++ b/src/nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc
@ -142,7 +142,7 @@ int main(int argc, char *argv[]) {
            continue;
          }
          // push features to GPU
-          feats.CopyFromMat(mat);
+          feats = mat;
          // possibly apply transform
          nnet_transf.Feedforward(feats, &feats_transf);
          // add to cache
--- a/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
+++ b/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
@ -138,6 +138,7 @@ int main(int argc, char *argv[]) {
            num_other_error++;
          } else { //dimension OK
            // push features to GPU
+            feats.Resize(mat.NumRows(), mat.NumCols(), kUndefined);
            feats.CopyFromMat(mat);
            // possibly apply transform
            nnet_transf.Feedforward(feats, &feats_transf);
--- a/src/nnetbin/rbm-train-cd1-frmshuff.cc
+++ b/src/nnetbin/rbm-train-cd1-frmshuff.cc
@ -132,7 +132,8 @@ int main(int argc, char *argv[]) {
        rbm_transf.Feedforward(feats, &feats_transf);
        // subsample the feats to get faster epochs
        if(drop_data > 0.0) {
-          Matrix<BaseFloat> mat2;
+          Matrix<BaseFloat> mat2(feats_transf.NumRows(), feats_transf.NumCols(),
+                                 kUndefined);
          feats_transf.CopyToMat(&mat2);
          for(int32 r=mat2.NumRows()-1; r >= 0; r--) {
            if(RandUniform() < drop_data) {
--- a/src/nnetbin/transf-to-nnet.cc
+++ b/src/nnetbin/transf-to-nnet.cc
@ -62,8 +62,7 @@ int main(int argc, char *argv[]) {
    //the pointer will be given to the nnet, so we don't need to call delete

    //convert Matrix to CuMatrix
-    CuMatrix<BaseFloat> cu_transform;
-    cu_transform.CopyFromMat(transform);
+    CuMatrix<BaseFloat> cu_transform(transform);

    //set the weights
    layer->SetLinearity(cu_transform);
--- a/windows/INSTALL
+++ b/windows/INSTALL
@ -17,7 +17,7 @@
 # Also we have not been checking that the code compiles in Visual Studio.
 # If anyone would like to maintain the Windows setup, we would like that,
 # but unfortunately, the situation right now is that it is not being
-# maintained.  
+# maintained.

 (A) Installing the Windows version of OpenFst.
  This is maintained by Paul Dixon; it has some small code changes versus