A lot of changes: script changes RE neural nets (more efficient IO, slightly better WERs); various new functionality for nnets and improving some feature-related binaries' interfaces.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1976 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-02-04 21:57:06 +00:00 · 2013-02-04 21:57:06 +00:00 · f699fd2be1
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@ -1,11 +1,5 @@
 #!/bin/bash
 # CAUTION: I changed e.g. 1.trans to trans.1 in the scripts.  If you ran it
 # part-way through prior to this, to convert to the new naming
 # convention, run:
 # for x in `find . -name '*.trans'`; do mv $x `echo $x | perl -ane 's/(\d+)\.trans/trans.$1/;print;'`; done
 # but be careful as this will not follow soft links.
 . cmd.sh
 # call the next line with the directory where the RM data is
--- a/egs/swbd/s5/conf/decode.config
+++ b/egs/swbd/s5/conf/decode.config
@ -1,5 +1,2 @@
 beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
 first_beam=8.0 # beam for 1st-pass decoding in SAT.
--- a/egs/swbd/s5/local/run_nnet_cpu.sh
+++ b/egs/swbd/s5/local/run_nnet_cpu.sh
@ -21,8 +21,8 @@
 )
 # Here are the results (copied from RESULTS file)
-#exp/nnet6a/decode_train_dev/wer_10:%WER 24.87 [ 12053 / 48460, 1590 ins, 3017 del, 7446 sub ]
+#exp/nnet6a/decode_train_dev/wer_11:%WER 24.30 [ 11774 / 48460, 1619 ins, 2877 del, 7278 sub ]
-#exp/nnet6a/decode_eval2000/score_10/eval2000.ctm.filt.sys:     | Sum/Avg    | 4459  42989 | 77.1   16.0    6.9    2.7   25.6   62.6 |
+#exp/nnet6a/decode_eval2000/score_10/eval2000.ctm.filt.sys:     | Sum/Avg    | 4459  42989 | 77.8   16.0    6.3    3.0   25.3   62.6 |
 # Here are some older results when the system had 2k not 4k leaves and ran from a worse SAT
--- a/egs/wsj/s5/RESULTS
+++ b/egs/wsj/s5/RESULTS
@ -191,6 +191,7 @@ exp/tri4a_dnn/decode_bd_tgpr_eval92/wer_10:%WER 4.00 [ 226 / 5643, 34 ins, 12 de
 # and for eval92 is 3.79, the same system.  (On this setup, discriminative training helped a lot,
 # which seems to be the reason we can't beat the SGMM+MMI numbers here.)
-exp/nnet5c1/decode_bd_tgpr_dev93/wer_10:%WER 7.48 [ 616 / 8234, 73 ins, 98 del, 445 sub ]
+
-exp/nnet5c1/decode_bd_tgpr_eval92/wer_11:%WER 4.41 [ 249 / 5643, 29 ins, 19 del, 201 sub ]
+exp/nnet5c1/decode_bd_tgpr_dev93/wer_14:%WER 7.32 [ 603 / 8234, 61 ins, 101 del, 441 sub ]
-# Note: my 4.41% result is worse than Karel's 4.00%.
+exp/nnet5c1/decode_bd_tgpr_eval92/wer_14:%WER 4.39 [ 248 / 5643, 32 ins, 17 del, 199 sub ]
 # Note: my 4.39% result is worse than Karel's 4.00%.
--- a/egs/wsj/s5/local/wsj_data_prep.sh
+++ b/egs/wsj/s5/local/wsj_data_prep.sh
@ -47,7 +47,7 @@ cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \
 grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist
 nl=`cat train_si84.flist | wc -l`
-[ "$nl" -eq 7138 ] || echo "Warning: expected 37416 lines in train_si84.flist, got $nl"
+[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl"
 # This version for SI-284
 cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@ -281,7 +281,6 @@ steps/train_quick.sh --cmd "$train_cmd" \
  exp/tri4b/graph_bd_tgpr data/test_eval92 exp/tri4b/decode_bd_tgpr_eval92 || exit 1;
 ) &
 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 # all the data).  Use 30 jobs.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
--- a/egs/wsj/s5/steps/append_feats.sh
+++ b/egs/wsj/s5/steps/append_feats.sh
@ -0,0 +1,68 @@
 #!/bin/bash
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # This script appends the features in two data directories.
 # To be run from .. (one directory up from here)
 # see ../run.sh for example
 # This config creates MFCC features with half the window size and window shift,
 # and splices and sub-samples them.  We'll use another script append_feats.sh
 # to combine (append) the data directories.
 # Begin configuration section.
 cmd=run.pl
 nj=4
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 if [ $# != 5 ]; then
   echo "usage: append_feats.sh [options] <src-data-dir1> <src-data-dir2> <dest-data-dir> <log-dir> <path-to-storage-dir>";
   echo "options: "
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
 fi
 data_src1=$1
 data_src2=$2
 data=$3
 logdir=$4
 mfccdir=$5
 utils/split_data.sh $data_src1 $nj || exit 1;
 utils/split_data.sh $data_src2 $nj || exit 1;
 mkdir -p $mfccdir $logdir
 rm -rf $data
 mkdir -p `basename $data` # Make sure directory one level up exists.
 cp -r $data_src1 $data # so we get the other files, such as utt2spk.
 rm $data/cmvn.scp
 rm -r $data/split* 2>/dev/null
 # use "name" as part of name of the archive.
 name=`basename $data`
 $cmd JOB=1:$nj $logdir/append.JOB.log \
   append-feats --truncate-frames=true \
   scp:$data_src1/split$nj/JOB/feats.scp scp:$data_src2/split$nj/JOB/feats.scp \
   ark,scp:$mfccdir/appended_$name.JOB.ark,$mfccdir/appended_$name.JOB.scp || exit 1;
 # concatenate the .scp files together.
 for ((n=1; n<=nj; n++)); do
  cat $mfccdir/appended_$name.$n.scp >> $data/feats.scp || exit 1;
 done > $data/feats.scp
 nf=`cat $data/feats.scp | wc -l` 
 nu=`cat $data/utt2spk | wc -l` 
 if [ $nf -ne $nu ]; then
  echo "It seems not all of the feature files were successfully ($nf != $nu);"
  echo "consider using utils/fix_data_dir.sh $data"
 fi
 echo "Succeeded creating MFCC features for $name"
--- a/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
+++ b/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
@ -18,6 +18,10 @@ max_mem=20000000 # This will stop the processes getting too large.
 # This is in bytes, but not "real" bytes-- you have to multiply
 # by something like 5 or 10 to get real bytes (not sure why so large)
 # End configuration section.
 num_threads=1 # Number of threads used in nnet-logprob computation.  If you set
              # this to a different value, make sure to also set the appropriate
              # queue options.  If you set this too high it won't use all the
              # threads as most of the time will be taken in the decoder.
 echo "$0 $@"  # Print the command line for logging
@ -104,9 +108,10 @@ fi
 if [ $sub_split -eq 1 ]; then 
  $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
-   nnet-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+   nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats" ark:- \| \
   latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
-     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+     $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
 else
  for n in `seq $nj`; do
    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
@ -120,9 +125,10 @@ else
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
      $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
-        nnet-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+        nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats_subset" ark:- \| \
        latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
-          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
+          $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
      echo Merging archives for data subset $n
      rm $dir/.error 2>/dev/null;
      for k in `seq $sub_split`; do
--- a/egs/wsj/s5/steps/train_nnet_cpu.sh
+++ b/egs/wsj/s5/steps/train_nnet_cpu.sh
@ -14,18 +14,24 @@ num_iters_final=10 # Number of final iterations to give to the
                   # optimization over the validation set.
 initial_learning_rate=0.02 # for RM; or 0.01 is suitable for Swbd.
 final_learning_rate=0.004  # for RM; or 0.001 is suitable for Swbd.
-num_valid_utts=300    # held-out utterances, used only for diagnostics.
+num_utts_subset=300    # number of utterances in validation and training
-num_valid_frames_shrink=2000 # a subset of the frames in "valid_utts", used only
+                       # subsets used for shrinkage and diagnostics
-                             # for estimating shrinkage parameters and for
+num_valid_frames_shrink=0 # number of validation frames in the subset
-                             # objective-function reporting.
+                             # used for shrinking
 num_train_frames_shrink=2000  # number of training frames in the subset used
                              # for shrinking (by default we use all training
                              # frames for this.)
 shrink_interval=3 # shrink every $shrink_interval iters,
                # except at the start of training when we do it every iter.
-num_valid_frames_combine=10000 # combination weights at the very end.
+num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
 num_train_frames_combine=10000 # # train frames for the above.
 num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
 minibatch_size=128 # by default use a smallish minibatch size for neural net training; this controls instability
                   # which would otherwise be a problem with multi-threaded update.  Note:
                   # it also interacts with the "preconditioned" update, so it's not completely cost free.
-samples_per_iteration=400000 # each iteration of training, see this many samples
+samples_per_iter=400000 # each iteration of training, see this many samples
-                             # per job.
+                             # per job.  This is just a guideline; it will pick a number
                             # that divides the number of samples in the entire data.
 shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
@ -37,13 +43,13 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=2
 initial_num_hidden_layers=1  # we'll add the rest one by one.
 num_parameters=2000000 # 2 million parameters by default.
-stage=-7
+stage=-9
 realign_iters=""
 beam=10  # for realignment.
 retry_beam=40
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
-shuffle_opts="-tc 5" # max 5 jobs running at one time (a lot of I/O.)
+io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
 nnet_config_opts=
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 lda_dim=250
@ -54,7 +60,11 @@ shrink=true
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
 num_threads=16
-mkl_num_threads=1
+
 valid_is_heldout=false # For some reason, holding out the validation set from the training set
                       # seems to hurt, so by default we don't do it (i.e. it's included in training)
 random_copy=false
 cleanup=true
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@ -72,7 +82,7 @@ if [ $# != 4 ]; then
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
-  echo "                                                   # with --samples-per-iteration and --num-jobs-nnet)"
+  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
@ -95,21 +105,27 @@ if [ $# != 4 ]; then
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads."
-  echo "  --shuffle-opts <opts|\"-tc 5\">                  # Options given to e.g. queue.pl for the job that shuffles the "
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "                                                   # data. (prevents stressing the disk). "
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
-  echo "  --samples-per-iteration <#samples|400000>        # Number of samples of data to process per iteration, per"
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
-  echo "  --stage <stage|-7>                               # Used to run a partially-completed training process from somewhere in"
+  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
  echo "                                                   # (the validation subset is held out from training)"
  echo "  --num-valid-frames-shrink <#frames|2000>         # Number of frames from the validation set used for shrinking"
  echo "  --num-train-frames-shrink <#frames|0>            # Number of frames from the training set used for shrinking"
  echo "                                                   # (by default it's included in training, which for some reason helps)."
  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames|10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  exit 1;
 fi
@ -144,8 +160,11 @@ cp $alidir/tree $dir
 # Get list of validation utterances. 
-awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_valid_utts \
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/valid_uttlist || exit 1;
 awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
     head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
 ## Set up features.  Note: these are different from the normal features
 ## because we have one rspecifier that has the features for the entire
@ -154,33 +173,49 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 case $feat_type in
-  delta) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
+  delta) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
     split_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | add-deltas ark:- ark:- |"
   ;;
-  lda) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+  lda) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      split_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
 if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
-  feats="$feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
  split_feats="$split_feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
 fi
 if [ $stage -le -9 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=`feat-to-len scp:$data/feats.scp ark,t:- | awk '{x += $2;} END{print x;}'` || exit 1;
  echo $num_frames > $dir/num_frames
 else
  num_frames=`cat $dir/num_frames` || exit 1;
 fi
 # Working out number of iterations per epoch.
 iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
 [ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
 samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
 echo "Every epoch, splitting the data up into $iters_per_epoch iterations,"
 echo "giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
 ## Do LDA on top of whatever features we already have; store the matrix which
 ## we'll put into the neural network as a constant.
-if [ $stage -le -7 ]; then
+if [ $stage -le -8 ]; then
-  echo "Accumulating LDA statistics."
+  echo "$0: Accumulating LDA statistics."
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
-      acc-lda --rand-prune=$randprune $alidir/final.mdl "$split_feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \
+      acc-lda --rand-prune=$randprune $alidir/final.mdl "$feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \
       $dir/lda.JOB.acc || exit 1;
  est-lda --dim=$lda_dim $dir/lda.mat $dir/lda.*.acc \
      2>$dir/log/lda_est.log || exit 1;
@ -195,7 +230,7 @@ if [ $initial_num_hidden_layers -gt $num_hidden_layers ]; then
 fi
-if [ $stage -le -6 ]; then
+if [ $stage -le -7 ]; then
  echo "$0: initializing neural net";
  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers. 
@ -219,14 +254,14 @@ if [ $stage -le -6 ]; then
       $dir/0.mdl || exit 1;
 fi
-if [ $stage -le -5 ]; then
+if [ $stage -le -6 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
 fi
-if [ $stage -le -4 ]; then
+if [ $stage -le -5 ]; then
  echo "Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst  \
@ -239,118 +274,113 @@ cp $alidir/ali.*.gz $dir
 nnet_context_opts="--left-context=`nnet-am-info $dir/0.mdl 2>/dev/null | grep -w left-context | awk '{print $2}'` --right-context=`nnet-am-info $dir/0.mdl 2>/dev/null | grep -w right-context | awk '{print $2}'`" || exit 1;
-if [ $stage -le -3 ]; then
+if [ $stage -le -4 ]; then
-  echo "Getting validation examples."
+  echo "Getting validation and training subset examples."
-  $cmd $dir/log/create_valid_subset_shrink.log \
+  rm $dir/.error 2>/dev/null
  $cmd $dir/log/create_valid_subset.log \
    nnet-get-egs $nnet_context_opts "$valid_feats" \
     "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $dir/0.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
-     "ark:$dir/valid_all.egs" || exit 1;
+     "ark:$dir/valid_all.egs" || touch $dir/.error &
-  echo "Getting subsets of validation examples for shrinking and combination."
+  $cmd $dir/log/create_train_subset.log \
    nnet-get-egs $nnet_context_opts "$train_subset_feats" \
     "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $dir/0.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && exit 1;
  echo "Getting subsets of validation examples for shrinking, diagnostics and combination."
  $cmd $dir/log/create_valid_subset_shrink.log \
-    nnet-subset-egs --n=$num_valid_frames_shrink ark:$dir/valid_all.egs ark:$dir/valid_shrink.egs  &
+    nnet-subset-egs --n=$num_valid_frames_shrink ark:$dir/valid_all.egs \
     ark:$dir/valid_shrink.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_combine.log \
-    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs ark:$dir/valid_combine.egs  &
+    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
        ark:$dir/valid_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
    ark:$dir/valid_diagnostic.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_shrink.log \
    nnet-subset-egs --n=$num_train_frames_shrink ark:$dir/train_subset_all.egs \
    ark:$dir/train_shrink.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_combine.log \
    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
    ark:$dir/train_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
    ark:$dir/train_diagnostic.egs || touch $dir/.error &
  wait
-  [ ! -s $dir/valid_shrink.egs ] && echo "No validation examples for shrinking" && exit 1;
+  cat $dir/valid_shrink.egs $dir/train_shrink.egs > $dir/shrink.egs
-  [ ! -s $dir/valid_combine.egs ] && echo "No validation examples for combination" && exit 1;
+  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
-  rm $dir/valid_all.egs
+
  for f in $dir/{shrink,combine,train_diagnostic,valid_diagnostic}.egs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_{shrink,combine}.egs
 fi
-if [ $stage -le -2 ]; then
+if [ $stage -le -3 ]; then
  mkdir -p $dir/egs
  mkdir -p $dir/temp
  echo "Creating training examples";
-  # in $dir/egs, create $num_jobs_nnet separate files with training examples,
+  # in $dir/egs, create $num_jobs_nnet separate files with training examples.
-  # with randomly shuffled order.  We shuffle the order of examples in each
+  # The order is not randomized at this point.
  # file.  Then on each iteration, for each training process, we'll take a 
  # random subset of blocks of examples within that process's file.
  # We take them in blocks, because it avoids the overhead of fseek() while
  # creating the examples.
  egs_list=
  for n in `seq 1 $num_jobs_nnet`; do
-    egs_list="$egs_list ark,scp:$dir/egs/egs_orig.$n.ark,$dir/egs/egs_orig.$n.scp"
+    egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark"
  done
  echo "Generating training examples on disk"
  # The examples will go round-robin to egs_list.
-  $cmd $dir/log/get_egs.log \
+  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet-get-egs $nnet_context_opts "$feats" \
-    "ark,cs:gunzip -c $dir/ali.*.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
+    "ark,cs:gunzip -c $dir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
    nnet-copy-egs ark:- $egs_list || exit 1;
 fi
 if [ $stage -le -2 ]; then
  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
  # then split into multiple parts egs.JOB.*.scp for different parts of the
  # data, 0 .. $iters_per_epoch-1.
  if [ $iters_per_epoch -eq 1 ]; then
    echo "Since iters-per-epoch == 1, just concatenating the data."
    for n in `seq 1 $num_jobs_nnet`; do
      cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
      rm $dir/egs/egs_orig.$n.*.ark || exit 1;
    done
  else # We'll have to split it up using nnet-copy-egs.
    egs_list=
    for n in `seq 0 $[$iters_per_epoch-1]`; do
      egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark"
    done
    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
      nnet-copy-egs --random=$random_copy --srand=JOB \
        "ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list '&&' \
        rm $dir/egs/egs_orig.JOB.*.ark || exit 1;
  fi
 fi
 if [ $stage -le -1 ]; then
  # Next, shuffle the order of the examples in each of those files.
-  # In order to not use too much memory (in case the size of the files is
+  # Each one should not be too large, so we can do this in memory.
  # huge) we do this by randomizing the order of the .scp file and then
  # just call nnet-copy-egs.  If the file system is willing to store
  # stuff in memory, it is free to do so.  This is not super-optimal in
  # terms of file system performance but it's simple and it won't fail when
  # the data gets large.
  echo "Shuffling the order of training examples"
  echo "(in order to avoid stressing the disk, these won't all run at once)."
-  $cmd $shuffle_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.JOB.log \
+
-    utils/shuffle_list.pl --srand JOB $dir/egs/egs_orig.JOB.scp \| \
+  for n in `seq 0 $[$iters_per_epoch-1]`; do
-    nnet-copy-egs scp:- ark,scp:$dir/egs/egs.JOB.ark,$dir/egs/egs.JOB.scp \
+    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
-    '&&' rm $dir/egs/egs_orig.JOB.ark $dir/egs/egs_orig.JOB.scp
+      nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
-  smallest_len=`wc -l $dir/egs/egs.*.scp | sort -n -k1 | awk '{print $1}' | head -1`
+      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark '&&' \
-  # If the $samples_per_iteration is more than each split of the data,
+      rm $dir/egs/egs_tmp.JOB.$n.ark || exit 1;
  # append to each .scp file the .scp files from the next one or two 
  # splits (or more), so each one is larger...
  rm $dir/egs/egs.*.scp.orig 2>/dev/null
  if [ $samples_per_iteration -gt $smallest_len ]; then
    extra_files=$[($samples_per_iteration-1) / $smallest_len]
    echo Each part of the data has about $smallest_len lines which is less than the 
    echo samples per iteration $samples_per_iteration, so appending next $extra_files
    echo files to each scp file
    for n in `seq $num_jobs_nnet`; do mv $dir/egs/egs.$n.scp $dir/egs/egs.$n.scp.orig; done
    for n in `seq $num_jobs_nnet`; do
      for e in `seq 0 $extra_files`; do
         m=$[(($n + $e - 1)%$num_jobs_nnet)+1]
         cat $dir/egs/egs.$m.scp.orig
      done > $dir/egs/egs.$n.scp
  done
 fi
 fi
-num_egs=`grep wrote $dir/log/get_egs.log | tail -1 | awk '{print $NF}'` || exit 1;
+num_iters_reduce=$[$num_epochs * $iters_per_epoch];
-! [ $num_egs -gt 0 ] && echo "bad num_egs $num_egs" && exit 1;
+num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
 num_iters_reduce=$[ 1 + (($num_egs * $num_epochs)/($num_jobs_nnet * $samples_per_iteration))]
 num_iters_extra=$[1 + (($num_egs * $num_epochs_extra)/($num_jobs_nnet * $samples_per_iteration))]
 num_iters=$[$num_iters_reduce+$num_iters_extra]
 echo "Will train for $num_epochs + $num_epochs_extra epochs, equalling "
 echo " $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
 echo " (while reducing learning rate) + (with constant learning rate)."
 function get_list {
  # usage: get_list <samples-per-iter> <iter> <input-file> >output
  #
  # Outputs an scp file for this job for this iteration.  The
  # output will have <samples-per-iter> lines, and will contain lines from
  # egs.JOB.scp, possibly with repeats.  It will be sorted numerically on its
  # first field, so the .ark file is accessed in order (we then pipe to
  # nnet-shuffle-egs to randomize the order).  The way we do it is, we imagine
  # we had concatenated the file $dir/egs/egs.JOB.scp infinite times, and
  # taken from the concatenated file, the lines 
  # <samples-per-iter> * <iter> ...  <samples-per-iter> * (<iter> + 1) - 1,
  # and then sorted them on the first field (which is a number).
  # We don't actually implement it this way, we do it a bit more efficiently.
  # We require that samples-per-iter <= (#lines in input-file).
  [ $# -ne 3 ] && echo "get_list: bad usage" && exit 1;
  samples_per_iter=$1
  my_iter=$2
  input_file=$3
  start=$[$my_iter * $samples_per_iter]; # starting-point in concatenated file.
  input_len=`cat $input_file | wc -l`
  start=$[$start - $input_len*($start/$input_len)]; # remove whole multiples of input_len
  # we have to concatenate the input file to itself.
  cat $input_file $input_file | \
     head -n $[$start + $samples_per_iter] | tail -n $samples_per_iter | \
     sort -k2 -k1n
 }
 # up till $last_normal_shrink_iter we will shrink the parameters
 # in the normal way using the dev set, but after that we will
 # only re-compute the shrinkage parameters periodically.
@ -361,22 +391,19 @@ x=0
 while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
-    # Set off a job that does diagnostics, in the background.
+    # Set off jobs doing some diagnostics, in the background.
-    $cmd $parallel_opts $dir/log/compute_prob.$x.log \
+    $cmd $dir/log/compute_prob_valid.$x.log \
-      nnet-compute-prob $dir/$x.mdl ark:$dir/valid_shrink.egs &
+      nnet-compute-prob $dir/$x.mdl ark:$dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$dir/train_diagnostic.egs &
    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "Realigning data (pass $x)"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        nnet-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$dir/$x.mdl" \
-         "ark:gunzip -c $dir/fsts.JOB.gz|" "$split_feats" \
+         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
        "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
    fi
    for n in `seq $num_jobs_nnet`; do
      # the following command gets a subset of the n'th scp file, containing
      # $samples_per_iteration lines.
      get_list $samples_per_iteration $x $dir/egs/egs.$n.scp > $dir/temp/egs.$x.$n.scp
    done      
    echo "Training neural net (pass $x)"
    if [ $x -gt 0 ] && \
@ -388,9 +415,8 @@ while [ $x -lt $num_iters ]; do
    fi
    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      MKL_NUM_THREADS=$mkl_num_threads \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
-           scp:$dir/temp/egs.$x.JOB.scp ark:- \| \
+         ark:$dir/egs/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
       nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
        "$mdl" ark:- $dir/$[$x+1].JOB.mdl \
       || exit 1;
@ -410,10 +436,10 @@ while [ $x -lt $num_iters ]; do
      if [ $x -le $last_normal_shrink_iter ] || [ $[$x % $shrink_interval] -eq 0 ]; then
        # For earlier iterations (while we've recently beeen adding layers), or every
        # $shrink_interval=3 iters , just do shrinking normally.
        mb=$[($num_valid_frames_shrink+$num_train_frames_shrink+$num_threads-1)/$num_threads]
        $cmd $parallel_opts $dir/log/shrink.$x.log \
-          MKL_NUM_THREADS=$mkl_num_threads nnet-combine-fast --num-threads=$num_threads --verbose=3 \
+          nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
-            --minibatch-size=$[($num_valid_frames_shrink+$num_threads-1)/$num_threads] \
+            $dir/$[$x+1].mdl ark:$dir/shrink.egs $dir/$[$x+1].mdl || exit 1;
            $dir/$[$x+1].mdl ark:$dir/valid_shrink.egs $dir/$[$x+1].mdl || exit 1;
      fi
    fi
    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@ -423,7 +449,7 @@ while [ $x -lt $num_iters ]; do
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
         $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
-    rm $nnets_list $dir/temp/egs.$x.*.scp
+    rm $nnets_list
  fi
  x=$[$x+1]
 done
@ -435,15 +461,32 @@ nnets_list=
 for x in `seq $[$num_iters-$num_iters_final+1] $num_iters`; do
  [ $x -gt $mix_up_iter ] && nnets_list="$nnets_list $dir/$x.mdl"
 done
 if [ $stage -le $num_iters ]; then
  mb=$[($num_valid_frames_combine+$num_train_frames_combine+$num_threads-1)/$num_threads]
  $cmd $parallel_opts $dir/log/combine.log \
-  MKL_NUM_THREADS=$mkl_num_threads nnet-combine-fast --num-threads=$num_threads \
+    nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
-    --verbose=3 --minibatch-size=$[($num_valid_frames_shrink+$num_threads-1)/$num_threads] \
+    $nnets_list ark:$dir/combine.egs $dir/final.mdl || exit 1;
-     $nnets_list ark:$dir/valid_combine.egs $dir/final.mdl || exit 1;
+fi
 # Compute the probability of the final, combined model with
 # the same subset we used for the previous compute_probs, as the
 # different subsets will lead to different probs.
-$cmd $parallel_opts $dir/log/compute_prob.final.log \
+$cmd $dir/log/compute_prob_valid.final.log \
-  nnet-compute-prob $dir/final.mdl ark:$dir/valid_shrink.egs || exit 1;
+  nnet-compute-prob $dir/final.mdl ark:$dir/valid_diagnostic.egs &
 $cmd $dir/log/compute_prob_train.final.log \
  nnet-compute-prob $dir/final.mdl ark:$dir/train_diagnostic.egs &
 echo Done
 if $cleanup; then
  echo Cleaning up data
  echo Removing training examples
  rm -r $dir/egs
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
       # delete all but every 10th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
 fi
--- a/egs/wsj/s5/steps/train_nnet_cpu_mmi.sh
+++ b/egs/wsj/s5/steps/train_nnet_cpu_mmi.sh
@ -31,16 +31,17 @@ num_jobs_nnet=8 # Number of neural net training jobs to run in parallel.
                # not the same as the num-jobs (nj) which will be the same as the
                # alignment and denlat directories.
 stage=0
-sub_stage=-2 # this can be used to start from a particular sub-iteration of an
+sub_stage=-3 # this can be used to start from a particular sub-iteration of an
             # iteration
 acwt=0.1
 boost=0.0  # boosting for BMMI (you can try 0.1).. this is applied per frame.
 transform_dir=  # Note: by default any transforms in $alidir will be used.
 parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
-shuffle_opts="-tc 5" # max 5 jobs running at one time (a lot of I/O.)
+io_opts="-tc 10" # max 5 jobs running at one time (a lot of I/O.)
 num_threads=16 # number of threads for neural net trainer..
 mkl_num_threads=1
 random_copy=false
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@ -71,8 +72,7 @@ if [ $# != 6 ]; then
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads."
-  echo "  --shuffle-opts <opts|\"-tc 5\">                  # Options given to e.g. queue.pl for the job that shuffles the "
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for any especially I/O intensive jobs"
  echo "                                                   # data. (prevents stressing the disk). "
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, for each"
@ -181,34 +181,37 @@ while [ $x -lt $num_epochs ]; do
  echo "Epoch $x of $num_epochs"
  if [ $stage -le $x ] && $first_iter_of_epoch; then
-    if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
+    if [ $stage -lt $x ] || [ $sub_stage -le -3 ]; then
      # First get the per-frame posteriors, by rescoring the lattices; this
      # process also gives us at the same time the posteriors of each state for
      # each frame (by default, pruned to 0.01 with a randomized algorithm).
      # The matrix-logprob stage produces a diagnostic and passes the pseudo-log-like
-      # matrix through unchanged.
+      # matrix through unchanged.  (Note: nnet-logprob2-parallel can use up to
-      $cmd JOB=1:$nj $dir/log/post.$z.JOB.log \
+      # $num_threads threads, but in practice it may be limited by the speed of
-        nnet-logprob2 $dir/$x.1.mdl "$feats" "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
+      # the other elements of the pipe.
      $cmd $parallel_opts JOB=1:$nj $dir/log/post.$z.JOB.log \
        nnet-logprob2-parallel --num-threads=$num_threads $dir/$x.1.mdl "$feats" \
          "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
        matrix-logprob ark:- "ark:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $dir/$x.1.mdl ark:- ark:-|" ark:- \| \
        lattice-rescore-mapped $dir/$x.1.mdl "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark:- ark:- \| \
        lattice-boost-ali --b=$boost --silence-phones=$silphonelist $dir/$x.1.mdl ark:- "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
        post-to-pdf-post $dir/$x.1.mdl ark:- "ark:|gzip -c >$dir/post/den_post.$z.JOB.gz" || exit 1;
    fi
-    if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
+    if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
      # run nnet-get-egs for all files, to get the training examples for each frame--
      # combines the feature and label/posterior information.  The posterior information
      # consists of 2 things: the numerator posteriors from the alignments, the denominator
      # posteriors from the lattices (times -1), and the smoothing posteriors from the 
      # neural net log-probs (times E).  
      # We copy the examples for each job round-robin to multiple archives, one for each
-      # of 1...$num_jobs_nnet.  We write these along with .scp files, for more convenient
+      # of 1...$num_jobs_nnet.  
      # and memory-efficient randomization.
      egs_out=""
      for n in `seq 1 $num_jobs_nnet`; do
-        egs_out="$egs_out ark,scp:$dir/egs/egs.$z.$n.JOB.ark,$dir/egs/egs.$z.$n.JOB.scp"
+        # indexes are egs_orig.$z.$num_jobs_nnet.$nj
        egs_out="$egs_out ark:$dir/egs/egs_orig.$z.$n.JOB.ark"
      done
-      $cmd JOB=1:$nj $dir/log/egs.$z.JOB.log \
+      $cmd JOB=1:$nj $dir/log/get_egs.$z.JOB.log \
         ali-to-pdf $dir/$x.1.mdl "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
         ali-to-post ark:- ark:- \| \
         sum-post --scale2=$E ark:- "ark:gunzip -c $dir/post/smooth_post.$z.JOB.gz|" ark:- \| \
@ -223,23 +226,33 @@ while [ $x -lt $num_epochs ]; do
      tail -n 50 $dir/log/post.$z.*.log | perl -e '$acwt=shift @ARGV; $acwt>0.0 || die "bad acwt"; while(<STDIN>) { if (m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames += $2; } if (m|matrix-logprob.+Average log-prob per frame is (\S+) over (\S+) frames|) { $tot_num_like += $1*$2; $tot_num_frames += $2; } } if (abs($tot_frames - $tot_num_frames) > 0.01*($tot_frames + $tot_num_frames)) { print STDERR "#frames differ $tot_frames vs $tot_num_frames\n"; }  $tot_den_lat_like /= $tot_frames; $tot_num_like /= $tot_num_frames; $objf = $acwt * $tot_num_like - $tot_den_lat_like; print $objf."\n"; ' $acwt > $dir/log/objf.$z.log
      echo "Objf on EBW iter $z is `cat $dir/log/objf.$z.log`"
    fi
-    if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
+    if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
-      echo "Shuffling the order of training examples and splitting them up"
+      echo "Merging training examples across original #jobs ($nj), and "
-      echo "(in order to avoid stressing the disk, these won't all run at once)."
+      echo "splitting across number of nnet jobs $num_jobs_nnet"
      egs_out2=""
      for n in `seq 1 $iters_per_epoch`; do
-        egs_out2="$egs_out2 ark:$dir/egs/egs_split.$z.$n.JOB.ark"
+        # indexes of egs_merged are: egs_merged.$z.$iters_per_epoch.$num_jobs_nnet
        egs_out2="$egs_out2 ark:$dir/egs/egs_merged.$z.$n.JOB.ark"
      done
      # Note: in the following command, JOB goes from 1 to $num_jobs_nnet, so one
      # job per parallel training job (different from the previous command).
      # We sum up over the index JOB in the previous $cmd, and write to multiple
      # archives, this time one for each "sub-iter".
-      $cmd $shuffle_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.JOB.log \
+      # indexes of egs_orig are: egs_orig.$z.$num_jobs_nnet.$nj
-        cat $dir/egs/egs.$z.JOB.*.scp \| \
+      $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/merge_and_split.$x.JOB.log \
-        utils/shuffle_list.pl --srand "\$[($z*$num_jobs_nnet)+JOB]" \| \
+        cat $dir/egs/egs_orig.$z.JOB.*.ark \| \
-        nnet-copy-egs scp:- $egs_out2 || exit 1; ##'&&' \
+        nnet-copy-egs --random=$random_copy "--srand=\$[JOB+($x*$num_jobs_nnet)]" \
-        ##rm $dir/egs/egs.$z.JOB.*.scp $dir/egs/egs.$z.JOB.*.ark || exit 1;
+          ark:- $egs_out2 '&&' rm $dir/egs/egs_orig.$z.JOB.*.ark || exit 1;
    fi
    if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
      echo "Randomizing order of examples in each job"
      for n in `seq 1 $iters_per_epoch`; do
        s=$[$num_jobs_nnet*($n+($iters_per_epoch*$z))] # for srand
        $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$z.$n.JOB.log \
          nnet-shuffle-egs "--srand=\$[JOB+$s]" \
          ark:$dir/egs/egs_merged.$z.$n.JOB.ark ark:$dir/egs/egs.$z.$n.JOB.ark '&&' \
          rm $dir/egs/egs_merged.$z.$n.JOB.ark || exit 1;
      done
    fi
  fi
  if [ $stage -le $x ]; then
@ -250,7 +263,7 @@ while [ $x -lt $num_epochs ]; do
      if [ $stage -lt $x ] || [ $sub_stage -le $y ]; then
        $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.$y.JOB.log \
          nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
-          $dir/$x.$y.mdl ark:$dir/egs/egs_split.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
+          $dir/$x.$y.mdl ark:$dir/egs/egs.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
          || exit 1;
        nnets_list=
        for n in `seq 1 $num_jobs_nnet`; do
--- a/egs/wsj/s5/utils/nnet-cpu/make_nnet_config.pl
+++ b/egs/wsj/s5/utils/nnet-cpu/make_nnet_config.pl
@ -68,7 +68,7 @@ Options:
   --input-left-context <n>        #  #frames of left context for input features; default 0.
   --input-right-context <n>       #  #frames of right context for input features; default 0.
   --param-stdddev-factor <f>      #  Factor which can be used to modify the standard deviation of
-                                   #  randomly nitialized features (default, 1.  Gets multiplied by
+                                   #  randomly initialized features (default, 1.  Gets multiplied by
                                   #  1/sqrt of number of inputs).
   --initial-num-hidden-layers <n> <config-file>   #  If >0, number of hidden layers to initialize the network with.
                                   #  In this case, the positional parameter <num-hidden-layers> is only
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@ -19,7 +19,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
        align-mapped align-compiled-mapped latgen-faster-mapped \
        hmm-info pdf-to-counts analyze-counts extract-ctx post-to-phone-post \
        post-to-pdf-post duplicate-matrix logprob-to-post prob-to-post copy-post \
-        matrix-logprob
+        matrix-logprob matrix-sum
 OBJFILES = 
--- a/src/bin/matrix-sum.cc
+++ b/src/bin/matrix-sum.cc
@ -0,0 +1,87 @@
 // bin/matrix-sum.cc
 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "matrix/kaldi-matrix.h"
 int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;
    const char *usage =
        "Sum (and optionally scale) two archives of input matrices\n"
        "of the same dimension\n"
        "\n"
        "Usage: matrix-sum [options] <matrix-rspecifier1> <matrix-rspecifier2> <sum-wspecifier>\n";
    BaseFloat scale1 = 1.0, scale2 = 1.0;
    ParseOptions po(usage);
    po.Register("scale1", &scale1, "Scale applied to first matrix");
    po.Register("scale2", &scale2, "Scale applied to second matrix");
    po.Read(argc, argv);
    if (po.NumArgs() != 3) {
      po.PrintUsage();
      exit(1);
    }
    std::string rspecifier1 = po.GetArg(1);
    std::string rspecifier2 = po.GetArg(2);
    std::string wspecifier = po.GetArg(3);
    SequentialBaseFloatMatrixReader mat1_reader(rspecifier1);
    RandomAccessBaseFloatMatrixReader mat2_reader(rspecifier2);
    BaseFloatMatrixWriter mat_writer(wspecifier);
    int32 num_done = 0, num_err = 0;
    for (; !mat1_reader.Done(); mat1_reader.Next()) {
      std::string key = mat1_reader.Key();
      Matrix<BaseFloat> mat1 (mat1_reader.Value());
      if (!mat2_reader.HasKey(key)) {
        KALDI_WARN << "No such key " << key << " in second table.";
        num_err++;
        continue;
      }
      const Matrix<BaseFloat> &mat2 (mat2_reader.Value(key));
      if (!SameDim(mat1, mat2)) {
        KALDI_WARN << "Matrices for key " << key << " have different dims "
                   << mat1.NumRows() << " x " << mat1.NumCols() << " vs. "
                   << mat2.NumRows() << " x " << mat2.NumCols();
        num_err++;
        continue;
      }
      if (scale1 != 1.0) mat1.Scale(scale1);
      mat1.AddMat(scale2, mat2);
      mat_writer.Write(key, mat1);
      num_done++;
    }
    KALDI_LOG << "Added " << num_done << " matrices; " << num_err
              << " had errors.";
    return (num_done != 0 ? 0 : 1);
  } catch(const std::exception &e) {
    std::cerr << e.what();
    return -1;
  }
 }
--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@ -93,10 +93,13 @@ Real* CuMatrix<Real>::RowData(MatrixIndexT r) {
 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols) {
+void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
                            MatrixResizeType resize_type) {
  // This code does not currently support the other resize_type options.
  KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
  if (num_rows_ == rows && num_cols_ == cols) {
-    // SetZero();
+    if (resize_type == kSetZero) SetZero();
-    return *this;
+    return;
  }
  Destroy();
@ -108,17 +111,15 @@ CuMatrix<Real>& CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols) {
    cuSafeCall(cudaMallocPitch((void**)&data_, &pitch, row_bytes, rows));
    num_rows_ = rows; num_cols_ = cols; 
    stride_ = pitch/sizeof(Real);
-    SetZero();
+    if (resize_type == kSetZero) SetZero();
  } else
  #endif
  {
-    mat_.Resize(rows, cols);
+    mat_.Resize(rows, cols, resize_type);
    num_rows_=rows;
    num_cols_=cols;
    stride_= mat_.Stride();
  }
  return *this;
 }
@ -134,7 +135,7 @@ void CuMatrix<Real>::Destroy() {
  } else
  #endif
  {
-    mat_.Destroy();
+    mat_.Resize(0, 0);
  }
  num_rows_ = num_cols_ = stride_ = 0;
 }
@ -142,9 +143,8 @@ void CuMatrix<Real>::Destroy() {
 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
+void CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
-  Resize(src.NumRows(), src.NumCols());
+  KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
  #if HAVE_CUDA==1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
@ -152,7 +152,8 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
    MatrixIndexT dst_pitch = stride_*sizeof(Real);
    MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
    MatrixIndexT width = src.NumCols()*sizeof(Real);
-    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, width, src.NumRows(), cudaMemcpyDeviceToDevice));
+    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
                            width, src.NumRows(), cudaMemcpyDeviceToDevice));
    CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromMatD2D",tim.Elapsed());
  } else
@ -160,16 +161,13 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const CuMatrix<Real> &src) {
  {
    mat_.CopyFromMat(src.mat_);
  }
  return *this;
 }
 template<typename Real>
-CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
+void CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
-  Resize(src.NumRows(), src.NumCols());
+  KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
  #if HAVE_CUDA==1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
@ -177,7 +175,8 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
    MatrixIndexT dst_pitch = stride_*sizeof(Real);
    MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
    MatrixIndexT width = src.NumCols()*sizeof(Real);
-    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, width, src.NumRows(), cudaMemcpyHostToDevice));
+    cuSafeCall(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
                            width, src.NumRows(), cudaMemcpyHostToDevice));
    CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromMatH2D",tim.Elapsed());
  } else
@ -185,17 +184,12 @@ CuMatrix<Real>& CuMatrix<Real>::CopyFromMat(const Matrix<Real> &src) {
  {
    mat_.CopyFromMat(src);
  }
  return *this;
 }
 template<typename Real>
 void CuMatrix<Real>::CopyToMat(Matrix<Real> *dst) const {
-  if (dst->NumRows() != NumRows()  ||  dst->NumCols() != NumCols()) {
+  KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols());
    dst->Resize(NumRows(), NumCols());
  }
  #if HAVE_CUDA==1 
  if (CuDevice::Instantiate().Enabled()) { 
@ -257,7 +251,7 @@ void CuMatrix<Real>::Read(std::istream &is, bool binary) {
 template<typename Real>
 void CuMatrix<Real>::Write(std::ostream &os, bool binary) const {
-  Matrix<BaseFloat> tmp;
+  Matrix<BaseFloat> tmp(NumRows(), NumCols(), kUndefined);
  CopyToMat(&tmp);
  tmp.Write(os, binary); 
 }
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@ -46,15 +46,41 @@ class CuMatrix {
 public:
  /// Default Constructor
-  CuMatrix<Real>()
+  CuMatrix<Real>():
-   : num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { }
-  }
+
  /// Constructor with memory initialisation
-  CuMatrix<Real>(MatrixIndexT rows, MatrixIndexT cols)
+  CuMatrix<Real>(MatrixIndexT rows, MatrixIndexT cols):
-   : num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
+  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) { 
    Resize(rows, cols); 
  }
  // Note: we had to remove the "explicit" keyword due
  // to problems with STL vectors of CuMatrix.
  CuMatrix<Real>(const CuMatrix<Real> &other):
  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) {
    Resize(other.NumRows(), other.NumCols(), kUndefined);
    CopyFromMat(other);
  }
  explicit CuMatrix<Real>(const Matrix<Real> &other):
  num_rows_(0), num_cols_(0), stride_(0), data_(NULL) {
    Resize(other.NumRows(), other.NumCols(), kUndefined);
    CopyFromMat(other);
  }
  CuMatrix<Real> &operator = (const CuMatrix<Real> &other) {
    Resize(other.NumRows(), other.NumCols(), kUndefined);
    CopyFromMat(other);
    return *this;
  }  
  CuMatrix<Real> &operator = (const Matrix<Real> &other) {
    Resize(other.NumRows(), other.NumCols(), kUndefined);
    CopyFromMat(other);
    return *this;
  }  
  /// Destructor
  ~CuMatrix() {
    Destroy(); 
@ -65,14 +91,12 @@ class CuMatrix {
    return num_rows_; 
  }
-  MatrixIndexT NumCols() const { 
+  MatrixIndexT NumCols() const { return num_cols_;  }
    return num_cols_; 
  }
-  MatrixIndexT Stride() const { 
+  MatrixIndexT Stride() const { return stride_; }
    return stride_; 
  }
  // MatrixDim is a struct containing "rows", "cols" and "stride",
  // that is an argument of most CUDA kernels.
  ::MatrixDim Dim() const { 
    ::MatrixDim d = { num_rows_, num_cols_, stride_ }; 
    return d; 
@ -87,29 +111,22 @@ class CuMatrix {
  Real* RowData(MatrixIndexT r);
  /// Get size of matrix in bytes
-  MatrixIndexT SizeInBytes() const { 
+  MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
    return num_rows_*stride_*sizeof(Real); 
  }
  /// Get size of matrix row in bytes
-  MatrixIndexT RowSizeInBytes() const {
+  MatrixIndexT RowSizeInBytes() const { return num_cols_*sizeof(Real); }
    return num_cols_*sizeof(Real); 
  }
  /// Get size of matrix stride in bytes
-  MatrixIndexT StrideSizeInBytes() const {
+  MatrixIndexT StrideSizeInBytes() const { return stride_*sizeof(Real); }
    return stride_*sizeof(Real); 
  }
  /// Allocate the memory
-  ThisType& Resize(MatrixIndexT rows, MatrixIndexT cols);
+  void Resize(MatrixIndexT rows, MatrixIndexT cols,
              MatrixResizeType resize_type = kSetZero);
-  /// Deallocate the memory
+  /// Copy functions (reallocates when needed, but note from Dan: eventually
-  void Destroy();
+  /// I'll change it to just die if the sizes don't match, like the Matrix class.)
-
+  void CopyFromMat(const CuMatrix<Real> &src);
-  /// Copy functions (reallocates when needed)
+  void CopyFromMat(const Matrix<Real> &src);
  ThisType&        CopyFromMat(const CuMatrix<Real> &src);
  ThisType&        CopyFromMat(const Matrix<Real> &src);
  void CopyToMat(Matrix<Real> *dst) const;
  /// Copy row interval from matrix
@ -154,6 +171,8 @@ class CuMatrix {
  }
 private:
  void Destroy();
  MatrixIndexT num_rows_;
  MatrixIndexT num_cols_;
  MatrixIndexT stride_;
--- a/src/cudamatrix/cu-vector-inl.h
+++ b/src/cudamatrix/cu-vector-inl.h
@ -44,8 +44,6 @@ const Real* CuVector<Real>::Data() const {
  }
 }
 template<typename Real>
 Real* CuVector<Real>::Data() { 
  #if HAVE_CUDA==1
@ -58,15 +56,12 @@ Real* CuVector<Real>::Data() {
  }
 }
 template<typename Real>
-CuVector<Real>& CuVector<Real>::Resize(MatrixIndexT dim) {
+void CuVector<Real>::Resize(MatrixIndexT dim) {
  if (dim_ == dim) {
-    // SetZero();
+    SetZero();
-    return *this;
+    return;
  }
  Destroy();
  #if HAVE_CUDA==1
@ -80,8 +75,6 @@ CuVector<Real>& CuVector<Real>::Resize(MatrixIndexT dim) {
  dim_ = dim;
  SetZero();
  return *this;
 }
@ -106,9 +99,7 @@ void CuVector<Real>::Destroy() {
 template<typename Real>
-CuVector<Real>& CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
+void CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
  Resize(src.Dim());
 #if HAVE_CUDA==1
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
@ -119,16 +110,13 @@ CuVector<Real>& CuVector<Real>::CopyFromVec(const CuVector<Real> &src) {
  {
    vec_.CopyFromVec(src.vec_);
  }
  return *this;
 }
 template<typename Real>
-CuVector<Real>& CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
+void CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
-  Resize(src.Dim());
+  KALDI_ASSERT(src.Dim() == dim_);
  #if HAVE_CUDA==1
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
@ -141,16 +129,14 @@ CuVector<Real>& CuVector<Real>::CopyFromVec(const Vector<Real> &src) {
  {
    vec_.CopyFromVec(src);
  }
  return *this;
 }
 template<typename Real>
 void CuVector<Real>::CopyToVec(Vector<Real> *dst) const {
-  if (dst->Dim() != dim_) {
+  KALDI_ASSERT(dst->Dim() == dim_);
-    dst->Resize(dim_);
+
  }
  #if HAVE_CUDA==1
  if (CuDevice::Instantiate().Enabled()) { 
@ -177,7 +163,7 @@ void CuVector<Real>::Read(std::istream &is, bool binary) {
 template<typename Real>
 void CuVector<Real>::Write(std::ostream &os, bool binary) const {
-  Vector<BaseFloat> tmp;
+  Vector<BaseFloat> tmp(Dim());
  CopyToVec(&tmp);
  tmp.Write(os, binary); 
 }
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@ -46,6 +46,16 @@ class CuVector {
    Resize(dim); 
  }
  CuVector<Real>(const CuVector<Real> &v): dim_(0), data_(NULL) { 
    Resize(v.dim_);
    CopyFromVec(v);
  }
  CuVector<Real>(const Vector<Real> &v): dim_(0), data_(NULL) { 
    Resize(v.Dim());
    CopyFromVec(v);
  }
  /// Destructor
  ~CuVector() {
    Destroy(); 
@ -61,14 +71,11 @@ class CuVector {
  Real* Data();
  /// Allocate the memory
-  ThisType& Resize(MatrixIndexT dim);
+  void Resize(MatrixIndexT dim);
  /// Deallocate the memory
  void Destroy();
  /// Copy functions (lazy reallocation when needed)
-  ThisType&        CopyFromVec(const CuVector<Real> &src);
+  void CopyFromVec(const CuVector<Real> &src);
-  ThisType&        CopyFromVec(const Vector<Real> &src);
+  void CopyFromVec(const Vector<Real> &src);
  void CopyToVec(Vector<Real> *dst) const;
  /// I/O 
@ -94,6 +101,7 @@ class CuVector {
  }
 private:
  void Destroy();
  MatrixIndexT dim_; ///< dimension of the vector
  Real *data_; ///< GPU data pointer
  Vector<Real> vec_; ///< non-GPU vector as back-up
--- a/src/decoder/simple-decoder.h
+++ b/src/decoder/simple-decoder.h
@ -234,7 +234,7 @@ class SimpleDecoder {
        const Arc &arc = aiter.Value();
        if (arc.ilabel == 0) {  // propagate nonemitting only...
          Token *new_tok = new Token(arc, tok);
-          if (new_tok->arc_.weight.Value() > cutoff) {
+          if (new_tok->weight_.Value() > cutoff) {
            Token::TokenDelete(new_tok);
          } else {
            unordered_map<StateId, Token*>::iterator find_iter
--- a/src/feat/mel-computations.cc
+++ b/src/feat/mel-computations.cc
@ -213,6 +213,10 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
    int32 offset = bins_[i].first;
    const Vector<BaseFloat> &v (bins_[i].second);
    (*mel_energies_out)(i) = VecVec(v, power_spectrum.Range(offset, v.Dim()));
    // The following assert was added due to a problem with OpenBlas that
    // we had at one point (it was a bug in that library).  Just to detect
    // it early.
    KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i)));
  }
  if (debug_) {
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@ -9,7 +9,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
    feat-to-len feat-to-dim fmpe-apply-transform fmpe-acc-stats fmpe-init \
    fmpe-est fmpe-copy fmpe-sum-accs append-feats extend-transform-dim \
    get-full-lda-mat compute-spectrogram-feats extract-feature-segments \
-    reverse-feats paste-feats select-feats
+    reverse-feats paste-feats select-feats subsample-feats
 OBJFILES = 
--- a/src/featbin/append-feats.cc
+++ b/src/featbin/append-feats.cc
@ -1,6 +1,7 @@
 // featbin/append-feats.cc
-// Copyright 2012   Petr Motlicek;  Pawel Swietojanski
+// Copyright 2012   Petr Motlicek  Pawel Swietojanski
 //                  Johns Hopkins University (author: Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -32,15 +33,11 @@ int main(int argc, char *argv[]) {
    ParseOptions po(usage);
-    int32 feats_offset_in1 = 0;
+    bool truncate_frames = false;
    int32 feats_offset_in2 = 0;
    int32 num_feats_in1 = 0;
    int32 num_feats_in2 = 0;
-    po.Register("feats-offset-in1", &feats_offset_in1, "Feats 1 offset");
+    po.Register("truncate-frames", &truncate_frames, "If true, do not treat it "
-    po.Register("num-feats-in1", &num_feats_in1, "Take num-feats from in1-rspeciifier");
+                "as an error when files differ in number of frames, but truncate "
-    po.Register("feats-offset-in2", &feats_offset_in2, "Feats 2 offset");
+                "the longest one.");
    po.Register("num-feats-in2", &num_feats_in2, "Take num-feats from in2-rspeciifier");
    po.Read(argc, argv);
@ -53,80 +50,47 @@ int main(int argc, char *argv[]) {
    std::string rspecifier2 = po.GetArg(2);
    std::string wspecifier = po.GetArg(3);
-    KALDI_ASSERT(feats_offset_in1 >= 0 && feats_offset_in2 >= 0);
+    BaseFloatMatrixWriter feats_writer(wspecifier);
    SequentialBaseFloatMatrixReader feats_reader1(rspecifier1);
    RandomAccessBaseFloatMatrixReader feats_reader2(rspecifier2);
-    BaseFloatMatrixWriter kaldi_writer(wspecifier);
+    int32 num_done = 0, num_err = 0;
    SequentialBaseFloatMatrixReader kaldi_reader1(rspecifier1);
    RandomAccessBaseFloatMatrixReader kaldi_reader2(rspecifier2);
-    // Peeking in the archives to get the feature dimensions
+    for (; !feats_reader1.Done(); feats_reader1.Next()) {
-    if (kaldi_reader1.Done()) {
+      std::string utt = feats_reader1.Key();
-      KALDI_ERR << "Could not read any features from " << rspecifier1
+      if (!feats_reader2.HasKey(utt)) {
                << ". (empty archive?)";
    }
    std::string utt = kaldi_reader1.Key();
    if (!kaldi_reader2.HasKey(utt)) {
      KALDI_ERR << "Could not read features for key " << utt << " from "
                << rspecifier2 << ". (empty archive?)";
    }
    int32 dim_feats_in1 = kaldi_reader1.Value().NumCols();
    int32 dim_feats_in2 = kaldi_reader2.Value(utt).NumCols();
    if (num_feats_in1 == 0)
      num_feats_in1 = dim_feats_in1 - feats_offset_in1;
    if (num_feats_in2 == 0)
      num_feats_in2 = dim_feats_in2 - feats_offset_in2;
    KALDI_LOG << "Reading features from " << rspecifier1 << " and " << rspecifier2;
    KALDI_LOG << "\tdim1 = " << dim_feats_in1 << "; offset1 = " << feats_offset_in1
              << "; num1 = " << num_feats_in1 << "; dim2 = " << dim_feats_in2
              << "; offset2 = " << feats_offset_in2 << "; num2 = " << num_feats_in2;
    KALDI_ASSERT((feats_offset_in1 + num_feats_in1) <= dim_feats_in1);
    KALDI_ASSERT((feats_offset_in2 + num_feats_in2) <= dim_feats_in2);
    for (; !kaldi_reader1.Done(); kaldi_reader1.Next()) {
      utt = kaldi_reader1.Key();
      if (!kaldi_reader2.HasKey(utt)) {
        KALDI_WARN << "Could not find features for " << utt << " in "
                   << rspecifier2 << ": producing no output for the utterance";
        num_err++;
        continue;
      }
-      const Matrix<BaseFloat> &feats1 = kaldi_reader1.Value();
+      const Matrix<BaseFloat> &feats1 = feats_reader1.Value();
-      const Matrix<BaseFloat> &feats2 = kaldi_reader2.Value(utt);
+      const Matrix<BaseFloat> &feats2 = feats_reader2.Value(utt);
-      int32 num_frames = feats1.NumRows();
+      if (feats1.NumRows() != feats2.NumRows() && !truncate_frames) {
-      KALDI_VLOG(1) << "Utterance : " << utt << ": # of frames = " << num_frames;
+        KALDI_WARN << "For utterance " << utt << ", features have different "
-
+                   << "#frames " << feats1.NumRows() << " vs. "
-      KALDI_ASSERT(feats1.NumCols() == dim_feats_in1 &&
+                   << feats2.NumRows() << ", producing no output (use "
-                   feats2.NumCols() == dim_feats_in2);
+                   << "--truncate-frames=true if you want output)";
-      if (num_frames != feats2.NumRows()) {
+        num_err++;
        KALDI_WARN << "Utterance " << utt << ": " << num_frames
                   << " frames read from " << rspecifier1 << " and "
                   << feats2.NumRows() << " frames read from " << rspecifier2
                   << ": producing no output for the utterance";
        continue;
      }
      int32 num_frames = std::min(feats1.NumRows(), feats2.NumRows()),
          dim1 = feats1.NumCols(), dim2 = feats2.NumCols();
      Matrix<BaseFloat> output(num_frames, dim1 + dim2, kUndefined);
      output.Range(0, num_frames, 0, dim1).CopyFromMat(
          feats1.Range(0, num_frames, 0, dim1));
      output.Range(0, num_frames, dim1, dim2).CopyFromMat(
          feats2.Range(0, num_frames, 0, dim2));
-      SubMatrix<BaseFloat> new_feats1(feats1, 0, num_frames, feats_offset_in1,
+      feats_writer.Write(utt, output);
-                                      num_feats_in1);
+      num_done++;
      SubMatrix<BaseFloat> new_feats2(feats2, 0, num_frames, feats_offset_in2,
                                      num_feats_in2);
      Matrix<BaseFloat> output_feats(num_frames, new_feats1.NumCols() +
                                     new_feats2.NumCols());
      output_feats.Range(0, num_frames, 0,
                         new_feats1.NumCols()).CopyFromMat(new_feats1);
      output_feats.Range(0, num_frames, new_feats1.NumCols(),
                         new_feats2.NumCols()).CopyFromMat(new_feats2);
      kaldi_writer.Write(utt, output_feats);
    }
-
+    KALDI_LOG << "Appended " << num_done << " feats; " << num_err
-    return 0;
+              << " with errors.";
-  }
+    return (num_done != 0 ? 0 : 1);
-  catch (const std::exception& e) {
+  } catch (const std::exception& e) {
    std::cerr << e.what();
    return -1;
  }
 }
--- a/src/featbin/compute-mfcc-feats.cc
+++ b/src/featbin/compute-mfcc-feats.cc
@ -21,7 +21,6 @@
 #include "feat/feature-mfcc.h"
 #include "feat/wave-reader.h"
 int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;
--- a/src/featbin/subsample-feats.cc
+++ b/src/featbin/subsample-feats.cc
@ -0,0 +1,96 @@
 // featbin/select-feats.cc
 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include <sstream>
 #include <algorithm>
 #include <iterator>
 #include <utility>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "matrix/kaldi-matrix.h"
 int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;
    using namespace std;
    const char *usage =
        "Sub-samples features by taking every n'th frame"
        "\n"
        "Usage: subsample-feats [options] in-rspecifier out-wspecifier\n"
        "  e.g. subsample-feats --n=2 ark:- ark:-\n";
    ParseOptions po(usage);
    int32 n = 1, offset = 0;
    po.Register("n", &n, "Take every n'th feature, for this value of n");
    po.Register("offset", &offset, "Start with the feature with this offset, "
                "then take every n'th feature.");
    po.Read(argc, argv);
    if (po.NumArgs() != 2) {
      po.PrintUsage();
      exit(1);
    }    
    string rspecifier = po.GetArg(1);
    string wspecifier = po.GetArg(2);
    SequentialBaseFloatMatrixReader feat_reader(rspecifier);
    BaseFloatMatrixWriter feat_writer(wspecifier);
    int32 num_done = 0, num_err = 0;
    // process all keys
    for (; !feat_reader.Done(); feat_reader.Next()) {
      std::string utt = feat_reader.Key();
      const Matrix<BaseFloat> feats(feat_reader.Value());
      // This code could, of course, be much more efficient; I'm just
      // keeping it simple.
      int32 num_indexes = 0;
      for (int32 k = offset; k < feats.NumRows(); k += n)
        num_indexes++; // k is the index.
      if (num_indexes == 0) {
        KALDI_WARN << "For utterance " << utt << ", output would have no rows, "
                   << "producing no output.";
        num_err++;
        continue;
      }
      Matrix<BaseFloat> output(num_indexes, feats.NumCols());
      int32 i = 0;
      for (int32 k = offset; k < feats.NumRows(); k += n, i++) {
        SubVector<BaseFloat> src(feats, k), dest(output, i);
        dest.CopyFromVec(src);
      }
      KALDI_ASSERT(i == num_indexes);
      feat_writer.Write(utt, output);
      num_done++;
    }
    KALDI_LOG << "Sub-sampled " << num_done << " feats; " << num_err
              << " with errors.";
    return (num_done != 0 ? 0 : 1);
  } catch(const std::exception &e) {
    std::cerr << e.what();
    return -1;
  }
 }
--- a/src/featbin/subset-feats.cc
+++ b/src/featbin/subset-feats.cc
@ -25,7 +25,7 @@ int main(int argc, char *argv[]) {
    using namespace kaldi;
    const char *usage =
-        "Copy a subset of features\n"
+        "Copy a subset of features (the first n features)\n"
        "Usage: subset-feats [options] in-rspecifier out-wspecifier\n";
    ParseOptions po(usage);
--- a/src/featbin/transform-feats.cc
+++ b/src/featbin/transform-feats.cc
@ -160,11 +160,9 @@ int main(int argc, char *argv[]) {
    KALDI_LOG << "Applied transform to " << num_done << " utterances; " << num_error
              << " had errors.";
-    return 0;
+    return (num_done != 0 ? 0 : 1);
  } catch(const std::exception &e) {
    std::cerr << e.what();
    return -1;
  }
 }
--- a/src/matrix/cblas-wrappers.h
+++ b/src/matrix/cblas-wrappers.h
@ -27,9 +27,24 @@
 namespace kaldi {
-inline void cblas_Xscal(const int N, float *X, const int incX, float *Y,
+
-                        const int incY, const float c, const float s) {
+inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
-  cblas_srot(N, X, incX, Y, incY, c, s);
+                        const int incY) {
  cblas_scopy(N, X, incX, Y, incY);
 }
 inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
                        const int incY) {
  cblas_dcopy(N, X, incX, Y, incY);
 }
 inline float cblas_Xasum(const int N, const float *X, const int incX) {
  return cblas_sasum(N, X, incX);
 }
 inline double cblas_Xasum(const int N, const double *X, const int incX) {
  return cblas_dasum(N, X, incX);
 }
 inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
@ -226,6 +241,78 @@ inline void cblas_Xsyrk(
  cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
 }
 /// matrix-vector multiply using a banded matrix; we always call this
 /// with b = 1 meaning we're multiplying by a diagonal matrix.  This is used for
 /// elementwise multiplication.  We miss some of the arguments out of this
 /// wrapper.
 inline void cblas_Xsbmv1(
    const MatrixIndexT dim,
    const double *A,
    const double alpha,
    const double *x,
    const double beta,
    double *y) {
  cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
              1, x, 1, beta, y, 1);
 }
 inline void cblas_Xsbmv1(
    const MatrixIndexT dim,
    const float *A,
    const float alpha,
    const float *x,
    const float beta,
    float *y) {
  cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
              1, x, 1, beta, y, 1);
 }
 /// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
 /// extend this somehow.
 inline void mul_elements(
    const MatrixIndexT dim,
    const double *a,
    double *b) { // does b *= a, elementwise.
  double c1, c2, c3, c4;
  MatrixIndexT i;
  for (i = 0; i + 4 <= dim; i += 4) {
    c1 = a[i] * b[i];
    c2 = a[i+1] * b[i+1];
    c3 = a[i+2] * b[i+2];
    c4 = a[i+3] * b[i+3];
    b[i] = c1;
    b[i+1] = c2;
    b[i+2] = c3;
    b[i+3] = c4;
  }
  for (; i < dim; i++)
    b[i] *= a[i];
 }
 inline void mul_elements(
    const MatrixIndexT dim,
    const float *a,
    float *b) { // does b *= a, elementwise.
  float c1, c2, c3, c4;
  MatrixIndexT i;
  for (i = 0; i + 4 <= dim; i += 4) {
    c1 = a[i] * b[i];
    c2 = a[i+1] * b[i+1];
    c3 = a[i+2] * b[i+2];
    c4 = a[i+3] * b[i+3];
    b[i] = c1;
    b[i+1] = c2;
    b[i+2] = c3;
    b[i+3] = c4;
  }
  for (; i < dim; i++)
    b[i] *= a[i];
 }
 // add clapack here
 #ifndef HAVE_ATLAS
 inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@ -495,6 +495,41 @@ template
 void MatrixBase<double>::CopyFromMat(const MatrixBase<double> & M,
                                     MatrixTransposeType Trans);
 // Specialize the template for CopyFromSp for float, float.
 template<>
 template<>
 void MatrixBase<float>::CopyFromSp(const SpMatrix<float> & M) {
  KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
  MatrixIndexT num_rows = num_rows_, stride = stride_;
  const float *Mdata = M.Data();
  float *row_data = data_, *col_data = data_;
  for (MatrixIndexT i = 0; i < num_rows; i++) {
    cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
    cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column.
    Mdata += i+1;
    row_data += stride;
    col_data += 1;
  }
 }
 // Specialize the template for CopyFromSp for double, double.
 template<>
 template<>
 void MatrixBase<double>::CopyFromSp(const SpMatrix<double> & M) {
  KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
  MatrixIndexT num_rows = num_rows_, stride = stride_;
  const double *Mdata = M.Data();
  double *row_data = data_, *col_data = data_;
  for (MatrixIndexT i = 0; i < num_rows; i++) {
    cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
    cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column.
    Mdata += i+1;
    row_data += stride;
    col_data += 1;
  }
 }
 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromSp(const SpMatrix<OtherReal> & M) {
@ -711,12 +746,16 @@ void Matrix<Real>::Destroy() {
 template<typename Real>
 void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
  KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
  MatrixIndexT i;
  MatrixIndexT j;
-  for (i = 0; i < num_rows_; i++) {
+  if (num_cols_ == stride_ && num_cols_ == a.stride_) {
-    for (j = 0; j < num_cols_; j++) {
+    mul_elements(num_rows_ * num_cols_, a.data_, data_);
-      (*this)(i, j) *= a(i, j);
+  } else {
    MatrixIndexT a_stride = a.stride_, stride = stride_;
    Real *data = data_, *a_data = a.data_;
    for (MatrixIndexT i = 0; i < num_rows_; i++) {
      mul_elements(num_cols_, a_data, data);
      a_data += a_stride;
      data += stride;
    }
  }
 }
@ -1985,6 +2024,13 @@ Real MatrixBase<Real>::ApplySoftMax() {
  return max + log(sum);
 }
 template<typename Real>
 void MatrixBase<Real>::ApplyTanh() {
  for (MatrixIndexT r = 0; r < num_rows_; r++) {
    SubVector<Real> v(*this, r);
    v.ApplyTanh();
  }
 }
 template<class Real>
 template<class OtherReal>
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@ -341,6 +341,9 @@ class MatrixBase {
  /// matrix and return normalizer (log sum of exponentials).
  Real ApplySoftMax();
  /// Apply the tanh function to each element of the matrix.
  void ApplyTanh();
  /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
   * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
   * orthogonal matrix so rP^{-1} = rP^T.   Throws exception if input was not
@ -553,7 +556,7 @@ class Matrix : public MatrixBase<Real> {
  /// It is symmetric, so no option for transpose, and NumRows == Cols
  template<typename OtherReal>
  explicit Matrix(const SpMatrix<OtherReal> & M) : MatrixBase<Real>() {
-    Resize(M.NumRows(), M.NumRows());
+    Resize(M.NumRows(), M.NumRows(), kUndefined);
    this->CopyFromSp(M);
  }
@ -562,10 +565,10 @@ class Matrix : public MatrixBase<Real> {
  explicit Matrix(const TpMatrix<OtherReal> & M,
                    MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
    if (trans == kNoTrans) {
-      Resize(M.NumRows(), M.NumCols());
+      Resize(M.NumRows(), M.NumCols(), kUndefined);
      this->CopyFromTp(M);
    } else {
-      Resize(M.NumCols(), M.NumRows());
+      Resize(M.NumCols(), M.NumRows(), kUndefined);
      this->CopyFromTp(M, kTrans);
    }
  }
@ -584,9 +587,6 @@ class Matrix : public MatrixBase<Real> {
  /// Distructor to free matrices.
  ~Matrix() { Destroy(); }
  /// Deallocates memory and sets to empty matrix.
  void Destroy();
  /// Sets matrix to a specified size (zero is OK as long as both r and c are
  /// zero).  The value of the new data depends on resize_type:
  ///   -if kSetZero, the new data will be zero
@ -601,9 +601,8 @@ class Matrix : public MatrixBase<Real> {
  /// Assignment operator that takes MatrixBase.
  Matrix<Real> &operator = (const MatrixBase<Real> &other) {
    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols()) {
+        MatrixBase<Real>::NumCols() != other.NumCols())
-      Resize(other.NumRows(), other.NumCols());
+      Resize(other.NumRows(), other.NumCols(), kUndefined);
    }
    MatrixBase<Real>::CopyFromMat(other);
    return *this;
  }
@ -611,15 +610,17 @@ class Matrix : public MatrixBase<Real> {
  /// Assignment operator. Needed for inclusion in std::vector.
  Matrix<Real> &operator = (const Matrix<Real> &other) {
    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols()) {
+        MatrixBase<Real>::NumCols() != other.NumCols())
-      Resize(other.NumRows(), other.NumCols());
+      Resize(other.NumRows(), other.NumCols(), kUndefined);
    }
    MatrixBase<Real>::CopyFromMat(other);
    return *this;
  }
 private:
  /// Deallocates memory and sets to empty matrix (dimension 0, 0).
  void Destroy();
  /// Init assumes the current class contents are invalid (i.e. junk or have
  /// already been freed), and it sets the matrix to newly allocated memory with
  /// the specified number of rows and columns.  r == c == 0 is acceptable.  The data
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@ -375,7 +375,13 @@ template
 void VectorBase<double>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT row);
-// takes elements to a power.  Throws exception if could not (but only for power != 1 ad power != 2).
+#ifdef HAVE_MKL
 template<>
 void VectorBase<float>::ApplyPow(float power) { vsPowx(dim_, data_, power, data_); }
 template<>
 void VectorBase<double>::ApplyPow(double power) { vdPowx(dim_, data_, power, data_); }
 #else
 // takes elements to a power.  Throws exception if could not (but only for power != 1 and power != 2).
 template<typename Real>
 void VectorBase<Real>::ApplyPow(Real power) {
  if (power == 1.0) return;
@ -399,6 +405,7 @@ void VectorBase<Real>::ApplyPow(Real power) {
    }
  }
 }
 #endif
 // Computes the p-th norm. Throws exception if could not.
 template<typename Real>
@ -534,14 +541,13 @@ template<typename Real>
 void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
  // note the double accumulator
  KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride();
+  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  Real *data = data_;
-    double sum = 0.0;
+  cblas_Xscal(dim, beta, data, 1);
-    const Real *src = M.Data() + i;
+  const Real *m_data = M.Data();
-    for (MatrixIndexT j = 0; j < num_rows; j++)
+
-      sum += src[j*stride];
+  for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
-    data_[i] = alpha * sum + beta * data_[i];
+    cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
  }
 }
 template<typename Real>
@ -651,6 +657,25 @@ Real VectorBase<Real>::ApplySoftMax() {
  return max + log(sum);
 }
 #ifdef HAVE_MKL
 template<>
 void VectorBase<float>::ApplyTanh() { vsTanh(dim_, data_, data_); }
 template<>
 void VectorBase<double>::ApplyTanh() { vdTanh(dim_, data_, data_); }
 #else
 template<typename Real>
 void VectorBase<Real>::ApplyTanh() {
  for (MatrixIndexT i = 0; i < dim_; i++) {
    Real x = data_[i];
    if (x > 0.0) {
      x = -1.0 + 2.0 / (1.0 + exp(-2.0 * x));
    } else {
      x = 1.0 - 2.0 / (1.0 + exp(2.0 * x));
    }
    data_[i] = x;
  }
 }
 #endif
 template<typename Real>
 void VectorBase<Real>::Add(Real c) {
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@ -126,6 +126,10 @@ class VectorBase {
  /// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$
  Real ApplySoftMax();
  /// Apply the tanh function to each element of a vector.  If using MKL, does
  /// it using the "less accurate" options.
  void ApplyTanh();
  /// Take all  elements of vector to a power.
  void ApplyPow(Real power);
@ -322,20 +326,20 @@ class Vector: public VectorBase<Real> {
  /// Copy constructor.  The need for this is controversial.
  Vector(const Vector<Real> &v) : VectorBase<Real>()  { //  (cannot be explicit)
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
    this->CopyFromVec(v);
  }
  /// Copy-constructor from base-class, needed to copy from SubVector.
  explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
    this->CopyFromVec(v);
  }
  /// Type conversion constructor.
  template<typename OtherReal>
  explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
-    Resize(v.Dim());
+    Resize(v.Dim(), kUndefined);
    this->CopyFromVec(v);
  }
@ -372,14 +376,14 @@ class Vector: public VectorBase<Real> {
  /// Assignment operator, protected so it can only be used by std::vector
  Vector<Real> &operator = (const Vector<Real> &other) {
-    Resize(other.Dim());
+    Resize(other.Dim(), kUndefined);
    this->CopyFromVec(other);
    return *this;
  }
  /// Assignment operator that takes VectorBase.
  Vector<Real> &operator = (const VectorBase<Real> &other) {
-    Resize(other.Dim());
+    Resize(other.Dim(), kUndefined);
    this->CopyFromVec(other);
    return *this;
  }
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@ -679,6 +679,28 @@ template<class Real> static void UnitTestAxpy() {
  }
 }
 template<class Real> static void UnitTestCopySp() {
  // Checking that the various versions of copying
  // matrix to SpMatrix work the same in the symmetric case.
  for (MatrixIndexT iter = 0;iter < 5;iter++) {
    int32 dim = 5 + rand() %  10;
    SpMatrix<Real> S(dim), T(dim);
    S.SetRandn();
    Matrix<Real> M(S);
    T.CopyFromMat(M, kTakeMeanAndCheck);
    AssertEqual(S, T);
    T.SetZero();
    T.CopyFromMat(M, kTakeMean);
    AssertEqual(S, T);
    T.SetZero();
    T.CopyFromMat(M, kTakeLower);
    AssertEqual(S, T);
    T.SetZero();
    T.CopyFromMat(M, kTakeUpper);
    AssertEqual(S, T);
  }
 }
 template<class Real> static void UnitTestPower() {
  for (MatrixIndexT iter = 0;iter < 5;iter++) {
@ -1430,6 +1452,7 @@ template<class Real> static void UnitTestMulElements() {
  }
 }
 template<class Real> static void UnitTestSpLogExp() {
  for (MatrixIndexT i = 0; i < 5; i++) {
    MatrixIndexT dimM = 10 + rand() % 10;
@ -1860,6 +1883,27 @@ template<class Real> static void  UnitTestLimitCond() {
  }
 }
 template<class Real> static void  UnitTestTanh() {
  for (MatrixIndexT i = 0; i < 10; i++) {
    MatrixIndexT dimM = 5 + rand() % 10, dimN = 5 + rand() % 10;
    Matrix<Real> M(dimM, dimN);
    Matrix<Real> N(M);
    for(int32 r = 0; r < dimM; r++) {
      for (int32 c = 0; c < dimN; c++) {
        Real x = N(r, c);
        if (x > 0.0) {
          x = -1.0 + 2.0 / (1.0 + exp(-2.0 * x));
        } else {
          x = 1.0 - 2.0 / (1.0 + exp(2.0 * x));
        }
        N(r, c) = x;
      }
    }
    M.ApplyTanh();
    AssertEqual(M, N);
  }
 }
 template<class Real> static void  UnitTestSimple() {
  for (MatrixIndexT i = 0;i < 5;i++) {
 	MatrixIndexT dimM = 20 + rand()%10, dimN = 20 + rand()%20;
@ -3541,6 +3585,7 @@ template<class Real> static void MatrixUnitTest(bool full_test) {
  UnitTestDotprod<Real>();
  // UnitTestSvdVariants<Real>();
  UnitTestPower<Real>();
  UnitTestCopySp<Real>();
  UnitTestDeterminant<Real>();
  KALDI_LOG << " Point F";
  UnitTestDeterminantSign<Real>();
@ -3566,6 +3611,7 @@ template<class Real> static void MatrixUnitTest(bool full_test) {
  UnitTestRange<Real>();
  UnitTestSimpleForVec<Real>();
  UnitTestSimpleForMat<Real>();
  UnitTestTanh<Real>();
  UnitTestNorm<Real>();
  UnitTestMul<Real>();
  KALDI_LOG << " Point I";
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@ -169,9 +169,17 @@ void SpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
        break;
      }
    case kTakeLower:
-      for (MatrixIndexT i = 0; i < D; i++)
+      { // making this one a bit more efficient.
        const Real *src = M.Data();
        Real *dest = this->data_;
        MatrixIndexT stride = M.Stride();
        for (MatrixIndexT i = 0; i < D; i++) {
          for (MatrixIndexT j = 0; j <= i; j++)
-          (*this)(i, j) = M(i, j);
+            dest[j] = src[j];
          dest += i + 1;
          src += stride;
        }
      }
      break;
    case kTakeUpper:
      for (MatrixIndexT i = 0; i < D; i++)
--- a/src/nnet-cpu/Makefile
+++ b/src/nnet-cpu/Makefile
@ -10,7 +10,7 @@ OBJFILES = nnet-component.o nnet-nnet.o nnet-update.o train-nnet.o \
     nnet-randomize.o nnet-compute.o am-nnet.o nnet-functions.o  \
     nnet-precondition.o shrink-nnet.o combine-nnet.o combine-nnet-a.o \
     mixup-nnet.o nnet-lbfgs.o nnet-update-parallel.o combine-nnet-fast.o \
-     nnet-fix.o
+     nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o
 #nnet-compute.o nnet-train.o
 # nnet-nnet.o nnet-loss.o nnet-rnnlm.o
--- a/src/nnet-cpu/nnet-component-test.cc
+++ b/src/nnet-cpu/nnet-component-test.cc
@ -183,6 +183,23 @@ void UnitTestSigmoidComponent() {
  }
 }
 void UnitTestReduceComponent() {
  // We're testing that the gradients are computed correctly:
  // the input gradients and the model gradients.
  int32 input_dim = 10 + rand() % 50, n = 1 + rand() % 3;
  {
    ReduceComponent reduce_component(input_dim, n);
    UnitTestGenericComponentInternal(reduce_component);
  }
  {
    ReduceComponent reduce_component;
    reduce_component.InitFromString("dim=15 n=3");
    UnitTestGenericComponentInternal(reduce_component);
  }
 }
 template<class T>
 void UnitTestGenericComponent() { // works if it has an initializer from int,
  // e.g. tanh, sigmoid.
@ -463,6 +480,8 @@ int main() {
    UnitTestGenericComponent<TanhComponent>();
    UnitTestGenericComponent<PermuteComponent>();
    UnitTestGenericComponent<SoftmaxComponent>();
    UnitTestSigmoidComponent();
    UnitTestReduceComponent();
    UnitTestAffineComponent();
    UnitTestAffinePreconInputComponent();
    UnitTestBlockAffineComponent();
--- a/src/nnet-cpu/nnet-component.cc
+++ b/src/nnet-cpu/nnet-component.cc
@ -47,6 +47,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
    ans = new TanhComponent();
  } else if (component_type == "SoftmaxComponent") {
    ans = new SoftmaxComponent();
  } else if (component_type == "ReduceComponent") {
    ans = new ReduceComponent();
  } else if (component_type == "AffineComponent") {
    ans = new AffineComponent();
  } else if (component_type == "AffineComponentA") {
@ -407,20 +409,8 @@ void TanhComponent::Propagate(const MatrixBase<BaseFloat> &in,
  // Apply tanh function to each element of the output...
  // the tanh function may be written as -1 + ( 2 / (1 + e^{-2 x})),
  // which is a scaled and shifted sigmoid.
-  out->Resize(in.NumRows(), in.NumCols());
+  *out = in;
-  int32 num_rows = in.NumRows(), num_cols = in.NumCols();
+  out->ApplyTanh();
  for(int32 r = 0; r < num_rows; r++) {
    const BaseFloat *in_data = in.RowData(r),
        *in_data_end = in_data + num_cols;
    BaseFloat *out_data = out->RowData(r);
    for (; in_data != in_data_end; ++in_data, ++out_data) {
      if (*in_data > 0.0) {
        *out_data = -1.0 + 2.0 / (1.0 + exp(-2.0 * *in_data));
      } else {
        *out_data = 1.0 - 2.0 / (1.0 + exp(2.0 * *in_data));
      }
    }
  }
 }
 void TanhComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value
@ -502,6 +492,67 @@ void SoftmaxComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value
  }
 }
 void ReduceComponent::InitFromString(std::string args) {
  std::string orig_args(args);
  int32 dim, n;
  bool ok = ParseFromString("dim", &args, &dim) &&
      ParseFromString("n", &args, &n);
  if (!args.empty())
    KALDI_ERR << "Could not process these elements in initializer: "
              << args;
  if (!ok)
    KALDI_ERR << "Bad initializer " << orig_args;
  Init(dim, n);
 }
 void ReduceComponent::Read(std::istream &is, bool binary) {
  ExpectOneOrTwoTokens(is, binary, "<ReduceComponent>", "<Dim>");
  ReadBasicType(is, binary, &dim_);
  ExpectToken(is, binary, "<N>");
  ReadBasicType(is, binary, &n_);
  ExpectToken(is, binary, "</ReduceComponent>");
 }
 void ReduceComponent::Write(std::ostream &os, bool binary) const {
  WriteToken(os, binary, "<ReduceComponent>");
  WriteToken(os, binary, "<Dim>");
  WriteBasicType(os, binary, dim_);
  WriteToken(os, binary, "<N>");
  WriteBasicType(os, binary, n_);
  WriteToken(os, binary, "</ReduceComponent>");
 }
 void ReduceComponent::Propagate(const MatrixBase<BaseFloat> &in,
                                int32 num_chunks,
                                Matrix<BaseFloat> *out) const {
  KALDI_ASSERT(in.NumRows() > 0 && in.NumCols() == InputDim());
  out->Resize(in.NumRows(), OutputDim());
  int32 num_frames = in.NumRows(), input_dim = in.NumCols(), n = n_;
  for (int32 r = 0; r < num_frames; r++) {
    const BaseFloat *src = in.RowData(r);
    BaseFloat *dest = out->RowData(r);
    for (int32 c = 0; c < input_dim; c++)
      dest[c / n] += src[c];
  }    
 }
 void ReduceComponent::Backprop(const MatrixBase<BaseFloat> &, // in_value,
                               const MatrixBase<BaseFloat> &, // out_value,
                               const MatrixBase<BaseFloat> &out_deriv,
                               int32, // num_chunks
                               Component *, // to_update
                               Matrix<BaseFloat> *in_deriv) const {
  int32 num_frames = out_deriv.NumRows(),
      input_dim = InputDim(), n = n_;
  in_deriv->Resize(num_frames, input_dim, kUndefined);
  for (int32 r = 0; r < num_frames; r++) {
    const BaseFloat *src = out_deriv.RowData(r);
    BaseFloat *dest = in_deriv->RowData(r);
    for (int32 c = 0; c < input_dim; c++)
      dest[c] = src[c / n];
  }    
 }
 void AffineComponent::Scale(BaseFloat scale) {
  linear_params_.Scale(scale);
  bias_params_.Scale(scale);
@ -859,9 +910,9 @@ void AffineComponentPreconditioned::Update(
    in_value_temp(i, in_value.NumCols()) = 1.0;
  Matrix<BaseFloat> in_value_precon(in_value_temp.NumRows(),
-                                    in_value_temp.NumCols()),
+                                    in_value_temp.NumCols(), kUndefined),
      out_deriv_precon(out_deriv.NumRows(),
-                       out_deriv.NumCols());
+                       out_deriv.NumCols(), kUndefined);
  // each row of in_value_precon will be that same row of
  // in_value, but multiplied by the inverse of a Fisher
  // matrix that has been estimated from all the other rows,
--- a/src/nnet-cpu/nnet-component.h
+++ b/src/nnet-cpu/nnet-component.h
@ -225,6 +225,8 @@ class NonlinearComponent: public Component {
  void Scale(BaseFloat scale);
  void Add(BaseFloat alpha, const NonlinearComponent &other);
  // The following functions are unique to NonlinearComponent.
  // They mostly relate to diagnostics.
  const Vector<double> &ValueSum() const { return value_sum_; }
  const Vector<double> &DerivSum() const { return deriv_sum_; }
  double Count() const { return count_; }
@ -324,6 +326,37 @@ class SoftmaxComponent: public NonlinearComponent {
  SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
 };
 /// This layer just sums up groups of n inputs to produce one output.
 class ReduceComponent: public Component {
 public:
  void Init(int32 dim, int32 n) { KALDI_ASSERT(dim > 0 && n > 0);dim_ = dim; n_ = n; }
  ReduceComponent(int32 dim, int32 n) { Init(dim, n); }
  ReduceComponent(): dim_(0), n_(0) { } // e.g. prior to Read()
  explicit ReduceComponent(const ReduceComponent &other):
      dim_(other.dim_), n_(other.n_) {}
  virtual Component* Copy() const { return new ReduceComponent(*this); }
  virtual std::string Type() const { return "ReduceComponent"; }
  virtual int32 InputDim() const { return dim_; }
  virtual int32 OutputDim() const { return (dim_ + n_ - 1) / n_; }
  virtual void InitFromString(std::string args);
  virtual void Read(std::istream &is, bool binary);
  virtual void Write(std::ostream &os, bool binary) const;
  virtual void Propagate(const MatrixBase<BaseFloat> &in,
                         int32 num_chunks,
                         Matrix<BaseFloat> *out) const;
  virtual void Backprop(const MatrixBase<BaseFloat> &in_value,
                        const MatrixBase<BaseFloat> &out_value,                        
                        const MatrixBase<BaseFloat> &out_deriv,
                        int32 num_chunks,
                        Component *to_update, // may be identical to "this".
                        Matrix<BaseFloat> *in_deriv) const;
  virtual bool BackpropNeedsInput() const { return false; }
  virtual bool BackpropNeedsOutput() const { return false; }
 private:
  int32 dim_;
  int32 n_;
 };
 // Affine means a linear function plus an offset.
 // Note: although this class can be instantiated, it also
--- a/src/nnet-cpu/nnet-limit-rank.cc
+++ b/src/nnet-cpu/nnet-limit-rank.cc
@ -0,0 +1,108 @@
 // nnet/nnet-limit-rank.cc
 // Copyright 2012   Johns Hopkins University (author: Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "nnet-cpu/nnet-limit-rank.h"
 #include "thread/kaldi-task-sequence.h"
 namespace kaldi {
 class LimitRankClass {
 public:
  LimitRankClass(const NnetLimitRankOpts &opts,
                 int32 c,
                 Nnet *nnet): opts_(opts), c_(c), nnet_(nnet) { }
  void operator () () {
    AffineComponent *ac = dynamic_cast<AffineComponent*>(
        &(nnet_->GetComponent(c_)));
    KALDI_ASSERT(ac != NULL);
    // We'll limit the rank of just the linear part, keeping the bias vector full.
    Matrix<BaseFloat> M (ac->LinearParams());
    int32 rows = M.NumRows(), cols = M.NumCols(), rc_min = std::min(rows, cols);
    Vector<BaseFloat> s(rc_min);
    Matrix<BaseFloat> U(rows, rc_min), Vt(rc_min, cols);
    // Do the destructive svd M = U diag(s) V^T.  It actually outputs the transpose of V.
    M.DestructiveSvd(&s, &U, &Vt);
    SortSvd(&s, &U, &Vt); // Sort the singular values from largest to smallest.
    int32 d = GetRetainedDim(rows, cols);
    BaseFloat old_svd_sum = s.Sum();
    U.Resize(rows, d, kCopyData);
    s.Resize(d, kCopyData);
    Vt.Resize(d, cols, kCopyData);
    BaseFloat new_svd_sum = s.Sum();
    KALDI_LOG << "For component " << c_ << " of dimension " << rows
              << " x " << cols << ", reduced rank from "
              << rc_min <<  " to " << d << ", SVD sum reduced from "
              << old_svd_sum << " to " << new_svd_sum;
    Vt.MulRowsVec(s); // Vt <-- diag(s) Vt.
    M.AddMatMat(1.0, U, kNoTrans, Vt, kNoTrans, 0.0); // Reconstruct with reduced
    // rank.
    Vector<BaseFloat> bias_params(ac->BiasParams());
    ac->SetParams(bias_params, M);
  }
  int32 GetRetainedDim(int32 rows, int32 cols) {
    if (opts_.parameter_proportion <= 0.0 || opts_.parameter_proportion > 1.0)
      KALDI_ERR << "bad --parameter-proportion " << opts_.parameter_proportion;
    // If we do SVD to dimension d, so that it's U diag(s) V^T where
    // U is rows * d, s is d, and V is cols * d, then the #params is as follows...
    //   the first column of U has free parameters (#rows - 1) [the -1 is due to
    //   the length constraint]; the second has (#rows - 2) [subtract 1 for the
    //   length constraint and one for orthogonality with the previous row], etc.
    //   Total is params(U) = (rows * d) - ((d(d+1))/2),
    //            params(s) = d,
    //            params(V) = (cols * d) - ((d(d+1))/2),
    //   So total is (rows + cols) * d - d * d .
    //   For example, if d = #rows, this equals (#rows * #cols)
    //   We are solving for:
    //   (rows * cols) * parameter_proportion = (rows + cols) * d - d * d, or
    //   d^2 - d * (rows + cols) + (rows*cols)*parameter_proportion
    //   In quadratic equation
    //   a = 1.0,
    //   b = -(rows + cols)
    //   c = rows * cols * parameter_proportion.
    //   Take smaller solution.
    BaseFloat a = 1.0, b = -(rows + cols),
        c = rows * cols * opts_.parameter_proportion;
    BaseFloat x = (-b - sqrt(b * b - 4 * a * c)) / (2.0 * a);
    int32 ans = static_cast<int32>(x);
    KALDI_ASSERT(ans > 0 && ans <= std::min(rows, cols));
    return ans;
  }
  ~LimitRankClass() { }
 private:
  const NnetLimitRankOpts &opts_;
  int32 c_;
  Nnet *nnet_;
 };
 void LimitRankParallel(const NnetLimitRankOpts &opts,
                            Nnet *nnet) {
  TaskSequencerConfig task_config;
  task_config.num_threads = opts.num_threads;
  TaskSequencer<LimitRankClass> tc(task_config);
  for (int32 c = 0; c < nnet->NumComponents(); c++) {
    if (dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c))) != NULL)
      tc.Run(new LimitRankClass(opts, c, nnet));
  }
 }
 } // namespace
--- a/src/nnet-cpu/nnet-limit-rank.h
+++ b/src/nnet-cpu/nnet-limit-rank.h
@ -0,0 +1,56 @@
 // nnet-cpu/nnet-limit-rank.h
 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #ifndef KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
 #define KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
 #include "nnet-cpu/nnet-nnet.h"
 #include "util/table-types.h"
 #include "thread/kaldi-semaphore.h"
 #include "thread/kaldi-thread.h"
 #include "nnet-cpu/nnet-update.h"
 namespace kaldi {
 struct NnetLimitRankOpts {
  int32 num_threads;
  BaseFloat parameter_proportion;
  NnetLimitRankOpts(): num_threads(1), parameter_proportion(0.75) { }
  void Register(ParseOptions *po) {
    po->Register("num-threads", &num_threads, "Number of threads used for "
                 "rank-limiting operation; note, will never use more than "
                 "#layers.");
    po->Register("parameter-proportion", &parameter_proportion, "Proportion of "
                 "dimension of each transform to limit the rank to.");
  }  
 };
 /// This function limits the rank of each affine transform in the
 /// neural net, by zeroing out the smallest singular values.  The number of
 /// singular values to zero out is determined on a layer by layer basis, using
 /// "parameter_proportion" to set the proportion of parameters to remove.
 void LimitRankParallel(const NnetLimitRankOpts &opts,
                       Nnet *nnet);
 } // namespace
 #endif // KALDI_NNET_CPU_NNET_LIMIT_RANK_H_
--- a/src/nnet-cpu/nnet-nnet.cc
+++ b/src/nnet-cpu/nnet-nnet.cc
@ -361,6 +361,17 @@ void Nnet::RemoveDropout() {
    KALDI_LOG << "Removed " << removed << " dropout components.";
 }
 void Nnet::RemovePreconditioning() {
  for (size_t i = 0; i < components_.size(); i++) {
    if (dynamic_cast<AffineComponentPreconditioned*>(components_[i]) != NULL) {
      AffineComponent *ac = new AffineComponent(
          *(dynamic_cast<AffineComponent*>(components_[i])));
      delete components_[i];
      components_[i] = ac;
    }
  }
 }
 void Nnet::AddNnet(const VectorBase<BaseFloat> &scale_params,
                   const Nnet &other) {
  KALDI_ASSERT(scale_params.Dim() == this->NumUpdatableComponents());
--- a/src/nnet-cpu/nnet-nnet.h
+++ b/src/nnet-cpu/nnet-nnet.h
@ -105,6 +105,10 @@ class Nnet {
  /// Excise any components of type DropoutComponent.
  void RemoveDropout();
  /// Replace any components of type AffineComponentPreconditioned with
  /// components of type AffineComponent.
  void RemovePreconditioning();
  /// For each updatatable component, adds to it
  /// the corresponding element of "other" times the
  /// appropriate element of "scales" (which has the
--- a/src/nnet-cpu/nnet-precondition.cc
+++ b/src/nnet-cpu/nnet-precondition.cc
@ -25,7 +25,13 @@ void PreconditionDirections(const MatrixBase<BaseFloat> &R,
                            MatrixBase<BaseFloat> *P) {
  int32 N = R.NumRows(), D = R.NumCols();
-  KALDI_ASSERT(SameDim(R, *P) && N > 1);
+  KALDI_ASSERT(SameDim(R, *P) && N > 0);
  if (N == 1) {
    KALDI_WARN << "Trying to precondition set of only one frames: returning "
               << "unchanged.  Ignore this warning if infrequent.";
    P->CopyFromMat(R);
    return;
  }
  MatrixBase<BaseFloat> &Q = *P;
  if (N >= D) {
--- a/src/nnet-cpu/nnet-stats.h
+++ b/src/nnet-cpu/nnet-stats.h
@ -0,0 +1,93 @@
 // nnet-cpu/nnet-stats.h
 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #ifndef KALDI_NNET_CPU_NNET_FIX_H_
 #define KALDI_NNET_CPU_NNET_FIX_H_
 #include "nnet-cpu/nnet-nnet.h"
 namespace kaldi {
 /* This program computes various statistics from a neural net.  These are
   summaries of certain quantities already present in the network as
   stored on disk, especially regarding certain average values and
   derivatives of the sigmoids.   
 */
 struct NnetStatsConfig {  
  BaseFloat bucket_width;
  NnetStatsConfig(): bucket_width(0.025) { }
  void Register(ParseOptions *po) {
    po->Register("bucket-width", &bucket_width, "Width of bucket in average-derivative "
                 "stats for analysis.");
  }
 };
 class NnetStats {
 public:
  NnetStats(int32 affine_component_index, BaseFloat bucket_width):
      affine_component_index_(affine_component_index),
      bucket_width_(bucket_width), global_(0, -1) { }
  // Use default copy constructor and assignment operator.
  void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
  void AddStatsFromNnet(const Nnet &nnet);
  void PrintStats(std::ostream &os);  
 private:
  struct StatsElement {
    BaseFloat deriv_begin; // avg-deriv, beginning of bucket.
    BaseFloat deriv_end;   // avg-deriv, end of bucket.
    BaseFloat deriv_sum;   // sum of avg-deriv within bucket.
    BaseFloat deriv_sumsq;   // Sum-squared of avg-deriv within bucket.
    BaseFloat abs_value_sum; // Sum of abs(avg-value).  Tells us whether it's
    // saturating at one or both ends.
    BaseFloat abs_value_sumsq; // Sum-squared of abs(avg-value).
    int32 count;      // Number of nonlinearities in this bucket.
    StatsElement(BaseFloat deriv_begin,
                 BaseFloat deriv_end):
        deriv_begin(deriv_begin), deriv_end(deriv_end), deriv_sum(0.0),
        deriv_sumsq(0.0), abs_value_sum(0.0), abs_value_sumsq(0.0), count(0) { }
    void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
    // Outputs stats for this bucket; no newline
    void PrintStats(std::ostream &os); 
  };
  int32 BucketFor(BaseFloat avg_deriv); // returns the bucket
  // for this avg-derivative value, and makes sure it is allocated.
  int32 affine_component_index_; // Component index of the affine component
                                // associated with this nonlinearity.
  BaseFloat bucket_width_; // width of buckets of stats we store (in derivative values).
  std::vector<StatsElement> buckets_; // Stats divided into buckets by avg_deriv.
  StatsElement global_; // All the stats.
 };
 void GetNnetStats(const NnetStatsConfig &config,
                  const Nnet &nnet,
                  std::vector<NnetStats> *stats);
 } // namespace
 #endif // KALDI_NNET_CPU_NNET_FIX_H_
--- a/src/nnet-cpu/nnet-update.cc
+++ b/src/nnet-cpu/nnet-update.cc
@ -44,7 +44,6 @@ class NnetUpdater {
  // Possibly splices input together from forward_data_[component].
  //   MatrixBase<BaseFloat> &GetSplicedInput(int32 component, Matrix<BaseFloat> *temp_matrix);
  void Propagate();
  /// Computes objective function and derivative at output layer.
@ -156,7 +155,7 @@ void NnetUpdater::Backprop(const std::vector<NnetTrainingExample> &data,
    component.Backprop(input, output, output_deriv, num_chunks,
                       component_to_update, &input_deriv);
-    *deriv = input_deriv;
+    input_deriv.Swap(deriv);
  }
 }
--- a/src/nnet-cpu/rescale-nnet.cc
+++ b/src/nnet-cpu/rescale-nnet.cc
@ -0,0 +1,212 @@
 // nnet/rescale-nnet.cc
 // Copyright 2012   Johns Hopkins University (author: Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "nnet-cpu/rescale-nnet.h"
 namespace kaldi {
 class NnetRescaler {
 public:
  NnetRescaler(const NnetRescaleConfig &config,
               const std::vector<NnetTrainingExample> &examples,
               Nnet *nnet):
      config_(config), examples_(examples), nnet_(nnet) {}
  void Rescale();
 private:
  /// takes the input and formats as a single matrix, in forward_data_[0].
  void FormatInput(const std::vector<NnetTrainingExample> &data,
                   Matrix<BaseFloat> *input);
  void RescaleComponent(int32 c, int32 num_chunks,
                        MatrixBase<BaseFloat> *cur_data_in,
                        Matrix<BaseFloat> *next_data);
  void ComputeRelevantIndexes();
  BaseFloat GetTargetAvgDeriv(int32 c);
  const NnetRescaleConfig &config_;
  const std::vector<NnetTrainingExample> &examples_;
  Nnet *nnet_;
  std::set<int32> relevant_indexes_; // values of c with AffineComponent followed
  // by (at c+1) NonlinearComponent that is not SoftmaxComponent.
 };
 void NnetRescaler::FormatInput(const std::vector<NnetTrainingExample> &data,
                               Matrix<BaseFloat> *input) {
  KALDI_ASSERT(data.size() > 0);
  int32 num_splice = nnet_->LeftContext() + 1 + nnet_->RightContext();
  KALDI_ASSERT(data[0].input_frames.NumRows() == num_splice);
  int32 feat_dim = data[0].input_frames.NumCols(),
         spk_dim = data[0].spk_info.Dim(),
         tot_dim = feat_dim + spk_dim; // we append these at the neural net
                                       // input... note, spk_dim might be 0.
  KALDI_ASSERT(tot_dim == nnet_->InputDim());
  int32 num_chunks = data.size();
  input->Resize(num_splice * num_chunks,
                tot_dim);
  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
    SubMatrix<BaseFloat> dest(*input,
                              chunk * num_splice, num_splice,
                              0, feat_dim);
    const Matrix<BaseFloat> &src(data[chunk].input_frames);
    dest.CopyFromMat(src);
    if (spk_dim != 0) {
      SubMatrix<BaseFloat> spk_dest(*input,
                                    chunk * num_splice, num_splice,
                                    feat_dim, spk_dim);
      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
    }
  }
 }
 void NnetRescaler::ComputeRelevantIndexes() {
  for (int32 c = 0; c + 1 < nnet_->NumComponents(); c++)
    if (dynamic_cast<AffineComponent*>(&nnet_->GetComponent(c)) != NULL &&
        (dynamic_cast<NonlinearComponent*>(&nnet_->GetComponent(c+1)) != NULL &&
         dynamic_cast<SoftmaxComponent*>(&nnet_->GetComponent(c+1)) == NULL))
      relevant_indexes_.insert(c);
 }
 BaseFloat NnetRescaler::GetTargetAvgDeriv(int32 c) {
  KALDI_ASSERT(relevant_indexes_.count(c) == 1);
  BaseFloat factor;
  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
    factor = 0.25;
  else if (dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
    factor = 1.0;
  else
    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
  int32 last_c = *std::max_element(relevant_indexes_.begin(), relevant_indexes_.end()),
      first_c = *std::min_element(relevant_indexes_.begin(), relevant_indexes_.end());
  if (c == first_c)
    return factor * config_.target_first_layer_avg_deriv;
  else if (c == last_c)
    return factor * config_.target_last_layer_avg_deriv;
  else
    return factor * config_.target_avg_deriv;
 }
 // Here, c is the index of the affine component, and
 // c + 1 is the index of the nonlinear component; *cur_data is the
 // output of the affine component.
 void NnetRescaler::RescaleComponent(
    int32 c,
    int32 num_chunks,
    MatrixBase<BaseFloat> *cur_data_in,
    Matrix<BaseFloat> *next_data) {
  int32 rows = cur_data_in->NumRows(), cols = cur_data_in->NumCols();
  // Only handle sigmoid or tanh here.
  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) == NULL &&
      dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) == NULL)
    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
  // the nonlinear component:
  NonlinearComponent &nc =
      *(dynamic_cast<NonlinearComponent*>(&(nnet_->GetComponent(c + 1))));
  BaseFloat orig_avg_deriv, target_avg_deriv = GetTargetAvgDeriv(c);
  BaseFloat cur_scaling = 1.0; // current rescaling factor (on input).
  int32 num_iters = 10;
  Matrix<BaseFloat> cur_data(*cur_data_in),
      ones(rows, cols), in_deriv(rows, cols);
  ones.Set(1.0);
  nc.Propagate(cur_data, num_chunks, next_data);
  nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
  BaseFloat cur_avg_deriv;
  cur_avg_deriv = in_deriv.Sum() / (rows * cols);
  orig_avg_deriv = cur_avg_deriv;
  for (int32 iter = 0; iter < num_iters; iter++) {
    // We already have "cur_avg_deriv"; perturb the scale and compute
    // the next avg_deriv, so we can see how it changes with the scale.
    cur_data.CopyFromMat(*cur_data_in);
    cur_data.Scale(cur_scaling + config_.delta);
    nc.Propagate(cur_data, num_chunks, next_data);
    nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
    BaseFloat next_avg_deriv = in_deriv.Sum() / (rows * cols);
    KALDI_ASSERT(next_avg_deriv < cur_avg_deriv);
    // "gradient" is how avg_deriv changes as we change the scale.
    // should be negative.
    BaseFloat gradient = (next_avg_deriv - cur_avg_deriv) / config_.delta;
    KALDI_ASSERT(gradient < 0.0);
    BaseFloat proposed_change = (target_avg_deriv - cur_avg_deriv) / gradient;
    KALDI_VLOG(2) << "cur_avg_deriv = " << cur_avg_deriv << ", target_avg_deriv = "
                  << target_avg_deriv << ", gradient = " << gradient
                  << ", proposed_change " << proposed_change; 
    // Limit size of proposed change in "cur_scaling", to ensure stability.
    if (fabs(proposed_change / cur_scaling) > config_.max_change)
      proposed_change = cur_scaling * config_.max_change *
          (proposed_change > 0.0 ? 1.0 : -1.0);
    cur_scaling += proposed_change;
    cur_data.CopyFromMat(*cur_data_in);
    cur_data.Scale(cur_scaling);
    nc.Propagate(cur_data, num_chunks, next_data);
    nc.Backprop(cur_data, *next_data, ones, num_chunks, NULL, &in_deriv);
    cur_avg_deriv = in_deriv.Sum() / (rows * cols);
    if (fabs(proposed_change) < config_.min_change) break; // Terminate the
    // optimization
  }
  UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(
      &nnet_->GetComponent(c));
  KALDI_ASSERT(uc != NULL);
  uc->Scale(cur_scaling); // scale the parameters of the previous
  // AffineComponent.
  KALDI_LOG << "For component " << c << ", scaling parameters by "
            << cur_scaling << "; average "
            << "derivative changed from " << orig_avg_deriv << " to "
            << cur_avg_deriv << "; target was " << target_avg_deriv;
 }
 void NnetRescaler::Rescale() {
  ComputeRelevantIndexes(); // set up relevant_indexes_.
  Matrix<BaseFloat> cur_data, next_data;
  FormatInput(examples_, &cur_data);
  int32 num_chunks = examples_.size();
  for (int32 c = 0; c < nnet_->NumComponents(); c++) {
    Component &component = nnet_->GetComponent(c);
    if (relevant_indexes_.count(c - 1) == 1) {
      // the following function call also appropriately sets "next_data"
      // after doing the rescaling
      RescaleComponent(c - 1, num_chunks, &cur_data, &next_data);
    } else {
      component.Propagate(cur_data, num_chunks, &next_data);
    }
    cur_data.Swap(&next_data);
  }
 }
 void RescaleNnet(const NnetRescaleConfig &rescale_config,
                 const std::vector<NnetTrainingExample> &examples,
                 Nnet *nnet) {
  NnetRescaler rescaler(rescale_config, examples, nnet);
  rescaler.Rescale();
 }
 } // namespace
--- a/src/nnet-cpu/rescale-nnet.h
+++ b/src/nnet-cpu/rescale-nnet.h
@ -0,0 +1,76 @@
 // nnet-cpu/rescale-nnet.h
 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #ifndef KALDI_NNET_CPU_RESCALE_NNET_H_
 #define KALDI_NNET_CPU_RESCALE_NNET_H_
 #include "nnet-cpu/nnet-update.h"
 #include "nnet-cpu/nnet-compute.h"
 #include "util/parse-options.h"
 // Neural net rescaling is a rescaling of the parameters of the various layers
 // of a neural net, done so as to match certain specified statistics on the
 // average derivative of the sigmoid, measured on sample data.  This relates to
 // how "saturated" the sigmoid is.
 namespace kaldi {
 struct NnetRescaleConfig {
  BaseFloat target_avg_deriv;
  BaseFloat target_first_layer_avg_deriv;
  BaseFloat target_last_layer_avg_deriv;
  // These are relatively unimportant; for now they have no
  // command line options.
  BaseFloat num_iters;
  BaseFloat delta;
  BaseFloat max_change; // maximum change on any one iteration (to
  // ensure stability).
  BaseFloat min_change; // minimum change on any one iteration (controls
  // termination
  NnetRescaleConfig(): target_avg_deriv(0.2),
                       target_first_layer_avg_deriv(0.3),
                       target_last_layer_avg_deriv(0.1),
                       num_iters(10),
                       delta(0.01),
                       max_change(0.2), min_change(1.0e-05) { }
  void Register(ParseOptions *po) {
    po->Register("target-avg-deriv", &target_avg_deriv, "Target average derivative "
                 "for hidden layers that are the not the first or last hidden layer "
                 "(as fraction of maximum derivative of the nonlinearity)");
    po->Register("target-first-layer-avg-deriv", &target_first_layer_avg_deriv,
                 "Target average derivative for the first hidden layer"
                 "(as fraction of maximum derivative of the nonlinearity)");
    po->Register("target-last-layer-avg-deriv", &target_last_layer_avg_deriv,
                 "Target average derivative for the last hidden layer, if "
                 "#hid-layers > 1"
                 "(as fraction of maximum derivative of the nonlinearity)");
  }  
 };
 void RescaleNnet(const NnetRescaleConfig &rescale_config,
                 const std::vector<NnetTrainingExample> &examples,
                 Nnet *nnet);
 } // namespace
 #endif
--- a/src/nnet-cpubin/Makefile
+++ b/src/nnet-cpubin/Makefile
@ -13,7 +13,8 @@ BINFILES = nnet-randomize-frames nnet-am-info nnet-train nnet-init \
   nnet-train-lbfgs nnet-get-egs nnet-train-parallel nnet-gradient \
   nnet-get-preconditioner nnet-precondition nnet-select-egs nnet-combine-fast \
   nnet-subset-egs nnet-shuffle-egs nnet-am-fix nnet-logprob  nnet-logprob2 \
-   nnet-logprob2-parallel nnet-logprob-parallel
+   nnet-logprob2-parallel nnet-logprob-parallel nnet-am-stats nnet-am-rescale \
   nnet-am-limit-rank
 OBJFILES =
--- a/src/nnet-cpubin/nnet-am-copy.cc
+++ b/src/nnet-cpubin/nnet-am-copy.cc
@ -41,6 +41,7 @@ int main(int argc, char *argv[]) {
    int32 truncate = -1;
    bool binary_write = true;
    bool remove_dropout = false;
    bool remove_preconditioning = false;
    BaseFloat learning_rate_factor = 1.0, learning_rate = -1;
    std::string learning_rates = "";
    std::string scales = "";
@ -64,6 +65,8 @@ int main(int argc, char *argv[]) {
                "to this many components by removing the last components.");
    po.Register("remove-dropout", &remove_dropout, "Set this to true to remove "
                "any dropout components.");
    po.Register("remove-preconditioning", &remove_preconditioning, "Set this to true to replace "
                "components of type AffineComponentPreconditioned with AffineComponent.");
    po.Register("stats-from", &stats_from, "Before copying neural net, copy the "
                "statistics in any layer of type NonlinearComponent, from this "
                "neural network: provide the extended filename.");
@ -133,6 +136,8 @@ int main(int argc, char *argv[]) {
    if (remove_dropout) am_nnet.GetNnet().RemoveDropout();
    if (remove_preconditioning) am_nnet.GetNnet().RemovePreconditioning();
    if (stats_from != "") {
      // Copy the stats associated with the layers descending from
      // NonlinearComponent.
--- a/src/nnet-cpubin/nnet-am-fix.cc
+++ b/src/nnet-cpubin/nnet-am-fix.cc
@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
        "e.g.:\n"
        " nnet-am-fix 1.mdl 1_fixed.mdl\n"
        "or:\n"
-        " nnet-am-shrink-rows --get-counts-from=1.gradient 1.mdl 1_shrunk.mdl\n";
+        " nnet-am-fix --get-counts-from=1.gradient 1.mdl 1_shrunk.mdl\n";
    bool binary_write = true;
    NnetFixConfig config;
--- a/src/nnet-cpubin/nnet-am-limit-rank.cc
+++ b/src/nnet-cpubin/nnet-am-limit-rank.cc
@ -0,0 +1,81 @@
 // nnet-cpubin/nnet-am-limit-rank.cc
 // Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
 #include "nnet-cpu/nnet-limit-rank.h"
 #include "nnet-cpu/am-nnet.h"
 #include "hmm/transition-model.h"
 #include "tree/context-dep.h"
 int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;
    typedef kaldi::int32 int32;
    const char *usage =
        "Copy a (cpu-based) neural net and its associated transition model,\n"
        "but modify it to reduce the effective parameter count by limiting\n"
        "the rank of weight matrices.\n"
        "\n"
        "Usage:  nnet-am-limit-rank [options] <nnet-in> <nnet-out>\n"
        "e.g.:\n"
        " nnet-am-limit-rank 1.mdl 1_limited.mdl\n";
    bool binary_write = true;
    NnetLimitRankOpts config;
    ParseOptions po(usage);
    po.Register("binary", &binary_write, "Write output in binary mode");
    config.Register(&po);
    po.Read(argc, argv);
    if (po.NumArgs() != 2) {
      po.PrintUsage();
      exit(1);
    }
    std::string nnet_rxfilename = po.GetArg(1),
        nnet_wxfilename = po.GetArg(2);
    TransitionModel trans_model;
    AmNnet am_nnet;
    {
      bool binary;
      Input ki(nnet_rxfilename, &binary);
      trans_model.Read(ki.Stream(), binary);
      am_nnet.Read(ki.Stream(), binary);
    }
    LimitRankParallel(config, &am_nnet.GetNnet());
    {
      Output ko(nnet_wxfilename, binary_write);
      trans_model.Write(ko.Stream(), binary_write);
      am_nnet.Write(ko.Stream(), binary_write);
    }
    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
              << " to " << nnet_wxfilename;
    return 0;
  } catch(const std::exception &e) {
    std::cerr << e.what() << '\n';
    return -1;
  }
 }
--- a/src/nnet-cpubin/nnet-am-rescale.cc
+++ b/src/nnet-cpubin/nnet-am-rescale.cc
@ -0,0 +1,92 @@
 // nnet-cpubin/nnet-am-rescale.cc
 // Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
 #include "nnet-cpu/rescale-nnet.h"
 #include "nnet-cpu/am-nnet.h"
 #include "hmm/transition-model.h"
 #include "tree/context-dep.h"
 int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;
    typedef kaldi::int32 int32;
    const char *usage =
        "Rescale the parameters in a neural net to achieve certain target\n"
        "statistics, relating to the average derivative of the sigmoids\n"
        "measured at some supplied data.  This relates to how saturated\n"
        "the sigmoids are (we try to match the statistics of `good' neural\n"
        "nets).\n"
        "\n"
        "Usage:  nnet-am-rescale [options] <nnet-in> <examples-in> <nnet-out>\n"
        "e.g.:\n"
        " nnet-am-rescale 1.mdl valid.egs 1_rescaled.mdl\n";
    bool binary_write = true;
    NnetRescaleConfig config;
    ParseOptions po(usage);
    po.Register("binary", &binary_write, "Write output in binary mode");
    config.Register(&po);
    po.Read(argc, argv);
    if (po.NumArgs() != 3) {
      po.PrintUsage();
      exit(1);
    }
    std::string nnet_rxfilename = po.GetArg(1),
        egs_rspecifier = po.GetArg(2), 
        nnet_wxfilename = po.GetArg(3);
    TransitionModel trans_model;
    AmNnet am_nnet;
    {
      bool binary;
      Input ki(nnet_rxfilename, &binary);
      trans_model.Read(ki.Stream(), binary);
      am_nnet.Read(ki.Stream(), binary);
    }
    std::vector<NnetTrainingExample> egs;
    // This block adds samples to "egs".
    SequentialNnetTrainingExampleReader example_reader(
        egs_rspecifier);
    for (; !example_reader.Done(); example_reader.Next())
      egs.push_back(example_reader.Value());
    KALDI_LOG << "Read " << egs.size() << " examples.";
    KALDI_ASSERT(!egs.empty());
    RescaleNnet(config, egs, &am_nnet.GetNnet());
    {
      Output ko(nnet_wxfilename, binary_write);
      trans_model.Write(ko.Stream(), binary_write);
      am_nnet.Write(ko.Stream(), binary_write);
    }
    KALDI_LOG << "Rescaled neural net and wrote it to " << nnet_wxfilename;
    return 0;
  } catch(const std::exception &e) {
    std::cerr << e.what() << '\n';
    return -1;
  }
 }
--- a/src/nnet-cpubin/nnet-am-stats.cc
+++ b/src/nnet-cpubin/nnet-am-stats.cc
@ -0,0 +1,72 @@
 // nnet-cpubin/nnet-am-stats.cc
 // Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
 #include "nnet-cpu/nnet-stats.h"
 #include "nnet-cpu/am-nnet.h"
 #include "hmm/transition-model.h"
 #include "tree/context-dep.h"
 int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;
    typedef kaldi::int32 int32;
    const char *usage =
        "Print some statistics about the average derivatives of the sigmoid layers\n"
        "of the neural net, that are stored in the net\n"
        "\n"
        "Usage:  nnet-am-stats [options] <nnet-in>\n"
        "e.g.:\n"
        " nnet-am-stats 1.mdl 1_fixed.mdl\n";
    NnetStatsConfig config;
    ParseOptions po(usage);
    config.Register(&po);
    po.Read(argc, argv);
    if (po.NumArgs() != 1) {
      po.PrintUsage();
      exit(1);
    }
    std::string nnet_rxfilename = po.GetArg(1);
    TransitionModel trans_model;
    AmNnet am_nnet;
    {
      bool binary;
      Input ki(nnet_rxfilename, &binary);
      trans_model.Read(ki.Stream(), binary);
      am_nnet.Read(ki.Stream(), binary);
    }
    std::vector<NnetStats> stats;
    GetNnetStats(config, am_nnet.GetNnet(), &stats);
    KALDI_ASSERT(!stats.empty());
    for (size_t i = 0; i < stats.size(); i++)
      stats[i].PrintStats(std::cout);
    return 0;
  } catch(const std::exception &e) {
    std::cerr << e.what() << '\n';
    return -1;
  }
 }
--- a/src/nnet-cpubin/nnet-subset-egs.cc
+++ b/src/nnet-cpubin/nnet-subset-egs.cc
@ -85,7 +85,7 @@ int main(int argc, char *argv[]) {
    KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read
              << " neural-network training examples ";
-    return (static_cast<size_t>(n) == egs.size() ? 0 : 1);
+    return (num_read != 0 ? 0 : 1);
  } catch(const std::exception &e) {
    std::cerr << e.what() << '\n';
    return -1;
--- a/src/nnet/nnet-cache-tgtmat.cc
+++ b/src/nnet/nnet-cache-tgtmat.cc
@ -88,8 +88,8 @@ void CacheTgtMat::AddData(const CuMatrix<BaseFloat> &features, const CuMatrix<Ba
      features_.CopyRowsFromMat(leftover, features_leftover_, 0, 0);
      targets_.CopyRowsFromMat(leftover, targets_leftover_, 0, 0);
-      features_leftover_.Destroy();
+      features_leftover_.Resize(0, 0);
-      targets_leftover_.Destroy();
+      targets_leftover_.Resize(0, 0);
      filling_pos_ += leftover;
    } 
  }
--- a/src/nnet/nnet-cache.cc
+++ b/src/nnet/nnet-cache.cc
@ -91,7 +91,7 @@ void Cache::AddData(const CuMatrix<BaseFloat> &features, const std::vector<int32
                targets_leftover_.begin()+leftover,
                targets_.begin());
-      features_leftover_.Destroy();
+      features_leftover_.Resize(0, 0);
      targets_leftover_.resize(0);
      filling_pos_ += leftover;
    } 
--- a/src/nnet/nnet-loss.cc
+++ b/src/nnet/nnet-loss.cc
@ -32,7 +32,7 @@ void Xent::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &t
  diff->Resize(net_out.NumRows(), net_out.NumCols());
  // compute derivative wrt. activations of last layer of neurons
-  diff->CopyFromMat(net_out);
+  *diff = net_out;
  diff->AddMat(-1.0, target);
  // we'll not produce per-frame classification accuracy for soft labels
@ -40,7 +40,8 @@ void Xent::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &t
  // :TODO: reimplement when needed
  // compute xentropy (ON CPU)
-  Matrix<BaseFloat> target_host, net_out_host;
+  Matrix<BaseFloat> target_host(target.NumRows(), target.NumCols(), kUndefined),
      net_out_host(net_out.NumRows(), net_out.NumCols(), kUndefined);
  target.CopyToMat(&target_host);
  net_out.CopyToMat(&net_out_host);
  BaseFloat val;
@ -69,7 +70,7 @@ void Xent::EvalVec(const CuMatrix<BaseFloat> &net_out, const std::vector<int32>
  // get the xentropy and global error 
  target_device_.CopyFromVec(target);
  if(&net_out != diff) { //<allow no-copy speedup
-    diff->CopyFromMat(net_out);
+    *diff = net_out;
  }
  cu::DiffXent(target_device_, diff, &log_post_tgt_);
  //
@ -84,6 +85,7 @@ void Xent::EvalVec(const CuMatrix<BaseFloat> &net_out, const std::vector<int32>
  // log(sum_row(net_out.*target_mat)))
  // they now are stored in vector log_post_tgt_
  //
  log_post_tgt_host_.Resize(log_post_tgt_.Dim());
  log_post_tgt_.CopyToVec(&log_post_tgt_host_);
  loss_    -= log_post_tgt_host_.Sum();
@ -110,9 +112,10 @@ std::string Xent::Report() {
 void Mse::Eval(const CuMatrix<BaseFloat> &net_out, const CuMatrix<BaseFloat> &target, CuMatrix<BaseFloat> *diff) {
  KALDI_ASSERT(net_out.NumCols() == target.NumCols());
  KALDI_ASSERT(net_out.NumRows() == target.NumRows());
-  diff->Resize(net_out.NumRows(), net_out.NumCols());
+
  // compute derivative w.r.t. neural nerwork outputs
  diff->Resize(net_out.NumRows(), net_out.NumCols());
  diff->CopyFromMat(net_out);
  diff->AddMat(-1.0, target);
@ -147,9 +150,9 @@ std::string Mse::Report() {
 void MseProgress::Eval(const CuMatrix<BaseFloat>& net_out, const CuMatrix<BaseFloat>& target, CuMatrix<BaseFloat>* diff) {
  KALDI_ASSERT(net_out.NumCols() == target.NumCols());
  KALDI_ASSERT(net_out.NumRows() == target.NumRows());
  diff->Resize(net_out.NumRows(),net_out.NumCols());
  //compute derivative w.r.t. neural nerwork outputs
  diff->Resize(net_out.NumRows(),net_out.NumCols());
  diff->CopyFromMat(net_out);
  diff->AddMat(-1.0,target);
--- a/src/nnetbin/cmvn-to-nnet.cc
+++ b/src/nnetbin/cmvn-to-nnet.cc
@ -94,8 +94,7 @@ int main(int argc, char *argv[]) {
      //the pointer will be given to the nnet, so we don't need to call delete
      //convert Vector to CuVector
-      CuVector<BaseFloat> cu_shift;
+      CuVector<BaseFloat> cu_shift(shift);
      cu_shift.CopyFromVec(shift);
      //set the weights
      shift_component->SetShiftVec(cu_shift);
@ -110,8 +109,7 @@ int main(int argc, char *argv[]) {
      //the pointer will be given to the nnet, so we don't need to call delete
      //convert Vector to CuVector
-      CuVector<BaseFloat> cu_scale;
+      CuVector<BaseFloat> cu_scale(scale);
      cu_scale.CopyFromVec(scale);
      //set the weights
      scale_component->SetScaleVec(cu_scale);
--- a/src/nnetbin/nnet-forward.cc
+++ b/src/nnetbin/nnet-forward.cc
@ -129,6 +129,7 @@ int main(int argc, char *argv[]) {
      }
      // push priors to GPU
      priors.Resize(tmp_priors.Dim());
      priors.CopyFromVec(tmp_priors);
    }
@ -150,7 +151,7 @@ int main(int argc, char *argv[]) {
        }
      }
      // push it to gpu
-      feats.CopyFromMat(mat);
+      feats = mat;
      // fwd-pass
      nnet_transf.Feedforward(feats, &feats_transf);
      nnet.Feedforward(feats_transf, &nnet_out);
@ -170,6 +171,7 @@ int main(int argc, char *argv[]) {
      }
      //download from GPU
      nnet_out_host.Resize(nnet_out.NumRows(), nnet_out.NumCols());
      nnet_out.CopyToMat(&nnet_out_host);
      //check for NaN/inf
      for(int32 r=0; r<nnet_out_host.NumRows(); r++) {
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ b/src/nnetbin/nnet-train-mmi-sequential.cc
@ -223,12 +223,13 @@ int main(int argc, char *argv[]) {
        //3) propagate the feature to get the log-posteriors (nnet w/o sofrmax)
        // push features to GPU
-        feats.CopyFromMat(mat);
+        feats = mat;
        // possibly apply transform
        nnet_transf.Feedforward(feats, &feats_transf);
        // propagate through the nnet (assuming w/o softmax)
        nnet.Propagate(feats_transf, &nnet_out);
-        // pop it back to the HOST
+        // transfer it back to the host
        nnet_out_h.Resize(nnet_out.NumRows(), nnet_out.NumCols(), kUndefined);
        nnet_out.CopyToMat(&nnet_out_h);
        // TODO: poccibly divide by priors
@ -277,7 +278,7 @@ int main(int argc, char *argv[]) {
        //7) backpropagate through the nnet
        if (!crossvalidate) {
-          nnet_diff.CopyFromMat(nnet_diff_h);
+          nnet_diff = nnet_diff_h;
          nnet.Backpropagate(nnet_diff, NULL);
        }
--- a/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
+++ b/src/nnetbin/nnet-train-mse-tgtmat-frmshuff.cc
@ -139,8 +139,8 @@ int main(int argc, char *argv[]) {
            continue;
          }
          // push features/targets to GPU
-          feats.CopyFromMat(fea_mat);
+          feats = fea_mat;
-          targets.CopyFromMat(tgt_mat);
+          targets = tgt_mat;
          // possibly apply feature transform
          nnet_transf.Feedforward(feats, &feats_transf);
          // add to cache
--- a/src/nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc
+++ b/src/nnetbin/nnet-train-xent-hardlab-frmshuff-prior.cc
@ -142,7 +142,7 @@ int main(int argc, char *argv[]) {
            continue;
          }
          // push features to GPU
-          feats.CopyFromMat(mat);
+          feats = mat;
          // possibly apply transform
          nnet_transf.Feedforward(feats, &feats_transf);
          // add to cache
--- a/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
+++ b/src/nnetbin/nnet-train-xent-hardlab-frmshuff.cc
@ -138,6 +138,7 @@ int main(int argc, char *argv[]) {
            num_other_error++;
          } else { //dimension OK
            // push features to GPU
            feats.Resize(mat.NumRows(), mat.NumCols(), kUndefined);
            feats.CopyFromMat(mat);
            // possibly apply transform
            nnet_transf.Feedforward(feats, &feats_transf);
--- a/src/nnetbin/rbm-train-cd1-frmshuff.cc
+++ b/src/nnetbin/rbm-train-cd1-frmshuff.cc
@ -132,7 +132,8 @@ int main(int argc, char *argv[]) {
        rbm_transf.Feedforward(feats, &feats_transf);
        // subsample the feats to get faster epochs
        if(drop_data > 0.0) {
-          Matrix<BaseFloat> mat2;
+          Matrix<BaseFloat> mat2(feats_transf.NumRows(), feats_transf.NumCols(),
                                 kUndefined);
          feats_transf.CopyToMat(&mat2);
          for(int32 r=mat2.NumRows()-1; r >= 0; r--) {
            if(RandUniform() < drop_data) {
--- a/src/nnetbin/transf-to-nnet.cc
+++ b/src/nnetbin/transf-to-nnet.cc
@ -62,8 +62,7 @@ int main(int argc, char *argv[]) {
    //the pointer will be given to the nnet, so we don't need to call delete
    //convert Matrix to CuMatrix
-    CuMatrix<BaseFloat> cu_transform;
+    CuMatrix<BaseFloat> cu_transform(transform);
    cu_transform.CopyFromMat(transform);
    //set the weights
    layer->SetLinearity(cu_transform);
`@ -1,5 +1,2 @@`
	`beam=11.0 # beam for decoding. Was 13.0 in the scripts.`	`beam=11.0 # beam for decoding. Was 13.0 in the scripts.`
	`first_beam=8.0 # beam for 1st-pass decoding in SAT.`	`first_beam=8.0 # beam for 1st-pass decoding in SAT.`