trunk: modifying recipes for neural net training with the new online preconditioning; adding/modifying example scripts for RM and Fisher; various cosmetic code changes.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4088 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-06-28 05:11:57 +00:00 · 2014-06-28 05:11:57 +00:00 · 99873c6171
--- a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
+++ b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
@ -1,11 +1,13 @@
 #!/bin/bash

-
 # this (local/nnet2/run_6c_gpu.sh) trains a p-norm neural network on top of
 # the SAT system in 5a.
+# It uses the _fast.sh version of the script, which is faster than the old
+# one, and also the --first-component-power 0.5 option, which we believe 
+# improves results (we're waiting for the numbers though).


-dir=nnet6c_gpu
+dir=nnet6c5_gpu
 train_stage=-10

 . ./cmd.sh
@ -28,11 +30,13 @@ parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely

  if [ ! -f exp/$dir/final.mdl ]; then

-    steps/nnet2/train_pnorm.sh --stage $train_stage --num-epochs 10 --get-egs-stage 3 --stage -3 \
+    steps/nnet2/train_pnorm_fast.sh --stage $train_stage --num-epochs 8  \
+      --first-component-power 0.5 \
+      --egs-dir exp/nnet6c3_gpu/egs \
+      --num-epochs-extra 4 \
      --samples-per-iter 400000 \
      --io-opts "-tc 10" \
-      --num-epochs-extra 5 \
-      --num-jobs-nnet 8 --num-threads 1 --max-change 40.0 \
+      --num-jobs-nnet 8 --num-threads 1 \
      --minibatch-size 512 --parallel-opts "$parallel_opts" \
      --mix-up 15000 \
      --initial-learning-rate 0.08 --final-learning-rate 0.008 \
@ -43,7 +47,7 @@ parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely
      data/train data/lang exp/tri5a exp/$dir || exit 1;
  fi

-   steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 \
+   steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 25 \
     --config conf/decode.config --transform-dir exp/tri5a/decode_dev \
      exp/tri5a/graph data/dev exp/$dir/decode_dev &

--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@ -105,14 +105,17 @@ exit 0
 %WER 1.72 [ 216 / 12533, 25 ins, 38 del, 153 sub ] exp/nnet4b_gpu/decode/wer_4
 %WER 8.34 [ 1045 / 12533, 94 ins, 146 del, 805 sub ] exp/nnet4b_gpu/decode_ug/wer_10

-%WER 1.75 [ 219 / 12533, 23 ins, 54 del, 142 sub ] exp/nnet4c/decode/wer_4
-%WER 8.83 [ 1107 / 12533, 70 ins, 206 del, 831 sub ] exp/nnet4c/decode_ug/wer_10
+%WER 1.72 [ 216 / 12533, 24 ins, 51 del, 141 sub ] exp/nnet4c/decode/wer_4
+%WER 9.04 [ 1133 / 12533, 110 ins, 170 del, 853 sub ] exp/nnet4c/decode_ug/wer_7

-%WER 1.76 [ 220 / 12533, 16 ins, 60 del, 144 sub ] exp/nnet4c_gpu/decode/wer_6
-%WER 8.82 [ 1106 / 12533, 90 ins, 173 del, 843 sub ] exp/nnet4c_gpu/decode_ug/wer_11
+%WER 1.83 [ 229 / 12533, 28 ins, 57 del, 144 sub ] exp/nnet4c_gpu/decode/wer_4
+%WER 9.08 [ 1138 / 12533, 111 ins, 166 del, 861 sub ] exp/nnet4c_gpu/decode_ug/wer_7

-%WER 1.61 [ 202 / 12533, 29 ins, 34 del, 139 sub ] exp/nnet4d_gpu/decode/wer_2
-%WER 8.48 [ 1063 / 12533, 76 ins, 176 del, 811 sub ] exp/nnet4d_gpu/decode_ug/wer_10
+%WER 1.60 [ 201 / 12533, 28 ins, 43 del, 130 sub ] exp/nnet4d/decode/wer_4
+%WER 8.24 [ 1033 / 12533, 83 ins, 166 del, 784 sub ] exp/nnet4d/decode_ug/wer_11
+
+%WER 1.69 [ 212 / 12533, 23 ins, 54 del, 135 sub ] exp/nnet4d_gpu/decode/wer_5
+%WER 8.39 [ 1052 / 12533, 86 ins, 174 del, 792 sub ] exp/nnet4d_gpu/decode_ug/wer_11

 %WER 1.37 [ 172 / 12533, 14 ins, 36 del, 122 sub ] exp/nnet4e_gpu/decode/wer_3
 %WER 8.03 [ 1006 / 12533, 61 ins, 179 del, 766 sub ] exp/nnet4e_gpu/decode_ug/wer_8
--- a/egs/rm/s5/local/nnet2/run_4c.sh
+++ b/egs/rm/s5/local/nnet2/run_4c.sh
@ -1,27 +1,56 @@
 #!/bin/bash

+
 # This is neural net training on top of adapted 40-dimensional features.
-# 
+# The same script works for GPUs, and for CPU only (with --use-gpu false).
+
+train_stage=-10
+use_gpu=true

 . cmd.sh
+. ./path.sh
+. utils/parse_options.sh

-(  steps/nnet2/train_tanh.sh  --num-epochs 20 \
-     --num-epochs-extra 10 --add-layers-period 1 \
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+  fi
+  parallel_opts="-l gpu=1" 
+  num_threads=1
+  dir=exp/nnet4c_gpu
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+  parallel_opts="-pe smp $num_threads" 
+  dir=exp/nnet4c
+fi
+
+
+
+if [ ! -f $dir/final.mdl ]; then
+ steps/nnet2/train_tanh_fast.sh --stage $train_stage \
+     --num-epochs 20 \
+     --add-layers-period 1 \
     --num-hidden-layers 2 \
     --mix-up 4000 \
     --initial-learning-rate 0.02 --final-learning-rate 0.004 \
     --cmd "$decode_cmd" \
     --hidden-layer-dim 375 \
-     data/train data/lang exp/tri3b_ali exp/nnet4c
+     data/train data/lang exp/tri3b_ali $dir || exit 1;
+fi

-   steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
-     --transform-dir exp/tri3b/decode \
-     exp/tri3b/graph data/test exp/nnet4c/decode 
-
-   steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
-     --transform-dir exp/tri3b/decode_ug \
-     exp/tri3b/graph_ug data/test exp/nnet4c/decode_ug
-
-)
+steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+  --transform-dir exp/tri3b/decode \
+  exp/tri3b/graph data/test $dir/decode  &

+steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+  --transform-dir exp/tri3b/decode_ug \
+  exp/tri3b/graph_ug data/test $dir/decode_ug

+wait
--- a/egs/rm/s5/local/nnet2/run_4c_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_4c_gpu.sh
@ -1,42 +0,0 @@
-#!/bin/bash
-
-# This is neural net training on top of adapted 40-dimensional features.
-# This version of the script uses GPUs.  We distinguish it by putting "_gpu"
-# at the end of the directory name.
-#
-# Since we're using one quarter the number of jobs (num-jobs-nnet) as the
-# run_4c.sh script, we halve the learning rate (generally speaking, splitting
-# the difference like this is probably a good idea.)
-
-
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
-
-. ./cmd.sh
-. ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-
-(  steps/nnet2/train_tanh.sh  --num-epochs 20 \
-     --num-jobs-nnet 4 --num-threads 1 --parallel-opts "$parallel_opts" \
-     --num-epochs-extra 10 --add-layers-period 1 \
-     --num-hidden-layers 2 \
-     --mix-up 4000 \
-     --initial-learning-rate 0.01 --final-learning-rate 0.002 \
-     --cmd "$decode_cmd" \
-     --hidden-layer-dim 375 \
-     data/train data/lang exp/tri3b_ali exp/nnet4c_gpu
-
-   steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
-     --transform-dir exp/tri3b/decode \
-     exp/tri3b/graph data/test exp/nnet4c_gpu/decode 
-
-   steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
-     --transform-dir exp/tri3b/decode_ug \
-     exp/tri3b/graph_ug data/test exp/nnet4c_gpu/decode_ug
-
-)
-
-
--- a/egs/rm/s5/local/nnet2/run_4d.sh
+++ b/egs/rm/s5/local/nnet2/run_4d.sh
@ -1,21 +1,42 @@
 #!/bin/bash

-# This is pnorm neural net training on top of adapted 40-dimensional features.
-# This version of the script uses GPUs.  We distinguish it by putting "_gpu"
-# at the end of the directory name.

+# local/nnet2/run_4d.sh is the new, faster version of the p-norm training script.
+# The same script works for GPUs, and for CPU only (with --use-gpu false).

-# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
-# almost the same, but this may be a little bit slow.
-num_threads=16
-parallel_opts="-pe smp $num_threads" 
-
+train_stage=-10
+use_gpu=true

 . cmd.sh
+. ./path.sh
+. utils/parse_options.sh

-dir=exp/nnet4d
-(  steps/nnet2/train_pnorm.sh  --num-epochs 20 \
-     --num-jobs-nnet 4 --num-threads $num_threads --parallel-opts "$parallel_opts" \
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+  fi
+  parallel_opts="-l gpu=1" 
+  num_threads=1
+  dir=exp/nnet4d_gpu
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+  parallel_opts="-pe smp $num_threads" 
+  dir=exp/nnet4d
+fi
+
+
+
+if [ ! -f $dir/final.mdl ]; then
+  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
+     --num-threads $num_threads --parallel-opts "$parallel_opts" \
+     --num-jobs-nnet 4 \
     --num-epochs-extra 10 --add-layers-period 1 \
     --num-hidden-layers 2 \
     --mix-up 4000 \
@ -23,15 +44,16 @@ dir=exp/nnet4d
     --cmd "$decode_cmd" \
     --pnorm-input-dim 1000 \
     --pnorm-output-dim 200 \
-     --combine-regularizer 1.0e-12 \
-     data/train data/lang exp/tri3b_ali $dir 
+     data/train data/lang exp/tri3b_ali $dir  || exit 1;
+fi

-   steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
-     --transform-dir exp/tri3b/decode \
-     exp/tri3b/graph data/test $dir/decode 
+steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+  --transform-dir exp/tri3b/decode \
+  exp/tri3b/graph data/test $dir/decode  &

-   steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
-     --transform-dir exp/tri3b/decode_ug \
-     exp/tri3b/graph_ug data/test $dir/decode_ug
+steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+  --transform-dir exp/tri3b/decode_ug \
+  exp/tri3b/graph_ug data/test $dir/decode_ug
+
+wait

-)
--- a/egs/rm/s5/local/nnet2/run_4d_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_4d_gpu.sh
@ -1,41 +0,0 @@
-#!/bin/bash
-
-# This is pnorm neural net training on top of adapted 40-dimensional features.
-# This version of the script uses GPUs.  We distinguish it by putting "_gpu"
-# at the end of the directory name.
-
-
-parallel_opts="-l gpu=1" 
-
-. ./cmd.sh
-. ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-
-dir=exp/nnet4d_gpu
-(  steps/nnet2/train_pnorm.sh  --num-epochs 20 \
-     --num-jobs-nnet 4 --num-threads 1 --parallel-opts "$parallel_opts" \
-     --num-epochs-extra 10 --add-layers-period 1 \
-     --num-hidden-layers 2 \
-     --mix-up 4000 \
-     --initial-learning-rate 0.02 --final-learning-rate 0.004 \
-     --cmd "$decode_cmd" \
-     --pnorm-input-dim 750 \
-     --pnorm-output-dim 150 \
-     --combine-regularizer 1.0e-12 \
-     data/train data/lang exp/tri3b_ali $dir 
-
-   steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
-     --transform-dir exp/tri3b/decode \
-     exp/tri3b/graph data/test $dir/decode 
-
-   steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
-     --transform-dir exp/tri3b/decode_ug \
-     exp/tri3b/graph_ug data/test $dir/decode_ug
-
-)
-
-
--- a/egs/rm/s5/local/run_nnet2.sh
+++ b/egs/rm/s5/local/run_nnet2.sh
@ -7,7 +7,7 @@
 use_gpu=true  

 if $use_gpu; then
-  # This example runs on top of "raw-fMLLR" features:
+  # This example runs on top of "raw-fMLLR" features.
  # We don't have a GPU version of this script.
  #local/nnet2/run_4a_gpu.sh

@ -15,11 +15,11 @@ if $use_gpu; then
  local/nnet2/run_4b_gpu.sh

  # This one is on top of 40-dim + fMLLR features
-  local/nnet2/run_4c_gpu.sh
+  local/nnet2/run_4c.sh --use-gpu true

  # This one is for training pnorm nnets on top of 40-dim + fMLLR features
  # **THIS IS THE PRIMARY RECIPE**
-  local/nnet2/run_4d_gpu.sh
+  local/nnet2/run_4d.sh --use-gpu true
  
  # This is discriminative training on top of 4c.
  local/nnet2/run_5c_gpu.sh
@ -34,11 +34,12 @@ else
  # This one is on top of filter-bank features, with only CMN.
  local/nnet2/run_4b.sh

-  # This one is on top of 40-dim + fMLLR features
-  local/nnet2/run_4c.sh
+  # This one is on top of 40-dim + fMLLR features, it's a fairly
+  # normal tanh system.
+  local/nnet2/run_4c.sh --use-gpu false

-  # **THIS IS THE PRIMARY RECIPE**
-  local/nnet2/run_4d.sh
+  # **THIS IS THE PRIMARY RECIPE (40-dim + fMLLR + p-norm neural net)**
+  local/nnet2/run_4d.sh --use-gpu false

  # This is discriminative training on top of 4c.
  local/nnet2/run_5c.sh
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_online.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_online.sh
@ -1,15 +1,19 @@
 #!/bin/bash

-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 # Apache 2.0.


-# This script trains neural network with pnorm nonlinearities. 
-# The difference with train_tanh.sh is that, instead of setting 
-# hidden_layer_size, you should set pnorm_input_dim and pnorm_output_dim.
-# Also the P value (the order of the p-norm) should be set.
+# train_pnorm_fast.sh is a new, improved version of train_pnorm.sh, which uses
+# the 'online' preconditioning method.  For GPUs it's about two times faster
+# than before (although that's partly due to optimizations that will also help
+# the old recipe), and for CPUs it gives better performance than the old method
+# (I believe); also, the difference in optimization performance between CPU and
+# GPU is almost gone.  The old train_pnorm.sh script is now deprecated.
+# We made this a separate script because not all of the options that the
+# old script accepted, are still accepted.

 # Begin configuration section.
 cmd=run.pl
@ -22,18 +26,15 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.
-
+softmax_learning_rate_factor=1.0 # In the default setting keep the same learning
+                                 # rate for the final layer.
 pnorm_input_dim=3000 
 pnorm_output_dim=300
 first_component_power=1.0  # could set this to 0.5, sometimes seems to improve results.
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update.  Note: it also
-                   # interacts with the "preconditioned" update which generally
-                   # works better with larger minibatch size, so it's not
-                   # completely cost free.
+                   # be a problem with multi-threaded update. 

 samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
@ -59,14 +60,18 @@ io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one t
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
-update_period=4 # relates to preconditioning: says how often we update the subspace.
+update_period=4 # relates to online preconditioning: says how often we update the subspace.
 num_samples_history=2000 # relates to online preconditioning
-max_change_per_sample=0.1
+max_change_per_sample=0.075
+precondition_rank_in=20  # relates to online preconditioning
+precondition_rank_out=80 # relates to online preconditioning
+
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
 num_threads=16
 parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
+combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
 cleanup=true
 egs_dir=
 lda_opts=
@ -75,8 +80,6 @@ egs_opts=
 transform_dir=     # If supplied, overrides alidir
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
-precondition_rank_in=20
-precondition_rank_out=80
 # End configuration section.


@ -262,9 +265,10 @@ echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
 echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
 echo "$0: (while reducing learning rate) + (with constant learning rate)."

+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
 # This is when we decide to mix up from: halfway between when we've finished
 # adding the hidden layers and the end of training.
-finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
 mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

 if [ $num_threads -eq 1 ]; then
@ -296,6 +300,7 @@ while [ $x -lt $num_iters ]; do
    fi
    
    echo "Training neural net (pass $x)"
+
    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
@ -303,18 +308,29 @@ while [ $x -lt $num_iters ]; do
    else
      mdl=$dir/$x.mdl
    fi
-
-
-    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
+    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size and just one job: the model-averaging doesn't seem to be helpful
+      # when the model is changing too fast (i.e. it worsens the objective
+      # function), and the smaller minibatch size will help to keep
+      # the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+      this_num_jobs_nnet=1
+    else
+      this_minibatch_size=$minibatch_size
+      this_num_jobs_nnet=$num_jobs_nnet
+    fi
+    
+    $cmd $parallel_opts JOB=1:$this_num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
-      nnet-train$train_suffix \
-         --minibatch-size=$minibatch_size --srand=$x "$mdl" \
+       nnet-train$train_suffix \
+        --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
-    for n in `seq 1 $num_jobs_nnet`; do
+    for n in `seq 1 $this_num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

@ -331,7 +347,7 @@ while [ $x -lt $num_iters ]; do
      else lr=$learning_rate; fi
      lr_string="$lr_string:$lr"
    done
-    
+
    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list - \| \
      nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@ -372,10 +388,9 @@ if [ $stage -le $num_iters ]; then
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
-  this_num_threads=$num_threads
-  [ $this_num_threads -lt 8 ] && this_num_threads=8
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
-  mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
+  combine_num_threads=8
+  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
@ -384,15 +399,15 @@ if [ $stage -le $num_iters ]; then
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
-  $cmd $parallel_opts $dir/log/combine.log \
+  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
-      --num-threads=$this_num_threads \
+      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
-  $cmd $parallel_opts $dir/log/normalize.log \
+  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
--- a/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh
@ -0,0 +1,451 @@
+#!/bin/bash 
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script trains a fairly vanilla network with tanh nonlinearities.
+
+# train_tanh_fast.sh is a new, improved version of train_tanh.sh, which uses
+# the 'online' preconditioning method.  For GPUs it's about two times faster
+# than before (although that's partly due to optimizations that will also help
+# the old recipe), and for CPUs it gives better performance than the old method
+# (I believe); also, the difference in optimization performance between CPU and
+# GPU is almost gone.  The old train_tanh.sh script is now deprecated.
+# We made this a separate script because not all of the options that the
+# old script accepted, are still accepted.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=15      # Number of epochs during which we reduce
+                   # the learning rate; number of iteration is worked out from this.
+num_epochs_extra=5 # Number of epochs after we stop reducing
+                   # the learning rate.
+num_iters_final=20 # Maximum number of final iterations to give to the
+                   # optimization over the validation set.
+initial_learning_rate=0.04
+final_learning_rate=0.004
+bias_stddev=0.5
+shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+                  # still adding layers, when we do it every iter.
+shrink=true
+num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
+                       # given.
+final_learning_rate_factor=0.5 # Train the two last layers of parameters half as
+                               # fast as the other layers, by default.
+
+hidden_layer_dim=300 #  You may want this larger, e.g. 1024 or 2048.
+
+minibatch_size=128 # by default use a smallish minibatch size for neural net
+                   # training; this controls instability which would otherwise
+                   # be a problem with multi-threaded update. 
+
+samples_per_iter=200000 # each iteration of training, see this many samples
+                        # per job.  This option is passed to get_egs.sh.
+num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
+                   # is passed to get_egs.sh.
+get_egs_stage=0
+spk_vecs_dir=
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
+                # the samples on each iter.  You could set it to 0 or to a large
+                # value for complete randomization, but this would both consume
+                # memory and cause spikes in disk I/O.  Smaller is easier on
+                # disk and memory but less random.  It's not a huge deal though,
+                # as samples are anyway randomized right at the start.
+
+add_layers_period=2 # by default, add new layers every 2 iterations.
+num_hidden_layers=3 # This is an important configuration value that you might
+                    # want to tune.
+stage=-5
+
+io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+splice_width=4 # meaning +- 4 frames on each side for second LDA
+randprune=4.0 # speeds up LDA.
+alpha=4.0 # relates to preconditioning.
+update_period=4 # relates to online preconditioning: says how often we update the subspace.
+num_samples_history=2000 # relates to online preconditioning
+max_change_per_sample=0.075
+# we make the [input, output] ranks less different for the tanh setup than for
+# the pnorm setup, as we don't have the difference in dimensions to deal with.
+precondition_rank_in=30  # relates to online preconditioning
+precondition_rank_out=60 # relates to online preconditioning
+mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
+         # specified.)
+num_threads=16
+parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" # by default we use 16 threads; this lets the queue know.
+  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
+combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+cleanup=true
+egs_dir=
+lda_opts=
+egs_opts=
+transform_dir=
+prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
+                        # more than enough.
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
+  echo "                                                   # while reducing learning rate (determines #iterations, together"
+  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
+  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
+  echo "                                                   # after learning rate fully reduced"
+  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
+  echo "                                                       # data, 0.01 for large data"
+  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
+  echo "                                                   # data, 0.001 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --initial-num-hidden-layers <#hidden-layers|1>   # Number of hidden layers to start with."
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
+  echo "                                                   # per context-dependent state.  Try a number several times #states."
+  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
+  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
+  echo "                                                   # Note: if you increase this, you may want to also increase"
+  echo "                                                   # the learning rate."
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
+  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
+  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
+  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
+  echo "                                                   # (the validation subset is held out from training)"
+  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
+  echo "  --num-valid-frames-combine <#frames|10000>       # Number of frames used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+dir=$4
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+# Set some variables.
+num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
+ 
+nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+# in this dir we'll have just one job.
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/cmvn_opts $dir 2>/dev/null
+cp $alidir/tree $dir
+
+
+[ -z "$transform_dir" ] && transform_dir=$alidir
+
+if [ $stage -le -4 ]; then
+  echo "$0: calling get_lda.sh"
+  steps/nnet2/get_lda.sh $lda_opts --transform-dir $transform_dir --splice-width $splice_width --cmd "$cmd" $data $lang $alidir $dir || exit 1;
+fi
+
+# these files will have been written by get_lda.sh
+feat_dim=`cat $dir/feat_dim` || exit 1;
+lda_dim=`cat $dir/lda_dim` || exit 1;
+
+if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
+  echo "$0: calling get_egs.sh"
+  [ ! -z $spk_vecs_dir ] && spk_vecs_opt="--spk-vecs-dir $spk_vecs_dir";
+  steps/nnet2/get_egs.sh $spk_vecs_opt --transform-dir $transform_dir --samples-per-iter $samples_per_iter \
+      --num-jobs-nnet $num_jobs_nnet --splice-width $splice_width --stage $get_egs_stage \
+      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
+      $data $lang $alidir $dir || exit 1;
+fi
+
+if [ -z $egs_dir ]; then
+  egs_dir=$dir/egs
+fi
+
+iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
+! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
+  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
+num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;
+
+
+if ! [ $num_hidden_layers -ge 1 ]; then
+  echo "Invalid num-hidden-layers $num_hidden_layers"
+  exit 1
+fi
+
+if [ $stage -le -2 ]; then
+  echo "$0: initializing neural net";
+
+  # Get spk-vec dim (in case we're using them).
+  if [ ! -z "$spk_vecs_dir" ]; then
+    spk_vec_dim=$[$(copy-vector --print-args=false "ark:cat $spk_vecs_dir/vecs.1|" ark,t:- | head -n 1 | wc -w) - 3];
+    ! [ $spk_vec_dim -gt 0 ] && echo "Error getting spk-vec dim" && exit 1;
+    ext_lda_dim=$[$lda_dim + $spk_vec_dim]
+    extend-transform-dim --new-dimension=$ext_lda_dim $dir/lda.mat $dir/lda_ext.mat || exit 1;
+    lda_mat=$dir/lda_ext.mat
+    ext_feat_dim=$[$feat_dim + $spk_vec_dim]
+  else
+    spk_vec_dim=0
+    lda_mat=$dir/lda.mat
+    ext_lda_dim=$lda_dim
+    ext_feat_dim=$feat_dim
+  fi
+
+  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
+
+  stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`
+  cat >$dir/nnet.config <<EOF
+SpliceComponent input-dim=$ext_feat_dim left-context=$splice_width right-context=$splice_width const-component-dim=$spk_vec_dim
+FixedAffineComponent matrix=$lda_mat
+AffineComponentPreconditionedOnline input-dim=$ext_lda_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
+TanhComponent dim=$hidden_layer_dim
+AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
+SoftmaxComponent dim=$num_leaves
+EOF
+
+  # to hidden.config it will write the part of the config corresponding to a
+  # single hidden layer; we need this to add new layers. 
+  cat >$dir/hidden.config <<EOF
+AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
+TanhComponent dim=$hidden_layer_dim
+EOF
+  $cmd $dir/log/nnet_init.log \
+    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
+    $dir/0.mdl || exit 1;
+fi
+
+if [ $stage -le -1 ]; then
+  echo "Training transition probabilities and setting priors"
+  $cmd $dir/log/train_trans.log \
+    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
+    || exit 1;
+fi
+
+num_iters_reduce=$[$num_epochs * $iters_per_epoch];
+num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
+num_iters=$[$num_iters_reduce+$num_iters_extra]
+
+echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
+echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
+echo "$0: (while reducing learning rate) + (with constant learning rate)."
+
+# This is when we decide to mix up from: halfway between when we've finished
+# adding the hidden layers and the end of training.
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
+
+if [ $num_threads -eq 1 ]; then
+  train_suffix="-simple" # this enables us to use GPU code if
+                         # we have just one thread.
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+  fi
+else
+  train_suffix="-parallel --num-threads=$num_threads"
+fi
+
+
+
+x=0
+while [ $x -lt $num_iters ]; do
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    # Set off jobs doing some diagnostics, in the background.
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
+    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
+         ark:$egs_dir/train_diagnostic.egs '&&' \
+         nnet-am-info $dir/$x.mdl &
+    fi
+    
+    echo "Training neural net (pass $x)"
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
+      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
+    else
+      mdl=$dir/$x.mdl
+    fi
+
+    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size and just one job: the model-averaging doesn't seem to be helpful
+      # when the model is changing too fast (i.e. it worsens the objective
+      # function), and the smaller minibatch size will help to keep
+      # the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+      this_num_jobs_nnet=1
+    else
+      this_minibatch_size=$minibatch_size
+      this_num_jobs_nnet=$num_jobs_nnet
+    fi
+
+    $cmd $parallel_opts JOB=1:$this_num_jobs_nnet $dir/log/train.$x.JOB.log \
+      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
+      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
+      nnet-train$train_suffix \
+         --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
+        ark:- $dir/$[$x+1].JOB.mdl \
+      || exit 1;
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs_nnet`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
+    done
+
+    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
+    last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
+    nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
+    nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    # na is number of last updatable AffineComponent layer [one-based, counting only
+    # updatable components.]
+    # The last two layers will get this (usually lower) learning rate.
+    lr_string="$learning_rate"
+    for n in `seq 2 $nu`; do 
+      if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
+      else lr=$learning_rate; fi
+      lr_string="$lr_string:$lr"
+    done
+    
+    $cmd $dir/log/average.$x.log \
+      nnet-am-average $nnets_list - \| \
+      nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
+
+    if $shrink && [ $[$x % $shrink_interval] -eq 0 ]; then
+      mb=$[($num_frames_shrink+$num_threads-1)/$num_threads]
+      $cmd $parallel_opts $dir/log/shrink.$x.log \
+        nnet-subset-egs --n=$num_frames_shrink --randomize-order=true --srand=$x \
+          ark:$egs_dir/train_diagnostic.egs ark:-  \| \
+        nnet-combine-fast --use-gpu=no --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
+          $dir/$[$x+1].mdl ark:- $dir/$[$x+1].mdl || exit 1;
+    else
+      # On other iters, do nnet-am-fix which is much faster and has roughly
+      # the same effect.
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+    fi
+
+    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
+      # mix up.
+      echo Mixing up from $num_leaves to $mix_up components
+      $cmd $dir/log/mix_up.$x.log \
+        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
+        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+    rm $nnets_list
+  fi
+  x=$[$x+1]
+done
+
+# Now do combination.
+# At the end, final.mdl will be a combination of the last e.g. 10 models.
+nnets_list=()
+if [ $num_iters_final -gt $num_iters_extra ]; then
+  echo "Setting num_iters_final=$num_iters_extra"
+fi
+start=$[$num_iters-$num_iters_final+1]
+for x in `seq $start $num_iters`; do
+  idx=$[$x-$start]
+  if [ $x -gt $mix_up_iter ]; then
+    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
+  fi
+done
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.mdl"
+  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as if
+  # there are many models it can give out-of-memory error on the GPU; set
+  # num-threads to 8 to speed it up (this isn't ideal...)
+  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
+  combine_num_threads=8
+  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
+  [ $mb -gt 512 ] && mb=512
+  $cmd $combine_parallel_opts $dir/log/combine.log \
+    nnet-combine-fast --use-gpu=no --num-threads=$combine_num_threads \
+      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
+      $dir/final.mdl || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
+fi
+
+if [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purposes of adjusting the priors."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  rm $dir/post.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
+    nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
+    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.log \
+   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;
+
+  rm $dir/post.*.vec;
+
+  echo "Re-adjusting priors based on computed posteriors"
+  $cmd $dir/log/adjust_priors.log \
+    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if [ $egs_dir == "$dir/egs" ]; then
+    echo Removing training examples
+    rm $dir/egs/egs*
+  fi
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+       # delete all but every 10th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi
--- a/egs/wsj/s5/utils/queue.pl
+++ b/egs/wsj/s5/utils/queue.pl
@ -207,8 +207,23 @@ if (! $sync) { # We're not submitting with -sync y, so we
    }
  }
  # We will need the sge_job_id, to check that job still exists
-  $sge_job_id=`grep "Your job" $queue_logfile | awk '{ print \$3 }' | sed 's|\\\..*||'`;
-  chomp($sge_job_id);
+  { # Get the SGE job-id from the log file in q/
+    open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
+    undef $sge_job_id;
+    while (<L>) {
+      if (m/Your job\S* (\d+)[. ].+ has been submitted/) {
+        if (defined $sge_job_id) {
+          die "Error: your job was submitted more than once (see $queue_logfile)";
+        } else {
+          $sge_job_id = $1;
+        }
+      }
+    }
+    close(L);
+    if (!defined $sge_job_id) {
+      die "Error: log file $queue_logfile does not specify the SGE job-id.";
+    }
+  }
  $check_sge_job_ctr=1;
  #
  $wait = 0.1;
--- a/src/matrix/matrix-functions.h
+++ b/src/matrix/matrix-functions.h
@ -58,6 +58,9 @@ namespace kaldi {
   back to where you started from.  We don't do this because
   in some contexts, the transform is made symmetric by multiplying
   by sqrt(N) in both passes.   The user can do this by themselves.
+
+   See also SplitRadixComplexFft, declared in srfft.h, which is more efficient
+   but only works if the length of the input is a power of 2.
 */
 template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);

--- a/src/matrix/srfft.cc
+++ b/src/matrix/srfft.cc
@ -43,7 +43,7 @@ SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
    logm_ ++;
  }
  ComputeTables();
-  temp_buffer = NULL;
+  temp_buffer_ = NULL;
 }

 template<typename Real>
@ -55,21 +55,21 @@ void SplitRadixComplexFft<Real>::ComputeTables() {

  lg2 = logm_ >> 1;
  if (logm_ & 1) lg2++;
-  brseed = new MatrixIndexT[1 << lg2];
-  brseed[0] = 0;
-  brseed[1] = 1;
+  brseed_ = new MatrixIndexT[1 << lg2];
+  brseed_[0] = 0;
+  brseed_[1] = 1;
  for (j = 2; j <= lg2; j++) {
    imax = 1 << (j - 1);
    for (i = 0; i < imax; i++) {
-      brseed[i] <<= 1;
-      brseed[i + imax] = brseed[i] + 1;
+      brseed_[i] <<= 1;
+      brseed_[i + imax] = brseed_[i] + 1;
    }
  }

  if (logm_ < 4) {
-    tab = NULL;
+    tab_ = NULL;
  } else {
-    tab = new Real* [logm_-3];
+    tab_ = new Real* [logm_-3];
    for (i = logm_; i>=4 ; i--) {
      /* Compute a few constants */
      m = 1 << i; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2;
@ -77,10 +77,10 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
      /* Allocate memory for tables */
      nel = m4 - 2;

-      tab[i-4] = new Real[6*nel];
+      tab_[i-4] = new Real[6*nel];

      /* Initialize pointers */
-      cn = tab[i-4]; spcn  = cn + nel;  smcn  = spcn + nel;
+      cn = tab_[i-4]; spcn  = cn + nel;  smcn  = spcn + nel;
      c3n = smcn + nel;  spc3n = c3n + nel; smc3n = spc3n + nel;

      /* Compute tables */
@ -99,14 +99,14 @@ void SplitRadixComplexFft<Real>::ComputeTables() {

 template<typename Real>
 SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
-  delete [] brseed;
-  if (tab != NULL) {
+  delete [] brseed_;
+  if (tab_ != NULL) {
    for (MatrixIndexT i = 0; i < logm_-3; i++)
-      delete [] tab[i];
-    delete [] tab;
+      delete [] tab_[i];
+    delete [] tab_;
  }
-  if (temp_buffer != NULL)
-    delete [] temp_buffer;
+  // "delete" only does something if it's a non-NULL pointer.
+  delete [] temp_buffer_;
 }

 template<typename Real>
@ -125,29 +125,29 @@ void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const

 template<typename Real>
 void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
-  if (temp_buffer == NULL)
-    temp_buffer = new Real[N_];
+  if (temp_buffer_== NULL)
+    temp_buffer_ = new Real[N_];
  for (MatrixIndexT i = 0; i < N_; i++) {
    x[i] = x[i*2];  // put the real part in the first half of x.
-    temp_buffer[i] = x[i*2 + 1];  // put the imaginary part in temp_buffer.
+    temp_buffer_[i] = x[i*2 + 1];  // put the imaginary part in temp_buffer.
  }
  // copy the imaginary part back to the second half of x.
-  memcpy(static_cast<void*>(x+N_),
-         static_cast<void*>(temp_buffer),
+  memcpy(static_cast<void*>(x + N_),
+         static_cast<void*>(temp_buffer_),
         sizeof(Real) * N_);

-  Compute(x, x+N_, forward);
+  Compute(x, x + N_, forward);
  // Now change the format back to interleaved.
-  memcpy(static_cast<void*>(temp_buffer),
-         static_cast<void*>(x+N_),
+  memcpy(static_cast<void*>(temp_buffer_),
+         static_cast<void*>(x + N_),
         sizeof(Real) * N_);
  for (MatrixIndexT i = N_-1; i > 0; i--) {  // don't include 0,
    // in case MatrixIndexT is unsigned, the loop would not terminate.
    // Treat it as a special case.
    x[i*2] = x[i];
-    x[i*2 + 1] = temp_buffer[i];
+    x[i*2 + 1] = temp_buffer_[i];
  }
-  x[1] = temp_buffer[0];  // special case of i = 0.
+  x[1] = temp_buffer_[0];  // special case of i = 0.
 }

 template<typename Real>
@ -162,11 +162,11 @@ void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) c

  /* Unshuffling loop */
  for (off = 1; off < n; off++) {
-    fj = n * brseed[off]; i = off; j = fj;
+    fj = n * brseed_[off]; i = off; j = fj;
    tmp = x[i]; x[i] = x[j]; x[j] = tmp;
    xp = &x[i];
-    brp = &(brseed[1]);
-    for (gno = 1; gno < brseed[off]; gno++) {
+    brp = &(brseed_[1]);
+    for (gno = 1; gno < brseed_[off]; gno++) {
      xp += n;
      j = fj + *brp++;
      xq = x + j;
@ -281,7 +281,7 @@ void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixInde
  xi1 = xi + m2; xi2 = xi1 + m4;
  if (logm >= 4) {
    nel = m4 - 2;
-    cn  = tab[logm-4]; spcn  = cn + nel;  smcn  = spcn + nel;
+    cn  = tab_[logm-4]; spcn  = cn + nel;  smcn  = spcn + nel;
    c3n = smcn + nel;  spc3n = c3n + nel; smc3n = spc3n + nel;
  }
  xr1++; xr2++; xi1++; xi2++;
--- a/src/matrix/srfft.h
+++ b/src/matrix/srfft.h
@ -40,11 +40,14 @@ namespace kaldi {
 // permission, optimized by Go Vivace Inc., and converted into C++ by
 // Microsoft Corporation
 // This is a more efficient way of doing the complex FFT than ComplexFft
-// above, but it only works for powers of 2.
+// (declared in matrix-functios.h), but it only works for powers of 2.
+// Note: in multi-threaded code, you would need to have one of these objects per
+// thread, because multiple calls to Compute in parallel would not work.
 template<typename Real>
 class SplitRadixComplexFft {
 public:
  typedef MatrixIndexT Integer;
+
  // N is the number of complex points (must be a power of two, or this
  // will crash).  Note that the constructor does some work so it's best to
  // initialize the object once and do the computation many times.
@ -73,12 +76,12 @@ class SplitRadixComplexFft {
  Integer logm_;  // log(N) [a slight mismatch in notation which we have not
  // bothered to fix].

-  Integer *brseed;
+  Integer *brseed_;
  // brseed is Evans' seed table, ref:  (Ref: D. M. W.
  // Evans, "An improved digit-reversal permutation algorithm ...",
  // IEEE Trans. ASSP, Aug. 1987, pp. 1120-1125).
-  Real **tab;       // Tables of butterfly coefficients.
-  Real *temp_buffer;  // Allocated only if someone calls Compute with only
+  Real **tab_;       // Tables of butterfly coefficients.
+  Real *temp_buffer_;  // Allocated only if someone calls Compute with only
  // one argument and we need a temporary buffer while creating interleaved
  // data.
 };
--- a/src/nnet2/nnet-precondition-online.cc
+++ b/src/nnet2/nnet-precondition-online.cc
@ -241,12 +241,19 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
      locked = false;
    }
  }
-
+  
  if (!locked) {
    // We're not updating the parameters, either because another thread is
    // working on updating them, or because another thread already did so from
    // the same or later starting point (making our update stale), or because
    // update_period_ > 1.  We just apply the preconditioning and return.
+
+    // note: we don't bother with any locks before incrementing
+    // num_updates_skipped_ below, because the worst that could happen is that,
+    // on very rare occasions, we could skip one or two more updates than we
+    // intended.
+    num_updates_skipped_++;
+    
    BaseFloat tr_Rt_RtT = TraceMatMat(*R_t, *R_t, kTrans);
    // P_t = R_t - H_t W_t
    R_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); 
@ -258,11 +265,6 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
    BaseFloat gamma_t = (tr_Pt_PtT == 0.0 ? 1.0 :
                         sqrt(tr_Rt_RtT / tr_Pt_PtT));
    *scale = gamma_t;
-    // note: we don't bother with any locks before incrementing
-    // num_updates_skipped_ below, because the worst that could happen is that,
-    // on very rare occasions, we could skip one or two more updates than we
-    // intended.
-    num_updates_skipped_++;
    return;
  }
  J_t.AddMatMat(1.0, H_t, kTrans, *R_t, kNoTrans, 0.0);  // J_t = H_t^T R_t
@ -295,6 +297,7 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
  BaseFloat beta_t = rho_t * (1.0 + alpha_) + alpha_ * d_t.Sum() / D;
  Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
  ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
+  KALDI_VLOG(5) << "e_t = " << e_t;
  
  SpMatrix<BaseFloat> Z_t(R);
  ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t);
@ -494,7 +497,6 @@ void OnlinePreconditioner::ComputeEt(const VectorBase<BaseFloat> &d_t,
  BaseFloat *e = e_t->Data();
  for (int32 i = 0; i < D; i++)
    e[i] = 1.0 / (beta_t / d[i]  +  1);
-  KALDI_VLOG(5) << "e_t = " << *e_t;
  sqrt_e_t->CopyFromVec(*e_t);
  sqrt_e_t->ApplyPow(0.5);
  inv_sqrt_e_t->CopyFromVec(*sqrt_e_t);
--- a/src/nnet2/nnet-update.cc
+++ b/src/nnet2/nnet-update.cc
@ -67,10 +67,10 @@ void NnetUpdater::Propagate() {
        (c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
        component.BackpropNeedsInput();
    if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
-      KALDI_LOG << "Stddev of data for component " << c
-                << " for this minibatch is "
-                << (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
-                    (forward_data_[c].NumRows() * forward_data_[c].NumCols()));
+      KALDI_VLOG(3) << "Stddev of data for component " << c
+                    << " for this minibatch is "
+                    << (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
+                        (forward_data_[c].NumRows() * forward_data_[c].NumCols()));
      num_times_printed++;
    }
    if (!need_last_output)
--- a/src/nnet2/train-nnet.cc
+++ b/src/nnet2/train-nnet.cc
@ -26,7 +26,8 @@ namespace nnet2 {
 NnetSimpleTrainer::NnetSimpleTrainer(
    const NnetSimpleTrainerConfig &config,
    Nnet *nnet):
-    config_(config), nnet_(nnet) {
+    config_(config), nnet_(nnet), logprob_this_phase_(0.0),
+    weight_this_phase_(0.0), logprob_total_(0.0), weight_total_(0.0) {
  num_phases_ = 0;
  bool first_time = true;
  BeginNewPhase(first_time);
@ -44,7 +45,7 @@ void NnetSimpleTrainer::TrainOneMinibatch() {
  logprob_this_phase_ += DoBackprop(*nnet_,
                                    buffer_,
                                    nnet_);
-  count_this_phase_ += buffer_.size();
+  weight_this_phase_ += TotalNnetTrainingWeight(buffer_);
  buffer_.clear();
  minibatches_seen_this_phase_++;
  if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
@ -56,10 +57,12 @@ void NnetSimpleTrainer::TrainOneMinibatch() {
 void NnetSimpleTrainer::BeginNewPhase(bool first_time) {
  if (!first_time)
    KALDI_LOG << "Training objective function (this phase) is "
-              << (logprob_this_phase_/count_this_phase_) << " over "
-              << count_this_phase_ << " frames.";
+              << (logprob_this_phase_/weight_this_phase_) << " over "
+              << weight_this_phase_ << " frames.";
+  logprob_total_ += logprob_this_phase_;
+  weight_total_ += weight_this_phase_;
  logprob_this_phase_ = 0.0;
-  count_this_phase_ = 0.0;
+  weight_this_phase_ = 0.0;
  minibatches_seen_this_phase_ = 0;
  num_phases_++;
 }
@ -75,6 +78,13 @@ NnetSimpleTrainer::~NnetSimpleTrainer() {
      BeginNewPhase(first_time);
    }
  }
+  if (weight_total_ == 0.0) {
+    KALDI_WARN << "No data seen.";
+  } else {
+    KALDI_LOG << "Did backprop on " << weight_total_
+              << " examples, average log-prob per frame is "
+              << (logprob_total_ / weight_total_);
+  }
 }


--- a/src/nnet2/train-nnet.h
+++ b/src/nnet2/train-nnet.h
@ -81,7 +81,10 @@ class NnetSimpleTrainer {
  std::vector<NnetExample> buffer_;

  double logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
-  double count_this_phase_; // count corresponding to the above.
+  double weight_this_phase_; // count corresponding to the above.
+  
+  double logprob_total_;
+  double weight_total_;
 };