Applied patch from BOLT system.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4673 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-12-04 06:01:24 +00:00 · 2014-12-04 06:01:24 +00:00 · de10f1506d
--- a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
+++ b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
@ -0,0 +1,95 @@
+#!/bin/bash
+
+# This is to be run after run_nnet2_multisplice.sh.
+# It demonstrates discriminative training for the online-nnet2 models
+
+. cmd.sh
+
+
+stage=1
+train_stage=-10
+use_gpu=true
+srcdir=exp/nnet2_online/nnet_ms_a_online
+criterion=smbr
+learning_rate=0.0016
+
+drop_frames=false # only relevant for MMI
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ ! -f $srcdir/final.mdl ]; then
+  echo "$0: expected $srcdir/final.mdl to exist; first run run_nnet2_multisplice.sh."
+  exit 1;
+fi
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  parallel_opts="-l gpu=1" 
+  num_threads=1
+  minibatch_size=512
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+  minibatch_size=128
+  parallel_opts="-pe smp $num_threads" 
+fi
+
+if [ $stage -le 1 ]; then
+  # use a wide beam because this is RM.  These would be too high for other setups.
+  nj=30
+  num_threads=6
+  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads" \
+      --nj $nj --sub-split 40 --num-threads "$num_threads" --beam 20.0 --lattice-beam 10.0 \
+     data/train data/lang $srcdir ${srcdir}_denlats || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=100
+  use_gpu=no
+  gpu_opts=
+  steps/online/nnet2/align.sh  --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+      --nj $nj data/train data/lang $srcdir ${srcdir}_ali || exit 1;
+fi
+
+
+if [ $stage -le 3 ]; then
+  # I tested the following with  --max-temp-archives 3 
+  # to test other branches of the code.
+  steps/online/nnet2/get_egs_discriminative2.sh \
+    --cmd "$decode_cmd -pe smp 5" \
+    --criterion $criterion --drop-frames $drop_frames \
+     data/train data/lang ${srcdir}{_ali,_denlats,,_degs} || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet2/train_discriminative2.sh --cmd "$decode_cmd $parallel_opts" \
+    --learning-rate $learning_rate \
+    --criterion $criterion --drop-frames $drop_frames \
+    --num-epochs 6 \
+    --num-jobs-nnet 2 --num-threads $num_threads \
+      ${srcdir}_degs ${srcdir}_${criterion}_${learning_rate} || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  ln -sf $(readlink -f $srcdir/conf) ${srcdir}_${criterion}_${learning_rate}/conf # so it acts like an online-decoding directory
+
+  for epoch in 0 1 2 3 4 5 6; do
+    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+      --iter epoch$epoch exp/tri3b/graph data/test ${srcdir}_${criterion}_${learning_rate}/decode_epoch$epoch &
+    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+      --iter epoch$epoch exp/tri3b/graph_ug data/test ${srcdir}_${criterion}_${learning_rate}/decode_ug_epoch$epoch &
+  done
+  wait
+  for dir in ${srcdir}_${criterion}_${learning_rate}/decode*; do grep WER $dir/wer_* | utils/best_wer.sh; done
+fi
--- a/egs/wsj/s5/steps/make_denlats.sh
+++ b/egs/wsj/s5/steps/make_denlats.sh
@ -63,7 +63,7 @@ oov=`cat $lang/oov.int` || exit 1;

 mkdir -p $dir

-cp -r $lang $dir/
+cp -rH $lang $dir/

 # Compute grammar FST which corresponds to unigram decoding graph.
 new_lang="$dir/"$(basename "$lang")
--- a/egs/wsj/s5/steps/make_denlats_sgmm.sh
+++ b/egs/wsj/s5/steps/make_denlats_sgmm.sh
@ -57,7 +57,7 @@ oov=`cat $lang/oov.int` || exit 1;

 mkdir -p $dir

-cp -r $lang $dir/
+cp -rH $lang $dir/

 # Compute grammar FST which corresponds to unigram decoding graph.
 new_lang="$dir/"$(basename "$lang")
--- a/egs/wsj/s5/steps/make_denlats_sgmm2.sh
+++ b/egs/wsj/s5/steps/make_denlats_sgmm2.sh
@ -66,7 +66,7 @@ oov=`cat $lang/oov.int` || exit 1;

 mkdir -p $dir

-cp -r $lang $dir/
+cp -rH $lang $dir/

 # Compute grammar FST which corresponds to unigram decoding graph.
 new_lang="$dir/"$(basename "$lang")
--- a/egs/wsj/s5/steps/nnet2/align.sh
+++ b/egs/wsj/s5/steps/nnet2/align.sh
@ -3,12 +3,7 @@
 #           2013  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0

-# Computes training alignments using MLP model
-
-# If you supply the "--use-graphs true" option, it will use the training
-# graphs from the source directory (where the model is).  In this
-# case the number of jobs must match with the source directory.
-
+# Computes training alignments using DNN

 # Begin configuration section.  
 nj=4
--- a/egs/wsj/s5/steps/nnet2/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs.sh
@ -53,7 +53,6 @@ if [ $# != 4 ]; then
  echo "  --splice-width <width;4>                         # Number of frames on each side to append for feature input"
  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input, overrides splice-width"
  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input, overrides splice-width"
-  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
@ -106,7 +105,7 @@ if [ -f $data/utt2uniq ]; then
 fi

 awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
-    utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
+   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;

 [ -z "$transform_dir" ] && transform_dir=$alidir

@ -210,7 +209,7 @@ if [ $stage -le 2 ]; then
     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
  wait;
-  [ -f $dir/.error ] && exit 1;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
  echo "Getting subsets of validation examples for diagnostics and combination."
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/egs/valid_all.egs \
--- a/egs/wsj/s5/steps/nnet2/get_egs2.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs2.sh
@ -75,7 +75,6 @@ if [ $# != 3 ]; then
  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
-  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
@ -236,7 +235,6 @@ if [ $stage -le 2 ]; then
    gzip -c >$dir/ali_special.gz || exit 1;
  set +o pipefail; # unset the pipefail option.

-  all_ids=$(seq -s, $nj)  # e.g. 1,2,...39,40
  $cmd $dir/log/create_valid_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
    "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
@ -246,7 +244,7 @@ if [ $stage -le 2 ]; then
     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
  wait;
-  [ -f $dir/.error ] && exit 1;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
  echo "... Getting subsets of validation examples for diagnostics and combination."
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
--- a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
@ -0,0 +1,300 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
+# training of neural nets.  Note: for "criterion", smbr > mpe > mmi in terms of
+# compatibility of the dumped egs, meaning you can use the egs dumped with
+# --criterion smbr for MPE or MMI, and egs dumped with --criterion mpe for MMI
+# training.  The discriminative training program itself doesn't enforce this and
+# it would let you mix and match them arbitrarily; we area speaking in terms of
+# the correctness of the algorithm that splits the lattices into pieces.
+
+# Begin configuration section.
+cmd=run.pl
+criterion=smbr
+drop_frames=false #  option relevant for MMI, affects how we dump examples.
+samples_per_iter=400000 # measured in frames, not in "examples"
+max_temp_archives=128 # maximum number of temp archives per input job, only
+                      # affects the process of generating archives, not the
+                      # final result.
+
+stage=0
+
+cleanup=true
+transform_dir= # If this is a SAT system, directory for transforms
+online_ivector_dir=
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 6 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-model-file> <degs-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe/degs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
+  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
+  echo "                                                   # online-neural-net setup.  (but you may want to use"
+  echo "                                                   # steps/online/nnet2/get_egs_discriminative2.sh instead)"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+denlatdir=$4
+src_model=$5
+dir=$6
+
+
+extra_files=
+[ ! -z $online_ivector_dir ] && \
+  extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
+         $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log $dir/info || exit 1;
+
+
+nj=$(cat $denlatdir/num_jobs) || exit 1; # $nj is the number of
+                                         # splits of the denlats and alignments.
+
+
+nj_ali=$(cat $alidir/num_jobs) || exit 1;
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+
+
+
+if [ $nj_ali -eq $nj ]; then
+  ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.JOB.gz |"
+else
+  ali_rspecifier="scp:$dir/ali.scp"
+  if [ $stage -le 1 ]; then
+    echo "$0: number of jobs in den-lats versus alignments differ: dumping them as single archive and index."
+    all_ids=$(seq -s, $nj_ali)
+    copy-int-vector --print-args=false \
+      "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+  fi
+fi
+
+
+splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/cmvn_opts $dir 2>/dev/null
+cp $alidir/tree $dir
+cp $lang/phones/silence.csl $dir/info/
+cp $src_model $dir/final.mdl || exit 1
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period)
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  echo $ivector_dim >$dir/info/ivector_dim
+  # the 'const_dim_opt' allows it to write only one iVector per example,
+  # rather than one per time-index... it has to average over
+  const_dim_opt="--const-feat-dim=$ivector_dim"
+else
+  echo 0 > $dir/info/ivector_dim
+fi
+
+## We don't support deltas here, only LDA or raw (mainly because deltas are less
+## frequently used).
+if [ -z $feat_type ]; then
+  if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
+fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+   ;;
+  lda) 
+    splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+    cp $alidir/final.mat $dir    
+    feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+
+if [ -z "$transform_dir" ]; then
+  if [ -f $transform_dir/trans.1 ] || [ -f $transform_dir/raw_trans.1 ]; then
+    transform_dir=$alidir
+  fi
+fi
+
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+  
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then
+    echo "$0: LDA transforms differ between $alidir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+fi
+if [ ! -z $online_ivector_dir ]; then
+  # add iVectors to the features.
+  feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
+fi
+
+
+if [ $stage -le 2 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+
+  echo $num_frames > $dir/info/num_frames 
+
+  # Working out total number of archives. Add one on the assumption the
+  # num-frames won't divide exactly, and we want to round up.
+  num_archives=$[$num_frames/$samples_per_iter + 1]
+
+  # the next few lines relate to how we may temporarily split each input job
+  # into fewer than $num_archives pieces, to avoid using an excessive
+  # number of filehandles.
+  archive_ratio=$[$num_archives/$max_temp_archives+1]
+  num_archives_temp=$[$num_archives/$archive_ratio]
+  # change $num_archives slightly to make it an exact multiple
+  # of $archive_ratio.
+  num_archives=$[$num_archives_temp*$archive_ratio]
+
+  echo $num_archives >$dir/info/num_archives || exit 1
+  echo $num_archives_temp >$dir/info/num_archives_temp || exit 1
+  
+  frames_per_archive=$[$num_frames/$num_archives]
+
+  # note, this is the number of frames per archive prior to discarding frames.
+  echo $frames_per_archive > $dir/info/frames_per_archive
+else
+  num_archives=$(cat $dir/info/num_archives) || exit 1;
+  num_archives_temp=$(cat $dir/info/num_archives_temp) || exit 1;
+  frames_per_archive=$(cat $dir/info/frames_per_archive) || exit 1;
+fi
+
+echo "$0: Splitting the data up into $num_archives archives (using $num_archives_temp temporary pieces per input job)"
+echo "$0: giving samples-per-iteration of $frames_per_archive (you requested $samples_per_iter)."
+
+# we create these data links regardless of the stage, as there are situations
+# where we would want to recreate a data link that had previously been deleted.
+
+if [ -d $dir/storage ]; then
+  echo "$0: creating data links for distributed storage of degs"
+  # See utils/create_split_dir.pl for how this 'storage' directory is created.
+  for x in $(seq $nj); do
+    for y in $(seq $num_archives_temp); do
+      utils/create_data_link.pl $dir/degs_orig.$x.$y.ark
+    done
+  done
+  for z in $(seq $num_archives); do
+    utils/create_data_link.pl $dir/degs.$z.ark
+  done
+  if [ $num_archives_temp -ne $num_archives ]; then
+    for z in $(seq $num_archives); do
+      utils/create_data_link.pl $dir/degs_temp.$z.ark
+    done
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: getting initial training examples by splitting lattices"
+
+  degs_list=$(for n in $(seq $num_archives_temp); do echo ark:$dir/degs_orig.JOB.$n.ark; done)
+
+  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
+    nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames \
+      "$src_model" "$feats" "$ali_rspecifier" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz|" ark:- \| \
+    nnet-copy-egs-discriminative $const_dim_opt ark:- $degs_list || exit 1;
+  sleep 5;  # wait a bit so NFS has time to write files.
+fi
+
+if [ $stage -le 4 ]; then
+  
+  degs_list=$(for n in $(seq $nj); do echo $dir/degs_orig.$n.JOB.ark; done)
+
+  if [ $num_archives -eq $num_archives_temp ]; then
+    echo "$0: combining data into final archives and shuffling it"
+    
+    $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
+      cat $degs_list \| nnet-shuffle-egs-discriminative --srand=JOB ark:- \
+       ark:$dir/degs.JOB.ark || exit 1;
+  else
+    echo "$0: combining and re-splitting data into un-shuffled versions of final archives."
+
+    archive_ratio=$[$num_archives/$num_archives_temp]
+    ! [ $archive_ratio -gt 1 ] && echo "$0: Bad archive_ratio $archive_ratio" && exit 1;
+
+    # note: the \$[ .. ] won't be evaluated until the job gets executed.  The
+    # aim is to write to the archives with the final numbering, 1
+    # ... num_archives, which is more than num_archives_temp.  The list with
+    # \$[... ] expressions in it computes the set of final indexes for each
+    # temporary index.
+    degs_list_out=$(for n in $(seq $archive_ratio); do echo "ark:$dir/degs_temp.\$[((JOB-1)*$archive_ratio)+$n].ark"; done)
+    # e.g. if dir=foo and archive_ratio=2, we'd have
+    # degs_list_out='foo/degs_temp.$[((JOB-1)*2)+1].ark foo/degs_temp.$[((JOB-1)*2)+2].ark'
+
+    $cmd JOB=1:$num_archives_temp $dir/log/resplit.JOB.log \
+      cat $degs_list \| nnet-copy-egs-discriminative --srand=JOB ark:- \
+      $degs_list_out || exit 1;
+  fi
+fi
+
+if [ $stage -le 5 ] && [ $num_archives -ne $num_archives_temp ]; then
+  echo "$0: shuffling final archives."
+
+  $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
+    nnet-shuffle-egs-discriminative --srand=JOB ark:$dir/degs_temp.JOB.ark \
+      ark:$dir/degs.JOB.ark || exit 1
+fi
+
+if $cleanup; then
+  echo "$0: removing temporary archives."
+  for x in $(seq $nj); do
+    for y in $(seq $num_archives_temp); do
+      file=$dir/degs_orig.$x.$y.ark
+      [ -L $file ] && rm $(readlink -f $file); rm $file
+    done
+  done
+  if [ $num_archives_temp -ne $num_archives ]; then
+    for z in $(seq $num_archives); do
+      file=$dir/degs_temp.$z.ark
+      [ -L $file ] && rm $(readlink -f $file); rm $file
+    done
+  fi
+fi
+
+echo "$0: Done."
--- a/egs/wsj/s5/steps/nnet2/get_num_frames.sh
+++ b/egs/wsj/s5/steps/nnet2/get_num_frames.sh
@ -10,16 +10,25 @@ if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;

 if [ $# -ne 1 ]; then
-  echo "Usage: $0 <data-dir>"
-  echo "Prints the number of frames of data in the data-dir, via sampling rather"
-  echo "than trying to access all the data."
+  (
+    echo "Usage: $0 <data-dir>"
+    echo "Prints the number of frames of data in the data-dir, via sampling rather"
+    echo "than trying to access all the data."
+  ) 1>&2
 fi

 data=$1

 if [ ! -f $data/feats.scp ]; then
-  echo "$0: expected $data/feats.scp to exist"
-  exit 1;
+  if [ -f $data/segments ]; then
+    echo "$0: $data/feats.scp does not exist, but $data/segments does exist; using that and assuming 100 frames per second." 1>&2
+    num_frames=$(cat $data/segments | awk '{x += $4 - $3;} END{print int(x*100);}') || exit 1;
+    echo $num_frames
+    exit 0;
+  else
+    echo "$0: neither $data/feats.scp nor $data/segments exist." 1>&2
+    exit 1;
+  fi
 fi


--- a/egs/wsj/s5/steps/nnet2/make_denlats.sh
+++ b/egs/wsj/s5/steps/nnet2/make_denlats.sh
@ -32,7 +32,7 @@ echo "$0 $@"  # Print the command line for logging

 if [ $# != 4 ]; then
  echo "Usage: steps/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
-  echo "  e.g.: steps/make_denlats.sh data/train data/lang exp/tri1 exp/tri1_denlats"
+  echo "  e.g.: steps/make_denlats.sh data/train data/lang exp/nnet4 exp/nnet4_denlats"
  echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
  echo " plus transforms."
  echo ""
@ -68,14 +68,12 @@ thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

 mkdir -p $dir/log
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs

 oov=`cat $lang/oov.int` || exit 1;

-mkdir -p $dir
-
-cp -r $lang $dir/
+cp -rH $lang $dir/

 # Compute grammar FST which corresponds to unigram decoding graph.
 new_lang="$dir/"$(basename "$lang")
--- a/egs/wsj/s5/steps/nnet2/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet2/train_discriminative.sh
@ -2,10 +2,8 @@

 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

-# This script does MPE or fMMI state-level minimum bayes risk (sMBR) training.
-# Note: the temporary data is put in <exp-dir>/degs/, so if you want
-# to use a different disk for that, just make that a soft link to some other
-# volume.
+# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training
+# of neural nets. 

 # Begin configuration section.
 cmd=run.pl
@ -45,7 +43,6 @@ transform_dir=
 degs_dir=
 retroactive=false
 online_ivector_dir=
-use_preconditioning=false
 # End configuration section.


@ -76,7 +73,7 @@ if [ $# != 6 ]; then
  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
-  echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
@ -85,6 +82,8 @@ if [ $# != 6 ]; then
  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
  echo "                                                   # changes across layers."
  echo "  --degs-dir <dir|"">                              # Directory for discriminative examples, e.g. exp/foo/degs"
+  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
+  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
  echo "                                                   # online-neural-net setup."
  exit 1;
@ -240,19 +239,17 @@ fi

 if [ $stage -le -7 ]; then
  echo "$0: Copying initial model and modifying preconditioning setup"
-  # We want online preconditioning with a larger number of samples of history, since
-  # in this setup the frames are only randomized at the segment level so they are highly
-  # correlated.  It might make sense to tune this a little, later on, although I doubt
-  # it matters once it's large enough.

-  if $use_preconditioning; then
-    $cmd $dir/log/convert.log \
-      nnet-am-copy --learning-rate=$learning_rate "$src_model" - \| \
-      nnet-am-switch-preconditioning  --num-samples-history=50000 - $dir/0.mdl || exit 1;
-  else
-    $cmd $dir/log/convert.log \
-      nnet-am-copy --learning-rate=$learning_rate "$src_model" $dir/0.mdl || exit 1;
-  fi
+  # Note, the baseline model probably had preconditioning, and we'll keep it;
+  # but we want online preconditioning with a larger number of samples of
+  # history, since in this setup the frames are only randomized at the segment
+  # level so they are highly correlated.  It might make sense to tune this a
+  # little, later on, although I doubt it matters once the --num-samples-history
+  # is large enough.
+
+  $cmd $dir/log/convert.log \
+    nnet-am-copy --learning-rate=$learning_rate "$src_model" - \| \
+    nnet-am-switch-preconditioning  --num-samples-history=50000 - $dir/0.mdl || exit 1;
 fi


@ -344,7 +341,7 @@ fi

 x=0   
 while [ $x -lt $num_iters ]; do
-  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+  if [ $stage -le $x ]; then
    
    echo "Training neural net (pass $x)"

@ -356,10 +353,7 @@ while [ $x -lt $num_iters ]; do
        $dir/$[$x+1].JOB.mdl \
      || exit 1;

-    nnets_list=
-    for n in `seq 1 $num_jobs_nnet`; do
-      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
-    done
+    nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.mdl; done)

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;
--- a/egs/wsj/s5/steps/nnet2/train_discriminative2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_discriminative2.sh
@ -0,0 +1,219 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training.
+# This version (2) of the script uses a newer format for the discriminative-training
+# egs, as obtained by steps/nnet2/get_egs_discriminative2.sh.
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=4       # Number of epochs of training
+learning_rate=0.00002
+acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
+boost=0.0       # option relevant for MMI
+
+criterion=smbr
+drop_frames=false #  option relevant for MMI
+num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
+                   # will interact with the learning rates (if you decrease
+                   # this, you'll have to decrease the learning rate, and vice
+                   # versa).
+
+modify_learning_rates=true
+last_layer_factor=1.0  # relates to modify-learning-rates
+first_layer_factor=1.0 # relates to modify-learning-rates
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+
+
+stage=-3
+
+
+num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
+                # using GPUs.
+cleanup=true
+retroactive=false
+remove_egs=false
+src_model=  # will default to $degs_dir/final.mdl
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [opts] <degs-dir> <exp-dir>"
+  echo " e.g.: $0 exp/tri4_mpe_degs exp/tri4_mpe"
+  echo ""
+  echo "You have to first call get_egs_discriminative2.sh to dump the egs."
+  echo "Caution: the options 'drop_frames' and 'criterion' are taken here"
+  echo "even though they were required also by get_egs_discriminative2.sh,"
+  echo "and they should normally match."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|4>                        # Number of epochs of training"
+  echo "  --initial-learning-rate <initial-learning-rate|0.0002> # Learning rate at start of training"
+  echo "  --final-learning-rate  <final-learning-rate|0.0004>   # Learning rate at end of training"
+  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
+  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
+  echo "                                                   # Note: if you increase this, you may want to also increase"
+  echo "                                                   # the learning rate.  Also note: if there are fewer archives"
+  echo "                                                   # of egs than this, it will get reduced automatically."
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size.  With GPU, must be 1."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --stage <stage|-3>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
+  echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
+  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
+  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
+  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
+  echo "                                                   # changes across layers."
+  exit 1;
+fi
+
+degs_dir=$1
+dir=$2
+
+[ -z "$src_model" ] && src_model=$degs_dir/final.mdl
+
+# Check some files.
+for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_archive} $src_model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log || exit 1;
+
+# copy some things
+for f in splice_opts cmvn_opts tree final.mat; do
+  if [ -f $degs_dir/$f ]; then
+    cp $degs_dir/$f $dir/ || exit 1;
+  fi
+done
+
+silphonelist=`cat $degs_dir/info/silence.csl` || exit 1;
+
+
+num_archives=$(cat $degs_dir/info/num_archives) || exit 1;
+
+if [ $num_jobs_nnet -gt $num_archives ]; then
+  echo "$0: num-jobs-nnet $num_jobs_nnet exceeds number of archives $num_archives,"
+  echo " ... setting it to $num_archives."
+  num_jobs_nnet=$num_archives
+fi
+
+num_iters=$[($num_epochs*$num_archives)/$num_jobs_nnet]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+
+if [ $stage -le -1 ]; then
+  echo "$0: Copying initial model and modifying preconditioning setup"
+
+  # Note, the baseline model probably had preconditioning, and we'll keep it;
+  # but we want online preconditioning with a larger number of samples of
+  # history, since in this setup the frames are only randomized at the segment
+  # level so they are highly correlated.  It might make sense to tune this a
+  # little, later on, although I doubt it matters once the --num-samples-history
+  # is large enough.
+
+  $cmd $dir/log/convert.log \
+    nnet-am-copy --learning-rate=$learning_rate "$src_model" - \| \
+    nnet-am-switch-preconditioning  --num-samples-history=50000 - $dir/0.mdl || exit 1;
+fi
+
+
+
+if [ $num_threads -eq 1 ]; then
+ train_suffix="-simple" # this enables us to use GPU code if
+                        # we have just one thread.
+else
+  train_suffix="-parallel --num-threads=$num_threads"
+fi
+
+
+x=0   
+while [ $x -lt $num_iters ]; do
+  if [ $stage -le $x ]; then
+    
+    echo "Training neural net (pass $x)"
+
+    # The \$ below delays the evaluation of the expression until the script runs (and JOB
+    # will be replaced by the job-id).  That expression in $[..] is responsible for
+    # choosing the archive indexes to use for each job on each iteration... we cycle through
+    # all archives.
+
+    $cmd JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
+      nnet-combine-egs-discriminative \
+        "ark:$degs_dir/degs.\$[((JOB-1+($x*$num_jobs_nnet))%$num_archives)+1].ark" ark:- \| \
+      nnet-train-discriminative$train_suffix --silence-phones=$silphonelist \
+       --criterion=$criterion --drop-frames=$drop_frames \
+       --boost=$boost --acoustic-scale=$acoustic_scale \
+       $dir/$x.mdl ark:- $dir/$[$x+1].JOB.mdl || exit 1;
+
+    nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.mdl; done)
+
+    $cmd $dir/log/average.$x.log \
+      nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;
+
+    if $modify_learning_rates; then
+      $cmd $dir/log/modify_learning_rates.$x.log \
+        nnet-modify-learning-rates --retroactive=$retroactive \
+        --last-layer-factor=$last_layer_factor \
+        --first-layer-factor=$first_layer_factor \
+        $dir/$x.mdl $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+    rm $nnets_list
+  fi
+
+  x=$[$x+1]
+done
+
+rm $dir/final.mdl 2>/dev/null
+ln -s $x.mdl $dir/final.mdl
+
+
+echo Done
+
+epoch_final_iters=
+for e in $(seq 0 $num_epochs); do
+  x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number.
+  ln -sf $x.mdl $dir/epoch$e.mdl
+  epoch_final_iters="$epoch_final_iters $x"
+done
+
+
+# function to remove egs that might be soft links.
+remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done }
+
+if $cleanup && $remove_egs; then  # note: this is false by default.
+  echo Removing training examples
+  for n in $(seq $num_archives); do
+    remove $degs_dir/degs.*
+  done
+fi
+
+
+if $cleanup; then
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if ! echo $epoch_final_iters | grep -w $x >/dev/null; then 
+      # if $x is not an epoch-final iteration..
+      rm $dir/$x.mdl 2>/dev/null
+    fi
+  done
+fi
+
--- a/egs/wsj/s5/steps/nnet2/train_discriminative_multilang2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_discriminative_multilang2.sh
@ -0,0 +1,304 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training,
+# in the multi-language or at least multi-model setting where you have multiple "degs" directories.
+# The input "degs" directories must be dumped by one of the get_egs_discriminative2.sh scripts.
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=4       # Number of epochs of training
+learning_rate=0.00002
+acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
+boost=0.0       # option relevant for MMI
+
+criterion=smbr
+drop_frames=false #  option relevant for MMI
+num_jobs_nnet="4 4"    # Number of neural net jobs to run in parallel, one per
+                       # language..  Note: this will interact with the learning
+                       # rates (if you decrease this, you'll have to decrease
+                       # the learning rate, and vice versa).
+
+modify_learning_rates=true
+last_layer_factor=1.0  # relates to modify-learning-rates
+first_layer_factor=1.0 # relates to modify-learning-rates
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+
+
+stage=-3
+
+
+num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
+                # using GPUs.
+cleanup=true
+retroactive=false
+remove_egs=false
+src_models=  # can be used to override the defaults of <degs-dir1>/final.mdl <degs-dir2>/final.mdl .. etc.
+             # set this to a space-separated list.
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# -lt 3 ]; then
+  echo "Usage: $0 [opts] <degs-dir1> <degs-dir2> ... <degs-dirN>  <exp-dir>"
+  echo " e.g.: $0 exp/tri4_mpe_degs exp_other_lang/tri4_mpe_degs exp/tri4_mpe_multilang"
+  echo ""
+  echo "You have to first call get_egs_discriminative2.sh to dump the egs."
+  echo "Caution: the options 'drop_frames' and 'criterion' are taken here"
+  echo "even though they were required also by get_egs_discriminative2.sh,"
+  echo "and they should normally match."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|4>                        # Number of epochs of training"
+  echo "  --initial-learning-rate <initial-learning-rate|0.0002> # Learning rate at start of training"
+  echo "  --final-learning-rate  <final-learning-rate|0.0004>   # Learning rate at end of training"
+  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
+  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
+  echo "                                                   # Note: if you increase this, you may want to also increase"
+  echo "                                                   # the learning rate.  Also note: if there are fewer archives"
+  echo "                                                   # of egs than this, it will get reduced automatically."
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size.  With GPU, must be 1."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --stage <stage|-3>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
+  echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
+  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
+  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
+  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
+  echo "                                                   # changes across layers."
+  exit 1;
+fi
+
+argv=("$@") 
+num_args=$#
+num_lang=$[$num_args-1]
+
+dir=${argv[$num_args-1]}
+
+num_jobs_nnet_array=($num_jobs_nnet)
+! [ "${#num_jobs_nnet_array[@]}" -eq "$num_lang" ] && \
+  echo "$0: --num-jobs-nnet option must have size equal to the number of languages" && exit 1;
+
+for lang in $(seq 0 $[$num_lang-1]); do
+  degs_dir[$lang]=${argv[$lang]}
+done
+
+if [ ! -z "$src_models" ]; then
+  src_model_array=($src_models)
+  ! [ "${#src_model_array[@]}" -eq "$num_lang" ] && \
+    echo "$0: --src-models option must have size equal to the number of languages" && exit 1;
+else
+  for lang in $(seq 0 $[$num_lang-1]); do
+    src_model_array[$lang]=${degs_dir[$lang]}/final.mdl
+  done
+fi
+
+mkdir -p $dir/log || exit 1;
+
+for lang in $(seq 0 $[$num_lang-1]); do
+  this_degs_dir=${degs_dir[$lang]}
+  mdl=${src_model_array[$lang]}
+  this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
+  # Check inputs
+  for f in $this_degs_dir/degs.1.ark $this_degs_dir/info/{num_archives,silence.csl,frames_per_archive} $mdl; do
+    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+  done
+  mkdir -p $dir/$lang/log || exit 1;
+
+  # check for valid num-jobs-nnet.
+  ! [ $this_num_jobs_nnet -gt 0 ] && echo "Bad num-jobs-nnet option '$num_jobs_nnet'" && exit 1;
+  this_num_archives=$(cat $this_degs_dir/info/num_archives) || exit 1;
+  num_archives_array[$lang]=$this_num_archives
+  silphonelist_array[$lang]=$(cat $this_degs_dir/info/silence.csl) || exit 1;
+
+  if [ $this_num_jobs_nnet -gt $this_num_archives ]; then
+    echo "$0: num-jobs-nnet $this_num_jobs_nnet exceeds number of archives $this_num_archives"
+    echo " ... for language $lang; setting it to $this_num_archives."
+    num_jobs_nnet_array[$lang]=$this_num_archives
+  fi
+
+  # copy some things from the input directories.
+  for f in splice_opts cmvn_opts tree final.mat; do
+    if [ -f $this_degs_dir/$f ]; then
+      cp $this_degs_dir/$f $dir/$lang/ || exit 1;
+    fi
+  done
+  if [ -f $this_degs_dir/conf ]; then
+    ln -sf $(readlink -f $this_degs_dir/conf) $dir/ || exit 1; 
+  fi
+done
+
+
+# work out number of iterations.
+num_archives0=$(cat ${degs_dir[0]}/info/num_archives) || exit 1;
+num_jobs_nnet0=${num_jobs_nnet_array[0]}
+
+! [ $num_epochs -gt 0 ] && echo "Error: num-epochs $num_epochs is not valid" && exit 1;
+
+
+num_iters=$[($num_epochs*$num_archives0)/$num_jobs_nnet0]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations (measured on language 0)"
+# Work out the number of epochs we train for on the other languages... this is
+# just informational.
+for lang in $(seq 1 $[$num_lang-1]); do
+  this_degs_dir=${degs_dir[$lang]}
+  this_num_archives=${num_archives_array[$lang]}
+  this_num_epochs=$[($num_iters*${num_jobs_nnet_array[$lang]})/$this_num_archives]
+  echo "$0: $num_iters iterations is approximately $this_num_epochs epochs for language $lang"
+done
+
+
+
+if [ $stage -le -1 ]; then
+  echo "$0: Copying initial models and modifying preconditioning setups"
+
+  # Note, the baseline model probably had preconditioning, and we'll keep it;
+  # but we want online preconditioning with a larger number of samples of
+  # history, since in this setup the frames are only randomized at the segment
+  # level so they are highly correlated.  It might make sense to tune this a
+  # little, later on, although I doubt it matters once the --num-samples-history
+  # is large enough.
+
+  for lang in $(seq 0 $[$num_lang-1]); do
+    $cmd $dir/$lang/log/convert.log \
+      nnet-am-copy --learning-rate=$learning_rate ${src_model_array[$lang]} - \| \
+      nnet-am-switch-preconditioning  --num-samples-history=50000 - $dir/$lang/0.mdl || exit 1;
+  done
+fi
+
+
+
+if [ $num_threads -eq 1 ]; then
+ train_suffix="-simple" # this enables us to use GPU code if
+                        # we have just one thread.
+else
+  train_suffix="-parallel --num-threads=$num_threads"
+fi
+
+
+x=0   
+while [ $x -lt $num_iters ]; do
+  if [ $stage -le $x ]; then
+    
+    echo "Training neural net (pass $x)"
+
+
+    rm $dir/.error 2>/dev/null
+
+    for lang in $(seq 0 $[$num_lang-1]); do
+      this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
+      this_num_archives=${num_archives_array[$lang]}
+      this_degs_dir=${degs_dir[$lang]}
+      this_silphonelist=${silphonelist_array[$lang]}
+
+      # The \$ below delays the evaluation of the expression until the script runs (and JOB
+      # will be replaced by the job-id).  That expression in $[..] is responsible for
+      # choosing the archive indexes to use for each job on each iteration... we cycle through
+      # all archives.
+
+      (
+        $cmd JOB=1:$this_num_jobs_nnet $dir/$lang/log/train.$x.JOB.log \
+          nnet-combine-egs-discriminative \
+          "ark:$this_degs_dir/degs.\$[((JOB-1+($x*$this_num_jobs_nnet))%$this_num_archives)+1].ark" ark:- \| \
+          nnet-train-discriminative$train_suffix --silence-phones=$this_silphonelist \
+           --criterion=$criterion --drop-frames=$drop_frames \
+           --boost=$boost --acoustic-scale=$acoustic_scale \
+           $dir/$lang/$x.mdl ark:- $dir/$lang/$[$x+1].JOB.mdl || exit 1;
+
+        nnets_list=$(for n in $(seq $this_num_jobs_nnet); do echo $dir/$lang/$[$x+1].$n.mdl; done)
+
+        # produce an average just within this language.
+        $cmd $dir/$lang/log/average.$x.log \
+          nnet-am-average $nnets_list $dir/$lang/$[$x+1].tmp.mdl || exit 1;
+
+        rm $nnets_list
+      ) || touch $dir/.error &
+    done
+    wait
+    [ -f $dir/.error ] && echo "$0: error on pass $x" && exit 1
+
+
+    # apply the modify-learning-rates thing to the model for the zero'th language;
+    # we'll use the resulting learning rates for the other languages.
+    if $modify_learning_rates; then
+      $cmd $dir/log/modify_learning_rates.$x.log \
+        nnet-modify-learning-rates --retroactive=$retroactive \
+        --last-layer-factor=$last_layer_factor \
+        --first-layer-factor=$first_layer_factor \
+        $dir/0/$x.mdl $dir/0/$[$x+1].tmp.mdl $dir/0/$[$x+1].tmp.mdl || exit 1;
+    fi
+
+    nnets_list=$(for lang in $(seq 0 $[$num_lang-1]); do echo $dir/$lang/$[$x+1].tmp.mdl; done)
+    weights_csl=$(echo $num_jobs_nnet | sed 's/ /:/g') # get as colon separated list.
+
+    # the next command produces the cross-language averaged model containing the
+    # final layer corresponding to language zero.  Note, if we did modify-learning-rates,
+    # it will also have the modified learning rates.
+    $cmd $dir/log/average.$x.log \
+      nnet-am-average --weights=$weights_csl --skip-last-layer=true \
+      $nnets_list $dir/0/$[$x+1].mdl || exit 1;
+
+    # we'll transfer these learning rates to the other models.
+    learning_rates=$(nnet-am-info --print-learning-rates=true $dir/0/$[$x+1].mdl 2>/dev/null)        
+
+    for lang in $(seq 1 $[$num_lang-1]); do
+      # the next command takes the averaged hidden parameters from language zero, and
+      # the last layer from language $lang.  It's not really doing averaging.
+      # we use nnet-am-copy to transfer the learning rates from model zero.
+      $cmd $dir/$lang/log/combine_average.$x.log \
+        nnet-am-average --weights=0.0:1.0 --skip-last-layer=true \
+          $dir/$lang/$[$x+1].tmp.mdl $dir/0/$[$x+1].mdl - \| \
+        nnet-am-copy --learning-rates=$learning_rates - $dir/$lang/$[$x+1].mdl || exit 1;
+    done
+
+    $cleanup && rm $dir/*/$[$x+1].tmp.mdl
+
+  fi
+
+  x=$[$x+1]
+done
+
+
+for lang in $(seq 0 $[$num_lang-1]); do
+  rm $dir/$lang/final.mdl 2>/dev/null
+  ln -s $x.mdl $dir/$lang/final.mdl
+
+
+  epoch_final_iters=
+  for e in $(seq 0 $num_epochs); do
+    x=$[($e*$num_archives0)/$num_jobs_nnet0] # gives the iteration number.
+    ln -sf $x.mdl $dir/$lang/epoch$e.mdl
+    epoch_final_iters="$epoch_final_iters $x"
+  done
+
+  if $cleanup; then
+    echo "Removing most of the models for language $lang"
+    for x in `seq 0 $num_iters`; do
+      if ! echo $epoch_final_iters | grep -w $x >/dev/null; then 
+        # if $x is not an epoch-final iteration..
+        rm $dir/$lang/$x.mdl 2>/dev/null
+      fi
+    done
+  fi
+done
+
+
+echo Done
--- a/egs/wsj/s5/steps/nnet2/train_more.sh
+++ b/egs/wsj/s5/steps/nnet2/train_more.sh
@ -64,8 +64,6 @@ if [ $# != 3 ]; then
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
-  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
-  echo "                                                   # after learning rate fully reduced"
  echo "  --learning-rate-factor<factor|1.0>               # Factor (e.g. 0.2) by which to change learning rate"
  echo "                                                   # during the course of training"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
--- a/egs/wsj/s5/steps/nnet2/train_more2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_more2.sh
@ -0,0 +1,351 @@
+#!/bin/bash
+
+# Copyright 2014  Johns Hopkins University (Author: Daniel Povey). 
+# Apache 2.0.
+
+# This script further trains an already-existing neural network,
+# given an existing model and an examples (egs/) directory.
+# This version of the script epects an egs/ directory in the newer
+# format, as created by get_egs2.sh.
+#
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=10      # Number of epochs of training; number of iterations is
+                   # worked out from this.
+num_iters_final=20 # Maximum number of final iterations to give to the
+                  # optimization over the validation set.
+learning_rate_factor=1.0 # You can use this to gradually decrease the learning
+                         # rate during training (e.g. use 0.2); the initial
+                         # learning rates are as specified in the model, but it
+                         # will decrease slightly on each iteration to achieve
+                         # this ratio.
+
+combine=true # controls whether or not to do the final model combination.
+combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+minibatch_size=128 # by default use a smallish minibatch size for neural net
+                   # training; this controls instability which would otherwise
+                   # be a problem with multi-threaded update.  Note: it also
+                   # interacts with the "preconditioned" update which generally
+                   # works better with larger minibatch size, so it's not
+                   # completely cost free.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+num_jobs_nnet=4
+mix_up=0
+stage=-5
+num_threads=16
+parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" # by default we use 16 threads; this lets the queue know.
+   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
+combine_num_threads=8
+cleanup=true
+prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
+                        # more than enough.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+remove_egs=false
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [opts] <input-model> <egs-dir> <exp-dir>"
+  echo " e.g.: $0 exp/nnet4c/final.mdl exp/nnet4c/egs exp/nnet5c/"
+  echo "see also the older script update_nnet.sh which creates the egs itself"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "                                                   # while reducing learning rate (determines #iterations, together"
+  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
+  echo "  --num-jobs-nnet <#jobs|4>                        # Number of neural-net jobs to run in parallel"
+  echo "  --learning-rate-factor<factor|1.0>               # Factor (e.g. 0.2) by which to change learning rate"
+  echo "                                                   # during the course of training"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
+  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
+  echo "  --mix-up <#mix|0>                                # If specified, add quasi-targets, analogous to a mixture of Gaussians vs."
+  echo "                                                   # single Gaussians.  Only do this if not already mixed-up."
+  echo "  --combine <true or false|true>                   # If true, do the final nnet-combine-fast stage."
+  echo "  --stage <stage|-5>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."  
+  exit 1;
+fi
+
+input_mdl=$1
+egs_dir=$2
+dir=$3
+
+# Check some files.
+for f in $input_mdl $egs_dir/egs.1.ark; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
+done
+
+mkdir -p $dir/log
+
+# Copy some things from the directory where the input model is located, to the
+# experimental directory, if they exist.  These might be needed for things like
+# decoding.
+input_dir=$(dirname $input_mdl);
+for f in tree splice_opts cmvn_opts final.mat; do
+  if [ -f $input_dir/$f ]; then
+    cp $input_dir/$f $dir/
+  fi
+done
+
+frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+
+# num_archives_expanded considers each separate label-position from
+# 0..frames_per_eg-1 to be a separate archive.
+num_archives_expanded=$[$num_archives*$frames_per_eg]
+
+if [ $num_jobs_nnet -gt $num_archives_expanded ]; then
+  echo "$0: --num-jobs-nnet cannot exceed num-archives*frames-per-eg which is $num_archives_expanded"
+  echo "$0: setting --num-jobs-nnet to $num_archives_expanded"
+  num_jobs_nnet=$num_archives_expanded
+fi
+
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$num_jobs_nnet == $num_epochs*$num_archives_expanded
+num_iters=$[($num_epochs*$num_archives_expanded)/$num_jobs_nnet]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+per_iter_learning_rate_factor=$(perl -e "print ($learning_rate_factor ** (1.0 / $num_iters));")
+
+mix_up_iter=$[$num_iters/2]
+
+if [ $num_threads -eq 1 ]; then
+  parallel_suffix="-simple" # this enables us to use GPU code if
+                         # we have just one thread.
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+  fi
+else
+  parallel_suffix="-parallel"
+  parallel_train_opts="--num-threads=$num_threads"
+fi
+
+
+approx_iters_per_epoch=$[$num_iters/$num_epochs]
+# First work out how many models we want to combine over in the final
+# nnet-combine-fast invocation.  This equals
+# min(max(max_models_combine, iters_per_epoch),
+#     2/3 * iters_after_mixup)
+num_models_combine=$max_models_combine
+if [ $num_models_combine -lt $approx_iters_per_epoch ]; then
+  num_models_combine=$approx_iters_per_epoch
+fi
+iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
+if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
+  num_models_combine=$iters_after_mixup_23
+fi
+first_model_combine=$[$num_iters-$num_models_combine+1]
+
+cp $input_mdl $dir/0.mdl || exit 1;
+
+x=0
+
+while [ $x -lt $num_iters ]; do
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    # Set off jobs doing some diagnostics, in the background.
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
+    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
+    fi
+    
+    echo "Training neural net (pass $x)"
+
+    rm $dir/.error 2>/dev/null
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+      
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in $(seq $num_jobs_nnet); do
+        k=$[$x*$num_jobs_nnet + $n - 1]; # k is a zero-based index that we'll derive
+                                         # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
+        # index; this increases more slowly than the archive index because the
+        # same archive with different frame indexes will give similar gradients,
+        # so we want to separate them in time.
+
+        $cmd $parallel_opts $dir/log/train.$x.$n.log \
+          nnet-train$parallel_suffix $parallel_train_opts \
+          --minibatch-size=$minibatch_size --srand=$x $dir/$x.mdl \
+          "ark:nnet-copy-egs --frame=$frame ark:$egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          $dir/$[$x+1].$n.mdl || touch $dir/.error &
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    nnets_list=
+    for n in `seq 1 $num_jobs_nnet`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
+    done     
+
+    $cmd $dir/log/average.$x.log \
+      nnet-am-average $nnets_list - \| \
+      nnet-am-copy --learning-rate-factor=$per_iter_learning_rate_factor - $dir/$[$x+1].mdl || exit 1;
+
+    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
+      # mix up.
+      echo Mixing up from $num_leaves to $mix_up components
+      $cmd $dir/log/mix_up.$x.log \
+        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
+         $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+    rm $nnets_list
+  fi
+  x=$[$x+1]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.mdl"
+
+  # Now do combination.
+  nnets_list=()
+  # the if..else..fi statement below sets 'nnets_list'.
+  if [ $max_models_combine -lt $num_models_combine ]; then
+    # The number of models to combine is too large, e.g. > 20.  In this case,
+    # each argument to nnet-combine-fast will be an average of multiple models.
+    cur_offset=0 # current offset from first_model_combine.
+    for n in $(seq $max_models_combine); do
+      next_offset=$[($n*$num_models_combine)/$max_models_combine]
+      sub_list="" 
+      for o in $(seq $cur_offset $[$next_offset-1]); do
+        iter=$[$first_model_combine+$o]
+        mdl=$dir/$iter.mdl
+        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+        sub_list="$sub_list $mdl"
+      done
+      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
+      cur_offset=$next_offset
+    done
+  else
+    nnets_list=
+    for n in $(seq 0 $[num_models_combine-1]); do
+      iter=$[$first_model_combine+$n]
+      mdl=$dir/$iter.mdl
+      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+      nnets_list[$n]=$mdl
+    done
+  fi
+
+
+  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
+  # if there are many models it can give out-of-memory error; set num-threads to 8
+  # to speed it up (this isn't ideal...)
+  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
+  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
+  [ $mb -gt 512 ] && mb=512
+  # Setting --initial-model to a large value makes it initialize the combination
+  # with the average of all the models.  It's important not to start with a
+  # single model, or, due to the invariance to scaling that these nonlinearities
+  # give us, we get zero diagonal entries in the fisher matrix that
+  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
+  # the effect that the initial model chosen gets much higher learning rates
+  # than the others.  This prevents the optimization from working well.
+  $cmd $combine_parallel_opts $dir/log/combine.log \
+    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
+      --num-threads=$combine_num_threads \
+      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
+      $dir/final.mdl || exit 1;
+
+  # Normalize stddev for affine or block affine layers that are followed by a
+  # pnorm layer and then a normalize layer.
+  $cmd $dir/log/normalize.log \
+    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
+fi
+
+if [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purposes of adjusting the priors."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  rm $dir/post.$x.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
+    nnet-copy-egs --frame=random --srand=JOB ark:$egs_dir/egs.1.ark ark:- \| \
+    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.$x.log \
+   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+
+  rm $dir/post.$x.*.vec;
+
+  echo "Re-adjusting priors based on computed posteriors"
+  $cmd $dir/log/adjust_priors.final.log \
+    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
+fi
+
+
+if [ ! -f $dir/final.mdl ]; then
+  echo "$0: $dir/final.mdl does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi
+
--- a/egs/wsj/s5/steps/nnet2/train_multilang2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_multilang2.sh
@ -0,0 +1,543 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+#           2013  Xiaohui Zhang
+#           2013  Guoguo Chen
+#           2014  Vimal Manohar
+#           2014  Vijayaditya Peddinti
+# Apache 2.0.
+
+
+# train_multilang2.sh is for multi-language training of neural nets.  It
+# takes multiple egs directories which must be created by get_egs2.sh, and the
+# corresponding alignment directories (only needed for training the transition
+# models).
+#
+# This script requires you to supply a neural net partially trained for the 1st
+# language, by one of the regular training scripts, to be used as the initial
+# neural net (for use by other languages, we'll discard the last layer); it
+# should not have been subject to "mix-up" (since this script does mix-up), or
+# combination (since it would increase the parameter range to a too-large value
+# which isn't compatible with our normal learning rate schedules).
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=10      # Number of epochs of training (for first language);
+                   # the number of iterations is worked out from this.
+initial_learning_rate=0.04
+final_learning_rate=0.004
+
+minibatch_size=128 # by default use a smallish minibatch size for neural net
+                   # training; this controls instability which would otherwise
+                   # be a problem with multi-threaded update. 
+
+num_jobs_nnet="2 2"    # Number of neural net jobs to run in parallel.  This option
+                       # is passed to get_egs.sh.  Array must be same length
+                       # as number of separate languages.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
+                        # more than enough.
+
+stage=-4
+
+
+mix_up="0 0" # Number of components to mix up to (should be > #tree leaves, if
+             # specified.)  An array, one per language.
+
+num_threads=16  # default suitable for CPU-based training
+parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G"  # default suitable for CPU-based training.
+  # by default we use 16 threads; this lets the queue know.
+  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
+combine_num_threads=8
+combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+cleanup=false # while testing, leaving cleanup=false.
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -lt 6 -o $[$#%2] -ne 0 ]; then
+  # num-args must be at least 6 and must be even.
+  echo "Usage: $0 [opts] <ali1> <egs1> <ali2> <egs2> ... <aliN> <egsN> <input-model> <exp-dir>"
+  echo " e.g.: $0 data/train exp/tri6_ali exp/tri6_egs exp_lang2/tri6_ali exp_lang2/tri6_egs exp/dnn6a/10.mdl exp/tri6_multilang"
+  echo ""
+  echo "Note: the first egs/ali should correspond to the language that you really want; this"
+  echo "only affects how the num-epochs is computed, and which model we link to final.mdl."
+  echo ""
+  echo "The --num-jobs-nnet should be an array saying how many jobs to allocate to each language,"
+  echo "e.g. --num-jobs-nnet '2 4'"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training (figured from 1st corpus)"
+  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
+  echo "                                                       # data, 0.01 for large data"
+  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
+  echo "                                                   # data, 0.001 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
+  echo "                                                   # per context-dependent state.  Try a number several times #states."
+  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
+  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
+  echo "                                                   # Note: if you increase this, you may want to also increase"
+  echo "                                                   # the learning rate."
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
+  echo "                                                   # Frame indices used for each splice layer."
+  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  exit 1;
+fi
+
+
+argv=("$@") 
+num_args=$#
+num_lang=$[($num_args-2)/2]
+
+dir=${argv[$num_args-1]}
+input_model=${argv[$num_args-2]}
+
+[ ! -f $input_model ] && echo "$0: Input model $input_model does not exist" && exit 1;
+
+mkdir -p $dir/log
+
+num_jobs_nnet_array=($num_jobs_nnet)
+! [ "${#num_jobs_nnet_array[@]}" -eq "$num_lang" ] && \
+  echo "$0: --num-jobs-nnet option must have size equal to the number of languages" && exit 1;
+mix_up_array=($mix_up)
+! [ "${#mix_up_array[@]}" -eq "$num_lang" ] && \
+  echo "$0: --mix-up option must have size equal to the number of languages" && exit 1;
+
+
+# Language index starts from 0.
+for lang in $(seq 0 $[$num_lang-1]); do
+  alidir[$lang]=${argv[$lang*2]}
+  egs_dir[$lang]=${argv[$lang*2+1]}
+  for f in ${egs_dir[$lang]}/info/frames_per_eg ${egs_dir[lang]}/egs.1.ark ${alidir[$lang]}/ali.1.gz ${alidir[$lang]}/tree; do
+    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+  done
+  mkdir -p $dir/$lang/log
+  cp ${alidir[$lang]}/tree $dir/$lang/ || exit 1;
+
+  for f in ${egs_dir[$lang]}/{final.mat,cmvn_opts,splice_opts}; do
+    # Copy any of these files that exist.
+    cp $f $dir/$lang/ 2>/dev/null 
+  done
+done
+
+
+for x in final.mat cmvn_opts splice_opts; do
+  if [ -f $dir/0/$x ]; then
+    for lang in $(seq 1 $[$num_lang-1]); do
+      if ! cmp $dir/0/$x $dir/$lang/$x; then
+        echo "$0: warning: files $dir/0/$x and $dir/$lang/$x are not identical."
+      fi
+    done
+  fi
+done
+
+# the input model is supposed to correspond to the first language.
+nnet-am-copy --learning-rate=$initial_learning_rate $input_model $dir/0/0.mdl
+
+if nnet-am-info --print-args=false $dir/0/0.mdl | grep SumGroupComponent 2>/dev/null; then
+  if [ "${mix_up_array[0]}" != "0" ]; then
+    echo "$0: Your input model already has mixtures, but you are asking to mix it up."
+    echo " ... best to use a model without mixtures as input.  (e.g., earlier iter)."
+    exit 1;
+  fi
+fi
+
+
+if [ $stage -le -4 ]; then
+  echo "$0: initializing models for other languages"
+  for lang in $(seq 1 $[$num_lang-1]); do
+  # create the initial models for the other languages.
+    $cmd $dir/$lang/log/reinitialize.log \
+      nnet-am-reinitialize $input_model ${alidir[$lang]}/final.mdl $dir/$lang/0.mdl || exit 1;
+  done
+fi
+
+if [ $stage -le -3 ]; then
+  echo "Training transition probabilities and setting priors"
+  for lang in $(seq 0 $[$num_lang-1]); do
+    $cmd $dir/$lang/log/train_trans.log \
+      nnet-train-transitions $dir/$lang/0.mdl "ark:gunzip -c ${alidir[$lang]}/ali.*.gz|" $dir/$lang/0.mdl \
+      || exit 1;
+  done
+fi
+
+# Work out the number of iterations... the number of epochs refers to the
+# first language (language zero) and this, together with the num-jobs-nnet for
+# that language and details of the egs, determine the number of epochs.
+
+frames_per_eg0=$(cat ${egs_dir[0]}/info/frames_per_eg) || exit 1;
+num_archives0=$(cat ${egs_dir[0]}/info/num_archives) || exit 1;
+# num_archives_expanded considers each separate label-position from
+# 0..frames_per_eg-1 to be a separate archive.
+num_archives_expanded0=$[$num_archives0*$frames_per_eg0]
+
+if [ ${num_jobs_nnet_array[0]} -gt $num_archives_expanded0 ]; then
+  echo "$0: --num-jobs-nnet[0] cannot exceed num-archives*frames-per-eg which is $num_archives_expanded"
+  exit 1;
+fi
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$num_jobs_nnet == $num_epochs*$num_archives_expanded
+num_iters=$[($num_epochs*$num_archives_expanded0)/${num_jobs_nnet_array[0]}]
+
+echo "$0: Will train for $num_epochs epochs (of language 0) = $num_iters iterations"
+
+! [ $num_iters -gt 0 ] && exit 1;
+
+# Work out the number of epochs we train for on the other languages... this is
+# just informational.
+for lang in $(seq 1 $[$num_lang-1]); do
+  frames_per_eg=$(cat ${egs_dir[$lang]}/info/frames_per_eg) || exit 1;
+  num_archives=$(cat ${egs_dir[$lang]}/info/num_archives) || exit 1;
+  num_archives_expanded=$[$num_archives*$frames_per_eg]
+  num_epochs=$[($num_iters*${num_jobs_nnet_array[$lang]})/$num_archives_expanded]
+  echo "$0: $num_iters iterations is approximately $num_epochs epochs for language $lang"
+done
+
+# do any mixing-up after half the iters.
+mix_up_iter=$[$num_iters/2]
+
+if [ $num_threads -eq 1 ]; then
+  parallel_suffix="-simple" # this enables us to use GPU code if
+                         # we have just one thread.
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+  fi
+else
+  parallel_suffix="-parallel"
+  parallel_train_opts="--num-threads=$num_threads"
+fi
+
+
+approx_iters_per_epoch=$[$num_iters/$num_epochs]
+# First work out how many models we want to combine over in the final
+# nnet-combine-fast invocation.  This equals
+# min(max(max_models_combine, iters_per_epoch),
+#     2/3 * iters_after_mixup).
+# We use the same numbers of iterations for all languages, even though it's just
+# worked out for the first language.
+num_models_combine=$max_models_combine
+if [ $num_models_combine -lt $approx_iters_per_epoch ]; then
+  num_models_combine=$approx_iters_per_epoch
+fi
+iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
+if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
+  num_models_combine=$iters_after_mixup_23
+fi
+first_model_combine=$[$num_iters-$num_models_combine+1]
+
+x=0
+
+
+while [ $x -lt $num_iters ]; do
+    
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    for lang in $(seq 0 $[$num_lang-1]); do
+      # Set off jobs doing some diagnostics, in the background.
+      $cmd $dir/$lang/log/compute_prob_valid.$x.log \
+        nnet-compute-prob $dir/$lang/$x.mdl ark:${egs_dir[$lang]}/valid_diagnostic.egs &
+      $cmd $dir/$lang/log/compute_prob_train.$x.log \
+        nnet-compute-prob $dir/$lang/$x.mdl ark:${egs_dir[$lang]}/train_diagnostic.egs &
+      if [ $x -gt 0 ] && [ ! -f $dir/$lang/log/mix_up.$[$x-1].log ]; then
+        $cmd $dir/$lang/log/progress.$x.log \
+          nnet-show-progress --use-gpu=no $dir/$lang/$[$x-1].mdl $dir/$lang/$x.mdl \
+          ark:${egs_dir[$lang]}/train_diagnostic.egs '&&' \
+           nnet-am-info $dir/$lang/$x.mdl &
+      fi
+    done
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -eq 0 ]; then
+      # on iteration zero, use a smaller minibatch size and only one quarter of the
+      # normal amount of training data: this will help, respectively, to ensure stability
+      # and to stop the models from moving so far that averaging hurts.
+      this_minibatch_size=$[$minibatch_size/2];
+      this_keep_proportion=0.25
+    else
+      this_minibatch_size=$minibatch_size
+      this_keep_proportion=1.0
+      # use half the examples on iteration 1, out of a concern that the model-averaging
+      # might not work if we move too far before getting close to convergence.
+      [ $x -eq 1 ] && this_keep_proportion=0.5 
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+      
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      
+      
+      for lang in $(seq 0 $[$num_lang-1]); do
+        this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
+        this_frames_per_eg=$(cat ${egs_dir[$lang]}/info/frames_per_eg) || exit 1;
+        this_num_archives=$(cat ${egs_dir[$lang]}/info/num_archives) || exit 1;
+
+        ! [ $this_num_jobs_nnet -gt 0 -a $this_frames_per_eg -gt 0 -a $this_num_archives -gt 0 ] && exit 1
+
+        for n in $(seq $this_num_jobs_nnet); do
+          k=$[$x*$this_num_jobs_nnet + $n - 1]; # k is a zero-based index that we'll derive
+                                                # the other indexes from.
+          archive=$[($k%$this_num_archives)+1]; # work out the 1-based archive index.
+          frame=$[(($k/$this_num_archives)%$this_frames_per_eg)];
+
+          $cmd $parallel_opts $dir/$lang/log/train.$x.$n.log \
+            nnet-train$parallel_suffix $parallel_train_opts \
+            --minibatch-size=$this_minibatch_size --srand=$x $dir/$lang/$x.mdl \
+            "ark:nnet-copy-egs --keep-proportion=$this_keep_proportion --frame=$frame ark:${egs_dir[$lang]}/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+            $dir/$lang/$[$x+1].$n.mdl || touch $dir/.error &
+        done
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+
+    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;
+
+    (
+      # First average within each language.  Use a sub-shell so "wait" won't
+      # wait for the diagnostic jobs.
+      for lang in $(seq 0 $[$num_lang-1]); do
+        this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
+        nnets_list=$(for n in `seq 1 $this_num_jobs_nnet`; do echo $dir/$lang/$[$x+1].$n.mdl; done)
+        # average the output of the different jobs.
+        $cmd $dir/$lang/log/average.$x.log \
+          nnet-am-average $nnets_list - \| \
+          nnet-am-copy --learning-rate=$learning_rate - $dir/$lang/$[$x+1].tmp.mdl || touch $dir/.error &
+      done
+      wait
+      [ -f $dir/.error ] && echo "$0: error averaging models on iteration $x of training" && exit 1;
+      # Remove the models we just averaged.
+      for lang in $(seq 0 $[$num_lang-1]); do
+        this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
+        for n in `seq 1 $this_num_jobs_nnet`; do rm $dir/$lang/$[$x+1].$n.mdl; done
+      done
+    )
+
+
+    nnets_list=$(for lang in $(seq 0 $[$num_lang-1]); do echo $dir/$lang/$[$x+1].tmp.mdl; done)
+    weights_csl=$(echo $num_jobs_nnet | sed 's/ /:/g') # get as colon separated list.
+
+    # the next command produces the cross-language averaged model containing the
+    # final layer corresponding to language zero.
+    $cmd $dir/log/average.$x.log \
+      nnet-am-average --weights=$weights_csl --skip-last-layer=true \
+      $nnets_list $dir/0/$[$x+1].mdl || exit 1;
+
+    for lang in $(seq 1 $[$num_lang-1]); do
+      # the next command takes the averaged hidden parameters from language zero, and
+      # the last layer from language $lang.  It's not really doing averaging.
+      $cmd $dir/$lang/log/combine_average.$x.log \
+        nnet-am-average --weights=0.0:1.0 --skip-last-layer=true \
+          $dir/$lang/$[$x+1].tmp.mdl $dir/0/$[$x+1].mdl $dir/$lang/$[$x+1].mdl || exit 1;
+    done
+
+    $cleanup && rm $dir/*/$[$x+1].tmp.mdl
+
+    if [ $x -eq $mix_up_iter ]; then
+      for lang in $(seq 0 $[$num_lang-1]); do     
+        this_mix_up=${mix_up_array[$lang]}
+        if [ $this_mix_up -gt 0 ]; then
+          echo "$0: for language $lang, mixing up to $this_mix_up components"
+          $cmd $dir/$lang/log/mix_up.$x.log \
+            nnet-am-mixup --min-count=10 --num-mixtures=$this_mix_up \
+             $dir/$lang/$[$x+1].mdl $dir/$lang/$[$x+1].mdl || exit 1;
+        fi
+      done
+    fi
+
+    # Now average across languages.
+
+    rm $nnets_list
+
+    for lang in $(seq 0 $[$num_lang-1]); do # mix up.
+      [ ! -f $dir/$lang/$[$x+1].mdl ] && echo "No such file $dir/$lang/$[$x+1].mdl" && exit 1;
+      if [ -f $dir/$lang/$[$x-1].mdl ] && $cleanup && \
+        [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+        rm $dir/$lang/$[$x-1].mdl
+      fi
+    done
+  fi
+  x=$[$x+1]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing combination to produce final models"
+
+
+  rm $dir/.error 2>/dev/null
+  for lang in $(seq 0 $[$num_lang-1]); do
+    nnets_list=()
+    # the if..else..fi statement below sets 'nnets_list'.
+    if [ $max_models_combine -lt $num_models_combine ]; then
+      # The number of models to combine is too large, e.g. > 20.  In this case,
+      # each argument to nnet-combine-fast will be an average of multiple models.
+      cur_offset=0 # current offset from first_model_combine.
+      for n in $(seq $max_models_combine); do
+        next_offset=$[($n*$num_models_combine)/$max_models_combine]
+        sub_list="" 
+        for o in $(seq $cur_offset $[$next_offset-1]); do
+          iter=$[$first_model_combine+$o]
+          mdl=$dir/$lang/$iter.mdl
+          [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+          sub_list="$sub_list $mdl"
+        done
+        nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
+        cur_offset=$next_offset
+      done
+    else
+      nnets_list=
+      for n in $(seq 0 $[num_models_combine-1]); do
+        iter=$[$first_model_combine+$n]
+        mdl=$dir/$iter.mdl
+        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+        nnets_list[$n]=$mdl
+      done
+    fi
+
+    # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
+    # if there are many models it can give out-of-memory error; set num-threads
+    # to 8 to speed it up (this isn't ideal...)
+    num_egs=`nnet-copy-egs ark:${egs_dir[$lang]}/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
+
+    mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
+    [ $mb -gt 512 ] && mb=512
+    # Setting --initial-model to a large value makes it initialize the combination
+    # with the average of all the models.  It's important not to start with a
+    # single model, or, due to the invariance to scaling that these nonlinearities
+    # give us, we get zero diagonal entries in the fisher matrix that
+    # nnet-combine-fast uses for scaling, which after flooring and inversion, has
+    # the effect that the initial model chosen gets much higher learning rates
+    # than the others.  This prevents the optimization from working well.
+    $cmd $combine_parallel_opts $dir/$lang/log/combine.log \
+      nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
+        --num-threads=$combine_num_threads \
+        --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:${egs_dir[$lang]}/combine.egs \
+      - \| nnet-normalize-stddev - $dir/$lang/final.mdl || touch $dir/.error &
+  done
+  wait
+  
+  [ -f $dir/.error ] && echo "$0: error doing model combination" && exit 1;
+fi
+
+
+if [ $stage -le $[$num_iters+1] ]; then
+  for lang in $(seq 0 $[$num_lang-1]); do  
+    # Run the diagnostics for the final models.
+    $cmd $dir/$lang/log/compute_prob_valid.final.log \
+      nnet-compute-prob $dir/$lang/final.mdl ark:${egs_dir[$lang]}/valid_diagnostic.egs &
+    $cmd $dir/$lang/log/compute_prob_train.final.log \
+      nnet-compute-prob $dir/$lang/final.mdl ark:${egs_dir[$lang]}/train_diagnostic.egs &
+  done
+  wait
+fi
+
+if [ $stage -le $[$num_iters+2] ]; then
+  # Note: this just uses CPUs, using a smallish subset of data.
+
+
+  for lang in $(seq 0 $[$num_lang-1]); do
+    echo "$0: Getting average posterior for purposes of adjusting the priors (language $lang)."
+    rm $dir/$lang/.error 2>/dev/null
+    rm $dir/$lang/post.$x.*.vec 2>/dev/null
+    $cmd JOB=1:$num_jobs_compute_prior $dir/$lang/log/get_post.JOB.log \
+      nnet-copy-egs --frame=random --srand=JOB ark:${egs_dir[$lang]}/egs.1.ark ark:- \| \
+      nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+      nnet-compute-from-egs "nnet-to-raw-nnet $dir/$lang/final.mdl -|" ark:- ark:- \| \
+      matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/$lang/post.JOB.vec || touch $dir/$lang/.error &
+  done
+  echo "$0: ... waiting for jobs for all languages to complete."
+  wait
+  sleep 3;  # make sure there is time for $dir/$lang/post.$x.*.vec to appear.
+  for lang in $(seq 0 $[$num_lang-1]); do
+    [ -f $dir/$lang/.error ] && \
+      echo "$0: error getting posteriors for adjusting the priors for language $lang" && exit 1;
+
+    $cmd $dir/$lang/log/vector_sum.log \
+      vector-sum $dir/$lang/post.*.vec $dir/$lang/post.vec || exit 1;
+
+    rm $dir/$lang/post.*.vec;
+
+    echo "Re-adjusting priors based on computed posteriors for language $lang"
+    $cmd $dir/$lang/log/adjust_priors.final.log \
+      nnet-adjust-priors $dir/$lang/final.mdl $dir/$lang/post.vec $dir/$lang/final.mdl || exit 1;
+  done
+fi
+
+
+for lang in $(seq 0 $[$num_lang-1]); do
+  if [ ! -f $dir/$lang/final.mdl ]; then
+    echo "$0: $dir/final.mdl does not exist."
+    # we don't want to clean up if the training didn't succeed.
+    exit 1;
+  fi
+done
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if [[ $egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$lang/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$lang/$x.mdl
+    fi
+  done
+fi
+
+exit 0
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh
@ -140,8 +140,8 @@ if [ $# != 4 ]; then
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
-  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
-  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
@ -133,8 +133,8 @@ if [ $# != 4 ]; then
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
-  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
-  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
--- a/egs/wsj/s5/steps/online/nnet2/align.sh
+++ b/egs/wsj/s5/steps/online/nnet2/align.sh
@ -0,0 +1,80 @@
+#!/bin/bash
+# Copyright      2012  Brno University of Technology (Author: Karel Vesely)
+#           2013-2014  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Computes training alignments using DNN.  This takes as input a directory
+# prepared as for online-nnet2 decoding (e.g. by
+# steps/online/nnet2/prepare_online_decoding.sh), and it computes the features
+# directly from the wav.scp instead of relying on features dumped on disk;
+# this avoids the hassle of having to dump suitably matched features.
+
+
+# Begin configuration section.  
+nj=4
+cmd=run.pl
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+iter=final
+use_gpu=no
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+
+for f in $srcdir/tree $srcdir/${iter}.mdl $data/wav.scp $lang/L.fst \
+      $srcdir/conf/online_nnet2_decoding.conf; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;
+
+grep -v '^--endpoint' $srcdir/conf/online_nnet2_decoding.conf >$dir/feature.conf || exit 1;
+
+
+if [ -f $data/segments ]; then
+  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
+  # previous utterances within a speaker, we do the filtering after extracting the features.
+  echo "$0 [info]: segments file exists: using that."
+  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- |"
+else
+  echo "$0 [info]: no segments file exists, using wav.scp."
+  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- |"
+fi
+
+echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
+
+tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+
+$cmd JOB=1:$nj $dir/log/align.JOB.log \
+  compile-train-graphs $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
+  nnet-align-compiled $scale_opts --use-gpu=$use_gpu --beam=$beam --retry-beam=$retry_beam \
+    $srcdir/${iter}.mdl ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+
+echo "$0: done aligning data."
+
--- a/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh
+++ b/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh
@ -78,4 +78,8 @@ for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
 done

 echo "$0: copied data from $srcdir to $destdir, with --utts-per-spk-max $utts_per_spk_max"
-utils/validate_data_dir.sh $destdir
+opts=
+[ ! -f $srcdir/feats.scp ] && opts="--no-feats"
+[ ! -f $srcdir/text ] && opts="$opts --no-text"
+
+utils/validate_data_dir.sh $opts $destdir
--- a/egs/wsj/s5/steps/online/nnet2/get_egs.sh
+++ b/egs/wsj/s5/steps/online/nnet2/get_egs.sh
@ -8,7 +8,7 @@
 # it uses the program online2-wav-dump-feature to do all parts of feature
 # extraction: MFCC/PLP/fbank, possibly plus pitch, plus iVectors.  This script
 # is intended mostly for cross-system training for online decoding, where you
-# initialize the nnet from an existing, larger systme.
+# initialize the nnet from an existing, larger system.


 # Begin configuration section.
@ -69,7 +69,7 @@ dir=$4
 mdl=$online_nnet_dir/final.mdl # only needed for left and right context.
 feature_conf=$online_nnet_dir/conf/online_nnet2_decoding.conf

-for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $feature_conf $mdl; do
+for f in $data/wav.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $feature_conf $mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done

@ -100,7 +100,7 @@ if [ -f $data/utt2uniq ]; then
 fi

 awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid/uttlist | \
-    utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset/uttlist || exit 1;
+   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset/uttlist || exit 1;


 for subdir in valid train_subset; do
--- a/egs/wsj/s5/steps/online/nnet2/get_egs2.sh
+++ b/egs/wsj/s5/steps/online/nnet2/get_egs2.sh
@ -0,0 +1,288 @@
+#!/bin/bash
+
+# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This is modified from ../../nnet2/get_egs2.sh.  [note: get_egs2.sh is as get_egs.sh,
+# but uses the newer, more compact way of writing egs. where we write multiple
+# frames of labels in order to share the context.]
+# This script combines the
+# nnet-example extraction with the feature extraction directly from wave files;
+# it uses the program online2-wav-dump-feature to do all parts of feature
+# extraction: MFCC/PLP/fbank, possibly plus pitch, plus iVectors.  This script
+# is intended mostly for cross-system training for online decoding, where you
+# initialize the nnet from an existing, larger system.
+#
+
+# Begin configuration section.
+cmd=run.pl
+frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
+                  # less time preparing egs, but more I/O during training.
+                  # note: the script may reduce this if reduce_frames_per_eg is true.
+
+reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
+                           # if there is only one archive and even with the
+                           # reduced frames_pe_eg, the number of
+                           # samples_per_iter that would result is less than or
+                           # equal to the user-specified value.
+num_utts_subset=300     # number of utterances in validation and training
+                        # subsets used for shrinkage and diagnostics.
+num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
+num_train_frames_combine=10000 # # train frames for the above.
+num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This is just a guideline; it will pick a number
+                        # that divides the number of samples in the entire data.
+
+stage=0
+io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+random_copy=false
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <ali-dir> <online-nnet-dir> <egs-dir>"
+  echo " e.g.: $0 data/train exp/tri3_ali exp/nnet2_online/nnet_a_gpu_online/ exp/nnet2_online/nnet_b/egs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --samples-per-iter <#samples;400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --feat-type <lda|raw>                            # (by default it tries to guess).  The feature type you want"
+  echo "                                                   # to use as input to the neural net."
+  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
+  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
+  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  
+  exit 1;
+fi
+
+data=$1
+alidir=$2
+online_nnet_dir=$3
+dir=$4
+
+mdl=$online_nnet_dir/final.mdl # only needed for left and right context.
+feature_conf=$online_nnet_dir/conf/online_nnet2_decoding.conf
+
+
+for f in $data/wav.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $mdl $feature_conf; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log $dir/info
+! cmp $alidir/tree $online_nnet_dir/tree && \
+   echo "$0: warning, tree from alignment dir does not match tree from online-nnet dir"
+cp $alidir/tree $dir
+grep -v '^--endpoint' $feature_conf >$dir/feature.conf || exit 1;
+mkdir -p $dir/valid $dir/train_subset
+
+# Get list of validation utterances. 
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
+    > $dir/valid/uttlist || exit 1;
+
+if [ -f $data/utt2uniq ]; then
+  echo "File $data/utt2uniq exists, so augmenting valid/uttlist to"
+  echo "include all perturbed versions of the same 'real' utterances."
+  mv $dir/valid/uttlist $dir/valid/uttlist.tmp
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+  cat $dir/valid/uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid/uttlist
+  rm $dir/uniq2utt $dir/valid/uttlist.tmp
+fi
+
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid/uttlist | \
+  utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset/uttlist || exit 1;
+
+
+for subdir in valid train_subset; do
+  # In order for the iVector extraction to work right, we need to process all
+  # utterances of the speakers which have utterances in valid/uttlist, and the
+  # same for train_subset/uttlist.  We produce $dir/valid/uttlist_extended which
+  # will contain all utterances of all speakers which have utterances in
+  # $dir/valid/uttlist, and the same for $dir/train_subset/.
+
+  utils/filter_scp.pl $dir/$subdir/uttlist <$data/utt2spk | awk '{print $2}' > $dir/$subdir/spklist || exit 1;
+  utils/filter_scp.pl -f 2 $dir/$subdir/spklist <$data/utt2spk >$dir/$subdir/utt2spk || exit 1;
+  utils/utt2spk_to_spk2utt.pl <$dir/$subdir/utt2spk >$dir/$subdir/spk2utt || exit 1;
+  awk '{print $1}' <$dir/$subdir/utt2spk >$dir/$subdir/uttlist_extended || exit 1;
+  rm $dir/$subdir/spklist
+done
+
+
+if [ -f $data/segments ]; then
+  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
+  # previous utterances within a speaker, we do the filtering after extracting the features.
+  echo "$0 [info]: segments file exists: using that."
+  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/segments  | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/segments  | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
+else
+  echo "$0 [info]: no segments file exists, using wav.scp."
+  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt scp:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt scp:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
+fi
+
+ivector_dim=$(online2-wav-dump-features --config=$dir/feature.conf --print-ivector-dim=true) || exit 1;
+
+! [ $ivector_dim -ge 0 ] && echo "$0: error getting iVector dim" && exit 1;
+
+
+
+set -o pipefail
+left_context=$(nnet-am-info $mdl | grep '^left-context' | awk '{print $2}') || exit 1;
+right_context=$(nnet-am-info $mdl | grep '^right-context' | awk '{print $2}') || exit 1;
+set +o pipefail
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+else
+  num_frames=`cat $dir/info/num_frames` || exit 1;
+fi
+
+# the + 1 is to round up, not down... we assume it doesn't divide exactly.
+num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
+# (for small data)- while reduce_frames_per_eg == true and the number of
+# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
+# by 1.
+reduced=false
+while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
+  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
+  frames_per_eg=$[$frames_per_eg-1]
+  num_archives=1
+  reduced=true
+done
+$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
+
+echo $num_archives >$dir/info/num_archives
+echo $frames_per_eg >$dir/info/frames_per_eg
+
+# Working out number of egs per archive
+egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
+! [ $egs_per_archive -le $samples_per_iter ] && \
+  echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
+  && exit 1;
+
+echo $egs_per_archive > $dir/info/egs_per_archive
+
+echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
+echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+
+# Making soft links to storage directories.  This is a no-up unless
+# the subdirectory $dir/storage/ exists.  See utils/create_split_dir.pl
+for x in `seq $num_archives`; do
+  utils/create_data_link.pl $dir/egs.$x.ark
+  for y in `seq $nj`; do
+    utils/create_data_link.pl $dir/egs_orig.$x.$y.ark
+  done
+done
+
+nnet_context_opts="--left-context=$left_context --right-context=$right_context"
+
+if [ $stage -le 2 ]; then
+  echo "$0: Getting validation and training subset examples."
+  rm $dir/.error 2>/dev/null
+  echo "$0: ... extracting validation and training-subset alignments."
+  set -o pipefail;
+  for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
+    copy-int-vector ark:- ark,t:- | \
+    utils/filter_scp.pl <(cat $dir/valid/uttlist $dir/train_subset/uttlist) | \
+    gzip -c >$dir/ali_special.gz || exit 1;
+  set +o pipefail; # unset the pipefail option.
+
+  $cmd $dir/log/create_valid_subset.log \
+    nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
+    "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+     "ark:$dir/valid_all.egs" || touch $dir/.error &
+  $cmd $dir/log/create_train_subset.log \
+    nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
+     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1;
+  echo "... Getting subsets of validation examples for diagnostics and combination."
+  $cmd $dir/log/create_valid_subset_combine.log \
+    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+        ark:$dir/valid_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_valid_subset_diagnostic.log \
+    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    ark:$dir/valid_diagnostic.egs || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset_combine.log \
+    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    ark:$dir/train_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_diagnostic.log \
+    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    ark:$dir/train_diagnostic.egs || touch $dir/.error &
+  wait
+  sleep 5  # wait for file system to sync.
+  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
+
+  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs $dir/ali_special.gz
+fi
+
+if [ $stage -le 3 ]; then
+  # create egs_orig.*.*.ark; the first index goes to $num_archives,
+  # the second to $nj (which is the number of jobs in the original alignment
+  # dir)
+
+  egs_list=
+  for n in $(seq $num_archives); do
+    egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
+  done
+  echo "$0: Generating training examples on disk"
+  
+  # The examples will go round-robin to egs_list.
+  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
+    nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
+    "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
+    nnet-copy-egs ark:- $egs_list || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: recombining and shuffling order of archives on disk"
+  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
+  # shuffle the order, writing to the egs.JOB.ark
+
+  egs_list=
+  for n in $(seq $nj); do 
+    egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
+  done
+
+  $cmd $io_opts $extra_opts JOB=1:$num_archives $dir/log/shuffle.JOB.log \
+    nnet-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: removing temporary archives"
+  for x in `seq $num_archives`; do
+    for y in `seq $nj`; do
+      file=$dir/egs_orig.$x.$y.ark
+      [ -L $file ] && rm $(readlink -f $file)
+      rm $file
+    done
+  done
+fi
+
+echo "$0: Finished preparing training examples"
--- a/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh
+++ b/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh
@ -0,0 +1,244 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
+# training of neural nets.  Note: for "criterion", smbr > mpe > mmi in terms of
+# compatibility of the dumped egs, meaning you can use the egs dumped with
+# --criterion smbr for MPE or MMI, and egs dumped with --criterion mpe for MMI
+# training.  The discriminative training program itself doesn't enforce this and
+# it would let you mix and match them arbitrarily; we area speaking in terms of
+# the correctness of the algorithm that splits the lattices into pieces.
+
+# Begin configuration section.
+cmd=run.pl
+criterion=smbr
+drop_frames=false #  option relevant for MMI, affects how we dump examples.
+samples_per_iter=400000 # measured in frames, not in "examples"
+max_temp_archives=128 # maximum number of temp archives per input job, only
+                      # affects the process of generating archives, not the
+                      # final result.
+
+stage=0
+iter=final
+cleanup=true
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 6 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-online-nnet2-dir> <degs-dir>"
+  echo " e.g.: $0 data/train data/lang exp/nnet2_online/nnet_a_online{_ali,_denlats,_degs}"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
+  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
+  echo "                                                   # online-neural-net setup.  (but you may want to use"
+  echo "                                                   # steps/online/nnet2/get_egs_discriminative2.sh instead)"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+denlatdir=$4
+srcdir=$5
+dir=$6
+
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
+         $denlatdir/lat.1.gz $denlatdir/num_jobs $srcdir/$iter.mdl $srcdir/conf/online_nnet2_decoding.conf; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log $dir/info || exit 1;
+
+
+nj=$(cat $denlatdir/num_jobs) || exit 1; # $nj is the number of
+                                         # splits of the denlats and alignments.
+
+
+nj_ali=$(cat $alidir/num_jobs) || exit 1;
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+
+
+
+if [ $nj_ali -eq $nj ]; then
+  ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.JOB.gz |"
+else
+  ali_rspecifier="scp:$dir/ali.scp"
+  if [ $stage -le 1 ]; then
+    echo "$0: number of jobs in den-lats versus alignments differ: dumping them as single archive and index."
+    all_ids=$(seq -s, $nj_ali)
+    copy-int-vector --print-args=false \
+      "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+  fi
+fi
+
+
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+
+cp $alidir/tree $dir
+cp $lang/phones/silence.csl $dir/info || exit 1;
+cp $srcdir/$iter.mdl $dir/final.mdl || exit 1;
+
+grep -v '^--endpoint' $srcdir/conf/online_nnet2_decoding.conf >$dir/feature.conf || exit 1;
+
+ivector_dim=$(online2-wav-dump-features --config=$dir/feature.conf --print-ivector-dim=true) || exit 1;
+
+echo $ivector_dim > $dir/info/ivector_dim
+
+! [ $ivector_dim -ge 0 ] && echo "$0: error getting iVector dim" && exit 1;
+
+if [ -f $data/segments ]; then
+  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
+  # previous utterances within a speaker, we do the filtering after extracting the features.
+  echo "$0 [info]: segments file exists: using that."
+  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- |"
+else
+  echo "$0 [info]: no segments file exists, using wav.scp."
+  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- |"
+fi
+
+
+if [ $stage -le 2 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+
+  echo $num_frames > $dir/info/num_frames 
+
+  # Working out total number of archives. Add one on the assumption the
+  # num-frames won't divide exactly, and we want to round up.
+  num_archives=$[$num_frames/$samples_per_iter + 1]
+
+  # the next few lines relate to how we may temporarily split each input job
+  # into fewer than $num_archives pieces, to avoid using an excessive
+  # number of filehandles.
+  archive_ratio=$[$num_archives/$max_temp_archives+1]
+  num_archives_temp=$[$num_archives/$archive_ratio]
+  # change $num_archives slightly to make it an exact multiple
+  # of $archive_ratio.
+  num_archives=$[$num_archives_temp*$archive_ratio]
+
+  echo $num_archives >$dir/info/num_archives || exit 1
+  echo $num_archives_temp >$dir/info/num_archives_temp || exit 1
+  
+  frames_per_archive=$[$num_frames/$num_archives]
+
+  # note, this is the number of frames per archive prior to discarding frames.
+  echo $frames_per_archive > $dir/info/frames_per_archive
+else
+  num_archives=$(cat $dir/info/num_archives) || exit 1;
+  num_archives_temp=$(cat $dir/info/num_archives_temp) || exit 1;
+  frames_per_archive=$(cat $dir/info/frames_per_archive) || exit 1;
+fi
+
+echo "$0: Splitting the data up into $num_archives archives (using $num_archives_temp temporary pieces per input job)"
+echo "$0: giving samples-per-iteration of $frames_per_archive (you requested $samples_per_iter)."
+
+# we create these data links regardless of the stage, as there are situations
+# where we would want to recreate a data link that had previously been deleted.
+
+if [ -d $dir/storage ]; then
+  echo "$0: creating data links for distributed storage of degs"
+  # See utils/create_split_dir.pl for how this 'storage' directory is created.
+  for x in $(seq $nj); do
+    for y in $(seq $num_archives_temp); do
+      utils/create_data_link.pl $dir/degs_orig.$x.$y.ark
+    done
+  done
+  for z in $(seq $num_archives); do
+    utils/create_data_link.pl $dir/degs.$z.ark
+  done
+  if [ $num_archives_temp -ne $num_archives ]; then
+    for z in $(seq $num_archives); do
+      utils/create_data_link.pl $dir/degs_temp.$z.ark
+    done
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: getting initial training examples by splitting lattices"
+
+  degs_list=$(for n in $(seq $num_archives_temp); do echo ark:$dir/degs_orig.JOB.$n.ark; done)
+
+  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
+    nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames \
+      "$srcdir/$iter.mdl" "$feats" "$ali_rspecifier" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz|" ark:- \| \
+    nnet-copy-egs-discriminative $const_dim_opt ark:- $degs_list || exit 1;
+  sleep 5;  # wait a bit so NFS has time to write files.
+fi
+
+if [ $stage -le 4 ]; then
+  
+  degs_list=$(for n in $(seq $nj); do echo $dir/degs_orig.$n.JOB.ark; done)
+
+  if [ $num_archives -eq $num_archives_temp ]; then
+    echo "$0: combining data into final archives and shuffling it"
+    
+    $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
+      cat $degs_list \| nnet-shuffle-egs-discriminative --srand=JOB ark:- \
+       ark:$dir/degs.JOB.ark || exit 1;
+  else
+    echo "$0: combining and re-splitting data into un-shuffled versions of final archives."
+
+    archive_ratio=$[$num_archives/$num_archives_temp]
+    ! [ $archive_ratio -gt 1 ] && echo "$0: Bad archive_ratio $archive_ratio" && exit 1;
+
+    # note: the \$[ .. ] won't be evaluated until the job gets executed.  The
+    # aim is to write to the archives with the final numbering, 1
+    # ... num_archives, which is more than num_archives_temp.  The list with
+    # \$[... ] expressions in it computes the set of final indexes for each
+    # temporary index.
+    degs_list_out=$(for n in $(seq $archive_ratio); do echo "ark:$dir/degs_temp.\$[((JOB-1)*$archive_ratio)+$n].ark"; done)
+    # e.g. if dir=foo and archive_ratio=2, we'd have
+    # degs_list_out='foo/degs_temp.$[((JOB-1)*2)+1].ark foo/degs_temp.$[((JOB-1)*2)+2].ark'
+
+    $cmd JOB=1:$num_archives_temp $dir/log/resplit.JOB.log \
+      cat $degs_list \| nnet-copy-egs-discriminative --srand=JOB ark:- \
+      $degs_list_out || exit 1;
+  fi
+fi
+
+if [ $stage -le 5 ] && [ $num_archives -ne $num_archives_temp ]; then
+  echo "$0: shuffling final archives."
+
+  $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
+    nnet-shuffle-egs-discriminative --srand=JOB ark:$dir/degs_temp.JOB.ark \
+      ark:$dir/degs.JOB.ark || exit 1
+
+fi
+
+if $cleanup; then
+  echo "$0: removing temporary archives."
+  for x in $(seq $nj); do
+    for y in $(seq $num_archives_temp); do
+      file=$dir/degs_orig.$x.$y.ark
+      [ -L $file ] && rm $(readlink -f $file); rm $file
+    done
+  done
+  if [ $num_archives_temp -ne $num_archives ]; then
+    for z in $(seq $num_archives); do
+      file=$dir/degs_temp.$z.ark
+      [ -L $file ] && rm $(readlink -f $file); rm $file
+    done
+  fi
+fi
+
+echo "$0: Done."
--- a/egs/wsj/s5/steps/online/nnet2/make_denlats.sh
+++ b/egs/wsj/s5/steps/online/nnet2/make_denlats.sh
@ -0,0 +1,168 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# Create denominator lattices for MMI/MPE training.  
+# This version uses the online-nnet2 features.
+#
+# Creates its output in $dir/lat.*.gz
+
+# Begin configuration section.
+stage=0
+nj=4
+cmd=run.pl
+sub_split=1
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+max_mem=20000000 # This will stop the processes getting too large.
+# This is in bytes, but not "real" bytes-- you have to multiply
+# by something like 5 or 10 to get real bytes (not sure why so large)
+num_threads=1
+parallel_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+  echo "Usage: steps/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
+  echo "  e.g.: steps/make_denlats.sh data/train data/lang exp/nnet2_online/nnet_a_online exp/nnet2_online/nnet_a_denlats"
+  echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
+  echo " plus transforms."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+  echo "                           # large databases so your jobs will be smaller and"
+  echo "                           # will (individually) finish reasonably soon."
+  echo "  --num-threads  <n>                # number of threads per decoding job"
+  echo "  --parallel-opts <string>          # if >1 thread, add this to 'cmd', e.g. -pe smp 6"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+for f in $data/wav.scp $lang/L.fst $srcdir/final.mdl $srcdir/conf/online_nnet2_decoding.conf; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+sdata=$data/split$nj
+
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/log
+split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+new_lang="$dir/"$(basename "$lang")
+
+
+grep -v '^--endpoint' $srcdir/conf/online_nnet2_decoding.conf >$dir/feature.conf || exit 1;
+
+if [ $stage -le 0 ]; then
+  # mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+  # it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+  # final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
+
+  cp -rH $lang $dir/
+
+  echo "Compiling decoding graph in $dir/dengraph"
+  if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
+    echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+  else
+    echo "Making unigram grammar FST in $new_lang"
+    cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+      awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+      utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
+      || exit 1;
+    utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
+  fi
+fi
+
+
+if [ -f $data/segments ]; then
+  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
+  # previous utterances within a speaker, we do the filtering after extracting the features.
+  echo "$0 [info]: segments file exists: using that."
+  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- |"
+else
+  echo "$0 [info]: no segments file exists, using wav.scp."
+  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- |"
+fi
+
+
+
+# if this job is interrupted by the user, we want any background jobs to be
+# killed too.
+cleanup() {
+  local pids=$(jobs -pr)
+  [ -n "$pids" ] && kill $pids
+}
+trap "cleanup" INT QUIT TERM EXIT
+
+
+if [ $sub_split -eq 1 ]; then 
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \
+   nnet-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+else
+  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
+  # to have at most two jobs running at each time.  The idea is that if we have stragglers 
+  # from one job, we can be processing another one at the same time.
+  rm $dir/.error 2>/dev/null
+
+  prev_pid=
+  for n in `seq $[nj+1]`; do
+    if [ $n -gt $nj ]; then
+      this_pid=
+    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+      this_pid=
+    else
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
+
+      $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        nnet-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
+      this_pid=$!
+    fi
+    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
+      wait $prev_pid
+      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
+      rm $dir/.merge_error 2>/dev/null
+      echo Merging archives for data subset $prev_n
+      for k in `seq $sub_split`; do
+        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
+      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
+      [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1;
+      rm $dir/lat.$prev_n.*.gz
+      touch $dir/.done.$prev_n
+    fi
+    prev_n=$n
+    prev_pid=$this_pid
+  done
+fi
+
+
+echo "$0: done generating denominator lattices."
--- a/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding_retrain.sh
+++ b/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding_retrain.sh
@ -27,26 +27,39 @@ echo "$0 $@"  # Print the command line for logging
 [ -f path.sh ] && . ./path.sh;
 . parse_options.sh || exit 1;

-if [ $# -ne 3 ]; then
-   echo "Usage: $0 [options] <orig-nnet-online-dir> <new-nnet-dir> <new-nnet-online-dir>"
-   echo "e.g.: $0 data/lang exp/nnet2_online/extractor exp/nnet2_online/nnet exp/nnet2_online/nnet_online"
-   echo "main options (for others, see top of script file)"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --config <config-file>                           # config containing options"
-   echo "  --stage <stage>                                  # stage to do partial re-run from."
-   exit 1;
+if [ $# -ne 3 ] && [ $# -ne 4 ]; then    
+  echo "Usage: $0 [options] <orig-nnet-online-dir> [<new-lang-dir>] <new-nnet-dir> <new-nnet-online-dir>"
+  echo "e.g.: $0 exp_other/nnet2_online/nnet_a_online data/lang exp/nnet2_online/nnet_a exp/nnet2_online/nnet_a_online"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  exit 1;
 fi

-online_src=$1
-nnet_src=$2
-dir=$3
+if [ $# -eq 3 ]; then
+  echo "$0: warning: it's better if you add the new <lang> directory as the 2nd argument."

-for f in $online_src/conf/online_nnet2_decoding.conf $nnet_src/final.mdl $nnet_src/tree; do
+  online_src=$1
+  lang=
+  nnet_src=$2
+  dir=$3
+else
+  online_src=$1
+  lang=$2
+  nnet_src=$3
+  dir=$4
+
+  extra_files=$lang/words.txt
+fi
+
+
+for f in $online_src/conf/online_nnet2_decoding.conf $nnet_src/final.mdl $nnet_src/tree $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done


-origdir=$dir
+dir_as_given=$dir
 dir=$(readlink -f $dir) # Convert $dir to an absolute pathname, so that the
                        # configuration files we write will contain absolute
                        # pathnames.
@ -89,8 +102,16 @@ $cmd $dir/log/append_nnet.log \
  nnet-insert --randomize-next-component=false --insert-at=0 \
  $nnet_src/final.mdl $dir/first_nnet.raw $dir/final.mdl || exit 1;

-cp $nnet_src/tree $dir/ || exit 1;
-
 $cleanup && rm $dir/first_nnet.raw

-echo "$0: formatted neural net for online decoding in $origdir"
+if [ ! -z "$lang" ]; then
+  # if the $lang option was provided, modify the silence-phones in the config;
+  # these are only used for the endpointing code, but we should get this right.
+  cp $dir/conf/online_nnet2_decoding.conf{,.tmp}
+  silphones=$(cat $lang/phones/silence.csl) || exit 1;
+  cat $dir/conf/online_nnet2_decoding.conf.tmp | \
+    sed s/silence-phones=.\\+/silence-phones=$silphones/ > $dir/conf/online_nnet2_decoding.conf
+  rm $dir/conf/online_nnet2_decoding.conf.tmp
+fi
+
+echo "$0: formatted neural net for online decoding in $dir_as_given"
--- a/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding_transfer.sh
+++ b/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding_transfer.sh
@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# This is as prepare_online_decoding.sh, but for transfer learning-- the case where
+# you have an existing online-decoding directory where you have all the feature
+# stuff, that you don't want to change, but 
+
+# Begin configuration.
+stage=0 # This allows restarting after partway, when something when wrong.
+cmd=run.pl
+iter=final
+# End configuration.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh;
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then    
+  echo "Usage: $0 [options] <orig-nnet-online-dir> <new-lang-dir> <new-nnet-dir> <new-nnet-online-dir>"
+  echo "e.g.: $0 exp_other/nnet2_online/nnet_a_online data/lang exp/nnet2_online/nnet_a exp/nnet2_online/nnet_a_online"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  exit 1;
+fi
+
+online_src=$1
+lang=$2
+nnet_src=$3
+dir=$4
+
+for f in $online_src/conf/online_nnet2_decoding.conf $nnet_src/final.mdl $nnet_src/tree $lang/words.txt; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+dir_as_given=$dir
+dir=$(readlink -f $dir) # Convert $dir to an absolute pathname, so that the
+                        # configuration files we write will contain absolute
+                        # pathnames.
+mkdir -p $dir/conf $dir/log
+
+
+cp $nnet_src/tree $dir/ || exit 1;
+
+cp $nnet_src/$iter.mdl $dir/ || exit 1;
+
+
+# There are a bunch of files that we will need to copy from $online_src, because
+# we're aiming to have one self-contained directory that has everything in it.
+cp -rT $online_src/ivector_extractor/ $dir/ivector_extractor
+
+[ ! -d $online_src/conf ] && \
+  echo "Expected directory $online_src/conf to exist" && exit 1;
+
+for x in $online_src/conf/*conf; do
+  # Replace directory name starting $online_src with those starting with $dir.
+  # We actually replace any directory names ending in /ivector_extractor/ or /conf/ 
+  # with $dir/ivector_extractor/ or $dir/conf/
+  cat $x | perl -ape "s:=(.+)/(ivector_extractor|conf)/:=$dir/\$2/:;" > $dir/conf/$(basename $x)
+done
+
+
+# modify the silence-phones in the config; these are only used for the
+# endpointing code.
+cp $dir/conf/online_nnet2_decoding.conf{,.tmp}
+silphones=$(cat $lang/phones/silence.csl) || exit 1;
+cat $dir/conf/online_nnet2_decoding.conf.tmp | \
+  sed s/silence-phones=.\\+/silence-phones=$silphones/ > $dir/conf/online_nnet2_decoding.conf
+rm $dir/conf/online_nnet2_decoding.conf.tmp
+
+echo "$0: formatted neural net for online decoding in $dir_as_given"
--- a/egs/wsj/s5/utils/apply_map.pl
+++ b/egs/wsj/s5/utils/apply_map.pl
@ -9,7 +9,7 @@


 if (@ARGV > 0 && $ARGV[0] eq "-f") {
-  shift @ARGV; 
+  shift @ARGV;
  $field_spec = shift @ARGV; 
  if ($field_spec =~ m/^\d+$/) {
    $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
@ -36,6 +36,7 @@ if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
 }

 if(@ARGV != 1) {
+  print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
  print STDERR "Usage: apply_map.pl [options] map <input >output\n" .
    "options: [-f <field-range> ]\n" .
    "Applies the map 'map' to all input text, where each line of the map\n" .
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@ -196,7 +196,8 @@ cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
 cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
 cp $dir/phones/silence.txt $dir/phones/context_indep.txt

-cat $srcdir/extra_questions.txt | utils/apply_map.pl $tmpdir/phone_map.txt \
+# if extra_questions.txt is empty, it's OK.
+cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
  >$dir/phones/extra_questions.txt

 # Want extra questions about the word-start/word-end stuff. Make it separate for
--- a/egs/wsj/s5/utils/split_data.sh
+++ b/egs/wsj/s5/utils/split_data.sh
@ -42,14 +42,14 @@ utt2spks=""
 texts=""

 nu=`cat $data/utt2spk | wc -l`
-nf=`cat $data/feats.scp | wc -l`
+nf=`cat $data/feats.scp 2>/dev/null | wc -l`
 nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file
-if [ $nu -ne $nf ]; then
+if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then
  echo "split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); this script "
  echo " may produce incorrectly split data."
  echo "use utils/fix_data_dir.sh to fix this."
 fi
-if [ $nt -ne 0 -a $nu -ne $nt ]; then
+if [ -f $data/text ] && [ $nu -ne $nt ]; then
  echo "split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); this script "
  echo " may produce incorrectly split data."
  echo "use utils/fix_data_dir.sh to fix this."
@ -89,7 +89,7 @@ fi

 utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1

-utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats || exit 1
+[ -f $data/feats.scp ] && utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats

 [ -f $data/text ] && utils/split_scp.pl $utt2spk_opt $data/text $texts

--- a/egs/wsj/s5/utils/subset_scp.pl
+++ b/egs/wsj/s5/utils/subset_scp.pl
@ -28,15 +28,16 @@
 $quiet = 0;
 $first = 0;
 $last = 0;
-if ($ARGV[0] eq "--quiet") {
+
+if (@ARGV > 0 && $ARGV[0] eq "--quiet") {
  shift;
  $quiet = 1;
 }
-if ($ARGV[0] eq "--first") {
+if (@ARGV > 0 && $ARGV[0] eq "--first") {
  shift;
  $first = 1;
 }
-if ($ARGV[0] eq "--last") {
+if (@ARGV > 0 && $ARGV[0] eq "--last") {
  shift;
  $last = 1;
 }
@ -44,7 +45,8 @@ if ($ARGV[0] eq "--last") {
 if(@ARGV < 2 ) {
    die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" .
        " --quiet  causes it to not die if N < num lines in scp.\n" .
-        " --first and --last make it equivalent to head or tail.\n";
+        " --first and --last make it equivalent to head or tail.\n" .
+        "See also: filter_scp.pl\n";
 }

 $N = shift @ARGV;
--- a/src/nnet2/am-nnet.cc
+++ b/src/nnet2/am-nnet.cc
@ -73,6 +73,11 @@ void AmNnet::Init(const Nnet &nnet) {
  }
 }

+void AmNnet::ResizeOutputLayer(int32 new_num_pdfs) {
+  nnet_.ResizeOutputLayer(new_num_pdfs);
+  priors_.Resize(new_num_pdfs);
+  priors_.Set(1.0 / new_num_pdfs);
+}

 } // namespace nnet2
 } // namespace kaldi
--- a/src/nnet2/am-nnet.h
+++ b/src/nnet2/am-nnet.h
@ -68,6 +68,10 @@ class AmNnet {

  std::string Info() const;

+  /// This function is used when doing transfer learning to a new system.
+  /// It will set the priors to be all the same. 
+  void ResizeOutputLayer(int32 new_num_pdfs);
+  
 private:
  const AmNnet &operator = (const AmNnet &other); // Disallow.
  Nnet nnet_;
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@ -1021,6 +1021,13 @@ void AffineComponent::Scale(BaseFloat scale) {
  bias_params_.Scale(scale);
 }

+// virtual
+void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
+  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
+  bias_params_.Resize(output_dim);
+  linear_params_.Resize(output_dim, input_dim);
+}
+
 void AffineComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
  const AffineComponent *other =
      dynamic_cast<const AffineComponent*>(&other_in);
@ -1590,6 +1597,22 @@ void AffineComponentPreconditioned::Update(
                           in_value_precon_part, kNoTrans, 1.0);
 }

+
+// virtual
+void AffineComponentPreconditionedOnline::Resize(
+    int32 input_dim, int32 output_dim) {
+  KALDI_ASSERT(input_dim > 1 && output_dim > 1);
+  if (rank_in_ >= input_dim) rank_in_ = input_dim - 1;
+  if (rank_out_ >= output_dim) rank_out_ = output_dim - 1;
+  bias_params_.Resize(output_dim);
+  linear_params_.Resize(output_dim, input_dim);
+  OnlinePreconditioner temp;
+  preconditioner_in_ = temp;
+  preconditioner_out_ = temp;
+  SetPreconditionerConfigs();
+}
+
+
 void AffineComponentPreconditionedOnline::Read(std::istream &is, bool binary) {
  std::ostringstream ostr_beg, ostr_end;
  ostr_beg << "<" << Type() << ">";
--- a/src/nnet2/nnet-component.h
+++ b/src/nnet2/nnet-component.h
@ -740,6 +740,10 @@ class AffineComponent: public UpdatableComponent {
  void Init(BaseFloat learning_rate,
            std::string matrix_filename);

+  // This function resizes the dimensions of the component, setting the
+  // parameters to zero, while leaving any other configuration values the same.
+  virtual void Resize(int32 input_dim, int32 output_dim);
+
  // The following functions are used for collapsing multiple layers
  // together.  They return a pointer to a new Component equivalent to
  // the sequence of two components.  We haven't implemented this for
@ -895,6 +899,8 @@ class AffineComponentPreconditionedOnline: public AffineComponent {
            BaseFloat alpha, BaseFloat max_change_per_sample,
            std::string matrix_filename);

+  virtual void Resize(int32 input_dim, int32 output_dim);
+  
  // This constructor is used when converting neural networks partway through
  // training, from AffineComponent or AffineComponentPreconditioned to
  // AffineComponentPreconditionedOnline.
--- a/src/nnet2/nnet-nnet.cc
+++ b/src/nnet2/nnet-nnet.cc
@ -339,68 +339,34 @@ void Nnet::SetLearningRates(BaseFloat learning_rate) {
  KALDI_LOG << "Set learning rates to " << learning_rate;
 }

-
-void Nnet::AdjustLearningRates(
-    const VectorBase<BaseFloat> &old_model_old_gradient,
-    const VectorBase<BaseFloat> &new_model_old_gradient,
-    const VectorBase<BaseFloat> &old_model_new_gradient,
-    const VectorBase<BaseFloat> &new_model_new_gradient,
-    BaseFloat measure_at,  // where to measure gradient,
-                           // on line between old and new model;
-                           // 0.5 < measure_at <= 1.0.
-    BaseFloat ratio,  // e.g. 1.1; ratio by  which we change learning rate.
-    BaseFloat max_learning_rate) {
-  std::vector<BaseFloat> new_lrates;
-  KALDI_ASSERT(old_model_old_gradient.Dim() == NumUpdatableComponents() &&
-               new_model_old_gradient.Dim() == NumUpdatableComponents() &&
-               old_model_new_gradient.Dim() == NumUpdatableComponents() &&
-               new_model_new_gradient.Dim() == NumUpdatableComponents());
-  KALDI_ASSERT(ratio >= 1.0);
-  KALDI_ASSERT(measure_at > 0.5 && measure_at <= 1.0);
-  std::string changes_str;
-  std::string dotprod_str;
-  BaseFloat inv_ratio = 1.0 / ratio;
-  int32 index = 0;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[c]);
-    if (uc == NULL) {  // Non-updatable component.
-      KALDI_ASSERT(old_model_old_gradient(c) == 0.0);
-      continue;
-    } else {
-      BaseFloat grad_dotprod_at_end =
-          new_model_new_gradient(index) - old_model_new_gradient(index),
-          grad_dotprod_at_start =
-          new_model_old_gradient(index) - old_model_old_gradient(index),
-          grad_dotprod_interp =
-          measure_at * grad_dotprod_at_end +
-          (1.0 - measure_at) * grad_dotprod_at_start;
-      // grad_dotprod_interp will be positive if we
-      // want more of the gradient term
-      // -> faster learning rate for this component
-
-      BaseFloat lrate = uc->LearningRate();
-      lrate *= (grad_dotprod_interp > 0 ? ratio : inv_ratio);
-      changes_str = changes_str +
-          (grad_dotprod_interp > 0 ? " increase" : " decrease");
-      dotprod_str = dotprod_str +
-          (new_model_new_gradient(index) > 0 ? " positive" : " negative");
-      if (lrate > max_learning_rate) lrate = max_learning_rate;
-
-      new_lrates.push_back(lrate);
-      uc->SetLearningRate(lrate);
-      index++;
-    }
+void Nnet::ResizeOutputLayer(int32 new_num_pdfs) {
+  KALDI_ASSERT(new_num_pdfs > 0);
+  KALDI_ASSERT(NumComponents() > 2);
+  int32 nc = NumComponents();
+  SoftmaxComponent *sc;
+  if ((sc = dynamic_cast<SoftmaxComponent*>(components_[nc - 1])) == NULL)
+    KALDI_ERR << "Expected last component to be SoftmaxComponent.";
+  SumGroupComponent *sgc = dynamic_cast<SumGroupComponent*>(components_[nc - 2]);
+  if (sgc != NULL) {
+    // Remove it.  We'll resize things later.
+    delete sgc;
+    components_.erase(components_.begin() + nc - 2,
+                      components_.begin() + nc - 1);
+    nc--;
  }
-  KALDI_ASSERT(index == NumUpdatableComponents());
-  KALDI_VLOG(1) << "Changes to learning rates: " << changes_str;
-  KALDI_VLOG(1) << "Dot product of model with validation gradient is "
-                << dotprod_str;
-  std::ostringstream lrate_str;
-  for (size_t i = 0; i < new_lrates.size(); i++)
-    lrate_str << new_lrates[i] << ' ';
-  KALDI_VLOG(1) << "Learning rates are " << lrate_str.str();
-}

+  // note: it could be child class of AffineComponent.
+  AffineComponent *ac = dynamic_cast<AffineComponent*>(components_[nc - 2]);
+  if (ac == NULL)
+    KALDI_ERR << "Network doesn't have expected structure (didn't find final "
+              << "AffineComponent).";
+  
+  ac->Resize(ac->InputDim(), new_num_pdfs);
+  // Remove the softmax component, and replace it with a new one
+  delete components_[nc - 1];
+  components_[nc - 1] = new SoftmaxComponent(new_num_pdfs);
+  this->Check();
+}

 int32 Nnet::NumUpdatableComponents() const {
  int32 ans = 0;
--- a/src/nnet2/nnet-nnet.h
+++ b/src/nnet2/nnet-nnet.h
@ -234,21 +234,11 @@ class Nnet {
  // with things of type NonlinearComponent.


-  /// [This function is only used in the binary nnet-train.cc which is currently not
-  /// being used]. This is used to separately adjust learning rates of each layer,
-  /// after each "phase" of training.  We basically ask (using the validation
-  /// gradient), do we wish we had gone further in this direction?  Yes->
-  /// increase learning rate, no -> decrease it.   The inputs have dimension
-  /// NumUpdatableComponents().
-  void AdjustLearningRates(
-      const VectorBase<BaseFloat> &old_model_old_gradient,
-      const VectorBase<BaseFloat> &new_model_old_gradient,
-      const VectorBase<BaseFloat> &old_model_new_gradient,
-      const VectorBase<BaseFloat> &new_model_new_gradient,
-      BaseFloat measure_at, // where to measure gradient, on line between old
-                            // and new model; 0.5 < measure_at <= 1.0.
-      BaseFloat learning_rate_ratio,
-      BaseFloat max_learning_rate);
+  /// This function is used when doing transfer learning to a new system.  It
+  /// resizes the final affine and softmax components.  If your system has a
+  /// SumGroupComponent before the final softmax, it will be discarded.
+  void ResizeOutputLayer(int32 new_num_pdfs);
+  

  /// Scale all the learning rates in the neural net by this factor.
  void ScaleLearningRates(BaseFloat factor);
--- a/src/nnet2bin/Makefile
+++ b/src/nnet2bin/Makefile
@ -27,7 +27,7 @@ BINFILES = nnet-am-info nnet-init \
   nnet-perturb-egs-fmllr nnet-get-weighted-egs nnet-adjust-priors \
   cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning \
   nnet-train-simple-perturbed nnet-train-parallel-perturbed \
-   nnet1-to-raw-nnet raw-nnet-copy nnet-relabel-egs
+   nnet1-to-raw-nnet raw-nnet-copy nnet-relabel-egs nnet-am-reinitialize

 OBJFILES =

--- a/src/nnet2bin/nnet-am-average.cc
+++ b/src/nnet2bin/nnet-am-average.cc
@ -23,6 +23,36 @@
 #include "nnet2/combine-nnet-a.h"
 #include "nnet2/am-nnet.h"

+namespace kaldi {
+
+void GetWeights(const std::string &weights_str,
+                int32 num_inputs,
+                vector<BaseFloat> *weights) {
+  KALDI_ASSERT(num_inputs >= 1);
+  if (!weights_str.empty()) {
+    SplitStringToFloats(weights_str, ":", true, weights);
+    if (weights->size() != num_inputs) {
+      KALDI_ERR << "--weights option must be a colon-separated list "
+                << "with " << num_inputs << " elements, got: "
+                << weights_str;
+    }
+  } else {
+    for (int32 i = 0; i < num_inputs; i++)
+      weights->push_back(1.0 / num_inputs);
+  }
+  // normalize the weights to sum to one.
+  float weight_sum = 0.0;
+  for (int32 i = 0; i < num_inputs; i++)
+    weight_sum += (*weights)[i];
+  for (int32 i = 0; i < num_inputs; i++)
+    (*weights)[i] = (*weights)[i] / weight_sum;
+  if (fabs(weight_sum - 1.0) > 0.01) {
+    KALDI_WARN << "Normalizing weights to sum to one, sum was " << weight_sum;
+  }
+}
+
+}
+

 int main(int argc, char *argv[]) {
  try {
@ -32,60 +62,109 @@ int main(int argc, char *argv[]) {
    typedef kaldi::int64 int64;

    const char *usage =
-        "This program average (or sums, if --sum=true) the parameters over a number of neural nets.\n"
+        "This program averages (or sums, if --sum=true) the parameters over a\n"
+        "number of neural nets.  If you supply the option --skip-last-layer=true,\n"
+        "the parameters of the last updatable layer are copied from <model1> instead\n"
+        "of being averaged (useful in multi-language scenarios).\n"
+        "The --weights option can be used to weight each model differently.\n"
        "\n"
        "Usage:  nnet-am-average [options] <model1> <model2> ... <modelN> <model-out>\n"
        "\n"
        "e.g.:\n"
        " nnet-am-average 1.1.nnet 1.2.nnet 1.3.nnet 2.nnet\n";
-    
+
    bool binary_write = true;
    bool sum = false;
-    
+
    ParseOptions po(usage);
    po.Register("sum", &sum, "If true, sums instead of averages.");
    po.Register("binary", &binary_write, "Write output in binary mode");
+    string weights_str;
+    bool skip_last_layer = false;
+    po.Register("weights", &weights_str, "Colon-separated list of weights, one "
+                "for each input model.  These will be normalized to sum to one.");
+    po.Register("skip-last-layer", &skip_last_layer, "If true, averaging of "
+                "the last updatable layer is skipped (result comes from model1)");
    
    po.Read(argc, argv);
-    
+
    if (po.NumArgs() < 2) {
      po.PrintUsage();
      exit(1);
    }
-    
+
    std::string
        nnet1_rxfilename = po.GetArg(1),
        nnet_wxfilename = po.GetArg(po.NumArgs());
-    
-    TransitionModel trans_model;
+
+    TransitionModel trans_model1;
    AmNnet am_nnet1;
    {
      bool binary_read;
      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
+      trans_model1.Read(ki.Stream(), binary_read);
      am_nnet1.Read(ki.Stream(), binary_read);
    }

    int32 num_inputs = po.NumArgs() - 1;
-    BaseFloat scale = (sum ? 1.0 : 1.0 / num_inputs);

-    am_nnet1.GetNnet().Scale(scale);
+    vector<BaseFloat> model_weights;
+    GetWeights(weights_str, num_inputs, &model_weights);
    
+    int32 c_begin = 0,
+        c_end = (skip_last_layer ?
+                 am_nnet1.GetNnet().LastUpdatableComponent() :
+                 am_nnet1.GetNnet().NumComponents());
+    KALDI_ASSERT(c_end != -1 && "Network has no updatable components.");
+    
+    // scale the components - except the last layer, if skip_last_layer == true.
+    for (int32 c = c_begin; c < c_end; c++) {
+      UpdatableComponent *uc =
+        dynamic_cast<UpdatableComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
+      if (uc != NULL)  uc->Scale(model_weights[0]);
+      NonlinearComponent *nc =
+        dynamic_cast<NonlinearComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
+      if (nc != NULL)
+        nc->Scale(model_weights[0]);
+    }
+
    for (int32 i = 2; i <= num_inputs; i++) {
      bool binary_read;
      Input ki(po.GetArg(i), &binary_read);
+      TransitionModel trans_model;
      trans_model.Read(ki.Stream(), binary_read);
      AmNnet am_nnet;
      am_nnet.Read(ki.Stream(), binary_read);
-      am_nnet1.GetNnet().AddNnet(scale, am_nnet.GetNnet());
+
+      for (int32 c = c_begin; c < c_end; c++) {
+        UpdatableComponent *uc_average =
+          dynamic_cast<UpdatableComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
+        const UpdatableComponent *uc_this =
+          dynamic_cast<const UpdatableComponent*>(&(am_nnet.GetNnet().GetComponent(c)));
+        if (uc_average != NULL) {
+          KALDI_ASSERT(uc_this != NULL &&
+                       "Networks must have the same structure.");
+          uc_average->Add(model_weights[i-1], *uc_this);
+        }
+
+        NonlinearComponent *nc_average =
+          dynamic_cast<NonlinearComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
+        const NonlinearComponent *nc_this =
+          dynamic_cast<const NonlinearComponent*>(&(am_nnet.GetNnet().GetComponent(c)));
+        if (nc_average != NULL) {
+          KALDI_ASSERT(nc_this != NULL &&
+                       "Networks must have the same structure.");
+          nc_average->Add(model_weights[i-1], *nc_this);
+        }
+      }
    }
-    
+
    {
      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
+      trans_model1.Write(ko.Stream(), binary_write);
      am_nnet1.Write(ko.Stream(), binary_write);
    }
-    
+
    KALDI_LOG << "Averaged parameters of " << num_inputs
              << " neural nets, and wrote to " << nnet_wxfilename;
    return 0; // it will throw an exception if there are any problems.
--- a/src/nnet2bin/nnet-am-info.cc
+++ b/src/nnet2bin/nnet-am-info.cc
@ -36,6 +36,14 @@ int main(int argc, char *argv[]) {
        " nnet-am-info 1.nnet\n";
        
    ParseOptions po(usage);
+
+    bool print_learning_rates = false;
+
+    po.Register("print-learning-rates", &print_learning_rates,
+                "If true, instead of printing the normal info, print a "
+                "colon-separated list of the learning rates for each updatable "
+                "layer, suitable to give to nnet-am-copy as the argument to"
+                "--learning-rates.");
    
    po.Read(argc, argv);

@ -55,9 +63,19 @@ int main(int argc, char *argv[]) {
      am_nnet.Read(ki.Stream(), binary_read);
    }

-    std::cout << am_nnet.Info();
+    if (print_learning_rates) {
+      Vector<BaseFloat> learning_rates(am_nnet.GetNnet().NumUpdatableComponents());
+      am_nnet.GetNnet().GetLearningRates(&learning_rates);
+      int32 nc = learning_rates.Dim();
+      for (int32 i = 0; i < nc; i++)
+        std::cout << learning_rates(i) << (i < nc - 1 ? ":" : "");
+      std::cout << std::endl;
+      KALDI_LOG << "Printed learning-rate info for " << nnet_rxfilename;
+    } else {
+      std::cout << am_nnet.Info();
+      KALDI_LOG << "Printed info about " << nnet_rxfilename;
+    }
    
-    KALDI_LOG << "Printed info about " << nnet_rxfilename;
  } catch(const std::exception &e) {
    std::cerr << e.what() << '\n';
    return -1;
--- a/src/nnet2bin/nnet-am-reinitialize.cc
+++ b/src/nnet2bin/nnet-am-reinitialize.cc
@ -0,0 +1,88 @@
+// nnet2bin/nnet-am-reinitialize.cc
+
+// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet2/am-nnet.h"
+#include "hmm/transition-model.h"
+#include "tree/context-dep.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet2;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "This program can used when transferring a neural net from one language\n"
+        "to another (or one tree to another).  It takes a neural net and a\n"
+        "transition model from a different neural net, resizes the last layer\n"
+        "to match the new transition model, zeroes it, and writes out the new,\n"
+        "resized .mdl file.  If the original model had been 'mixed-up', the associated\n"
+        "SumGroupComponent will be removed.\n"
+        "\n"
+        "Usage:  nnet-am-reinitialize [options] <nnet-in> <new-transition-model> <nnet-out>\n"
+        "e.g.:\n"
+        " nnet-am-reinitialize 1.mdl exp/tri6/final.mdl 2.mdl\n";
+
+    bool binary_write = true;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        transition_model_rxfilename = po.GetArg(2),
+        nnet_wxfilename = po.GetArg(3);
+    
+    TransitionModel orig_trans_model;
+    AmNnet am_nnet;
+    {
+      bool binary;
+      Input ki(nnet_rxfilename, &binary);
+      orig_trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    TransitionModel new_trans_model;
+    ReadKaldiObject(transition_model_rxfilename, &new_trans_model);
+
+    am_nnet.ResizeOutputLayer(new_trans_model.NumPdfs());
+    
+    {
+      Output ko(nnet_wxfilename, binary_write);
+      new_trans_model.Write(ko.Stream(), binary_write);
+      am_nnet.Write(ko.Stream(), binary_write);
+    }
+    KALDI_LOG << "Resized neural net from " << nnet_rxfilename
+              << " to " << am_nnet.NumPdfs()
+              << " pdfs, and wrote to " << nnet_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
--- a/src/nnet2bin/nnet-get-egs-discriminative.cc
+++ b/src/nnet2bin/nnet-get-egs-discriminative.cc
@ -33,10 +33,10 @@ int main(int argc, char *argv[]) {

    const char *usage =
        "Get examples of data for discriminative neural network training;\n"
-        "each one corresponds to part of a file, of variable (and configurable\n"
+        "each one corresponds to part of a file, of variable (and configurable)\n"
        "length.\n"
        "\n"
-        "Usage:  nnet-get-egs-discriminative [options] <model|transition-model> "
+        "Usage:  nnet-get-egs-discriminative [options] <model> "
        "<features-rspecifier> <ali-rspecifier> <den-lat-rspecifier> "
        "<training-examples-out>\n"
        "\n"