several nnet2-online changes: make it easier to get the feature extraction options right in cross-system training; add train_pnorm_simple.sh script (simplified learning-rate schedule and improved combination at the end, supersedes train_pnorm_fast.sh); modifying big-data online-nnet2 recipes to use 40-dimensional MFCC rather than 13 as input (will add results soon, but they are improved). Modified filter_scp.pl to have one-based, not zero-based, field index.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4493 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2014-09-30 19:18:36 +00:00 · 2014-09-30 19:18:36 +00:00 · 6f598676cc
--- a/egs/babel/s5b/local/nist_eval/filter_data.sh
+++ b/egs/babel/s5b/local/nist_eval/filter_data.sh
@ -96,7 +96,7 @@ while (( "$#" )); do
    $cmd LMWT=$min_lmwt:$max_lmwt $targetdir/$kws/kws_filter.LMWT.log \
      set -e';' set -o pipefail';' \
      mkdir -p $targetdir/${kws}_LMWT';'\
-      cat $resultdir/${kws}_LMWT/'result.*' \| utils/filter_scp.pl -f 1 $filter \> $targetdir/${kws}_LMWT/result || exit -1
+      cat $resultdir/${kws}_LMWT/'result.*' \| utils/filter_scp.pl -f 2 $filter \> $targetdir/${kws}_LMWT/result || exit -1


    echo -e  "\tWrite normalized..."
--- a/egs/fisher_english/s5/conf/fbank.conf
+++ b/egs/fisher_english/s5/conf/fbank.conf
@ -1,10 +0,0 @@
--window-type=hamming # disable Dans window, use the standard
--use-energy=false    # only fbank outputs
--sample-frequency=8000 # Cantonese is sampled at 8kHz
-
--low-freq=64         # typical setup from Frantisek Grezl
--high-freq=3800
--dither=1
-
--num-mel-bins=15     # 8kHz so we use 15 bins
--htk-compat=true     # try to make it compatible with HTK
--- a/egs/fisher_english/s5/conf/mfcc_hires.conf
+++ b/egs/fisher_english/s5/conf/mfcc_hires.conf
@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
--- a/egs/fisher_english/s5/local/online/run_nnet2.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2.sh
@ -1,6 +1,5 @@
 #!/bin/bash

-
 . cmd.sh


@ -12,74 +11,99 @@ set -e
 . ./path.sh
 . ./utils/parse_options.sh

-if $use_gpu; then
-  if ! cuda-compiled; then
-    cat <<EOF && exit 1 
+
+# assume use_gpu=true since it would be way too slow otherwise.
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1 
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+where "nvcc" is installed.
 EOF
-  fi
-  parallel_opts="-l gpu=1" 
-  num_threads=1
-  minibatch_size=512
-  # the _a is in case I want to change the parameters.
-  dir=exp/nnet2_online/nnet_a_gpu 
-else
-  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
-  # almost the same, but this may be a little bit slow.
-  num_threads=16
-  minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
-  dir=exp/nnet2_online/nnet_a
 fi
-
+parallel_opts="-l gpu=1" 
+num_threads=1
+minibatch_size=512
+dir=exp/nnet2_online/nnet_a_gpu 
+mkdir -p exp/nnet2_online

 if [ $stage -le 1 ]; then
-  mkdir -p exp/nnet2_online
-  # To train a diagonal UBM we don't need very much data, so use the smallest subset.
-  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
-    data/train_30k 512 exp/tri5a exp/nnet2_online/diag_ubm
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=mfcc
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5/$mfccdir/storage $mfccdir/storage
+  fi
+  utils/copy_data_dir.sh data/train data/train_hires
+  steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
+  steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1;
+
+  utils/subset_data_dir.sh data/train_hires 30000 data/train_hires_30k
+  # want the 100k subset to exactly match train_100k, since we'll use its alignments.
+  awk '{print $1}' data/train_100k/utt2spk > uttlist
+  utils/subset_data_dir.sh --utt-list uttlist data/train_hires data/train_hires_100k
+  rm uttlist
 fi

 if [ $stage -le 2 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+  # the transform (12th iter is the last), any further training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/train_hires_100k data/lang exp/tri4a exp/nnet2_online/tri5a
+fi
+
+
+if [ $stage -le 3 ]; then
+  # To train a diagonal UBM we don't need very much data, so use the smallest
+  # subset.  the input directory exp/nnet2_online/tri5a is only needed for
+  # the splice-opts and the LDA transform.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
+    data/train_hires_30k 512 exp/nnet2_online/tri5a exp/nnet2_online/diag_ubm
+fi
+
+if [ $stage -le 4 ]; then
  # iVector extractors can in general be sensitive to the amount of data, but
  # this one has a fairly small dim (defaults to 100) so we don't use all of it,
  # we use just the 100k subset (about one sixteenth of the data).
  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/train_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
+    data/train_hires_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
 fi

-if [ $stage -le 3 ]; then
+if [ $stage -le 5 ]; then
  ivectordir=exp/nnet2_online/ivectors_train
-  if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems.
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english/s5/$ivectordir/storage $ivectordir/storage
  fi

-  # We extract iVectors on all the train data, which will be what we
-  # train the system on.  This version of the iVector-extraction script
-  # pairs the utterances into twos (by default, see --utts-per-spk-max option) 
-  # and treats each of these pairs as one speaker.
-  # Note that these are extracted 'online'.
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires data/train_hires_max2
+
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
-    --utts-per-spk-max 2 \
-    data/train exp/nnet2_online/extractor $ivectordir || exit 1;
+    data/train_hires_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
 fi


-if [ $stage -le 4 ]; then
+if [ $stage -le 6 ]; then
  if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
    utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$dir/egs $dir/egs/storage
  fi

  # Because we have a lot of data here and we don't want the training to take
-  # too long, we reduce the number of epochs from the defaults (15 + 5) to (1 +
+  # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
  # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
  # data across four filesystems for speed.

  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
-    --num-epochs 3 --num-epochs-extra 1 \
+    --num-epochs 4 --num-epochs-extra 1 \
+    --samples-per-iter 400000 \
    --splice-width 7 --feat-type raw \
    --online-ivector-dir exp/nnet2_online/ivectors_train \
    --cmvn-opts "--norm-means=false --norm-vars=false" \
@ -94,30 +118,12 @@ if [ $stage -le 4 ]; then
    --cmd "$decode_cmd" \
    --pnorm-input-dim 3500 \
    --pnorm-output-dim 350 \
-    data/train data/lang exp/tri5a $dir  || exit 1;
+    data/train_hires data/lang exp/tri5a $dir  || exit 1;
 fi

-if [ $stage -le 5 ]; then
-  # dump iVectors for the testing data.
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
-    data/dev exp/nnet2_online/extractor exp/nnet2_online/ivectors_dev || exit 1;
-fi
-
-
-if [ $stage -le 6 ]; then
-  # this does offline decoding that should give about the same results as the
-  # real online decoding (the one with --per-utt true)
-  steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
-       --online-ivector-dir exp/nnet2_online/ivectors_dev \
-       exp/tri5a/graph data/dev $dir/decode_dev || exit 1;
-fi
-
-
 if [ $stage -le 7 ]; then
-  # If this setup used PLP features, we'd have to give the option --feature-type plp
-  # to the script below.
-  steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
-    "$dir" ${dir}_online || exit 1;
+  steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
 fi

 if [ $stage -le 8 ]; then
@ -146,30 +152,3 @@ fi

 exit 0;

-
-#Baseline: GMM+SAT system.
-#%WER 31.07 [ 12163 / 39141, 1869 ins, 2705 del, 7589 sub ] exp/tri5a/decode_dev/wer_13
-
-# Baseline: p-norm system on top of fMLLR features.
-#%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
-
-# Our experiment, carrying forward the adaptation state between
-# utterances of each speaker.
-#%WER 23.79 [ 9311 / 39141, 1499 ins, 2277 del, 5535 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_11
-
-
-# Our experiment, with per-utterance decoding:
-%WER 24.84 [ 9721 / 39141, 1445 ins, 2410 del, 5866 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
-
-
- # below, with --max-chunks-at-once 3.  The WER is slightly worse but I expect in general it will
- # be slightly better, to to more iVector right context; this is likely just noise.  The average
- # latency was reduced vs the baseline,
- #%WER 24.92 [ 9753 / 39141, 1423 ins, 2429 del, 5901 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt_mc3/wer_11
-
-
-# The following results (obtained after ./run_nnet2_discriminative.sh was run), show
-# the effect of discriminative training.  After 2 epochs, we reduce the WER from 23.58 to 22.07.
-%WER 23.58 [ 9229 / 39141, 1382 ins, 2400 del, 5447 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_12
-%WER 22.16 [ 8675 / 39141, 1522 ins, 1886 del, 5267 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_smbr_epoch1/wer_13
-%WER 22.07 [ 8637 / 39141, 1540 ins, 1873 del, 5224 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_smbr_epoch2/wer_13
--- a/egs/fisher_english/s5/local/online/run_nnet2_b.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_b.sh
@ -60,14 +60,12 @@ if [ $stage -le 3 ]; then
    utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
  fi

-  # We extract iVectors on all the train data, which will be what we
-  # train the system on.  This version of the iVector-extraction script
-  # pairs the utterances into twos (by default, see --utts-per-spk-max option) 
-  # and treats each of these pairs as one speaker.
-  # Note that these are extracted 'online'.
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
+
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
-    --utts-per-spk-max 2 \
-    data/train exp/nnet2_online/extractor $ivectordir || exit 1;
+    data/train_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
 fi


@ -83,7 +81,8 @@ if [ $stage -le 4 ]; then
  # data across four filesystems for speed.

  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
-    --num-epochs 3 --num-epochs-extra 1 \
+    --num-epochs 4 --num-epochs-extra 1 \
+    --samples-per-iter 400000 \
    --splice-width 7 --feat-type raw \
    --online-ivector-dir exp/nnet2_online/ivectors_train \
    --cmvn-opts "--norm-means=false --norm-vars=false" \
--- a/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh
@ -1,8 +1,6 @@
 #!/bin/bash

 # This is to be run after run_nnet2.sh
-# THIS IS NOT TESTED YET.
-

 . cmd.sh

@ -43,7 +41,6 @@ set -e
 nj=40

 if [ $stage -le 1 ]; then
- 
  # the make_denlats job is always done on CPU not GPU, since in any case
  # the graph search and lattice determinization takes quite a bit of CPU.
  # note: it's the sub-split option that determinies how many jobs actually
@ -51,7 +48,7 @@ if [ $stage -le 1 ]; then
  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
      --online-ivector-dir exp/nnet2_online/ivectors_train \
-      data/train data/lang $srcdir ${srcdir}_denlats
+      data/train_hires data/lang $srcdir ${srcdir}_denlats
 fi

 if [ $stage -le 2 ]; then
@ -59,7 +56,7 @@ if [ $stage -le 2 ]; then
  steps/nnet2/align.sh  --cmd "$decode_cmd $gpu_opts" \
      --online-ivector-dir exp/nnet2_online/ivectors_train \
      --use-gpu $use_gpu_opt \
-      --nj $nj data/train data/lang ${srcdir} ${srcdir}_ali
+      --nj $nj data/train_hires data/lang ${srcdir} ${srcdir}_ali
 fi

 if [ $stage -le 3 ]; then
@ -72,22 +69,22 @@ if [ $stage -le 3 ]; then
  # since we're using 4 disks.
  steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" --learning-rate 0.00001 \
    --io-opts "-pe smp 10" \
-    --num-epochs 2 \
+    --num-epochs 4 \
    --use-preconditioning $use_preconditioning \
    --online-ivector-dir exp/nnet2_online/ivectors_train \
    --num-jobs-nnet 4  --num-threads $num_threads --parallel-opts "$gpu_opts" \
-      data/train data/lang \
+      data/train_hires data/lang \
    ${srcdir}_ali ${srcdir}_denlats ${srcdir}/final.mdl ${srcdir}_smbr
 fi

 if [ $stage -le 4 ]; then
  # we'll do the decoding as 'online' decoding by using the existing
  # _online directory but with extra models copied to it.
-  for epoch in 1 2; do
+  for epoch in 1 2 3 4; do
    cp ${srcdir}_smbr/epoch${epoch}.mdl ${srcdir}_online/smbr_epoch${epoch}.mdl
  done

-  for epoch in 1 2; do
+  for epoch in 1 2 3 4; do
    # do the actual online decoding with iVectors, carrying info forward from 
    # previous utterances of the same speaker.
    steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 --iter smbr_epoch${epoch} \
@ -95,5 +92,6 @@ if [ $stage -le 4 ]; then
  done
 fi

+wait

 # for results, see the end of run_nnet2.sh
--- a/egs/lre/v1/local/vad_split_utts.sh
+++ b/egs/lre/v1/local/vad_split_utts.sh
@ -45,7 +45,7 @@ if [ $stage -le 2 ]; then
 local/vad_split_utts_fix_data.pl $in_dir $dir;
 fi

-utils/filter_scp.pl -f 0 \
+utils/filter_scp.pl \
 <(echo "`awk < "$dir/segments" '{ print $2 }'`") $in_dir/wav.scp \
 > $dir/wav.scp

--- a/egs/lre/v1/run_logistic_regression.sh
+++ b/egs/lre/v1/run_logistic_regression.sh
@ -31,7 +31,7 @@ classes="ark:lid/remove_dialect.pl data/train/utt2lang \
 # Create priors to rebalance the model. The following script rebalances
 # the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
 lid/balance_priors_to_test.pl \
-    <(lid/remove_dialect.pl <(utils/filter_scp.pl -f 0 \
+    <(lid/remove_dialect.pl <(utils/filter_scp.pl \
        exp/ivectors_train/ivector.scp data/train/utt2lang)) \
    <(lid/remove_dialect.pl data/lre07/utt2lang) \
    exp/ivectors_train/languages.txt \
--- a/egs/rm/s5/local/online/run_nnet2.sh
+++ b/egs/rm/s5/local/online/run_nnet2.sh
@ -6,6 +6,9 @@
 stage=1
 train_stage=-10
 use_gpu=true
+dir=exp/nnet2_online/nnet_a
+
+
 . cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
@ -21,7 +24,6 @@ EOF
  parallel_opts="-l gpu=1" 
  num_threads=1
  minibatch_size=512
-  dir=exp/nnet2_online/nnet_gpu
 else
  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
  # almost the same, but this may be a little bit slow.
@ -47,14 +49,17 @@ if [ $stage -le 2 ]; then
 fi

 if [ $stage -le 3 ]; then
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
+
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
-    --utts-per-spk-max 2 \
-    data/train exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
+    data/train_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
 fi


 if [ $stage -le 4 ]; then
-  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
+  steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
    --splice-width 7 \
    --feat-type raw \
    --online-ivector-dir exp/nnet2_online/ivectors \
@ -63,7 +68,8 @@ if [ $stage -le 4 ]; then
    --minibatch-size "$minibatch_size" \
    --parallel-opts "$parallel_opts" \
    --num-jobs-nnet 4 \
-    --num-epochs-extra 10 --add-layers-period 1 \
+    --num-epochs 25 \
+    --add-layers-period 1 \
    --num-hidden-layers 2 \
    --mix-up 4000 \
    --initial-learning-rate 0.02 --final-learning-rate 0.004 \
--- a/egs/rm/s5/local/online/run_nnet2_baseline.sh
+++ b/egs/rm/s5/local/online/run_nnet2_baseline.sh
@ -1,7 +1,7 @@
 #!/bin/bash


-# this is a baseline for run_online_decoding_nnet2.sh, without
+# this is a baseline for ./run_nnet2.sh, without
 # the iVectors, to see whether they make a difference.

 . cmd.sh
@ -10,10 +10,14 @@
 stage=1
 train_stage=-10
 use_gpu=true
+dir=exp/nnet2_online/nnet_a_baseline
+
 . cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh

+
+
 if $use_gpu; then
  if ! cuda-compiled; then
    cat <<EOF && exit 1 
@ -25,19 +29,17 @@ EOF
  parallel_opts="-l gpu=1" 
  num_threads=1
  minibatch_size=512
-  dir=exp/nnet2_online/nnet_gpu_baseline
 else
  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
  # almost the same, but this may be a little bit slow.
  num_threads=16
  minibatch_size=128
  parallel_opts="-pe smp $num_threads" 
-  dir=exp/nnet2_online/nnet_baseline
 fi


 if [ $stage -le 1 ]; then
-  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
+  steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
    --splice-width 7 \
    --feat-type raw \
    --cmvn-opts "--norm-means=false --norm-vars=false" \
@ -45,7 +47,8 @@ if [ $stage -le 1 ]; then
    --minibatch-size "$minibatch_size" \
    --parallel-opts "$parallel_opts" \
    --num-jobs-nnet 4 \
-    --num-epochs-extra 10 --add-layers-period 1 \
+    --num-epochs 25 \
+    --add-layers-period 1 \
    --num-hidden-layers 2 \
    --mix-up 4000 \
    --initial-learning-rate 0.02 --final-learning-rate 0.004 \
@ -82,4 +85,4 @@ if [ $stage -le 4 ]; then
  wait
 fi

-# for results, see the end of ./run_online_decoding_nnet2.sh
+# for results, see the end of ./run_nnet2.sh
--- a/egs/rm/s5/local/online/run_nnet2_perturbed.sh
+++ b/egs/rm/s5/local/online/run_nnet2_perturbed.sh
@ -77,11 +77,13 @@ if [ $stage -le 5 ]; then
  fi
  # Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
  # of speakers into "fake-speakers" with about 2 utterances each, by randomly making 
-  # some have 2 and some 3 utterances... this randomnes will be different in different
+  # some have 2 and some 3 utterances... this randomness will be different in different
  # copies of the data.
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
+     data/train_perturbed_mfcc_max2.5
+
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    --utts-per-spk-max 2.5 \
-    data/train_perturbed_mfcc exp/nnet2_online/extractor $ivectordir || exit 1;
+    data/train_perturbed_mfcc_max2.5 exp/nnet2_online/extractor $ivectordir || exit 1;
 fi


--- a/egs/rm/s5/local/online/run_nnet2_wsj.sh
+++ b/egs/rm/s5/local/online/run_nnet2_wsj.sh
@ -4,7 +4,7 @@
 # the optional part local/online/run_online_decoding_nnet2.sh.  It builds a
 # neural net for online decoding on top of the network we previously trained on
 # WSJ, by keeping everything but the last layer of that network and then
-# training just the last layer on our data.
+# training just the last layer on our data.  We then train the whole thing.

 stage=0
 set -e
@ -26,35 +26,40 @@ EOF
  parallel_opts="-l gpu=1" 
  num_threads=1
  minibatch_size=512
-  dir=exp/nnet2_online_wsj/nnet_gpu
-  trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
+  dir=exp/nnet2_online_wsj/nnet_a
+  trainfeats=exp/nnet2_online_wsj/wsj_activations_train
+  # later we'll change the script to download the trained model from kaldi-asr.org.
  srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
-  # the following things are needed while training the combined model.
-  srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu 
-  ivector_src=../../wsj/s5/exp/nnet2_online/extractor
 else
  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
  # almost the same, but this may be a little bit slow.
  num_threads=16
  minibatch_size=128
  parallel_opts="-pe smp $num_threads" 
-  dir=exp/nnet2_online_wsj/nnet
+  dir=exp/nnet2_online_wsj/nnet_a
  trainfeats=exp/nnet2_online_wsj/wsj_activations_train
  srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
-  # the following things are needed while training the combined model.
-  srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a
-  ivector_src=../../wsj/s5/exp/nnet2_online/extractor
 fi


 if [ $stage -le 0 ]; then
  echo "$0: dumping activations from WSJ model"
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $trainfeats/feats/storage ]; then
+    # this shows how you can split the data across multiple file-systems; it's optional.
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$date/s5/$trainfeats/feats/storage \
+       $trainfeats/feats/storage
+  fi
  steps/online/nnet2/dump_nnet_activations.sh --cmd "$train_cmd" --nj 30 \
     data/train $srcdir $trainfeats
 fi

 if [ $stage -le 1 ]; then
  echo "$0: training 0-hidden-layer model on top of WSJ activations"
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then    
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
  steps/nnet2/retrain_fast.sh --stage $train_stage \
    --num-threads "$num_threads" \
    --minibatch-size "$minibatch_size" \
@ -71,9 +76,6 @@ if [ $stage -le 2 ]; then
  steps/online/nnet2/prepare_online_decoding_retrain.sh $srcdir $dir ${dir}_online
 fi

-# Note: at this point it might be possible to further train the combined model
-# by doing backprop through all of it.  We haven't implemented this yet.
-
 if [ $stage -le 3 ]; then
  # do online decoding with the combined model.
  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
@ -98,7 +100,7 @@ fi
 ## the model on this dataset.  First we need to create a combined version of the
 ## model. 
 if [ $stage -le 5 ]; then
-  steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
+  steps/nnet2/create_appended_model.sh $srcdir $dir ${dir}_combined_init
  
  # Set the learning rate in this initial value to our guess of a suitable value.
  # note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
@ -107,31 +109,20 @@ if [ $stage -le 5 ]; then
  nnet-am-copy --learning-rate=$initial_learning_rate ${dir}_combined_init/final.mdl ${dir}_combined_init/final.mdl
 fi

-# In order to train the combined model, we'll need to dump iVectors.
 if [ $stage -le 6 ]; then
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
-    --utts-per-spk-max 2 \
-    data/train $ivector_src exp/nnet2_online_wsj/ivectors || exit 1;
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then    
+    utils/create_split_dir.pl \
+      /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/${dir}_combined/egs/storage \
+        $dir_combined/egs/storage
+  fi
+
+  # This version of the get_egs.sh script does the feature extraction and iVector
+  # extraction in a single binary, reading the config, as part of the script.
+  steps/online/nnet2/get_egs.sh --cmd "$train_cmd" --num-jobs-nnet 4 \
+    data/train exp/tri3b_ali ${dir}_online ${dir}_combined
 fi

 if [ $stage -le 7 ]; then
-  # assume left and right context of model are identical.
-  splice_width=$(nnet-am-info exp/nnet2_online_wsj/nnet_gpu_combined_init/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1;
-
-  # Note: in general the get_egs.sh script would get things like the LDA matrix
-  # from exp/tri3b_ali, which would be the wrong thing to do as we want to get
-  # them from the original model dir.  In this case we're using raw MFCC
-  # features so it's not an issue.  But in general we'd probably have to create
-  # a temporary dir and copy or link both the alignments and feature-related
-  # things to it.
-  steps/nnet2/get_egs.sh  --cmd "$train_cmd" \
-    --feat-type raw --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --online-ivector-dir exp/nnet2_online_wsj/ivectors \
-    --num-jobs-nnet 4 --splice-width $splice_width \
-    data/train data/lang exp/tri3b_ali ${dir}_combined
-fi
-
-if [ $stage -le 8 ]; then
  steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
    --num-threads "$num_threads" \
    --minibatch-size "$minibatch_size" \
@ -139,15 +130,15 @@ if [ $stage -le 8 ]; then
     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined 
 fi

-if [ $stage -le 9 ]; then
+if [ $stage -le 8 ]; then
  # Create an online-decoding dir corresponding to what we just trained above.
  # If this setup used PLP features, we'd have to give the option --feature-type plp
  # to the script below.
-  steps/online/nnet2/prepare_online_decoding.sh data/lang $ivector_src \
+  steps/online/nnet2/prepare_online_decoding.sh data/lang $srcdir/ivector_extractor \
    ${dir}_combined ${dir}_combined_online || exit 1;
 fi

-if [ $stage -le 10 ]; then
+if [ $stage -le 9 ]; then
  # do the online decoding on top of the retrained _combined_online model, and
  # also the per-utterance version of the online decoding.
  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
@ -166,25 +157,27 @@ fi
 exit 0;

 # Here are the results when we just retrain the last layer:
-# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh 
-#%WER 1.61 [ 202 / 12533, 22 ins, 46 del, 134 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_3
-#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh 
-#%WER 7.99 [ 1002 / 12533, 74 ins, 153 del, 775 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_6
+# grep WER exp/nnet2_online_wsj/nnet_a_online/decode/wer_* | utils/best_wer.sh 
+#%WER 1.60 [ 201 / 12533, 22 ins, 46 del, 133 sub ] exp/nnet2_online_wsj/nnet_a_online/decode/wer_3
+#a11:s5: grep WER exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh 
+#%WER 8.02 [ 1005 / 12533, 74 ins, 155 del, 776 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_6

 # and with per-utterance decoding:
-# %WER 1.72 [ 216 / 12533, 26 ins, 45 del, 145 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_utt/wer_3
-# %WER 8.40 [ 1053 / 12533, 85 ins, 158 del, 810 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug_utt/wer_6
+# %WER 8.47 [ 1061 / 12533, 88 ins, 157 del, 816 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug_utt/wer_6
+# %WER 1.70 [ 213 / 12533, 24 ins, 46 del, 143 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_utt/wer_3
+
+

 #, here when we retrain the whole thing:
-# %WER 1.32 [ 165 / 12533, 14 ins, 34 del, 117 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode/wer_3
-# %WER 7.20 [ 902 / 12533, 78 ins, 127 del, 697 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug/wer_6
+#%WER 1.42 [ 178 / 12533, 16 ins, 44 del, 118 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode/wer_4
+#%WER 7.08 [ 887 / 12533, 74 ins, 133 del, 680 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug/wer_6

-# and with per-utterance decoding:
-# %WER 1.38 [ 173 / 12533, 19 ins, 32 del, 122 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_per_utt/wer_3
-# %WER 7.44 [ 932 / 12533, 57 ins, 163 del, 712 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug_per_utt/wer_8
+# and the same with per-utterance decoding:
+# %WER 1.56 [ 196 / 12533, 31 ins, 26 del, 139 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_per_utt/wer_2
+# %WER 7.86 [ 985 / 12533, 59 ins, 171 del, 755 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug_per_utt/wer_8

 # And this is a suitable baseline: a system trained on RM only.
-#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh 
-#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
-#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh 
-#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
+#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode/wer_* | utils/best_wer.sh 
+#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_a_online/decode/wer_8
+#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh 
+#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_a_online/decode_ug/wer_11
--- a/egs/swbd/s5b/local/online/run_nnet2.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2.sh
@ -50,12 +50,14 @@ fi

 if [ $stage -le 3 ]; then
  # We extract iVectors on all the train_nodup data, which will be what we
-  # train the system on.  This version of the iVector-extraction script
-  # pairs the utterances into twos (by default, see --utts-per-spk-max option) 
-  # and treats each as one speaker.
+  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_nodup data/train_nodup_max2
+
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    --utts-per-spk-max 2 \
-    data/train_nodup exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_nodup2 || exit 1;
+    data/train_nodup_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_nodup2 || exit 1;
 fi


--- a/egs/swbd/s5b/local/online/run_nnet2_fisher.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_fisher.sh
@ -0,0 +1,194 @@
+#!/bin/bash
+
+
+# This script trains a Switchboard system starting from a neural net trained for
+# Fisher English.  It builds a
+# neural net for online decoding on top of the network we previously trained on
+# WSJ, by keeping everything but the last layer of that network and then
+# training just the last layer on our data.
+
+stage=0
+set -e
+
+train_stage=-10
+use_gpu=true
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  parallel_opts="-l gpu=1" 
+  num_threads=1
+  minibatch_size=512
+  dir=exp/nnet2_online_wsj/nnet_gpu
+  trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
+  srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
+  # the following things are needed while training the combined model.
+  srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu 
+  ivector_src=../../wsj/s5/exp/nnet2_online/extractor
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+  minibatch_size=128
+  parallel_opts="-pe smp $num_threads" 
+  dir=exp/nnet2_online_wsj/nnet
+  trainfeats=exp/nnet2_online_wsj/wsj_activations_train
+  srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
+  # the following things are needed while training the combined model.
+  srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a
+  ivector_src=../../wsj/s5/exp/nnet2_online/extractor
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: dumping activations from WSJ model"
+  steps/online/nnet2/dump_nnet_activations.sh --cmd "$train_cmd" --nj 30 \
+     data/train $srcdir $trainfeats
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: training 0-hidden-layer model on top of WSJ activations"
+  steps/nnet2/retrain_fast.sh --stage $train_stage \
+    --num-threads "$num_threads" \
+    --minibatch-size "$minibatch_size" \
+    --parallel-opts "$parallel_opts" \
+    --cmd "$decode_cmd" \
+    --num-jobs-nnet 4 \
+    --mix-up 4000 \
+    --initial-learning-rate 0.02 --final-learning-rate 0.004 \
+     $trainfeats/data data/lang exp/tri3b_ali $dir 
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: formatting combined model for online decoding."
+  steps/online/nnet2/prepare_online_decoding_retrain.sh $srcdir $dir ${dir}_online
+fi
+
+# Note: at this point it might be possible to further train the combined model
+# by doing backprop through all of it.  We haven't implemented this yet.
+
+if [ $stage -le 3 ]; then
+  # do online decoding with the combined model.
+  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+    exp/tri3b/graph data/test ${dir}_online/decode &
+  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+    exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1;
+  wait
+fi
+
+if [ $stage -le 4 ]; then
+  # do online per-utterance decoding with the combined model.
+  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+     --per-utt true \
+    exp/tri3b/graph data/test ${dir}_online/decode_utt &
+  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+     --per-utt true \
+    exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_utt || exit 1;
+  wait
+fi
+
+## From this point on we try something else: we try training all the layers of
+## the model on this dataset.  First we need to create a combined version of the
+## model. 
+if [ $stage -le 5 ]; then
+  steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
+  
+  # Set the learning rate in this initial value to our guess of a suitable value.
+  # note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
+  # (1.32, 1.38, 7.20, 7.44) with a learning rate of 0.01.
+  initial_learning_rate=0.01
+  nnet-am-copy --learning-rate=$initial_learning_rate ${dir}_combined_init/final.mdl ${dir}_combined_init/final.mdl
+fi
+
+# In order to train the combined model, we'll need to dump iVectors.
+if [ $stage -le 6 ]; then
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
+    data/train_max2 $ivector_src exp/nnet2_online_wsj/ivectors || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # assume left and right context of model are identical.
+  splice_width=$(nnet-am-info exp/nnet2_online_wsj/nnet_gpu_combined_init/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1;
+
+  # Note: in general the get_egs.sh script would get things like the LDA matrix
+  # from exp/tri3b_ali, which would be the wrong thing to do as we want to get
+  # them from the original model dir.  In this case we're using raw MFCC
+  # features so it's not an issue.  But in general we'd probably have to create
+  # a temporary dir and copy or link both the alignments and feature-related
+  # things to it.
+  steps/nnet2/get_egs.sh  --cmd "$train_cmd" \
+    --feat-type raw --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --online-ivector-dir exp/nnet2_online_wsj/ivectors \
+    --num-jobs-nnet 4 --splice-width $splice_width \
+    data/train data/lang exp/tri3b_ali ${dir}_combined
+fi
+
+if [ $stage -le 8 ]; then
+  steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
+    --num-threads "$num_threads" \
+    --minibatch-size "$minibatch_size" \
+    --parallel-opts "$parallel_opts" \
+     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined 
+fi
+
+if [ $stage -le 9 ]; then
+  # Create an online-decoding dir corresponding to what we just trained above.
+  # If this setup used PLP features, we'd have to give the option --feature-type plp
+  # to the script below.
+  steps/online/nnet2/prepare_online_decoding.sh data/lang $ivector_src \
+    ${dir}_combined ${dir}_combined_online || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  # do the online decoding on top of the retrained _combined_online model, and
+  # also the per-utterance version of the online decoding.
+  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+    exp/tri3b/graph data/test ${dir}_combined_online/decode &
+  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+    exp/tri3b/graph_ug data/test ${dir}_combined_online/decode_ug &
+  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+    --per-utt true exp/tri3b/graph data/test ${dir}_combined_online/decode_per_utt &
+  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
+    --per-utt true exp/tri3b/graph_ug data/test ${dir}_combined_online/decode_ug_per_utt || exit 1;
+  wait
+fi
+
+
+
+exit 0;
+
+# Here are the results when we just retrain the last layer:
+# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh 
+#%WER 1.61 [ 202 / 12533, 22 ins, 46 del, 134 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_3
+#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh 
+#%WER 7.99 [ 1002 / 12533, 74 ins, 153 del, 775 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_6
+
+# and with per-utterance decoding:
+# %WER 1.72 [ 216 / 12533, 26 ins, 45 del, 145 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_utt/wer_3
+# %WER 8.40 [ 1053 / 12533, 85 ins, 158 del, 810 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug_utt/wer_6
+
+#, here when we retrain the whole thing:
+# %WER 1.32 [ 165 / 12533, 14 ins, 34 del, 117 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode/wer_3
+# %WER 7.20 [ 902 / 12533, 78 ins, 127 del, 697 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug/wer_6
+
+# and with per-utterance decoding:
+# %WER 1.38 [ 173 / 12533, 19 ins, 32 del, 122 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_per_utt/wer_3
+# %WER 7.44 [ 932 / 12533, 57 ins, 163 del, 712 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug_per_utt/wer_8
+
+# And this is a suitable baseline: a system trained on RM only.
+#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh 
+#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
+#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh 
+#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
--- a/egs/wsj/s5/local/online/run_nnet2.sh
+++ b/egs/wsj/s5/local/online/run_nnet2.sh
@ -55,9 +55,13 @@ fi
 if [ $stage -le 3 ]; then
  # We extract iVectors on all the train_si284 data, which will be what we
  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_si284 data/train_si284_max2
+
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    --utts-per-spk-max 2 
-    data/train_si284 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_si284 || exit 1;
+    data/train_si284_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors_train_si284 || exit 1;
 fi


@ -78,8 +82,8 @@ if [ $stage -le 4 ]; then
  # wouldn't be able to decode in real-time using a CPU.
  #
  # I copied the learning rates from ../nnet2/run_5d.sh
-  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
-    --num-epochs 8 --num-epochs-extra 4 \
+  steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
+    --num-epochs 12 \
    --splice-width 7 --feat-type raw \
    --online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
    --cmvn-opts "--norm-means=false --norm-vars=false" \
--- a/egs/wsj/s5/steps/nnet2/create_appended_model.sh
+++ b/egs/wsj/s5/steps/nnet2/create_appended_model.sh
@ -32,7 +32,7 @@ src1=$1
 src2=$2
 dir=$3

-for f in $src1/final.mdl $src1/cmvn_opts $src2/tree $src2/final.mdl; do
+for f in $src1/final.mdl $src2/tree $src2/final.mdl; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
 done

--- a/egs/wsj/s5/steps/nnet2/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs.sh
@ -60,10 +60,12 @@ if [ $# != 4 ]; then
 fi

 data=$1
-lang=$2
+lang=$2  # kept for historical reasons, but never used.
 alidir=$3
 dir=$4

+
+
 # Check some files.
 [ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
@ -73,13 +75,8 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/
 done


-# Set some variables.
-oov=`cat $lang/oov.int`
-num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
-silphonelist=`cat $lang/phones/silence.csl` || exit 1;
-
 nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
-# in this dir we'll have just one job.
+
 sdata=$data/split$nj
 utils/split_data.sh $data $nj

@ -189,14 +186,22 @@ mkdir -p $dir/egs
 if [ $stage -le 2 ]; then
  echo "Getting validation and training subset examples."
  rm $dir/.error 2>/dev/null
+  echo "$0: extracting validation and training-subset alignments."
+  set -o pipefail;
+  for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
+    copy-int-vector ark:- ark,t:- | \
+    utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \
+    gzip -c >$dir/ali_special.gz || exit 1;
+  set +o pipefail; # unset the pipefail option.
+
  all_ids=$(seq -s, $nj)  # e.g. 1,2,...39,40
  $cmd $dir/log/create_valid_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
-     "ark,s,cs:gunzip -c $alidir/ali.{$all_ids}.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+    "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/egs/valid_all.egs" || touch $dir/.error &
  $cmd $dir/log/create_train_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
-     "ark,s,cs:gunzip -c $alidir/ali.{$all_ids}.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && exit 1;
@ -220,12 +225,10 @@ if [ $stage -le 2 ]; then
  for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
-  rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs
+  rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs $dir/ali_special.gz
 fi

 if [ $stage -le 3 ]; then
-  mkdir -p $dir/temp
-
  # Other scripts might need to know the following info:
  echo $num_jobs_nnet >$dir/egs/num_jobs_nnet
  echo $iters_per_epoch >$dir/egs/iters_per_epoch
@ -279,9 +282,6 @@ if [ $stage -le 5 ]; then
  echo "Shuffling the order of training examples"
  echo "(in order to avoid stressing the disk, these won't all run at once)."

-
-  # note, the "|| true" below is a workaround for NFS bugs
-  # we encountered running this script with Debian-7, NFS-v4.
  for n in `seq 0 $[$iters_per_epoch-1]`; do
    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
      nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
--- a/egs/wsj/s5/steps/nnet2/retrain_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/retrain_fast.sh
@ -392,7 +392,6 @@ echo Done
 if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
-    echo Removing training examples
-    rm $dir/egs/egs*
+    steps/nnet2/remove_egs.sh $dir/egs
  fi
 fi
--- a/egs/wsj/s5/steps/nnet2/train_pnorm.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm.sh
@ -121,8 +121,6 @@ if [ $# != 4 ]; then
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
-  echo "  --first-component-power <power|1.0>              # Power applied to output of first p-norm layer... setting this to"
-  echo "                                                   # 0.5 seems to help under some circumstances."
  echo "  --egs-opts <opts>                                # Extra options to pass to get_egs.sh"
  echo "  --lda-opts <opts>                                # Extra options to pass to get_lda.sh"
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh
@ -0,0 +1,478 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+#           2013  Xiaohui Zhang
+#           2013  Guoguo Chen
+# Apache 2.0.
+
+
+# train_pnorm_simpo.sh is a modified version of train_pnorm_fast.sh.  Like
+# train_pnorm_fast.sh, it uses the `online' preconditioning, which is faster
+# (especially on GPUs).  The difference is that the learning-rate schedule is
+# simpler, with the learning rate exponentially decreasing during training,
+# and no phase where the learning rate is constant.
+# 
+# Also, the final model-combination is done a bit differently: we combine models
+# over typically a whole epoch, and because that would be too many iterations to
+# easily be able to combine over, we arrange the iterations into groups (20
+# groups by default) and average over each group.
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=15      # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+initial_learning_rate=0.04
+final_learning_rate=0.004
+bias_stddev=0.5
+pnorm_input_dim=3000 
+pnorm_output_dim=300
+p=2
+minibatch_size=128 # by default use a smallish minibatch size for neural net
+                   # training; this controls instability which would otherwise
+                   # be a problem with multi-threaded update. 
+
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This option is passed to get_egs.sh
+num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
+                   # is passed to get_egs.sh.
+get_egs_stage=0
+online_ivector_dir=
+
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+add_layers_period=2 # by default, add new layers every 2 iterations.
+num_hidden_layers=3
+stage=-4
+
+io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+splice_width=4 # meaning +- 4 frames on each side for second LDA
+randprune=4.0 # speeds up LDA.
+alpha=4.0 # relates to preconditioning.
+update_period=4 # relates to online preconditioning: says how often we update the subspace.
+num_samples_history=2000 # relates to online preconditioning
+max_change_per_sample=0.075
+precondition_rank_in=20  # relates to online preconditioning
+precondition_rank_out=80 # relates to online preconditioning
+
+mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
+        # specified.)
+num_threads=16
+parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
+  # by default we use 16 threads; this lets the queue know.
+  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
+combine_num_threads=8
+combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+cleanup=true
+egs_dir=
+lda_opts=
+lda_dim=
+egs_opts=
+transform_dir=     # If supplied, overrides alidir
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+            # only relevant for "raw" features, not lda.
+feat_type=  # Can be used to force "raw" features.
+prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
+                        # more than enough.
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
+  echo "                                                       # data, 0.01 for large data"
+  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
+  echo "                                                   # data, 0.001 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
+  echo "                                                   # per context-dependent state.  Try a number several times #states."
+  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
+  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
+  echo "                                                   # Note: if you increase this, you may want to also increase"
+  echo "                                                   # the learning rate."
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+  
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+dir=$4
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+# Set some variables.
+num_leaves=`tree-info $alidir/tree 2>/dev/null | awk '{print $2}'` || exit 1
+[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
+[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
+
+nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+# in this dir we'll have just one job.
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+cp $alidir/tree $dir
+
+extra_opts=()
+[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+[ -z "$transform_dir" ] && transform_dir=$alidir
+extra_opts+=(--transform-dir $transform_dir)
+extra_opts+=(--splice-width $splice_width)
+
+if [ $stage -le -4 ]; then
+  echo "$0: calling get_lda.sh"
+  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
+fi
+
+# these files will have been written by get_lda.sh
+feat_dim=$(cat $dir/feat_dim) || exit 1;
+ivector_dim=$(cat $dir/ivector_dim) || exit 1;
+lda_dim=$(cat $dir/lda_dim) || exit 1;
+
+if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
+  echo "$0: calling get_egs.sh"
+  steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
+      --samples-per-iter $samples_per_iter \
+      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
+      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
+      $data $lang $alidir $dir || exit 1;
+fi
+
+if [ -z $egs_dir ]; then
+  egs_dir=$dir/egs
+fi
+
+iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
+! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
+  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
+num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;
+
+
+if ! [ $num_hidden_layers -ge 1 ]; then
+  echo "Invalid num-hidden-layers $num_hidden_layers"
+  exit 1
+fi
+
+if [ $stage -le -2 ]; then
+  echo "$0: initializing neural net";
+  lda_mat=$dir/lda.mat
+  tot_input_dim=$[$feat_dim+$ivector_dim]
+
+  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
+
+  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
+  cat >$dir/nnet.config <<EOF
+SpliceComponent input-dim=$tot_input_dim left-context=$splice_width right-context=$splice_width const-component-dim=$ivector_dim
+FixedAffineComponent matrix=$lda_mat
+AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
+PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
+NormalizeComponent dim=$pnorm_output_dim
+AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
+SoftmaxComponent dim=$num_leaves
+EOF
+
+  # to hidden.config it will write the part of the config corresponding to a
+  # single hidden layer; we need this to add new layers. 
+  cat >$dir/hidden.config <<EOF
+AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
+PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
+NormalizeComponent dim=$pnorm_output_dim
+EOF
+  $cmd $dir/log/nnet_init.log \
+    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
+    $dir/0.mdl || exit 1;
+fi
+
+if [ $stage -le -1 ]; then
+  echo "Training transition probabilities and setting priors"
+  $cmd $dir/log/train_trans.log \
+    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
+    || exit 1;
+fi
+
+num_iters=$[$num_epochs * $iters_per_epoch];
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+# This is when we decide to mix up from: halfway between when we've finished
+# adding the hidden layers and the end of training.
+mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
+
+if [ $num_threads -eq 1 ]; then
+  parallel_suffix="-simple" # this enables us to use GPU code if
+                         # we have just one thread.
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+  fi
+else
+  parallel_suffix="-parallel"
+  parallel_train_opts="--num-threads=$num_threads"
+fi
+
+# First work out how many models we want to combine over in the final
+# nnet-combine-fast invocation.  This equals
+# min(max(max_models_combine, iters_per_epoch),
+#     2/3 * iters_after_mixup)
+num_models_combine=$max_models_combine
+if [ $num_models_combine -lt $iters_per_epoch ]; then
+  num_models_combine=$iters_per_epoch
+fi
+iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
+if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
+  num_models_combine=$iters_after_mixup_23
+fi
+first_model_combine=$[$num_iters-$num_models_combine+1]
+
+x=0
+
+while [ $x -lt $num_iters ]; do
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    # Set off jobs doing some diagnostics, in the background.
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
+    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
+          ark:$egs_dir/train_diagnostic.egs '&&' \
+        nnet-am-info $dir/$x.mdl &
+    fi
+    
+    echo "Training neural net (pass $x)"
+
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
+      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
+    else
+      mdl=$dir/$x.mdl
+    fi
+    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size and just one job: the model-averaging doesn't seem to be helpful
+      # when the model is changing too fast (i.e. it worsens the objective
+      # function), and the smaller minibatch size will help to keep
+      # the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+      do_average=false
+    else
+      this_minibatch_size=$minibatch_size
+      do_average=true
+    fi
+
+    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
+      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
+      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
+       nnet-train$parallel_suffix $parallel_train_opts \
+        --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
+        ark:- $dir/$[$x+1].JOB.mdl \
+      || exit 1;
+
+    nnets_list=
+    for n in `seq 1 $num_jobs_nnet`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
+    done
+
+    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet-am-average $nnets_list - \| \
+        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+
+    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
+      # mix up.
+      echo Mixing up from $num_leaves to $mix_up components
+      $cmd $dir/log/mix_up.$x.log \
+        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
+        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
+    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+      rm $dir/$[$x-1].mdl
+    fi
+  fi
+  x=$[$x+1]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.mdl"
+
+  # Now do combination.
+  nnets_list=()
+  # the if..else..fi statement below sets 'nnets_list'.
+  if [ $max_models_combine -lt $num_models_combine ]; then
+    # The number of models to combine is too large, e.g. > 20.  In this case,
+    # each argument to nnet-combine-fast will be an average of multiple models.
+    cur_offset=0 # current offset from first_model_combine.
+    for n in $(seq $max_models_combine); do
+      next_offset=$[($n*$num_models_combine)/$max_models_combine]
+      sub_list="" 
+      for o in $(seq $cur_offset $[$next_offset-1]); do
+        iter=$[$first_model_combine+$o]
+        mdl=$dir/$iter.mdl
+        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+        sub_list="$sub_list $mdl"
+      done
+      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
+      cur_offset=$next_offset
+    done
+  else
+    nnets_list=
+    for n in $(seq 0 $[num_models_combine-1]); do
+      iter=$[$first_model_combine+$n]
+      mdl=$dir/$iter.mdl
+      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+      nnets_list[$n]=$mdl
+    done
+  fi
+
+
+  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
+  # if there are many models it can give out-of-memory error; set num-threads to 8
+  # to speed it up (this isn't ideal...)
+  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
+  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
+  [ $mb -gt 512 ] && mb=512
+  # Setting --initial-model to a large value makes it initialize the combination
+  # with the average of all the models.  It's important not to start with a
+  # single model, or, due to the invariance to scaling that these nonlinearities
+  # give us, we get zero diagonal entries in the fisher matrix that
+  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
+  # the effect that the initial model chosen gets much higher learning rates
+  # than the others.  This prevents the optimization from working well.
+  $cmd $combine_parallel_opts $dir/log/combine.log \
+    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
+      --num-threads=$combine_num_threads \
+      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
+      $dir/final.mdl || exit 1;
+
+  # Normalize stddev for affine or block affine layers that are followed by a
+  # pnorm layer and then a normalize layer.
+  $cmd $dir/log/normalize.log \
+    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
+fi
+
+if [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purposes of adjusting the priors."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  rm $dir/post.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
+    nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
+    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.log \
+   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;
+
+  rm $dir/post.*.vec;
+
+  echo "Re-adjusting priors based on computed posteriors"
+  $cmd $dir/log/adjust_priors.log \
+    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
+fi
+
+
+if [ ! -f $dir/final.mdl ]; then
+  echo "$0: $dir/final.mdl does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if [ $egs_dir == "$dir/egs" ]; then
+    steps/nnet2/remove_egs.sh $dir/egs
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+
+fi
--- a/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh
+++ b/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh
@ -0,0 +1,81 @@
+#!/bin/bash
+
+# Copyright 2013-2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script is as utils/copy_data_dir.sh in that it copies a data-dir,
+# but it supports the --utts-per-spk-max option.  If nonzero, it modifies
+# the utt2spk and spk2utt files by splitting each speaker into multiple
+# versions, so that each speaker has no more than --utts-per-spk-max
+# utterances.
+
+
+# begin configuration section
+utts_per_spk_max=-1
+# end configuration section
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
+  echo "Options"
+  echo "   --utts-per-spk-max=n  # number of utterances per speaker maximum,"
+  echo "                         # default -1 (meaning no maximum).  E.g. 2."
+  exit 1;
+fi
+
+
+export LC_ALL=C
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/utt2spk ]; then
+  echo "$0: no such file $srcdir/utt2spk" 
+  exit 1;
+fi
+
+set -e;
+set -o pipefail
+
+mkdir -p $destdir
+
+
+if [ "$utts_per_spk_max" != -1 ]; then
+  # create spk2utt file with reduced number of utterances per speaker.
+  awk -v max=$utts_per_spk_max '{ n=2; count=0;
+    while(n<=NF) {
+      int_max=int(max)+ (rand() < (max-int(max))?1:0);
+      nmax=n+int_max; count++; printf("%s-%06x", $1, count);
+      for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
+   <$srcdir/spk2utt >$destdir/spk2utt
+  utils/spk2utt_to_utt2spk.pl <$destdir/spk2utt >$destdir/utt2spk
+
+  if [ -f $srcdir/cmvn.scp ]; then
+    # below, the first apply_map command outputs a cmvn.scp indexed by utt;
+    # the second one outputs a cmvn.scp indexed by new speaker-id.
+    utils/apply_map.pl -f 2 $srcdir/cmvn.scp <$srcdir/utt2spk | \
+      utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq > $destdir/cmvn.scp
+    echo "$0: mapping cmvn.scp, but you may want to recompute it if it's needed,"
+    echo " as it would probably change."
+  fi
+  if [ -f $srcdir/spk2gender ]; then
+    utils/apply_map.pl -f 2 $srcdir/spk2gender <$srcdir/utt2spk | \
+      utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq >$destdir/spk2gender
+  fi
+else
+  cp $srcdir/spk2utt $srcdir/utt2spk $destdir/
+  [ -f $srcdir/spk2gender ] && cp $srcdir/spk2gender $destdir/
+  [ -f $srcdir/cmvn.scp ] && cp $srcdir/cmvn.scp $destdir/
+fi
+
+
+for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
+  [ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
+done
+
+echo "$0: copied data from $srcdir to $destdir, with --utts-per-spk-max $utts_per_spk_max"
+utils/validate_data_dir.sh $destdir
--- a/egs/wsj/s5/steps/online/nnet2/dump_nnet_activations.sh
+++ b/egs/wsj/s5/steps/online/nnet2/dump_nnet_activations.sh
@ -98,6 +98,9 @@ fi
 if [ $stage -le 2 ]; then
  echo "$0: dumping neural net activations"

+  # The next line is a no-op unless $dir/feats/storage/ exists; see utils/create_split_dir.pl.
+  for j in $(seq $nj); do  utils/create_data_link.pl $dir/feats/feats.$j.ark; done
+
  if [ -f $data/segments ]; then
    wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
  else
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
@ -14,12 +14,8 @@
 # for online decoding.

 # Rather than treating each utterance separately, it carries forward
-# information from one utterance to the next, within the speaker.  However, 
-# take note of the option "utts-per-spk-max", which splits speakers up into 
-# "fake speakers" with at most two utterances in them.  This means that more 
-# iVectors  are estimated starting from an uninformative starting point, than 
-# if we used the real speaker labels (which may have many utterances each); 
-# it's a compromise between per-utterance and per-speaker iVector estimation.
+# information from one utterance to the next, within the speaker. 
+

 # Begin configuration section.
 nj=30
@ -36,13 +32,9 @@ posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
                    # used when training the iVector extractor, but more important
                    # that this match the value used when you do real online decoding
                    # with the neural nets trained with these iVectors.
-utts_per_spk_max=-1 # Maximum utterances per "fake-speaker." With the default 
-                    # of -1 no fake-speakers are used.  Note: this does not have to 
-                    # be an integer; if it's noninteger, it will be rounded in a 
-                    # randomized way to one of the two integers it's close to.  
-                    # This is useful in the "perturbed-feature" recipe to encourage 
-                    # that different perturbed versions of the same speaker get 
-                    # split into fake-speakers differently.
+#utts_per_spk_max=-1 # This option is no longer supported, you should use
+                    # steps/online/nnet2/copy_data_dir.sh with the --utts-per-spk-max
+                    # option to make a copy of the data dir.
 compress=true       # If true, compress the iVectors stored on disk (it's lossy
                    # compression, as used for feature matrices).

@ -112,7 +104,6 @@ echo "--posterior-scale=$posterior_scale" >>$ieconf
 echo "--max-remembered-frames=1000" >>$ieconf # the default


-
 ns=$(wc -l <$data/spk2utt)
 if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
  echo "$0: you seem to have just one speaker in your database.  This is probably not a good idea."
@ -121,29 +112,10 @@ if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 -a "$utts_per_spk_max" != -1 ]; then
  utts_per_spk_max=1
 fi

-spk2utt=""
-if [ "$utts_per_spk_max" != -1 ]; then
-  mkdir -p $dir/spk2utt_fake
-  for job in $(seq $nj); do 
-    # create fake spk2utt files with reduced number of utterances per speaker,
-    # so the network is well adapted to using iVectors from small amounts of
-    # training data.
-    # the if (rand() % 2 == 0)
-    awk -v max=$utts_per_spk_max '{ n=2; count=0;
-      while(n<=NF) {
-        int_max=int(max)+ (rand() < (max-int(max))?1:0);
-        nmax=n+int_max; count++; printf("%s-%06x", $1, count); 
-        for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
-    <$sdata/$job/spk2utt >$dir/spk2utt_fake/spk2utt.$job
-  done
-  spk2utt="ark:$dir/spk2utt_fake/spk2utt.JOB"
-else
-  spk2utt="ark:$sdata/JOB/spk2utt"
-fi


 for n in $(seq $nj); do
-  # This will do nothing unless the directorys $dir/storage exists;
+  # This will do nothing unless the directory $dir/storage exists;
  # it can be used to distribute the data among multiple machines.
  utils/create_data_link.pl $dir/ivector_online.$n.ark
 done
@ -151,7 +123,7 @@ done
 if [ $stage -le 0 ]; then
  echo "$0: extracting iVectors"
  $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
-     ivector-extract-online2 --config=$ieconf "$spk2utt" scp:$sdata/JOB/feats.scp ark:- \| \
+     ivector-extract-online2 --config=$ieconf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \
     copy-feats --compress=$compress ark:- \
      ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
 fi
--- a/egs/wsj/s5/steps/online/nnet2/get_egs.sh
+++ b/egs/wsj/s5/steps/online/nnet2/get_egs.sh
@ -0,0 +1,285 @@
+#!/bin/bash
+
+# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# This is modified from ../../nnet2/get_egs.sh. 
+# This script combines the
+# nnet-example extraction with the feature extraction directly from wave files;
+# it uses the program online2-wav-dump-feature to do all parts of feature
+# extraction: MFCC/PLP/fbank, possibly plus pitch, plus iVectors.  This script
+# is intended mostly for cross-system training for online decoding, where you
+# initialize the nnet from an existing, larger systme.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_utts_subset=300    # number of utterances in validation and training
+                       # subsets used for shrinkage and diagnostics
+num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
+num_train_frames_combine=10000 # # train frames for the above.
+num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This is just a guideline; it will pick a number
+                        # that divides the number of samples in the entire data.
+transform_dir=     # If supplied, overrides alidir
+num_jobs_nnet=16    # Number of neural net jobs to run in parallel
+stage=0
+io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+random_copy=false
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ]; then
+  echo "Usage: steps/online/nnet2/get_egs.sh [opts] <data> <ali-dir> <online-nnet-dir> <exp-dir>"
+  echo " e.g.: steps/online/nnet2/get_egs.sh data/train exp/tri3_ali exp/nnet2_online/nnet_a_gpu_online/ exp/tri4_nnet"
+  echo "In <online-nnet-dir>, it looks for final.mdl (need to compute required left and right context),"
+  echo "and a configuration file conf/online_nnet2_decoding.conf which describes the features."
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-jobs-nnet <num-jobs;16>                    # Number of parallel jobs to use for main neural net"
+  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
+  echo "                                                   # Note: if you increase this, you may want to also increase"
+  echo "                                                   # the learning rate."
+  echo "  --samples-per-iter <#samples;400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --feat-type <lda|raw>                            # (by default it tries to guess).  The feature type you want"
+  echo "                                                   # to use as input to the neural net."
+  echo "  --splice-width <width;4>                         # Number of frames on each side to append for feature input"
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
+  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  
+  exit 1;
+fi
+
+data=$1
+alidir=$2
+online_nnet_dir=$3
+dir=$4
+
+
+mdl=$online_nnet_dir/final.mdl # only needed for left and right context.
+feature_conf=$online_nnet_dir/conf/online_nnet2_decoding.conf
+
+for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $feature_conf $mdl; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+cp $alidir/tree $dir
+grep -v '^--endpoint' $feature_conf >$dir/feature.conf || exit 1;
+
+# Get list of validation utterances. 
+mkdir -p $dir/valid $dir/train_subset
+
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
+    > $dir/valid/uttlist || exit 1;
+
+if [ -f $data/utt2uniq ]; then
+  echo "File $data/utt2uniq exists, so augmenting valid/uttlist to"
+  echo "include all perturbed versions of the same 'real' utterances."
+  mv $dir/valid/uttlist $dir/valid/uttlist.tmp
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+  cat $dir/valid/uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid/uttlist
+  rm $dir/uniq2utt $dir/valid/uttlist.tmp
+fi
+
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid/uttlist | \
+     head -$num_utts_subset > $dir/train_subset/uttlist || exit 1;
+
+
+for subdir in valid train_subset; do
+  # In order for the iVector extraction to work right, we need to process all
+  # utterances of the speakers which have utterances in valid/uttlist, and the
+  # same for train_subset/uttlist.  We produce $dir/valid/uttlist_extended which
+  # will contain all utterances of all speakers which have utterances in
+  # $dir/valid/uttlist, and the same for $dir/train_subset/.
+
+  utils/filter_scp.pl $dir/$subdir/uttlist <$data/utt2spk | awk '{print $2}' > $dir/$subdir/spklist || exit 1;
+  utils/filter_scp.pl -f 2 $dir/$subdir/spklist <$data/utt2spk >$dir/$subdir/utt2spk || exit 1;
+  utils/utt2spk_to_spk2utt.pl <$dir/$subdir/utt2spk >$dir/$subdir/spk2utt || exit 1;
+  awk '{print $1}' <$dir/$subdir/utt2spk >$dir/$subdir/uttlist_extended || exit 1;
+  rm $dir/$subdir/spklist
+done
+
+if [ -f $data/segments ]; then
+  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
+  # previous utterances within a speaker, we do the filtering after extracting the features.
+  echo "$0 [info]: segments file exists: using that."
+  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/segments  | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/segments  | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
+else
+  echo "$0 [info]: no segments file exists, using wav.scp."
+  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt scp:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt scp:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
+fi
+
+ivector_dim=$(online2-wav-dump-features --config=$dir/feature.conf --print-ivector-dim=true) || exit 1;
+
+! [ $ivector_dim -ge 0 ] && echo "$0: error getting iVector dim" && exit 1;
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/num_frames
+else
+  num_frames=`cat $dir/num_frames` || exit 1;
+fi
+
+# Working out number of iterations per epoch.
+iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
+[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
+samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
+echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations,"
+echo "$0: giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
+
+# Making soft links to storage directories.  This is a no-up unless
+# the subdirectory $dir/egs/storage/ exists.  See utils/create_split_dir.pl
+for x in `seq 1 $num_jobs_nnet`; do
+  for y in `seq 0 $[$iters_per_epoch-1]`; do
+    utils/create_data_link.pl $dir/egs/egs.$x.$y.ark
+    utils/create_data_link.pl $dir/egs/egs_tmp.$x.$y.ark
+  done
+  for y in `seq 1 $nj`; do
+    utils/create_data_link.pl $dir/egs/egs_orig.$x.$y.ark
+  done
+done
+
+remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done }
+
+set -o pipefail
+left_context=$(nnet-am-info $mdl | grep '^left-context' | awk '{print $2}') || exit 1;
+right_context=$(nnet-am-info $mdl | grep '^right-context' | awk '{print $2}') || exit 1;
+nnet_context_opts="--left-context=$left_context --right-context=$right_context"
+set +o pipefail
+
+mkdir -p $dir/egs
+
+if [ $stage -le 2 ]; then
+  rm $dir/.error 2>/dev/null
+  
+  echo "$0: extracting validation and training-subset alignments."
+  set -o pipefail;
+  for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
+    copy-int-vector ark:- ark,t:- | \
+    utils/filter_scp.pl <(cat $dir/valid/uttlist $dir/train_subset/uttlist) | \
+    gzip -c >$dir/ali_special.gz || exit 1;
+  set +o pipefail; # unset the pipefail option.
+  
+  echo "Getting validation and training subset examples."
+  $cmd $dir/log/create_valid_subset.log \
+    nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
+     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+     "ark:$dir/egs/valid_all.egs" || touch $dir/.error &
+  $cmd $dir/log/create_train_subset.log \
+    nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
+    "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+     "ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && exit 1;
+  echo "Getting subsets of validation examples for diagnostics and combination."
+  $cmd $dir/log/create_valid_subset_combine.log \
+    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/egs/valid_all.egs \
+        ark:$dir/egs/valid_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_valid_subset_diagnostic.log \
+    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/valid_all.egs \
+    ark:$dir/egs/valid_diagnostic.egs || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset_combine.log \
+    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/egs/train_subset_all.egs \
+    ark:$dir/egs/train_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_diagnostic.log \
+    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/train_subset_all.egs \
+    ark:$dir/egs/train_diagnostic.egs || touch $dir/.error &
+  wait
+  [ -f $dir/.error ] && echo "Error detected while creating egs" && exit 1;
+  cat $dir/egs/valid_combine.egs $dir/egs/train_combine.egs > $dir/egs/combine.egs
+
+  for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+  rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs $dir/ali_special.gz
+fi
+
+if [ $stage -le 3 ]; then
+
+  # Other scripts might need to know the following info:
+  echo $num_jobs_nnet >$dir/egs/num_jobs_nnet
+  echo $iters_per_epoch >$dir/egs/iters_per_epoch
+  echo $samples_per_iter_real >$dir/egs/samples_per_iter
+
+  echo "Creating training examples";
+  # in $dir/egs, create $num_jobs_nnet separate files with training examples.
+  # The order is not randomized at this point.
+
+  egs_list=
+  for n in `seq 1 $num_jobs_nnet`; do
+    egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark"
+  done
+  echo "Generating training examples on disk"
+  # The examples will go round-robin to egs_list.
+  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
+    nnet-get-egs $ivectors_opt $nnet_context_opts "$feats" \
+    "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
+    nnet-copy-egs ark:- $egs_list || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: rearranging examples into parts for different parallel jobs"
+  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
+  # then split into multiple parts egs.JOB.*.scp for different parts of the
+  # data, 0 .. $iters_per_epoch-1.
+
+  if [ $iters_per_epoch -eq 1 ]; then
+    echo "$0: Since iters-per-epoch == 1, just concatenating the data."
+    for n in `seq 1 $num_jobs_nnet`; do
+      cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
+      remove $dir/egs/egs_orig.$n.*.ark 
+    done
+  else # We'll have to split it up using nnet-copy-egs.
+    egs_list=
+    for n in `seq 0 $[$iters_per_epoch-1]`; do
+      egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark"
+    done
+    # note, the "|| true" below is a workaround for NFS bugs
+    # we encountered running this script with Debian-7, NFS-v4.
+    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
+      nnet-copy-egs --random=$random_copy --srand=JOB \
+        "ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list || exit 1;
+    remove $dir/egs/egs_orig.*.*.ark  2>/dev/null
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  # Next, shuffle the order of the examples in each of those files.
+  # Each one should not be too large, so we can do this in memory.
+  echo "Shuffling the order of training examples"
+  echo "(in order to avoid stressing the disk, these won't all run at once)."
+
+  for n in `seq 0 $[$iters_per_epoch-1]`; do
+    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
+      nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
+      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark 
+    remove $dir/egs/egs_tmp.*.$n.ark
+  done
+fi
+
+echo "$0: Finished preparing training examples"
--- a/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding.sh
+++ b/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding.sh
@ -79,6 +79,12 @@ cp $srcdir/final.mdl $dir/ || exit 1;
 if [ ! -z "$iedir" ]; then
  mkdir -p $dir/ivector_extractor/
  cp $iedir/final.{mat,ie,dubm} $iedir/global_cmvn.stats $dir/ivector_extractor/ || exit 1;
+
+  # The following things won't be needed directly by the online decoding, but
+  # will allow us to run prepare_online_decoding.sh again with
+  # $dir/ivector_extractor/ as the input directory (useful in certain
+  # cross-system training scenarios).
+  cp $iedir/splice_opts $iedir/online_cmvn.conf $dir/ivector_extractor/ || exit 1;
 fi


--- a/egs/wsj/s5/utils/create_data_link.pl
+++ b/egs/wsj/s5/utils/create_data_link.pl
@ -46,6 +46,7 @@ creates a link such as
 Usage: utils/create_data_link.pl <data-archive>
 e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark

+See also utils/remove_data_links.sh
 EOU

 GetOptions();
--- a/egs/wsj/s5/utils/create_split_dir.pl
+++ b/egs/wsj/s5/utils/create_split_dir.pl
@ -30,7 +30,7 @@ Allowed options:
  --suffix    : Common suffix to <actual_storage_dirs>    (string, default = "")

 See also create_data_link.pl, which is intended to work with the resulting
-directory structure.
+directory structure, and remove_data_links.sh
 EOU

 my $suffix="";
--- a/egs/wsj/s5/utils/filter_scp.pl
+++ b/egs/wsj/s5/utils/filter_scp.pl
@ -19,12 +19,12 @@
 # This script takes a list of utterance-ids or any file whose first field
 # of each line is an utterance-id, and filters an scp
 # file (or any file whose "n-th" field is an utterance id), printing
-# out only those lines whose "n-th" field is in id_list. The index of 
-# the "n-th" field is zero, by default, but can be changed by using \
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
 # the -f <n> switch

 $exclude = 0;
-$field = 0;
+$field = 1;
 $shifted = 0;

 do {
@ -42,7 +42,13 @@ do {
 } while ($shifted);

 if(@ARGV < 1 || @ARGV > 2) {
-  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp ";
+  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
+      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
+      "only the lines that were *not* in id_list.\n" .
+      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+      "-f option, add 1 to the argument.\n";
 }


@ -54,12 +60,27 @@ while(<F>) {
  $seen{$A[0]} = 1;
 }

-while(<>) {
-  @A = split;
-  @A > 0 || die "Invalid scp file line $_";
-  @A >= $field || die "Invalid scp file line $_";
-
-  if((!$exclude && $seen{$A[$field]}) || ($exclude && !defined $seen{$A[$field]})) {
-    print $_;
+if ($field == 1) { # Treat this as special case, since it is common.
+  while(<>) {
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+      print $_;
+    }
+  }
+} else {
+  while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+      print $_;
+    }
  }
 }
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
--- a/egs/wsj/s5/utils/remove_data_links.sh
+++ b/egs/wsj/s5/utils/remove_data_links.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+
+# This program searches within a directory for soft links that
+# appear to be created by 'create_data_link.pl' to a 'storage/' subdirectory,
+# and it removes both the soft links and the things they point to.
+# for instance, if you have a soft link 
+#   foo/egs/1.1.egs -> storage/2/1.1.egs
+# it will remove both foo/egs/storage/2/1.1.egs, and foo/egs/1.1.egs.
+
+ret=0
+
+dry_run=false
+
+if [ "$1" == "--dry-run" ]; then
+  dry_run=true
+  shift
+fi
+
+if [ $# == 0 ]; then
+  echo "Usage:  $0 [--dry-run] <list-of-directories>"
+  echo "e.g.: $0 exp/nnet4a/egs/"
+  echo " Removes from any subdirectories of the command-line arguments, soft links that "
+  echo " appear to have been created by utils/create_data_link.pl, as well as the things"
+  echo " that those soft links point to.  Will typically be called on a directory prior"
+  echo " to 'rm -r' on that directory, to ensure that data that was distributed on other"
+  echo " volumes also gets deleted."
+  echo " With --dry-run, just prints what it would do."
+fi
+
+for dir in $*; do
+  if [ ! -d $dir ]; then
+    echo "$0: not a directory: $dir"
+    ret=1
+  else
+    for subdir in $(find $dir -type d); do
+      if [ -d $subdir/storage ]; then
+        for x in $(ls $subdir); do
+          f=$subdir/$x
+          if [ -L $f ] && [[ $(readlink $f) == storage/* ]]; then
+            target=$subdir/$(readlink $f)
+            if $dry_run; then
+              echo rm $f $target
+            else
+              rm $f $target
+            fi
+          fi
+        done
+      fi
+    done
+  fi
+done
+
+exit $ret
--- a/src/featbin/extract-segments.cc
+++ b/src/featbin/extract-segments.cc
@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
    const char *usage =
        "Extract segments from a large audio file in WAV format.\n"
        "Usage:  extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier>\n"
-        "e.g. extract-segments wav.scp segments ark:- | <some other program>\n"
+        "e.g. extract-segments scp:wav.scp segments ark:- | <some other program>\n"
        " segments-file format: segment_id wav_file_name start_time end_time [channel]\n"
        " e.g.: spkabc_seg1 spkabc_recording1 1.10 2.36 1\n"
        " If channel is not provided as last element, expects mono.\n"
--- a/src/fstext/deterministic-fst.h
+++ b/src/fstext/deterministic-fst.h
@ -235,6 +235,7 @@ class LmExampleDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
                                    Label bos_symbol,
                                    Label eos_symbol);
  
+
  virtual StateId Start() { return start_state_; }

  /// We don't bother caching the final-probs, just the arcs.
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@ -141,6 +141,7 @@ struct OnlineNnet2FeaturePipelineInfo {
  bool use_ivectors;
  OnlineIvectorExtractionInfo ivector_extractor_info;

+  int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
 private:
  KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
 };
--- a/src/online2bin/online2-wav-dump-features.cc
+++ b/src/online2bin/online2-wav-dump-features.cc
@ -42,7 +42,8 @@ int main(int argc, char *argv[]) {
        "Usage: online2-wav-dump-features [options] <spk2utt-rspecifier> <wav-rspecifier> <feature-wspecifier>\n"
        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
        "you want to generate features utterance by utterance.\n"
-        "See steps/online/nnet2/dump_nnet_activations.sh for an example.\n";
+        "Alternate usage: online2-wav-dump-features [options] --print-ivector-dim=true\n"
+        "See steps/online/nnet2/{dump_nnet_activations,get_egs.sh} for examples.\n";
    
    ParseOptions po(usage);
    
@ -50,24 +51,34 @@ int main(int argc, char *argv[]) {
    // as well as the basic features.
    OnlineNnet2FeaturePipelineConfig feature_config;  
    BaseFloat chunk_length_secs = 0.05;
+    bool print_ivector_dim = false;
    
    po.Register("chunk-length", &chunk_length_secs,
                "Length of chunk size in seconds, that we process.");
+    po.Register("print-ivector-dim", &print_ivector_dim,
+                "If true, print iVector dimension (possibly zero) and exit.  This "
+                "version requires no arguments.");
    
    feature_config.Register(&po);
    
    po.Read(argc, argv);
    
-    if (po.NumArgs() != 3) {
+    if (!print_ivector_dim && po.NumArgs() != 3) {
      po.PrintUsage();
      return 1;
    }
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
+
+    if (print_ivector_dim) {
+      std::cout << feature_info.IvectorDim() << std::endl;
+      exit(0);
+    }
    
    std::string spk2utt_rspecifier = po.GetArg(1),
        wav_rspecifier = po.GetArg(2),
        feats_wspecifier = po.GetArg(3);
    
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
    
    int32 num_done = 0, num_err = 0;
    int64 num_frames_tot = 0;
--- a/src/online2bin/online2-wav-nnet2-am-compute.cc
+++ b/src/online2bin/online2-wav-nnet2-am-compute.cc
@ -1,7 +1,7 @@
 // online2bin/online2-wav-nnet2-am-compute.cc

 // Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-//                 David Snyder
+//           2014  David Snyder

 // See ../../COPYING for clarification regarding multiple authors
 //