trunk: updates to librispeech recipe, with better 'multi-splice' version of pnorm online-nnet2 recipe.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4765 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2015-01-10 22:02:55 +00:00 · 2015-01-10 22:02:55 +00:00 · b7c7005024
--- a/egs/librispeech/s5/RESULTS
+++ b/egs/librispeech/s5/RESULTS
@ -140,6 +140,8 @@


 ### online-nnet2 results with a model trained on all(960h) of the training data
+### Note: these results are now superseded by the multi-splice (_ms_) results below.
+### Be careful when comparing, as the _ms_ results don't yet have the _fglarge tests.
 %WER 4.90 [ 2663 / 54402, 388 ins, 273 del, 2002 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_fglarge/wer_13
 %WER 5.19 [ 2822 / 54402, 406 ins, 311 del, 2105 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_tglarge/wer_13
 %WER 6.60 [ 3593 / 54402, 457 ins, 426 del, 2710 sub ] exp/nnet2_online/nnet_a_online/decode_dev_clean_tgmed/wer_11
@ -192,3 +194,65 @@
 %WER 24.18 [ 12317 / 50948, 1284 ins, 1732 del, 9301 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.5/wer_15
 %WER 24.19 [ 12323 / 50948, 1327 ins, 1686 del, 9310 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.75/wer_15

+## Multi-splice version of online recipe.
+# for x in exp/nnet2_online/nnet_ms_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 4.72 [ 2568 / 54402, 390 ins, 258 del, 1920 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tglarge/wer_12
+%WER 5.90 [ 3212 / 54402, 345 ins, 441 del, 2426 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tgmed/wer_14
+%WER 6.64 [ 3612 / 54402, 401 ins, 479 del, 2732 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_clean_tgsmall/wer_12
+%WER 13.11 [ 6680 / 50948, 797 ins, 866 del, 5017 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tglarge/wer_15
+%WER 15.56 [ 7925 / 50948, 727 ins, 1261 del, 5937 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tgmed/wer_15
+%WER 17.10 [ 8714 / 50948, 733 ins, 1510 del, 6471 sub ] exp/nnet2_online/nnet_ms_i2/decode_dev_other_tgsmall/wer_15
+
+# for x in exp/nnet2_online/nnet_ms_a_online/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 4.83 [ 2629 / 54402, 393 ins, 264 del, 1972 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge/wer_13
+%WER 5.01 [ 2726 / 54402, 402 ins, 270 del, 2054 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt/wer_13
+%WER 4.87 [ 2647 / 54402, 386 ins, 290 del, 1971 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt_offline/wer_14
+%WER 6.05 [ 3294 / 54402, 409 ins, 392 del, 2493 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed/wer_12
+%WER 6.30 [ 3428 / 54402, 389 ins, 434 del, 2605 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt/wer_13
+%WER 6.09 [ 3311 / 54402, 393 ins, 417 del, 2501 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt_offline/wer_13
+%WER 6.87 [ 3740 / 54402, 390 ins, 547 del, 2803 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall/wer_13
+%WER 7.21 [ 3921 / 54402, 440 ins, 535 del, 2946 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt/wer_12
+%WER 6.95 [ 3783 / 54402, 415 ins, 543 del, 2825 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt_offline/wer_13
+%WER 13.21 [ 6732 / 50948, 812 ins, 852 del, 5068 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge/wer_14
+%WER 14.24 [ 7254 / 50948, 884 ins, 959 del, 5411 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt/wer_15
+%WER 13.63 [ 6945 / 50948, 890 ins, 856 del, 5199 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt_offline/wer_14
+%WER 15.69 [ 7996 / 50948, 800 ins, 1189 del, 6007 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed/wer_14
+%WER 16.63 [ 8473 / 50948, 809 ins, 1317 del, 6347 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt/wer_15
+%WER 16.09 [ 8197 / 50948, 872 ins, 1130 del, 6195 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt_offline/wer_13
+%WER 17.15 [ 8736 / 50948, 756 ins, 1424 del, 6556 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall/wer_14
+%WER 18.23 [ 9288 / 50948, 782 ins, 1585 del, 6921 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt/wer_15
+%WER 17.54 [ 8936 / 50948, 813 ins, 1425 del, 6698 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt_offline/wer_14
+
+
+## Note: this learning rate is the effective learning rate; it gets multiplied by the num-jobs.
+# for x in exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch*{clean,other}*; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 5.92 [ 3221 / 54402, 352 ins, 439 del, 2430 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_clean_tgmed/wer_14
+%WER 6.63 [ 3605 / 54402, 399 ins, 481 del, 2725 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_clean_tgsmall/wer_12
+%WER 4.44 [ 2416 / 54402, 385 ins, 204 del, 1827 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tglarge/wer_14
+%WER 5.52 [ 3001 / 54402, 360 ins, 340 del, 2301 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tgmed/wer_15
+%WER 6.22 [ 3384 / 54402, 388 ins, 411 del, 2585 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_clean_tgsmall/wer_14
+%WER 4.39 [ 2386 / 54402, 368 ins, 208 del, 1810 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tglarge/wer_15 **
+%WER 5.41 [ 2945 / 54402, 338 ins, 339 del, 2268 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tgmed/wer_16
+%WER 6.13 [ 3333 / 54402, 371 ins, 410 del, 2552 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_clean_tgsmall/wer_15
+%WER 4.39 [ 2387 / 54402, 377 ins, 199 del, 1811 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tglarge/wer_14
+%WER 5.36 [ 2918 / 54402, 328 ins, 338 del, 2252 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgmed/wer_17
+%WER 6.08 [ 3305 / 54402, 369 ins, 396 del, 2540 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgsmall/wer_15
+%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14 
+%WER 5.35 [ 2909 / 54402, 328 ins, 339 del, 2242 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgmed/wer_17
+%WER 6.05 [ 3291 / 54402, 384 ins, 381 del, 2526 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgsmall/wer_14
+%WER 13.45 [ 6850 / 50948, 808 ins, 876 del, 5166 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tglarge/wer_15
+%WER 15.65 [ 7975 / 50948, 714 ins, 1311 del, 5950 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tgmed/wer_16
+%WER 17.12 [ 8722 / 50948, 739 ins, 1489 del, 6494 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tgsmall/wer_15
+%WER 12.84 [ 6544 / 50948, 877 ins, 703 del, 4964 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tglarge/wer_16
+%WER 14.87 [ 7578 / 50948, 742 ins, 1102 del, 5734 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tgmed/wer_18
+%WER 16.25 [ 8277 / 50948, 823 ins, 1171 del, 6283 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch1_dev_other_tgsmall/wer_15
+%WER 12.80 [ 6522 / 50948, 869 ins, 698 del, 4955 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tglarge/wer_17 **
+%WER 14.80 [ 7542 / 50948, 774 ins, 1034 del, 5734 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tgmed/wer_17
+%WER 16.14 [ 8225 / 50948, 763 ins, 1242 del, 6220 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch2_dev_other_tgsmall/wer_17
+%WER 12.82 [ 6531 / 50948, 871 ins, 710 del, 4950 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tglarge/wer_18
+%WER 14.82 [ 7549 / 50948, 818 ins, 958 del, 5773 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tgmed/wer_16
+%WER 16.10 [ 8204 / 50948, 795 ins, 1165 del, 6244 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_other_tgsmall/wer_16
+%WER 12.85 [ 6549 / 50948, 902 ins, 672 del, 4975 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tglarge/wer_17
+%WER 14.80 [ 7540 / 50948, 800 ins, 1025 del, 5715 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tgmed/wer_18
+%WER 16.10 [ 8201 / 50948, 789 ins, 1240 del, 6172 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_other_tgsmall/wer_18
+
--- a/egs/librispeech/s5/conf/queue_nok20.conf
+++ b/egs/librispeech/s5/conf/queue_nok20.conf
@ -0,0 +1,13 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q
+option gpu=* -l gpu=$0 -q g.q
+default allow_k20=true
+option allow_k20=true
+option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
--- a/egs/librispeech/s5/local/online/run_nnet2.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2.sh
@ -3,6 +3,8 @@
 # example script for online-nnet2 system training and decoding,
 # based on the one for fisher-English.

+# note: run_nnet2_ms.sh gives better results, but it's slower to train.
+
 . cmd.sh


@ -37,85 +39,7 @@ else
  parallel_opts="-pe smp $num_threads" 
 fi

-
-if [ $stage -le 1 ]; then
-  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
-  # this shows how you can split across multiple file-systems.  we'll split the
-  # MFCC dir across multiple locations.  You might want to be careful here, if you
-  # have multiple copies of Kaldi checked out and run the same recipe, not to let
-  # them overwrite each other.
-  mfccdir=mfcc
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
-  fi
-
-  for datadir in train_960 dev_clean dev_other; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-  done
-
-  # now create some data subsets.
-  # mixed is the clean+other data.
-  # 30k is 1/10 of the data (around 100 hours), 60k is 1/5th of it (around 200 hours).
-  utils/subset_data_dir.sh data/train_960_hires 30000 data/train_mixed_hires_30k
-  utils/subset_data_dir.sh data/train_960_hires 60000 data/train_mixed_hires_60k
-fi
-
-
-if [ $stage -le 2 ]; then
-  # We need to build a small system just because we need the LDA+MLLT transform
-  # to train the diag-UBM on top of.  We align a subset of training data for
-  # this purpose.
-  utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_mixed_hires_30k/utt2spk) \
-     data/train_960 data/train_960_30k
-
-  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-    data/train_960_30k data/lang exp/tri6b exp/nnet2_online/tri6b_ali_30k
-fi
-
-if [ $stage -le 3 ]; then
-  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
-  # because after we get the transform (12th iter is the last), any further
-  # training is pointless.
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
-    --realign-iters "" \
-    --splice-opts "--left-context=3 --right-context=3" \
-    5000 10000 data/train_mixed_hires_30k data/lang \
-    exp/nnet2_online/tri6b_ali_30k exp/nnet2_online/tri7b
-fi
-
-
-if [ $stage -le 4 ]; then
-  mkdir -p exp/nnet2_online
-  # To train a diagonal UBM we don't need very much data, so use a small subset
-  # (actually, it's not that small: still around 100 hours).
-  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
-    data/train_mixed_hires_30k 512 exp/nnet2_online/tri7b exp/nnet2_online/diag_ubm
-fi
-
-if [ $stage -le 5 ]; then
-  # iVector extractors can in general be sensitive to the amount of data, but
-  # this one has a fairly small dim (defaults to 100) so we don't use all of it,
-  # we use just the 60k subset (about one fifth of the data, or 200 hours).
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/train_mixed_hires_60k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
-fi
-
-if [ $stage -le 6 ]; then
-  ivectordir=exp/nnet2_online/ivectors_train_960_hires
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
-  fi
-  # We extract iVectors on all the train data, which will be what we train the
-  # system on.  With --utts-per-spk-max 2, the script.  pairs the utterances
-  # into twos, and treats each of these pairs as one speaker.  Note that these
-  # are extracted 'online'.
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
-    --utts-per-spk-max 2 data/train_960_hires exp/nnet2_online/extractor $ivectordir || exit 1;
-fi
-
+local/online/run_nnet2_common.sh --stage $stage

 if [ $stage -le 7 ]; then
  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
@ -126,9 +50,8 @@ if [ $stage -le 7 ]; then
  # The size of the system is kept rather smaller than the run_7a_960.sh system:
  # this is because we want it to be small enough that we could plausibly run it
  # in real-time.
-  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
-    --samples-per-iter 400000 \
-    --num-epochs 6 --num-epochs-extra 2 \
+  steps/nnet2/train_pnorm_simple2.sh --stage $train_stage \
+    --num-epochs 8 --num-jobs-nnet 6 \
    --splice-width 7 --feat-type raw \
    --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
    --cmvn-opts "--norm-means=false --norm-vars=false" \
@ -136,7 +59,6 @@ if [ $stage -le 7 ]; then
    --minibatch-size "$minibatch_size" \
    --parallel-opts "$parallel_opts" \
    --io-opts "-tc 12" \
-    --num-jobs-nnet 6 \
    --initial-learning-rate 0.01 --final-learning-rate 0.001 \
    --cmd "$decode_cmd" \
    --pnorm-input-dim 3500 \
--- a/egs/librispeech/s5/local/online/run_nnet2_common.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_common.sh
@ -0,0 +1,94 @@
+#!/bin/bash
+
+# this script contains some common (shared) parts of the run_nnet*.sh scripts.
+
+. cmd.sh
+
+
+stage=0
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if [ $stage -le 1 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=mfcc
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in train_960 dev_clean dev_other; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+
+  # now create some data subsets.
+  # mixed is the clean+other data.
+  # 30k is 1/10 of the data (around 100 hours), 60k is 1/5th of it (around 200 hours).
+  utils/subset_data_dir.sh data/train_960_hires 30000 data/train_mixed_hires_30k
+  utils/subset_data_dir.sh data/train_960_hires 60000 data/train_mixed_hires_60k
+fi
+
+if [ $stage -le 2 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We align a subset of training data for
+  # this purpose.
+  utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_mixed_hires_30k/utt2spk) \
+     data/train_960 data/train_960_30k
+
+  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+    data/train_960_30k data/lang exp/tri6b exp/nnet2_online/tri6b_ali_30k
+fi
+
+if [ $stage -le 3 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/train_mixed_hires_30k data/lang \
+    exp/nnet2_online/tri6b_ali_30k exp/nnet2_online/tri7b
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p exp/nnet2_online
+  # To train a diagonal UBM we don't need very much data, so use a small subset
+  # (actually, it's not that small: still around 100 hours).
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
+    data/train_mixed_hires_30k 512 exp/nnet2_online/tri7b exp/nnet2_online/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # iVector extractors can in general be sensitive to the amount of data, but
+  # this one has a fairly small dim (defaults to 100) so we don't use all of it,
+  # we use just the 60k subset (about one fifth of the data, or 200 hours).
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_mixed_hires_60k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  ivectordir=exp/nnet2_online/ivectors_train_960_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on all the train data, which will be what we train the
+  # system on.  With --utts-per-spk-max 2, the script.  pairs the utterances
+  # into twos, and treats each of these pairs as one speaker.  Note that these
+  # are extracted 'online'.
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
+    --utts-per-spk-max 2 data/train_960_hires exp/nnet2_online/extractor $ivectordir || exit 1;
+fi
+
+
+exit 0;
--- a/egs/librispeech/s5/local/online/run_nnet2_disc.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_disc.sh
@ -0,0 +1,159 @@
+#!/bin/bash
+
+
+# This script does discriminative training on top of the online,
+# system trained in run_nnet2.sh.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+# Note: rather than using any features we have dumped on disk, this script
+# regenerates them from the wav data three times-- when we do lattice
+# generation, numerator alignment and discriminative training.  This made the
+# script easier to write and more generic, because we don't have to know where
+# the features and the iVectors are, but of course it's a little inefficient.
+# The time taken is dominated by the lattice generation anyway, so this isn't
+# a huge deal.
+
+. cmd.sh
+
+
+stage=0
+train_stage=-10
+use_gpu=true
+srcdir=exp/nnet2_online/nnet_a
+criterion=smbr
+drop_frames=false  # only relevant for MMI actually.
+learning_rate=0.0001
+train_stage=-10 # can be used to start training in the middle.
+decode_start_epoch=0 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+num_epochs=4
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  parallel_opts="-l gpu=1" 
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+  parallel_opts="-pe smp $num_threads" 
+fi
+
+if [ ! -f ${srcdir}_online/final.mdl ]; then
+  echo "$0: expected ${srcdir}_online/final.mdl to exist; first run run_nnet2_ms.sh."
+  exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  nj=50  # this doesn't really affect anything strongly, except the num-jobs for one of
+         # the phases of get_egs_discriminative2.sh below.
+  num_threads_denlats=6
+  subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+              # total slots = 80 * 6 = 480.
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+      --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+     data/train_960_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
+
+  # the command below is a more generic, but slower, way to do it.
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+  #   data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
+
+fi
+
+if [ $stage -le 2 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  use_gpu=no
+  gpu_opts=
+
+  steps/nnet2/align.sh  --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+     --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
+     --nj $nj data/train_960_hires data/lang $srcdir ${srcdir}_ali || exit 1;
+
+  # the command below is a more generic, but slower, way to do it.
+  # steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+  #    --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1;
+fi
+
+
+if [ $stage -le 3 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+  fi
+  # have a higher maximum num-jobs if
+  if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+  steps/nnet2/get_egs_discriminative2.sh \
+    --cmd "$decode_cmd -tc $max_jobs" \
+    --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
+    --criterion $criterion --drop-frames $drop_frames \
+     data/train_960_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
+
+  # the command below is a more generic, but slower, way to do it.
+  #steps/online/nnet2/get_egs_discriminative2.sh \
+  #  --cmd "$decode_cmd -tc $max_jobs" \
+  #  --criterion $criterion --drop-frames $drop_frames \
+  #   data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet2/train_discriminative2.sh --cmd "$decode_cmd $parallel_opts" \
+    --stage $train_stage \
+    --learning-rate $learning_rate \
+    --criterion $criterion --drop-frames $drop_frames \
+    --num-epochs $num_epochs \
+    --num-jobs-nnet 2 --num-threads $num_threads \
+      ${srcdir}_degs ${srcdir}_${criterion}_${learning_rate} || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  dir=${srcdir}_${criterion}_${learning_rate}
+  ln -sf $(readlink -f ${srcdir}_online/conf) $dir/conf # so it acts like an online-decoding directory
+
+  for epoch in $(seq $decode_start_epoch $num_epochs); do
+    for test in dev_clean dev_other; do
+      (
+        steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 50 \
+          --iter epoch$epoch exp/tri6b/graph_tgsmall data/${test} $dir/decode_epoch${epoch}_${test}_tgsmall || exit 1
+        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+          data/${test} $dir/decode_epoch${epoch}_${test}_{tgsmall,tgmed}  || exit 1;
+        steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+          data/$test $dir/decode_epoch${epoch}_${test}_{tgsmall,tglarge} || exit 1;
+      ) &
+    done
+  done
+  wait
+  for dir in $dir/decode*; do grep WER $dir/wer_* | utils/best_wer.sh; done
+fi
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${srcdir}_denlats/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
--- a/egs/librispeech/s5/local/online/run_nnet2_ms.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_ms.sh
@ -0,0 +1,159 @@
+#!/bin/bash
+
+# This is the "multi-splice" version of the online-nnet2 training script.
+# It's currently the best recipe.
+# You'll notice that we splice over successively larger windows as we go deeper
+# into the network.
+
+. cmd.sh
+
+
+stage=7
+train_stage=-10
+use_gpu=true
+dir=exp/nnet2_online/nnet_ms_a
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  parallel_opts="-l gpu=1"
+  num_threads=1
+  minibatch_size=512
+
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
+    parallel_opts="$parallel_opts --config conf/queue_no_k20.conf --allow-k20 false"
+    # that config is like the default config in the text of queue.pl, but adding the following lines.
+    # default allow_k20=true
+    # option allow_k20=true
+    # option allow_k20=false -l 'hostname=!g01&!g02&!b06'
+    # It's a workaround for an NVidia CUDA library bug for our currently installed version
+    # of the CUDA toolkit, that only shows up on k20's
+  fi
+  # the _a is in case I want to change the parameters.
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+  minibatch_size=128
+  parallel_opts="-pe smp $num_threads" 
+fi
+
+# do the common parts of the script.
+local/online/run_nnet2_common.sh --stage $stage
+
+
+if [ $stage -le 7 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  # The size of the system is kept rather smaller than the run_7a_960.sh system:
+  # this is because we want it to be small enough that we could plausibly run it
+  # in real-time.
+  steps/nnet2/train_multisplice_accel2.sh --stage $train_stage \
+    --num-epochs 8 --num-jobs-initial 3 --num-jobs-final 18 \
+    --num-hidden-layers 6 --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer3/-3:3 layer4/-7:2" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --num-threads "$num_threads" \
+    --minibatch-size "$minibatch_size" \
+    --parallel-opts "$parallel_opts" \
+    --io-opts "--max-jobs-run 12" \
+    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
+    --cmd "$decode_cmd" \
+    --pnorm-input-dim 3500 \
+    --pnorm-output-dim 350 \
+    --mix-up 12000 \
+    data/train_960_hires data/lang exp/tri6b $dir  || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  # dump iVectors for the testing data.
+  for test in dev_clean dev_other; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${test}_hires exp/nnet2_online/extractor exp/nnet2_online/ivectors_$test || exit 1;
+  done
+fi
+
+
+if [ $stage -le 9 ]; then
+  # this does offline decoding that should give about the same results as the
+  # real online decoding (the one with --per-utt true)
+  for test in dev_clean dev_other; do
+    steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
+      --online-ivector-dir exp/nnet2_online/ivectors_${test} \
+      exp/tri6b/graph_tgsmall data/${test}_hires $dir/decode_${test}_tgsmall || exit 1;
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed}  || exit 1;
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/$test $dir/decode_${test}_{tgsmall,tglarge} || exit 1;
+  done
+fi
+
+
+if [ $stage -le 10 ]; then
+  # If this setup used PLP features, we'd have to give the option --feature-type plp
+  # to the script below.
+  steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  # do the actual online decoding with iVectors, carrying info forward from 
+  # previous utterances of the same speaker.
+  for test in dev_clean dev_other; do
+    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
+      exp/tri6b/graph_tgsmall data/$test ${dir}_online/decode_${test}_tgsmall || exit 1;
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed}  || exit 1;
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge} || exit 1;
+  done
+fi
+
+if [ $stage -le 12 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for test in dev_clean dev_other; do
+    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
+      --per-utt true exp/tri6b/graph_tgsmall data/$test ${dir}_online/decode_${test}_tgsmall_utt || exit 1;
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed}_utt  || exit 1;
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge}_utt || exit 1;
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information, but looks to the end
+  # of the utterance while computing the iVector (--online false)
+  for test in dev_clean dev_other; do
+    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
+      --per-utt true --online false exp/tri6b/graph_tgsmall data/$test \
+        ${dir}_online/decode_${test}_tgsmall_utt_offline || exit 1;
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/$test ${dir}_online/decode_${test}_{tgsmall,tgmed}_utt_offline  || exit 1;
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/$test ${dir}_online/decode_${test}_{tgsmall,tglarge}_utt_offline || exit 1;
+  done
+fi
+
+exit 0;
--- a/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
@ -0,0 +1,160 @@
+#!/bin/bash
+
+
+# This script does discriminative training on top of the online, multi-splice
+# system trained in run_nnet2_ms.sh.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+# Note: rather than using any features we have dumped on disk, this script
+# regenerates them from the wav data three times-- when we do lattice
+# generation, numerator alignment and discriminative training.  This made the
+# script easier to write and more generic, because we don't have to know where
+# the features and the iVectors are, but of course it's a little inefficient.
+# The time taken is dominated by the lattice generation anyway, so this isn't
+# a huge deal.
+
+. cmd.sh
+
+
+stage=0
+train_stage=-10
+use_gpu=true
+srcdir=exp/nnet2_online/nnet_ms_a
+criterion=smbr
+drop_frames=false  # only matters for MMI anyway.
+effective_lrate=0.000005
+num_jobs_nnet=6
+train_stage=-10 # can be used to start training in the middle.
+decode_start_epoch=0 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+num_epochs=4
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  parallel_opts="-l gpu=1" 
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+  parallel_opts="-pe smp $num_threads" 
+fi
+
+if [ ! -f ${srcdir}_online/final.mdl ]; then
+  echo "$0: expected ${srcdir}_online/final.mdl to exist; first run run_nnet2_ms.sh."
+  exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  nj=50  # this doesn't really affect anything strongly, except the num-jobs for one of
+         # the phases of get_egs_discriminative2.sh below.
+  num_threads_denlats=6
+  subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+              # max total slots = 80 * 6 = 480.
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+      --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+     data/train_960_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
+
+  # the command below is a more generic, but slower, way to do it.
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+  #   data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
+
+fi
+
+if [ $stage -le 2 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  use_gpu=no
+  gpu_opts=
+
+  steps/nnet2/align.sh  --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+     --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
+     --nj $nj data/train_960_hires data/lang $srcdir ${srcdir}_ali || exit 1;
+
+  # the command below is a more generic, but slower, way to do it.
+  # steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+  #    --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1;
+fi
+
+
+if [ $stage -le 3 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+  fi
+  # have a higher maximum num-jobs if
+  if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+  steps/nnet2/get_egs_discriminative2.sh \
+    --cmd "$decode_cmd -tc $max_jobs" \
+    --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
+    --criterion $criterion --drop-frames $drop_frames \
+     data/train_960_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
+
+  # the command below is a more generic, but slower, way to do it.
+  #steps/online/nnet2/get_egs_discriminative2.sh \
+  #  --cmd "$decode_cmd -tc $max_jobs" \
+  #  --criterion $criterion --drop-frames $drop_frames \
+  #   data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet2/train_discriminative2.sh --cmd "$decode_cmd $parallel_opts" \
+    --stage $train_stage \
+    --effective-lrate $effective_lrate \
+    --criterion $criterion --drop-frames $drop_frames \
+    --num-epochs $num_epochs \
+    --num-jobs-nnet 6 --num-threads $num_threads \
+      ${srcdir}_degs ${srcdir}_${criterion}_${effective_lrate} || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  dir=${srcdir}_${criterion}_${effective_lrate}
+  ln -sf $(readlink -f ${srcdir}_online/conf) $dir/conf # so it acts like an online-decoding directory
+
+  for epoch in $(seq $decode_start_epoch $num_epochs); do
+    for test in dev_clean dev_other; do
+      (
+        steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 50 \
+          --iter epoch$epoch exp/tri6b/graph_tgsmall data/${test} $dir/decode_epoch${epoch}_${test}_tgsmall || exit 1
+        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+          data/${test} $dir/decode_epoch${epoch}_${test}_{tgsmall,tgmed}  || exit 1;
+        steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+          data/$test $dir/decode_epoch${epoch}_${test}_{tgsmall,tglarge} || exit 1;
+      ) &
+    done
+  done
+  wait
+  for dir in $dir/decode*; do grep WER $dir/wer_* | utils/best_wer.sh; done
+fi
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${srcdir}_denlats/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
--- a/egs/librispeech/s5/run.sh
+++ b/egs/librispeech/s5/run.sh
@ -260,6 +260,9 @@ steps/train_quick.sh --cmd "$train_cmd" \
  done
 )&

+# steps/cleanup/debug_lexicon.sh --remove-stress true  --nj 200 --cmd "$train_cmd" data/train_clean_100 \
+#    data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h
+
 # Perform RNNLM rescoring of tri6b
 # Attention: with default settings requires 4 GB of memory per rescoring job, so commenting this out by default
 # local/run_rnnlm.sh $data data/local/lm
@ -271,4 +274,13 @@ local/nnet2/run_7a_960.sh || exit 1
 ## we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
 #local/run_data_cleaning.sh

-# local/online/run_nnet2.sh
+
+# # The following is the current online-nnet2 recipe, with "multi-splice".
+# local/online/run_nnet2_ms.sh
+
+# # The following is the discriminative-training continuation of the above.
+# local/online/run_nnet2_ms_disc.sh
+
+# ## The following is an older version of the online-nnet2 recipe, without "multi-splice".  It's faster
+# ## to train but slightly worse.
+# # local/online/run_nnet2.sh