add fisher_swbd nnet3 and chain recipe

2016-04-05 16:17:04 +08:00 · 2016-04-05 16:17:04 +08:00 · 02cf52a48e
--- a/egs/fisher_swbd/s5/RESULTS
+++ b/egs/fisher_swbd/s5/RESULTS
@ -42,8 +42,71 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_eval2000*_fg; do grep Sum  $x/
 %WER 12.3 | 1831 21395 | 89.2 7.2 3.5 1.5 12.3 50.8 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_fsh_sw1_fg/score_13/eval2000.ctm.swbd.filt.sys
 %WER 11.8 | 1831 21395 | 89.6 7.2 3.2 1.4 11.8 49.0 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_offline_fsh_sw1_fg/score_11/eval2000.ctm.swbd.filt.sys
 # nnet3 result on eval2000
 # BLSTM ran for about 760 hours, command:
 # local/nnet3/run_lstm.sh --affix bidirectional --lstm-delay " [-1,1] [-2,2] [-3,3] " --label-delay 0 \
 #                         --cell-dim 1024 --recurrent-projection-dim 128 --non-recurrent-projection-dim 128 \
 #                         --chunk-left-context 40 --chunk-right-context 40 \
 #                         --extra-left-context 50 --extra-right-context 50
 # use tri-gram
 for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
 %WER 15.8 | 4459 42989 | 86.1 9.7 4.1 1.9 15.8 52.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
 %WER 14.8 | 4459 42989 | 86.6 9.2 4.3 1.4 14.8 54.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
 # rescore with four-gram
 for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
 %WER 15.4 | 4459 42989 | 86.4 9.5 4.0 1.8 15.4 51.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
 %WER 14.5 | 4459 42989 | 87.0 9.0 4.0 1.5 14.5 53.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
 # nnet3 result on eval2000 for swbd subset
 # use tri-gram
 for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 11.6 | 1831 21395 | 89.7 7.3 3.0 1.3 11.6 47.7 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
 %WER 10.7 | 1831 21395 | 90.3 6.7 3.0 1.0 10.7 45.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
 # rescore with four-gram
 for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 11.1 | 1831 21395 | 90.2 7.0 2.8 1.3 11.1 46.2 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
 %WER 10.4 | 1831 21395 | 90.6 6.5 2.9 1.0 10.4 45.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
 # nnet3 result on eval2000 for callhm subset
 # use tri-gram
 for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
 %WER 19.9 | 2628 21594 | 82.6 12.1 5.3 2.6 19.9 56.0 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
 %WER 18.8 | 2628 21594 | 83.1 11.7 5.2 1.9 18.8 60.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
 # rescore with four-gram
 for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 19.7 | 2628 21594 | 82.7 12.1 5.2 2.4 19.7 55.3 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
 %WER 18.6 | 2628 21594 | 83.3 11.5 5.2 1.9 18.6 59.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
 # chain result on eval2000
 # BLSTM ran for about 380 hours
 # use tri-gram
 for x in exp/chain/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
 %WER 13.6 | 4459 42989 | 88.2 7.9 3.9 1.8 13.6 51.0 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
 %WER 12.1 | 4459 42989 | 89.7 6.8 3.5 1.8 12.1 50.2 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.filt.sys
 # rescore with four-gram
 for x in exp/chain/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
 %WER 13.3 | 4459 42989 | 88.4 7.8 3.8 1.8 13.3 50.1 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
 %WER 12.0 | 4459 42989 | 89.6 6.5 3.8 1.7 12.0 49.3 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_8_0.5/eval2000_hires.ctm.filt.sys
 # chain result on eval2000 for swbd subset
 # use tri-gram
 for x in exp/chain/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 9.4 | 1831 21395 | 91.7 5.4 2.9 1.2 9.4 43.9 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_tg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
 %WER 8.8 | 1831 21395 | 92.5 5.3 2.2 1.4 8.8 46.9 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_1.0/eval2000_hires.ctm.swbd.filt.sys
 # rescore with four-gram
 for x in exp/chain/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 9.2 | 1831 21395 | 92.1 5.6 2.3 1.3 9.2 42.4 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_fg/score_9_0.0/eval2000_hires.ctm.swbd.filt.sys
 %WER 8.5 | 1831 21395 | 92.6 4.9 2.4 1.2 8.5 44.1 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_9_1.0/eval2000_hires.ctm.swbd.filt.sys
 # chain result on eval2000 for callhm subset
 # use tri-gram
 for x in exp/chain/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
 %WER 17.4 | 2628 21594 | 84.7 9.8 5.5 2.1 17.4 55.3 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
 %WER 15.3 | 2628 21594 | 86.9 8.3 4.8 2.2 15.3 52.4 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
 # rescore with four-gram
 for x in exp/chain/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
 %WER 17.3 | 2628 21594 | 84.9 9.7 5.5 2.1 17.3 55.0 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
 %WER 15.3 | 2628 21594 | 87.0 8.6 4.4 2.4 15.3 52.1 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_6_0.5/eval2000_hires.ctm.callhm.filt.sys
 # GMM and SGMM numbers reported on rt03
 for x in exp/*/decode_rt03*; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
@ -89,3 +152,63 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_rt03*_fg; do grep Sum  $x/scor
 %WER 20.2 | 3970 36721 | 88.3 8.1 3.6 8.5 20.2 74.3 | exp/nnet2_online/nnet_ms_a_online/decode_rt03_utt_fsh_sw1_fg/score_11/rt03.ctm.swbd.filt.sys
 %WER 19.1 | 3970 36721 | 88.8 7.8 3.4 7.9 19.1 72.2 | exp/nnet2_online/nnet_ms_a_online/decode_rt03_utt_offline_fsh_sw1_fg/score_11/rt03.ctm.swbd.filt.sys
 # nnet3 result on rt03
 # use tri-gram
 for x in exp/nnet3/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
 %WER 14.7 | 8420 76157 | 86.8 8.9 4.3 1.5 14.7 45.9 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.0/rt03_hires.ctm.filt.sys
 %WER 14.2 | 8420 76157 | 87.0 8.7 4.3 1.2 14.2 46.9 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
 # rescore with four-gram
 for x in exp/nnet3/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
 %WER 14.4 | 8420 76157 | 87.1 8.8 4.2 1.5 14.4 45.2 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_11_0.0/rt03_hires.ctm.filt.sys
 %WER 13.9 | 8420 76157 | 87.2 8.4 4.3 1.2 13.9 46.0 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_9_0.0/rt03_hires.ctm.filt.sys
 # nnet3 result on rt03 for swbd subset
 # use tri-gram
 for x in exp/nnet3/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 17.4 | 4450 39436 | 84.3 10.6 5.1 1.8 17.4 48.9 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.5/rt03_hires.ctm.swbd.filt.sys
 %WER 16.6 | 4450 39436 | 84.7 10.0 5.3 1.3 16.6 49.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_10_0.5/rt03_hires.ctm.swbd.filt.sys
 # rescore with four-gram
 for x in exp/nnet3/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 17.1 | 4450 39436 | 84.6 10.3 5.1 1.8 17.1 48.2 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_12_0.0/rt03_hires.ctm.swbd.filt.sys
 %WER 16.3 | 4450 39436 | 85.0 9.8 5.1 1.3 16.3 49.0 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys
 # nnet3 result on rt03 for fsh subset
 # use tri-gram
 for x in exp/nnet3/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
 %WER 11.8 | 3970 36721 | 89.4 7.2 3.5 1.2 11.8 42.5 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.0/rt03_hires.ctm.fsh.filt.sys
 %WER 11.6 | 3970 36721 | 89.4 7.1 3.5 1.0 11.6 43.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
 # rescore with four-gram
 for x in exp/nnet3/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
 %WER 11.4 | 3970 36721 | 89.7 6.9 3.4 1.1 11.4 41.5 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_11_0.0/rt03_hires.ctm.fsh.filt.sys
 %WER 11.4 | 3970 36721 | 89.5 6.7 3.8 1.0 11.4 42.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.fsh.filt.sys
 # chain result on rt03
 # BLSTM ran for about 380 hours
 # use tri-gram
 for x in exp/chain/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
 %WER 12.7 | 8420 76157 | 88.5 7.2 4.2 1.3 12.7 43.2 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.filt.sys
 %WER 11.7 | 8420 76157 | 89.8 6.6 3.6 1.5 11.7 43.7 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
 # rescore with four-gram
 for x in exp/chain/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
 %WER 12.4 | 8420 76157 | 88.9 7.0 4.1 1.3 12.4 42.7 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_fg/score_9_0.0/rt03_hires.ctm.filt.sys
 %WER 11.4 | 8420 76157 | 89.9 6.1 3.9 1.3 11.4 43.4 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
 # chain result on rt03 for swbd subset
 # use tri-gram
 for x in exp/chain/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 15.0 | 4450 39436 | 86.4 8.6 5.0 1.4 15.0 45.8 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
 %WER 13.3 | 4450 39436 | 88.3 7.5 4.2 1.6 13.3 45.2 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
 # rescore with four-gram
 for x in exp/chain/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
 %WER 14.8 | 4450 39436 | 86.5 8.0 5.5 1.3 14.8 45.5 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys
 %WER 13.0 | 4450 39436 | 88.5 7.3 4.2 1.6 13.0 44.8 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
 # chain result on rt03 for fsh subset
 # use tri-gram
 for x in exp/chain/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
 %WER 10.2 | 3970 36721 | 91.1 6.0 3.0 1.2 10.2 40.2 | exp/chain/tdnn_7b_relu_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
 %WER 9.8 | 3970 36721 | 91.4 5.3 3.3 1.2 9.8 42.0 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
 # rescore with four-gram
 for x in exp/chain/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
 %WER 9.8 | 3970 36721 | 91.4 5.8 2.8 1.2 9.8 39.6 | exp/chain/tdnn_7b_relu_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
 %WER 9.6 | 3970 36721 | 91.6 5.2 3.3 1.2 9.6 41.4 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
@ -0,0 +1,181 @@
 #!/bin/bash
 # based on run_tdnn_6h.sh
 set -e
 # configs for 'chain'
 stage=12
 train_stage=-10
 get_egs_stage=-10
 dir=exp/chain/blstm_6h
 decode_iter=
 decode_dir_affix=
 # training options
 num_epochs=4
 remove_egs=false
 common_egs_dir=
 affix=
 chunk_width=150
 chunk_left_context=40
 chunk_right_context=40
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 if ! cuda-compiled; then
  cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
 dir=$dir${affix:+_$affix}
 train_set=train_nodup_sp
 ali_dir=exp/tri5a_ali_nodup
 treedir=exp/chain/tri6_tree_11000
 lang=data/lang_chain
 # The iVector-extraction and feature-dumping parts are the same as the standard
 # nnet3 setup, and you can skip them by setting "--stage 8" if you have already
 # run those things.
 local/nnet3/run_ivector_common.sh --stage $stage \
  --speed-perturb true \
  --generate-alignments false || exit 1;
 if [ $stage -le 9 ]; then
  # Get the alignments as lattices (gives the CTC training more freedom).
  # use the same num-jobs as the alignments
  nj=$(cat $ali_dir/num_jobs) || exit 1;
  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
    data/lang exp/tri5a exp/tri5a_lats_nodup_sp
  rm exp/tri5a_lats_nodup_sp/fsts.*.gz # save space
 fi
 if [ $stage -le 10 ]; then
  # Create a version of the lang/ directory that has one state per phone in the
  # topo file. [note, it really has two states.. the first one is only repeated
  # once, the second one has zero or more repeats.]
  rm -rf $lang
  cp -r data/lang $lang
  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
  # Use our special topology... note that later on may have to tune this
  # topology.
  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
 fi
 if [ $stage -le 11 ]; then
  # Build a tree using our new topology.
  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
      --leftmost-questions-truncate -1 \
      --cmd "$train_cmd" 11000 data/$train_set $lang $ali_dir $treedir
 fi
 if [ $stage -le 12 ]; then
  echo "$0: creating neural net configs";
  steps/nnet3/lstm/make_configs.py  \
    --feat-dir data/${train_set}_hires \
    --ivector-dir exp/nnet3/ivectors_${train_set} \
    --tree-dir $treedir \
    --splice-indexes="-2,-1,0,1,2 0 0" \
    --lstm-delay=" [-3,3] [-3,3] [-3,3] " \
    --xent-regularize 0.1 \
    --include-log-softmax false \
    --num-lstm-layers 3 \
    --cell-dim 1024 \
    --hidden-dim 1024 \
    --recurrent-projection-dim 256 \
    --non-recurrent-projection-dim 256 \
    --label-delay 0 \
    --self-repair-scale 0.00001 \
   $dir/configs || exit 1;
 fi
 if [ $stage -le 13 ]; then
  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
    utils/create_split_dir.pl \
     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
  fi
  touch $dir/egs/.nodelete # keep egs around when that run dies.
  steps/nnet3/chain/train.py --stage $train_stage \
    --cmd "$decode_cmd" \
    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
    --chain.xent-regularize 0.1 \
    --chain.leaky-hmm-coefficient 0.1 \
    --chain.l2-regularize 0.00005 \
    --chain.apply-deriv-weights false \
    --chain.lm-opts="--num-extra-lm-states=2000" \
    --chain.left-deriv-truncate 0 \
    --trainer.num-chunk-per-minibatch 64 \
    --trainer.frames-per-iter 1200000 \
    --trainer.max-param-change 1.414 \
    --trainer.num-epochs $num_epochs \
    --trainer.optimization.shrink-value 0.99 \
    --trainer.optimization.num-jobs-initial 3 \
    --trainer.optimization.num-jobs-final 16 \
    --trainer.optimization.initial-effective-lrate 0.001 \
    --trainer.optimization.final-effective-lrate 0.0001 \
    --trainer.optimization.momentum 0.0 \
    --egs.stage $get_egs_stage \
    --egs.opts "--frames-overlap-per-eg 0" \
    --egs.chunk-width $chunk_width \
    --egs.chunk-left-context $chunk_left_context \
    --egs.chunk-right-context $chunk_right_context \
    --egs.dir "$common_egs_dir" \
    --cleanup.remove-egs $remove_egs \
    --feat-dir data/${train_set}_hires \
    --tree-dir $treedir \
    --lat-dir exp/tri5a_lats_nodup_sp \
    --dir $dir  || exit 1;
 fi
 if [ $stage -le 14 ]; then
  # Note: it might appear that this $lang directory is mismatched, and it is as
  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
  # the lang directory.
  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 decode_suff=fsh_sw1_tg
 graph_dir=$dir/graph_fsh_sw1_tg
 if [ $stage -le 15 ]; then
  iter_opts=
  if [ ! -z $decode_iter ]; then
    iter_opts=" --iter $decode_iter "
  fi
  # decoding options
  extra_left_context=$[$chunk_left_context+10]
  extra_right_context=$[$chunk_right_context+10]
  for decode_set in eval2000 rt03; do
      (
      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
          --nj $num_jobs --cmd "$decode_cmd" $iter_opts \
          --extra-left-context $extra_left_context \
          --extra-right-context $extra_right_context \
          --frames-per-chunk $chunk_width \
          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
          data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
      fi
      ) &
  done
 fi
 wait;
 exit 0;
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
@ -0,0 +1,162 @@
 #!/bin/bash
 set -e
 # based on run_tdnn_7b.sh in the swbd recipe
 # configs for 'chain'
 affix=
 stage=12
 train_stage=-10
 get_egs_stage=-10
 dir=exp/chain/tdnn_7b
 decode_iter=
 # training options
 num_epochs=4
 remove_egs=false
 common_egs_dir=
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 if ! cuda-compiled; then
  cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
 dir=${dir}${affix:+_$affix}
 train_set=train_nodup_sp
 ali_dir=exp/tri5a_ali_nodup
 treedir=exp/chain/tri6_tree_11000
 lang=data/lang_chain
 # The iVector-extraction and feature-dumping parts are the same as the standard
 # nnet3 setup, and you can skip them by setting "--stage 8" if you have already
 # run those things.
 local/nnet3/run_ivector_common.sh --stage $stage \
  --speed-perturb true \
  --generate-alignments false || exit 1;
 if [ $stage -le 9 ]; then
  # Get the alignments as lattices (gives the chain training more freedom).
  # use the same num-jobs as the alignments
  nj=$(cat $ali_dir/num_jobs) || exit 1;
  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
    data/lang exp/tri5a exp/tri5a_lats_nodup_sp
  rm exp/tri5a_lats_nodup_sp/fsts.*.gz # save space
 fi
 if [ $stage -le 10 ]; then
  # Create a version of the lang/ directory that has one state per phone in the
  # topo file. [note, it really has two states.. the first one is only repeated
  # once, the second one has zero or more repeats.]
  rm -rf $lang
  cp -r data/lang $lang
  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
  # Use our special topology... note that later on may have to tune this
  # topology.
  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
 fi
 if [ $stage -le 11 ]; then
  # Build a tree using our new topology.
  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
      --leftmost-questions-truncate -1 \
      --cmd "$train_cmd" 11000 data/$train_set $lang $ali_dir $treedir
 fi
 if [ $stage -le 12 ]; then
  echo "$0: creating neural net configs";
  # create the config files for nnet initialization
  steps/nnet3/tdnn/make_configs.py \
    --self-repair-scale 0.00001 \
    --feat-dir data/${train_set}_hires \
    --ivector-dir exp/nnet3/ivectors_${train_set} \
    --tree-dir $treedir \
    --relu-dim 725 \
    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
    --use-presoftmax-prior-scale false \
    --xent-regularize 0.1 \
    --xent-separate-forward-affine true \
    --include-log-softmax false \
    --final-layer-normalize-target 0.5 \
    $dir/configs || exit 1;
 fi
 if [ $stage -le 13 ]; then
  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
    utils/create_split_dir.pl \
     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
  fi
  touch $dir/egs/.nodelete # keep egs around when that run dies.
  steps/nnet3/chain/train.py --stage $train_stage \
    --egs.dir "$common_egs_dir" \
    --cmd "$decode_cmd" \
    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
    --chain.xent-regularize 0.1 \
    --chain.leaky-hmm-coefficient 0.1 \
    --chain.l2-regularize 0.00005 \
    --chain.apply-deriv-weights false \
    --chain.lm-opts="--num-extra-lm-states=2000" \
    --egs.stage $get_egs_stage \
    --egs.opts "--frames-overlap-per-eg 0" \
    --egs.chunk-width 150 \
    --trainer.num-chunk-per-minibatch 128 \
    --trainer.frames-per-iter 1500000 \
    --trainer.num-epochs $num_epochs \
    --trainer.optimization.num-jobs-initial 3 \
    --trainer.optimization.num-jobs-final 16 \
    --trainer.optimization.initial-effective-lrate 0.001 \
    --trainer.optimization.final-effective-lrate 0.0001 \
    --trainer.max-param-change 2.0 \
    --cleanup.remove-egs $remove_egs \
    --feat-dir data/${train_set}_hires \
    --tree-dir $treedir \
    --lat-dir exp/tri5a_lats_nodup_sp \
    --dir $dir  || exit 1;
 fi
 if [ $stage -le 14 ]; then
  # Note: it might appear that this $lang directory is mismatched, and it is as
  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
  # the lang directory.
  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 decode_suff=fsh_sw1_tg
 graph_dir=$dir/graph_fsh_sw1_tg
 if [ $stage -le 15 ]; then
  iter_opts=
  if [ ! -z $decode_iter ]; then
    iter_opts=" --iter $decode_iter "
  fi
  for decode_set in eval2000 rt03; do
      (
      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
          --nj $num_jobs --cmd "$decode_cmd" $iter_opts \
          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
          data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_fsh_sw1_{tg,fg} || exit 1;
      fi
      ) &
  done
 fi
 wait;
 exit 0;
--- a/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh
@ -0,0 +1,141 @@
 #!/bin/bash
 . ./cmd.sh
 set -e
 stage=1
 train_stage=-10
 generate_alignments=true # false if doing chain training
 speed_perturb=true
 . ./path.sh
 . ./utils/parse_options.sh
 # perturbed data preparation
 train_set=train_nodup
 if [ "$speed_perturb" == "true" ]; then
  if [ $stage -le 1 ]; then
    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
    # _sp stands for speed-perturbed
    for datadir in train_nodup; do
      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
      rm -r data/temp1 data/temp2
      mfccdir=mfcc_perturbed
      steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
      utils/fix_data_dir.sh data/${datadir}_tmp
      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
      utils/fix_data_dir.sh data/${datadir}_sp
      rm -r data/temp0 data/${datadir}_tmp
    done
  fi
  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
    #obtain the alignment of the perturbed data
    steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
      data/train_nodup_sp data/lang_nosp exp/tri5a exp/tri5a_ali_nodup_sp || exit 1
  fi
  train_set=train_nodup_sp
 fi
 if [ $stage -le 3 ]; then
  mfccdir=mfcc_hires
  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
    date=$(date +'%m_%d_%H_%M')
    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
  fi
  # the 100k_nodup directory is copied seperately, as
  # we want to use exp/tri1b_ali_100k_nodup for lda_mllt training
  # the main train directory might be speed_perturbed
  for dataset in $train_set train_100k_nodup; do
    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
    # scale the waveforms, this is useful as we don't use CMVN
    data_dir=data/${dataset}_hires
    cat $data_dir/wav.scp | python -c "
 import sys, os, subprocess, re, random
 scale_low = 1.0/8
 scale_high = 2.0
 for line in sys.stdin.readlines():
  if len(line.strip()) == 0:
    continue
  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
 "| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
    # Remove the small number of utterances that couldn't be extracted for some
    # reason (e.g. too short; no such file).
    utils/fix_data_dir.sh data/${dataset}_hires;
  done
  for dataset in eval2000 rt03; do
    # Create MFCCs for the eval set
    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
    utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
  done
  # Take the first 30k utterances (about 1/8th of the data) this will be used
  # for the diagubm training
  utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires
  local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires  # 33hr
 fi
 # ivector extractor training
 if [ $stage -le 5 ]; then
  # We need to build a small system just because we need the LDA+MLLT transform
  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
  # the transform (12th iter is the last), any further training is pointless.
  # this decision is based on fisher_english
  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
    --splice-opts "--left-context=3 --right-context=3" \
    5500 90000 data/train_100k_nodup_hires \
    data/lang_nosp exp/tri1b_ali exp/nnet3/tri2b
 fi
 if [ $stage -le 6 ]; then
  # To train a diagonal UBM we don't need very much data, so use the smallest subset.
  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
    data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri2b exp/nnet3/diag_ubm
 fi
 if [ $stage -le 7 ]; then
  # iVector extractors can be sensitive to the amount of data, but this one has a
  # fairly small dim (defaults to 100) so we don't use all of it, we use just the
  # 100k subset (just under half the data).
  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
    data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
 fi
 if [ $stage -le 8 ]; then
  # We extract iVectors on all the train_nodup data, which will be what we
  # train the system on.
  # having a larger number of speakers is helpful for generalization, and to
  # handle per-utterance decoding well (iVector starts at zero).
  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
    data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
  for data_set in eval2000 rt03; do
    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
      data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
  done
 fi
 exit 0;
--- a/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh
@ -0,0 +1,158 @@
 #!/bin/bash
 # Copyright 2015  Johns Hopkins University (Author: Daniel Povey).
 #           2015  Vijayaditya Peddinti
 #           2015  Xingyu Na
 #           2015  Pegah Ghahrmani
 # Apache 2.0.
 # this is a basic lstm script
 # LSTM script runs for more epochs than the TDNN script
 # and each epoch takes twice the time
 # At this script level we don't support not running on GPU, as it would be painfully slow.
 # If you want to run without GPU you'd have to call lstm/train.sh with --gpu false
 stage=0
 train_stage=-10
 affix=
 common_egs_dir=
 reporting_email=
 # LSTM options
 splice_indexes="-2,-1,0,1,2 0 0"
 lstm_delay=" -1 -2 -3 "
 label_delay=5
 num_lstm_layers=3
 cell_dim=1024
 hidden_dim=1024
 recurrent_projection_dim=256
 non_recurrent_projection_dim=256
 chunk_width=20
 chunk_left_context=40
 chunk_right_context=0
 # training options
 num_epochs=8
 initial_effective_lrate=0.0003
 final_effective_lrate=0.00003
 num_jobs_initial=3
 num_jobs_final=15
 momentum=0.5
 num_chunk_per_minibatch=100
 samples_per_iter=20000
 remove_egs=true
 #decode options
 extra_left_context=
 extra_right_context=
 frames_per_chunk=
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 if ! cuda-compiled; then
  cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
 dir=exp/nnet3/lstm
 dir=$dir${affix:+_$affix}
 if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
 train_set=train_nodup_sp
 ali_dir=exp/tri5a_ali_nodup_sp
 local/nnet3/run_ivector_common.sh --stage $stage \
  --speed-perturb true || exit 1;
 if [ $stage -le 9 ]; then
  echo "$0: creating neural net configs";
  config_extra_opts=()
  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
    --feat-dir data/${train_set}_hires \
    --ivector-dir exp/nnet3/ivectors_${train_set} \
    --ali-dir $ali_dir \
    --num-lstm-layers $num_lstm_layers \
    --splice-indexes "$splice_indexes " \
    --cell-dim $cell_dim \
    --hidden-dim $hidden_dim \
    --recurrent-projection-dim $recurrent_projection_dim \
    --non-recurrent-projection-dim $non_recurrent_projection_dim \
    --label-delay $label_delay \
    --self-repair-scale 0.00001 \
   $dir/configs || exit 1;
 fi
 if [ $stage -le 10 ]; then
  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
    utils/create_split_dir.pl \
     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
  fi
  steps/nnet3/train_rnn.py --stage=$train_stage \
    --cmd="$decode_cmd" \
    --feat.online-ivector-dir=exp/nnet3/ivectors_${train_set} \
    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
    --trainer.num-epochs=$num_epochs \
    --trainer.samples-per-iter=$samples_per_iter \
    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
    --trainer.optimization.num-jobs-final=$num_jobs_final \
    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
    --trainer.optimization.shrink-value 0.99 \
    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
    --trainer.optimization.momentum=$momentum \
    --egs.chunk-width=$chunk_width \
    --egs.chunk-left-context=$chunk_left_context \
    --egs.chunk-right-context=$chunk_right_context \
    --egs.dir="$common_egs_dir" \
    --cleanup.remove-egs=$remove_egs \
    --cleanup.preserve-model-interval=500 \
    --use-gpu=true \
    --feat-dir=data/${train_set}_hires \
    --ali-dir=$ali_dir \
    --lang=data/lang \
    --reporting.email="$reporting_email" \
    --dir=$dir  || exit 1;
 fi
 graph_dir=exp/tri5a/graph_sw1_tg
 if [ $stage -le 11 ]; then
  if [ -z $extra_left_context ]; then
    extra_left_context=$chunk_left_context
  fi
  if [ -z $extra_right_context ]; then
    extra_right_context=$chunk_right_context
  fi
  if [ -z $frames_per_chunk ]; then
    frames_per_chunk=$chunk_width
  fi
  for decode_set in eval2000 rt03; do
      (
      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
      steps/nnet3/lstm/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
          --extra-left-context $extra_left_context  \
          --extra-right-context $extra_right_context  \
          --frames-per-chunk "$frames_per_chunk" \
          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg || exit 1;
      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
          data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
         $dir/decode_${decode_set}_fsh_sw1_{tg,fg} || exit 1;
      ) &
  done
 fi
 wait;
 exit 0;
--- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn.sh
@ -0,0 +1,99 @@
 #!/bin/bash
 # this is the standard "tdnn" system, built in nnet3; it's what we use to
 # call multi-splice.
 . ./cmd.sh
 # At this script level we don't support not running on GPU, as it would be painfully slow.
 # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
 # --num-threads 16 and --minibatch-size 128.
 stage=0
 affix=
 train_stage=-10
 common_egs_dir=
 reporting_email=
 remove_egs=true
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 if ! cuda-compiled; then
  cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
 dir=exp/nnet3/tdnn
 dir=$dir${affix:+_$affix}
 train_set=train_nodup_sp
 ali_dir=exp/tri5a_ali_nodup_sp
 local/nnet3/run_ivector_common.sh --stage $stage \
  --speed-perturb true || exit 1;
 if [ $stage -le 9 ]; then
  echo "$0: creating neural net configs";
  # create the config files for nnet initialization
  python steps/nnet3/tdnn/make_configs.py  \
    --feat-dir data/${train_set}_hires \
    --ivector-dir exp/nnet3/ivectors_${train_set} \
    --ali-dir $ali_dir \
    --relu-dim 1024 \
    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -3,3 -7,2 0"  \
    --use-presoftmax-prior-scale true \
   $dir/configs || exit 1;
 fi
 if [ $stage -le 10 ]; then
  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
    utils/create_split_dir.pl \
     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
  fi
  steps/nnet3/train_dnn.py --stage=$train_stage \
    --cmd="$decode_cmd" \
    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
    --trainer.num-epochs 4 \
    --trainer.optimization.num-jobs-initial 3 \
    --trainer.optimization.num-jobs-final 16 \
    --trainer.optimization.initial-effective-lrate 0.0017 \
    --trainer.optimization.final-effective-lrate 0.00017 \
    --egs.dir "$common_egs_dir" \
    --cleanup.remove-egs $remove_egs \
    --cleanup.preserve-model-interval 500 \
    --use-gpu true \
    --feat-dir=data/${train_set}_hires \
    --ali-dir $ali_dir \
    --lang data/lang \
    --reporting.email="$reporting_email" \
    --dir=$dir  || exit 1;
 fi
 graph_dir=exp/tri5a/graph_fsh_sw1_tg
 if [ $stage -le 11 ]; then
  for decode_set in eval2000 rt03; do
    (
    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg || exit 1;
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
        data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
       $dir/decode_${decode_set}_fsh_sw1_{tg,fg} || exit 1;
    ) &
  done
 fi
 wait;
 exit 0;
--- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
@ -0,0 +1,188 @@
 #!/bin/bash
 set -o pipefail
 # this is run_tdnn_discriminative.sh
 # This script does discriminative training on top of CE nnet3 system.
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
 # 
 . ./cmd.sh
 stage=0
 train_stage=-10 # can be used to start training in the middle.
 get_egs_stage=-10
 use_gpu=true  # for training
 cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
               # alignments and degs).
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 srcdir=exp/nnet3/tdnn
 train_data_dir=data/train_nodup_sp_hires
 online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
 degs_dir=                     # If provided, will skip the degs directory creation
 lats_dir=                     # If provided, will skip denlats creation
 ## Objective options
 criterion=smbr
 one_silence_class=true
 dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
 truncate_deriv_weights=10
 ## Nnet training options
 effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
 minibatch_size=64
 adjust_priors=true            # May need to be set to false 
                              # because it does not help in some setups
 modify_learning_rates=true
 last_layer_factor=0.1
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
 if $use_gpu; then
  if ! cuda-compiled; then
    cat <<EOF && exit 1 
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
  fi
  num_threads=1
 else
  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
  # almost the same, but this may be a little bit slow.
  num_threads=16
 fi
 if [ ! -f ${srcdir}/final.mdl ]; then
  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
  exit 1;
 fi
 if [ $stage -le 1 ]; then
  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
  # get excellent GPU utilization though.]
  nj=100 # have a high number of jobs because this could take a while, and we might
         # have some stragglers.
  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
    --online-ivector-dir $online_ivector_dir \
     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
 fi
 if [ -z "$lats_dir" ]; then
  lats_dir=${srcdir}_denlats
  if [ $stage -le 2 ]; then
    nj=100  
    # this doesn't really affect anything strongly, except the num-jobs for one of
    # the phases of get_egs_discriminative.sh below.
    num_threads_denlats=6
    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
    # total slots = 80 * 6 = 480.
    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
      --online-ivector-dir $online_ivector_dir \
      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      $train_data_dir data/lang $srcdir ${lats_dir} ;
  fi
 fi
 model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
 model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 valid_left_context=$[valid_left_context + frames_per_eg]
 valid_right_context=$[valid_right_context + frames_per_eg]
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 cmvn_opts=`cat $srcdir/cmvn_opts` 
 if [ -z "$degs_dir" ]; then
  degs_dir=${srcdir}_degs
  if [ $stage -le 3 ]; then
    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
      utils/create_split_dir.pl \
        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
    fi
    # have a higher maximum num-jobs if
    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
    steps/nnet3/get_egs_discriminative.sh \
      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
      --adjust-priors $adjust_priors \
      --online-ivector-dir $online_ivector_dir \
      --left-context $left_context --right-context $right_context \
      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
  fi
 fi
 if [ $stage -le 4 ]; then
  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
    --stage $train_stage \
    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
    --criterion $criterion --drop-frames true \
    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
    --regularization-opts "$regularization_opts" \
    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
    ${degs_dir} $dir 
 fi
 graph_dir=exp/tri5a/graph_fsh_sw1_tg
 if [ $stage -le 5 ]; then
  for x in `seq $decode_start_epoch $num_epochs`; do
    for decode_set in eval2000 rt03; do
      (
      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
      iter=epoch$x.adj
      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg_$iter ;
      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
        data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
        $dir/decode_${decode_set}_fsh_sw1_{tg,fg}_$iter ;
      ) &
    done
  done
 fi
 wait;
 if [ $stage -le 6 ] && $cleanup; then
  # if you run with "--cleanup true --stage 6" you can clean up.
  rm ${lats_dir}/lat.*.gz || true
  rm ${srcdir}_ali/ali.*.gz || true
  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
 fi
 exit 0;