some new tuning experiments on chain+swbd setup; add --xent-separate-forward-affine option to make_jesus_configs.py; some cleanup in librispeech/s5/run.sh

2016-02-22 16:47:19 -05:00 · 2016-02-22 16:47:19 -05:00 · 3e73f67d29
--- a/egs/librispeech/s5/run.sh
+++ b/egs/librispeech/s5/run.sh
@ -2,7 +2,7 @@


 # Set this to somewhere where you want to put your data, or where
-# someone else has already put it.  You'll want to change this 
+# someone else has already put it.  You'll want to change this
 # if you're not on the CLSP grid.
 data=/export/a15/vpanayotov/data

@ -10,8 +10,8 @@ data=/export/a15/vpanayotov/data
 data_url=www.openslr.org/resources/12
 lm_url=www.openslr.org/resources/11

-. cmd.sh
-. path.sh
+. ./cmd.sh
+. ./path.sh

 # you might not want to do this for interactive shells.
 set -e
@ -24,12 +24,12 @@ for part in dev-clean test-clean dev-other test-other train-clean-100; do
 done

 # download the LM resources
-local/download_lm.sh $lm_url data/local/lm || exit 1
+local/download_lm.sh $lm_url data/local/lm

 # format the data as Kaldi data directories
 for part in dev-clean test-clean dev-other test-other train-clean-100; do
  # use underscore-separated names in data directories.
-  local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) || exit 1
+  local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g)
 done

 ## Optional text corpus normalization and LM training
@ -39,7 +39,7 @@ done
 ## well as some intermediate data(e.g. the normalized text used for LM training),
 ## are available for download at http://www.openslr.org/11/
 #local/lm/train_lm.sh $LM_CORPUS_ROOT \
-#  data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm || exit 1
+#  data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm

 ## Optional G2P training scripts.
 ## As the LM training scripts above, this script is intended primarily to
@ -49,24 +49,24 @@ done
 # when "--stage 3" option is used below we skip the G2P steps, and use the
 # lexicon we have already downloaded from openslr.org/11/
 local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
-   data/local/lm data/local/lm data/local/dict_nosp || exit 1
+   data/local/lm data/local/lm data/local/dict_nosp

 utils/prepare_lang.sh data/local/dict_nosp \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
+  "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp

-local/format_lms.sh --src-dir data/lang_nosp data/local/lm || exit 1
+local/format_lms.sh --src-dir data/lang_nosp data/local/lm

 # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
 utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
-  data/lang_nosp data/lang_nosp_test_tglarge || exit 1;
+  data/lang_nosp data/lang_nosp_test_tglarge
 utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
-  data/lang_nosp data/lang_nosp_test_fglarge || exit 1;
+  data/lang_nosp data/lang_nosp_test_fglarge

 mfccdir=mfcc
 # spread the mfccs over various machines, as this data-set is quite large.
-if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then 
+if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
  mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
-  utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
+  utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
    $mfccdir/storage
 fi

@ -87,15 +87,15 @@ utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k

 # train a monophone system
 steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
-  data/train_2kshort data/lang_nosp exp/mono || exit 1;
+  data/train_2kshort data/lang_nosp exp/mono

 # decode using the monophone model
 (
  utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \
-    exp/mono exp/mono/graph_nosp_tgsmall || exit 1
+    exp/mono exp/mono/graph_nosp_tgsmall
  for test in test_clean test_other dev_clean dev_other; do
    steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \
-      data/$test exp/mono/decode_nosp_tgsmall_$test || exit 1
+      data/$test exp/mono/decode_nosp_tgsmall_$test
  done
 )&

@ -104,97 +104,97 @@ steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \

 # train a first delta + delta-delta triphone system on a subset of 5000 utterances
 steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
-    2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 || exit 1;
+    2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1

 # decode using the tri1 model
 (
  utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri1 exp/tri1/graph_nosp_tgsmall || exit 1;
+    exp/tri1 exp/tri1/graph_nosp_tgsmall
  for test in test_clean test_other dev_clean dev_other; do
    steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \
-      data/$test exp/tri1/decode_nosp_tgsmall_$test || exit 1;
+      data/$test exp/tri1/decode_nosp_tgsmall_$test
    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test
  done
 )&

 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-  data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k || exit 1;
+  data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k


 # train an LDA+MLLT system.
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
   --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
-   data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b || exit 1;
+   data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b

 # decode using the LDA+MLLT model
 (
  utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri2b exp/tri2b/graph_nosp_tgsmall || exit 1;
+    exp/tri2b exp/tri2b/graph_nosp_tgsmall
  for test in test_clean test_other dev_clean dev_other; do
    steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \
-      data/$test exp/tri2b/decode_nosp_tgsmall_$test || exit 1;
+      data/$test exp/tri2b/decode_nosp_tgsmall_$test
    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test
  done
 )&

 # Align a 10k utts subset using the tri2b model
 steps/align_si.sh  --nj 10 --cmd "$train_cmd" --use-graphs true \
-  data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k || exit 1;
+  data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k

 # Train tri3b, which is LDA+MLLT+SAT on 10k utts
 steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
-  data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b || exit 1;
+  data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b

 # decode using the tri3b model
 (
  utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri3b exp/tri3b/graph_nosp_tgsmall || exit 1;
+    exp/tri3b exp/tri3b/graph_nosp_tgsmall
  for test in test_clean test_other dev_clean dev_other; do
    steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
      exp/tri3b/graph_nosp_tgsmall data/$test \
-      exp/tri3b/decode_nosp_tgsmall_$test || exit 1;
+      exp/tri3b/decode_nosp_tgsmall_$test
    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test
  done
 )&

 # align the entire train_clean_100 subset using the tri3b model
 steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
  data/train_clean_100 data/lang_nosp \
-  exp/tri3b exp/tri3b_ali_clean_100 || exit 1;
+  exp/tri3b exp/tri3b_ali_clean_100

 # train another LDA+MLLT+SAT system on the entire 100 hour subset
 steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
  data/train_clean_100 data/lang_nosp \
-  exp/tri3b_ali_clean_100 exp/tri4b || exit 1;
+  exp/tri3b_ali_clean_100 exp/tri4b

 # decode using the tri4b model
 (
  utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri4b exp/tri4b/graph_nosp_tgsmall || exit 1;
+    exp/tri4b exp/tri4b/graph_nosp_tgsmall
  for test in test_clean test_other dev_clean dev_other; do
    steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
      exp/tri4b/graph_nosp_tgsmall data/$test \
-      exp/tri4b/decode_nosp_tgsmall_$test || exit 1;
+      exp/tri4b/decode_nosp_tgsmall_$test
    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test
  done
 )&

@ -205,125 +205,125 @@ steps/get_prons.sh --cmd "$train_cmd" \
 utils/dict_dir_add_pronprobs.sh --max-normalize true \
  data/local/dict_nosp \
  exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
+  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict

 utils/prepare_lang.sh data/local/dict \
  "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
 local/format_lms.sh --src-dir data/lang data/local/lm

 utils/build_const_arpa_lm.sh \
-  data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge || exit 1;
+  data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
 utils/build_const_arpa_lm.sh \
-  data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge || exit 1;
+  data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge

 # decode using the tri4b model with pronunciation and silence probabilities
 (
  utils/mkgraph.sh \
-    data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall || exit 1;
+    data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall
  for test in test_clean test_other dev_clean dev_other; do
    steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
      exp/tri4b/graph_tgsmall data/$test \
-      exp/tri4b/decode_tgsmall_$test || exit 1;
+      exp/tri4b/decode_tgsmall_$test
    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test
  done
 )&

 # align train_clean_100 using the tri4b model
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 || exit 1;
+  data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100

 # if you want at this point you can train and test NN model(s) on the 100 hour
 # subset
-local/nnet2/run_5a_clean_100.sh || exit 1
+local/nnet2/run_5a_clean_100.sh

-local/download_and_untar.sh $data $data_url train-clean-360 || exit 1;
+local/download_and_untar.sh $data $data_url train-clean-360

 # now add the "clean-360" subset to the mix ...
 local/data_prep.sh \
-  $data/LibriSpeech/train-clean-360 data/train_clean_360 || exit 1
+  $data/LibriSpeech/train-clean-360 data/train_clean_360
 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \
-  exp/make_mfcc/train_clean_360 $mfccdir || exit 1
+  exp/make_mfcc/train_clean_360 $mfccdir
 steps/compute_cmvn_stats.sh \
-  data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir || exit 1
+  data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir

 # ... and then combine the two sets into a 460 hour one
 utils/combine_data.sh \
-  data/train_clean_460 data/train_clean_100 data/train_clean_360 || exit 1
+  data/train_clean_460 data/train_clean_100 data/train_clean_360

 # align the new, combined set, using the tri4b model
 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 || exit 1;
+  data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460

 # create a larger SAT model, trained on the 460 hours of data.
 steps/train_sat.sh  --cmd "$train_cmd" 5000 100000 \
-  data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b || exit 1;
+  data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b

 # decode using the tri5b model
 (
  utils/mkgraph.sh data/lang_test_tgsmall \
-    exp/tri5b exp/tri5b/graph_tgsmall || exit 1;
+    exp/tri5b exp/tri5b/graph_tgsmall
  for test in test_clean test_other dev_clean dev_other; do
    steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
      exp/tri5b/graph_tgsmall data/$test \
-      exp/tri5b/decode_tgsmall_$test || exit 1;
+      exp/tri5b/decode_tgsmall_$test
    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test
  done
 )&

 # train a NN model on the 460 hour set
-local/nnet2/run_6a_clean_460.sh || exit 1
+local/nnet2/run_6a_clean_460.sh

-local/download_and_untar.sh $data $data_url train-other-500 || exit 1;
+local/download_and_untar.sh $data $data_url train-other-500

 # prepare the 500 hour subset.
 local/data_prep.sh \
-  $data/LibriSpeech/train-other-500 data/train_other_500 || exit 1
+  $data/LibriSpeech/train-other-500 data/train_other_500
 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \
-  exp/make_mfcc/train_other_500 $mfccdir || exit 1
+  exp/make_mfcc/train_other_500 $mfccdir
 steps/compute_cmvn_stats.sh \
-  data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir || exit 1
+  data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir

 # combine all the data
 utils/combine_data.sh \
-  data/train_960 data/train_clean_460 data/train_other_500 || exit 1
+  data/train_960 data/train_clean_460 data/train_other_500

 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 || exit 1;
+  data/train_960 data/lang exp/tri5b exp/tri5b_ali_960

 # train a SAT model on the 960 hour mixed data.  Use the train_quick.sh script
 # as it is faster.
 steps/train_quick.sh --cmd "$train_cmd" \
-  7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b || exit 1;
+  7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b

 # decode using the tri6b model
 (
  utils/mkgraph.sh data/lang_test_tgsmall \
-    exp/tri6b exp/tri6b/graph_tgsmall || exit 1;
+    exp/tri6b exp/tri6b/graph_tgsmall
  for test in test_clean test_other dev_clean dev_other; do
    steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
-      exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test || exit 1;
+      exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test
    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test
    steps/lmrescore_const_arpa.sh \
      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test
  done
 )&

@ -349,7 +349,7 @@ steps/train_quick.sh --cmd "$train_cmd" \


 # train NN models on the entire dataset
-local/nnet2/run_7a_960.sh || exit 1
+local/nnet2/run_7a_960.sh

 # # train models on cleaned-up data
 # # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
--- a/egs/swbd/s5c/local/chain/README.txt
+++ b/egs/swbd/s5c/local/chain/README.txt
@ -17,5 +17,6 @@ ones to look at right now:
  5v is what I am currently using as a baseline- it has an even smaller
     --jesus-hidden-dim as 5t (hence faster to train), but gives the same
     performance.
-
-
+  6g is a setup with a 'thinner' jesus-layer (with only one repeated-affine component)
+     and slightly more parameters, which is quicker to train than 5v but gives
+     about the same results.  I'm hoping to use this setup, going forward.
--- a/egs/swbd/s5c/local/chain/run_tdnn_5x.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-# _5w is as _5x but decreasing the context of the averaging layer from +-0.99
+# _5x is as _5w but decreasing the context of the averaging layer from +-0.99
 # seconds to +-0.66 seconds.  I would not have expected this to work a priori,
 # but the change from 5k -> 5l, which made the context wider, made WERs slightly
 # worse, so I'd like to see what happens when we decrease the context.
--- a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
@ -1,15 +1,17 @@
 #!/bin/bash

 # _5z is as _5v, but adding skip-splicing (a new configuration option)
-# It seems definitely not helpful.  I'll remove the option soon.
-#local/chain/compare_wer.sh 5v 5z
-#System                       5v        5z
-#WER on train_dev(tg)      15.38     15.60
-#WER on train_dev(fg)      14.39     14.50
-#WER on eval2000(tg)        17.4      17.6
-#WER on eval2000(fg)        15.7      15.9
-#Final train prob       -0.11156 -0.113823
-#Final valid prob      -0.131797 -0.131356
+# It seems not helpful.  I'll remove the option soon.
+# note: 5v2 is a rerun of 5v.
+
+# local/chain/compare_wer.sh 5v 5v2 5z
+# System                       5v       5v2        5z
+# WER on train_dev(tg)      15.38     15.74     15.60
+# WER on train_dev(fg)      14.39     14.50     14.50
+# WER on eval2000(tg)        17.4      17.5      17.6
+# WER on eval2000(fg)        15.7      15.9      15.9
+# Final train prob       -0.11156 -0.112155 -0.113823
+# Final valid prob      -0.131797 -0.129516 -0.131356

 # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.

--- a/egs/swbd/s5c/local/chain/run_tdnn_6c.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh
@ -3,15 +3,15 @@
 # _6c is as _5v but adding "--thick-jesus-layer true" (new option): extra hidden
 # layer inside jesus layer.

-# Doesn't seem to be helpful.
-#local/chain/compare_wer.sh 5v 6c
-#System                       5v        6c
-#WER on train_dev(tg)      15.38     15.54
-#WER on train_dev(fg)      14.39     14.55
-#WER on eval2000(tg)        17.4      17.5
-#WER on eval2000(fg)        15.7      15.8
-#Final train prob       -0.11156 -0.114084
-#Final valid prob      -0.131797 -0.129589
+# Note: 5v2 is a rerun of 5v.
+#local/chain/compare_wer.sh 5v 5v2 6c
+#System                       5v       5v2        6c
+#WER on train_dev(tg)      15.38     15.74     15.54
+#WER on train_dev(fg)      14.39     14.50     14.55
+#WER on eval2000(tg)        17.4      17.5      17.5
+#WER on eval2000(fg)        15.7      15.9      15.8
+#Final train prob       -0.11156 -0.112155 -0.114084
+#Final valid prob      -0.131797 -0.129516 -0.129589

 # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.

--- a/egs/swbd/s5c/local/chain/run_tdnn_6d.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh
@ -4,15 +4,16 @@
 # this means (after rounding) that we have 6, not 5, as
 # --jesus-forward-input-dim / --num-jesus-blocks.

-#a bit worse.
-#a03:s5c: local/chain/compare_wer.sh 5v 6d
-#System                       5v        6d
-#WER on train_dev(tg)      15.38     15.66
-#WER on train_dev(fg)      14.39     14.54
-#WER on eval2000(tg)        17.4      17.5
-#WER on eval2000(fg)        15.7      15.8
-#Final train prob       -0.11156 -0.112034
-#Final valid prob      -0.131797 -0.131714
+# no clear difference.
+#[note, 5v2 is a rerun of 5v].
+# local/chain/compare_wer.sh 5v 5v2 6d
+# System                       5v       5v2        6d
+# WER on train_dev(tg)      15.38     15.74     15.66
+# WER on train_dev(fg)      14.39     14.50     14.54
+# WER on eval2000(tg)        17.4      17.5      17.5
+# WER on eval2000(fg)        15.7      15.9      15.8
+# Final train prob       -0.11156 -0.112155 -0.112034
+# Final valid prob      -0.131797 -0.129516 -0.131714

 # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.

--- a/egs/swbd/s5c/local/chain/run_tdnn_6g.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh
@ -3,6 +3,17 @@
 # _6g is as _6f but increasing the parameters (increasing
 # jesus-forward-input-from from 500 to 600).

+# seems better than 6f, and about the same as (5v,5v2).  encouraging.
+# note, 5v2 is rerun of 5v.
+#local/chain/compare_wer.sh 5v 5v2 6f 6g
+#System                       5v       5v2        6f        6g
+#WER on train_dev(tg)      15.38     15.74     15.71     15.50
+#WER on train_dev(fg)      14.39     14.50     14.50     14.31
+#WER on eval2000(tg)        17.4      17.5      17.5      17.5
+#WER on eval2000(fg)        15.7      15.9      15.9      15.8
+#Final train prob       -0.11156 -0.112155 -0.111305 -0.105853
+#Final valid prob      -0.131797 -0.129516 -0.131487 -0.129997
+
 # _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
 # means there is no hidden part in the jesus layer (it's just repeated affine and relu).

--- a/egs/swbd/s5c/local/chain/run_tdnn_6h.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh
@ -0,0 +1,483 @@
+#!/bin/bash
+
+# _6h is as _6g but adding --xent-separate-forward-affine=true, which
+# gives a separate last-but-one weight matrix to the xent output.
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6h # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
--- a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
@ -32,6 +32,9 @@ parser.add_argument("--xent-regularize", type=float,
                    help="For chain models, if nonzero, add a separate output for cross-entropy "
                    "regularization (with learning-rate-factor equal to the inverse of this)",
                    default=0.0)
+parser.add_argument("--xent-separate-forward-affine", type=str,
+                    help="if using --xent-regularize, gives it separate last-but-one weight matrix",
+                    default="false", choices = ["false", "true"])
 parser.add_argument("--use-repeated-affine", type=str,
                    help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)",
                    default="true", choices = ["false", "true"])
@ -462,6 +465,19 @@ for l in range(1, num_hidden_layers + 1):
        print('output-node name=output input=final-affine', file=f)

    if args.xent_regularize != 0.0:
+        xent_input = 'final-relu'
+        if l == num_hidden_layers and args.xent_separate_forward_affine == "true":
+            print('component name=forward-affine{0}-xent type=NaturalGradientAffineComponent '
+                  'input-dim={1} output-dim={2} bias-stddev=0'.
+                  format(l, args.jesus_forward_output_dim, args.final_hidden_dim), file=f)
+            print('component-node name=jesus{0}-forward-output-affine-xent component=forward-affine{0}-xent input=post-jesus{0}'.format(
+                    l), file=f)
+            print('component name=final-relu-xent type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format(
+                    args.final_hidden_dim, args.self_repair_scale), file=f)
+            print('component-node name=final-relu-xent component=final-relu-xent '
+                  'input=jesus{0}-forward-output-affine-xent'.format(l), file=f)
+            xent_input = 'final-relu-xent'
+
        # This block prints the configs for a separate output that will be
        # trained with a cross-entropy objective in the 'chain' models... this
        # has the effect of regularizing the hidden parts of the model.  we use
@ -473,8 +489,8 @@ for l in range(1, num_hidden_layers + 1):
        print('component name=final-affine-xent type=NaturalGradientAffineComponent '
              'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format(
                cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f)
-        print('component-node name=final-affine-xent component=final-affine-xent input=final-relu',
-              file=f)
+        print('component-node name=final-affine-xent component=final-affine-xent input={0}'.format(
+                xent_input), file=f)
        print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format(
                args.num_targets), file=f)
        print('component-node name=final-log-softmax-xent component=final-log-softmax-xent '