From 94c4646abac227ed093c3534f9587e7deed9f489 Mon Sep 17 00:00:00 2001 From: Chao Weng Date: Tue, 10 Dec 2013 19:29:00 +0000 Subject: [PATCH] adding chime wsj eg git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3291 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8 --- egs/chime_wsj0/s5/cmd.sh | 29 + egs/chime_wsj0/s5/conf/decode_dnn.config | 2 + egs/chime_wsj0/s5/conf/fbank.conf | 11 + egs/chime_wsj0/s5/conf/mfcc.conf | 1 + .../s5/local/binmask_wsj0_data_prep.sh | 117 +++ egs/chime_wsj0/s5/local/chime_format_data.sh | 86 +++ .../s5/local/clean_wsj0_data_prep.sh | 190 +++++ egs/chime_wsj0/s5/local/copy_clean_ali.sh | 13 + egs/chime_wsj0/s5/local/cstr_ndx2flist.pl | 54 ++ egs/chime_wsj0/s5/local/cstr_wsj_data_prep.sh | 187 +++++ .../s5/local/cstr_wsj_extend_dict.sh | 172 +++++ egs/chime_wsj0/s5/local/dict/add_counts.pl | 31 + egs/chime_wsj0/s5/local/dict/count_rules.pl | 44 ++ egs/chime_wsj0/s5/local/dict/filter_dict.pl | 19 + egs/chime_wsj0/s5/local/dict/find_acronyms.pl | 95 +++ .../s5/local/dict/get_acronym_prons.pl | 123 ++++ .../s5/local/dict/get_candidate_prons.pl | 187 +++++ .../s5/local/dict/get_rule_hierarchy.pl | 73 ++ egs/chime_wsj0/s5/local/dict/get_rules.pl | 204 ++++++ .../s5/local/dict/limit_candidate_prons.pl | 103 +++ .../s5/local/dict/reverse_candidates.pl | 50 ++ egs/chime_wsj0/s5/local/dict/reverse_dict.pl | 14 + egs/chime_wsj0/s5/local/dict/score_prons.pl | 50 ++ egs/chime_wsj0/s5/local/dict/score_rules.pl | 52 ++ .../s5/local/dict/select_candidate_prons.pl | 84 +++ .../s5/local/find_noisy_transcripts.pl | 65 ++ egs/chime_wsj0/s5/local/find_transcripts.pl | 64 ++ egs/chime_wsj0/s5/local/flist2scp.pl | 31 + .../s5/local/generate_example_kws.sh | 110 +++ egs/chime_wsj0/s5/local/kws_data_prep.sh | 60 ++ egs/chime_wsj0/s5/local/ndx2flist.pl | 62 ++ egs/chime_wsj0/s5/local/nnet2/run_5b.sh | 69 ++ egs/chime_wsj0/s5/local/nnet2/run_5c.sh | 24 + .../s5/local/noisy_wsj0_data_prep.sh | 119 +++ .../s5/local/normalize_transcript.pl | 59 ++ .../s5/local/reverb_wsj0_data_prep.sh | 100 +++ egs/chime_wsj0/s5/local/run_basis_fmllr.sh | 42 ++ egs/chime_wsj0/s5/local/run_dnn.sh | 181 +++++ egs/chime_wsj0/s5/local/run_fwdbwd.sh | 41 ++ egs/chime_wsj0/s5/local/run_mmi_tri2b.sh | 60 ++ egs/chime_wsj0/s5/local/run_mmi_tri4b.sh | 50 ++ egs/chime_wsj0/s5/local/run_nnet_cpu.sh | 9 + egs/chime_wsj0/s5/local/run_raw_fmllr.sh | 66 ++ egs/chime_wsj0/s5/local/run_rnnlms_sgmm5b.sh | 42 ++ egs/chime_wsj0/s5/local/run_rnnlms_tri3b.sh | 64 ++ egs/chime_wsj0/s5/local/run_sgmm.sh | 113 +++ egs/chime_wsj0/s5/local/run_sgmm2.sh | 148 ++++ egs/chime_wsj0/s5/local/score.sh | 67 ++ egs/chime_wsj0/s5/local/score_combine.sh | 95 +++ egs/chime_wsj0/s5/local/score_mbr.sh | 58 ++ egs/chime_wsj0/s5/local/wsj_data_prep.sh | 201 +++++ egs/chime_wsj0/s5/local/wsj_extend_dict.sh | 173 +++++ egs/chime_wsj0/s5/local/wsj_format_data.sh | 86 +++ .../s5/local/wsj_format_local_lms.sh | 52 ++ egs/chime_wsj0/s5/local/wsj_prepare_dict.sh | 83 +++ egs/chime_wsj0/s5/local/wsj_train_lms.sh | 202 +++++ egs/chime_wsj0/s5/local/wsj_train_rnnlms.sh | 153 ++++ egs/chime_wsj0/s5/path.sh | 3 + egs/chime_wsj0/s5/run.sh | 261 +++++++ egs/chime_wsj0/s5/steps/align_basis_fmllr.sh | 150 ++++ egs/chime_wsj0/s5/steps/align_fmllr.sh | 148 ++++ egs/chime_wsj0/s5/steps/align_nnet.sh | 99 +++ egs/chime_wsj0/s5/steps/align_raw_fmllr.sh | 142 ++++ egs/chime_wsj0/s5/steps/align_sgmm.sh | 193 +++++ egs/chime_wsj0/s5/steps/align_sgmm2.sh | 193 +++++ egs/chime_wsj0/s5/steps/align_si.sh | 89 +++ egs/chime_wsj0/s5/steps/append_feats.sh | 67 ++ egs/chime_wsj0/s5/steps/compute_cmvn_stats.sh | 80 ++ egs/chime_wsj0/s5/steps/decode.sh | 108 +++ egs/chime_wsj0/s5/steps/decode_basis_fmllr.sh | 206 ++++++ egs/chime_wsj0/s5/steps/decode_biglm.sh | 86 +++ egs/chime_wsj0/s5/steps/decode_combine.sh | 59 ++ egs/chime_wsj0/s5/steps/decode_fmllr.sh | 217 ++++++ egs/chime_wsj0/s5/steps/decode_fmllr_extra.sh | 250 +++++++ egs/chime_wsj0/s5/steps/decode_fmmi.sh | 111 +++ egs/chime_wsj0/s5/steps/decode_fromlats.sh | 90 +++ egs/chime_wsj0/s5/steps/decode_fwdbwd.sh | 122 +++ egs/chime_wsj0/s5/steps/decode_nnet.sh | 128 ++++ egs/chime_wsj0/s5/steps/decode_nnet_cpu.sh | 127 ++++ egs/chime_wsj0/s5/steps/decode_raw_fmllr.sh | 235 ++++++ egs/chime_wsj0/s5/steps/decode_sgmm.sh | 257 +++++++ egs/chime_wsj0/s5/steps/decode_sgmm2.sh | 211 ++++++ .../s5/steps/decode_sgmm2_fromlats.sh | 270 +++++++ .../s5/steps/decode_sgmm2_rescore.sh | 111 +++ .../s5/steps/decode_sgmm2_rescore_project.sh | 172 +++++ .../s5/steps/decode_sgmm_fromlats.sh | 273 +++++++ .../s5/steps/decode_sgmm_rescore.sh | 107 +++ egs/chime_wsj0/s5/steps/decode_si.sh | 108 +++ egs/chime_wsj0/s5/steps/decode_with_map.sh | 113 +++ egs/chime_wsj0/s5/steps/get_ctm.sh | 66 ++ egs/chime_wsj0/s5/steps/get_fmllr_basis.sh | 95 +++ egs/chime_wsj0/s5/steps/get_lexicon_probs.sh | 225 ++++++ egs/chime_wsj0/s5/steps/get_train_ctm.sh | 66 ++ egs/chime_wsj0/s5/steps/lmrescore.sh | 122 +++ egs/chime_wsj0/s5/steps/make_bn_feats.sh | 117 +++ egs/chime_wsj0/s5/steps/make_denlats.sh | 146 ++++ egs/chime_wsj0/s5/steps/make_denlats_nnet.sh | 177 +++++ .../s5/steps/make_denlats_nnet_cpu.sh | 146 ++++ egs/chime_wsj0/s5/steps/make_denlats_sgmm.sh | 159 ++++ egs/chime_wsj0/s5/steps/make_denlats_sgmm2.sh | 170 +++++ egs/chime_wsj0/s5/steps/make_fbank.sh | 111 +++ egs/chime_wsj0/s5/steps/make_fmllr_feats.sh | 103 +++ egs/chime_wsj0/s5/steps/make_fmmi_feats.sh | 102 +++ egs/chime_wsj0/s5/steps/make_index.sh | 83 +++ egs/chime_wsj0/s5/steps/make_mfcc.sh | 111 +++ egs/chime_wsj0/s5/steps/make_plp.sh | 111 +++ egs/chime_wsj0/s5/steps/mixup.sh | 148 ++++ egs/chime_wsj0/s5/steps/nnet2/align.sh | 104 +++ egs/chime_wsj0/s5/steps/nnet2/decode.sh | 131 ++++ egs/chime_wsj0/s5/steps/nnet2/get_egs.sh | 276 +++++++ egs/chime_wsj0/s5/steps/nnet2/get_lda.sh | 120 +++ .../s5/steps/nnet2/get_lda_block.sh | 120 +++ .../s5/steps/nnet2/get_perturbed_feats.sh | 89 +++ egs/chime_wsj0/s5/steps/nnet2/retrain_tanh.sh | 216 ++++++ egs/chime_wsj0/s5/steps/nnet2/train_block.sh | 376 ++++++++++ egs/chime_wsj0/s5/steps/nnet2/train_tanh.sh | 377 ++++++++++ egs/chime_wsj0/s5/steps/pretrain_dbn.sh | 257 +++++++ egs/chime_wsj0/s5/steps/rnnlmrescore.sh | 180 +++++ egs/chime_wsj0/s5/steps/search_index.sh | 50 ++ egs/chime_wsj0/s5/steps/tandem/align_fmllr.sh | 185 +++++ egs/chime_wsj0/s5/steps/tandem/align_sgmm.sh | 233 ++++++ egs/chime_wsj0/s5/steps/tandem/align_sgmm2.sh | 232 ++++++ egs/chime_wsj0/s5/steps/tandem/align_si.sh | 130 ++++ egs/chime_wsj0/s5/steps/tandem/decode.sh | 143 ++++ .../s5/steps/tandem/decode_fmllr.sh | 242 ++++++ egs/chime_wsj0/s5/steps/tandem/decode_sgmm.sh | 300 ++++++++ .../s5/steps/tandem/decode_sgmm2.sh | 236 ++++++ egs/chime_wsj0/s5/steps/tandem/decode_si.sh | 143 ++++ .../s5/steps/tandem/make_denlats.sh | 184 +++++ .../s5/steps/tandem/make_denlats_sgmm.sh | 201 +++++ .../s5/steps/tandem/make_denlats_sgmm2.sh | 201 +++++ .../s5/steps/tandem/mk_aslf_lda_mllt.sh | 177 +++++ .../s5/steps/tandem/mk_aslf_sgmm2.sh | 178 +++++ .../s5/steps/tandem/train_deltas.sh | 163 +++++ .../s5/steps/tandem/train_lda_mllt.sh | 257 +++++++ egs/chime_wsj0/s5/steps/tandem/train_mllt.sh | 236 ++++++ egs/chime_wsj0/s5/steps/tandem/train_mmi.sh | 184 +++++ .../s5/steps/tandem/train_mmi_sgmm.sh | 190 +++++ .../s5/steps/tandem/train_mmi_sgmm2.sh | 190 +++++ egs/chime_wsj0/s5/steps/tandem/train_mono.sh | 161 ++++ egs/chime_wsj0/s5/steps/tandem/train_sat.sh | 278 +++++++ egs/chime_wsj0/s5/steps/tandem/train_sgmm.sh | 312 ++++++++ egs/chime_wsj0/s5/steps/tandem/train_sgmm2.sh | 334 +++++++++ egs/chime_wsj0/s5/steps/tandem/train_ubm.sh | 168 +++++ egs/chime_wsj0/s5/steps/train_deltas.sh | 142 ++++ egs/chime_wsj0/s5/steps/train_diag_ubm.sh | 125 ++++ egs/chime_wsj0/s5/steps/train_lda_mllt.sh | 209 ++++++ egs/chime_wsj0/s5/steps/train_mmi.sh | 145 ++++ egs/chime_wsj0/s5/steps/train_mmi_fmmi.sh | 223 ++++++ .../s5/steps/train_mmi_fmmi_indirect.sh | 246 +++++++ egs/chime_wsj0/s5/steps/train_mmi_sgmm.sh | 150 ++++ egs/chime_wsj0/s5/steps/train_mmi_sgmm2.sh | 153 ++++ egs/chime_wsj0/s5/steps/train_mono.sh | 138 ++++ egs/chime_wsj0/s5/steps/train_mpe.sh | 158 ++++ egs/chime_wsj0/s5/steps/train_nnet.sh | 329 +++++++++ egs/chime_wsj0/s5/steps/train_nnet_cpu.sh | 535 ++++++++++++++ .../s5/steps/train_nnet_cpu_conv.sh | 692 ++++++++++++++++++ egs/chime_wsj0/s5/steps/train_nnet_cpu_mmi.sh | 293 ++++++++ .../s5/steps/train_nnet_cpu_tanh.sh | 496 +++++++++++++ egs/chime_wsj0/s5/steps/train_nnet_mmi.sh | 190 +++++ egs/chime_wsj0/s5/steps/train_nnet_mpe.sh | 173 +++++ .../s5/steps/train_nnet_scheduler.sh | 178 +++++ egs/chime_wsj0/s5/steps/train_quick.sh | 191 +++++ egs/chime_wsj0/s5/steps/train_raw_sat.sh | 295 ++++++++ egs/chime_wsj0/s5/steps/train_sat.sh | 255 +++++++ egs/chime_wsj0/s5/steps/train_sat_basis.sh | 277 +++++++ egs/chime_wsj0/s5/steps/train_sgmm.sh | 274 +++++++ egs/chime_wsj0/s5/steps/train_sgmm2.sh | 296 ++++++++ egs/chime_wsj0/s5/steps/train_sgmm2_group.sh | 343 +++++++++ egs/chime_wsj0/s5/steps/train_smbr.sh | 152 ++++ egs/chime_wsj0/s5/steps/train_ubm.sh | 139 ++++ .../s5/steps/word_align_lattices.sh | 48 ++ egs/chime_wsj0/s5/utils/add_disambig.pl | 58 ++ egs/chime_wsj0/s5/utils/add_lex_disambig.pl | 118 +++ egs/chime_wsj0/s5/utils/apply_map.pl | 83 +++ egs/chime_wsj0/s5/utils/best_wer.sh | 32 + egs/chime_wsj0/s5/utils/combine_data.sh | 37 + egs/chime_wsj0/s5/utils/convert_ctm.pl | 92 +++ egs/chime_wsj0/s5/utils/convert_slf.pl | 138 ++++ egs/chime_wsj0/s5/utils/copy_data_dir.sh | 99 +++ egs/chime_wsj0/s5/utils/eps2disambig.pl | 23 + egs/chime_wsj0/s5/utils/filter_scp.pl | 50 ++ egs/chime_wsj0/s5/utils/find_arpa_oovs.pl | 64 ++ egs/chime_wsj0/s5/utils/fix_ctm.sh | 32 + egs/chime_wsj0/s5/utils/fix_data_dir.sh | 169 +++++ egs/chime_wsj0/s5/utils/format_lm.sh | 84 +++ egs/chime_wsj0/s5/utils/format_lm_sri.sh | 124 ++++ egs/chime_wsj0/s5/utils/gen_topo.pl | 63 ++ egs/chime_wsj0/s5/utils/int2sym.pl | 71 ++ .../s5/utils/kwslist_post_process.pl | 291 ++++++++ egs/chime_wsj0/s5/utils/ln.pl | 58 ++ egs/chime_wsj0/s5/utils/make_lexicon_fst.pl | 161 ++++ .../s5/utils/make_phone_bigram_lang.sh | 98 +++ .../s5/utils/make_unigram_grammar.pl | 54 ++ egs/chime_wsj0/s5/utils/mkgraph.sh | 124 ++++ .../s5/utils/nnet-cpu/make_nnet_config.pl | 159 ++++ .../utils/nnet-cpu/make_nnet_config_block.pl | 156 ++++ .../make_nnet_config_preconditioned.pl | 277 +++++++ .../utils/nnet-cpu/update_learning_rates.pl | 141 ++++ .../s5/utils/nnet/analyze_alignments.sh | 71 ++ egs/chime_wsj0/s5/utils/nnet/copy_feats.sh | 62 ++ egs/chime_wsj0/s5/utils/nnet/gen_dct_mat.py | 53 ++ egs/chime_wsj0/s5/utils/nnet/gen_hamm_mat.py | 45 ++ egs/chime_wsj0/s5/utils/nnet/gen_mlp_init.py | 92 +++ egs/chime_wsj0/s5/utils/nnet/gen_rbm_init.py | 110 +++ .../s5/utils/nnet/gen_recurrent_dnn.py | 46 ++ egs/chime_wsj0/s5/utils/nnet/gen_splice.py | 40 + egs/chime_wsj0/s5/utils/nnet/init_nnet.sh | 136 ++++ egs/chime_wsj0/s5/utils/parse_options.sh | 94 +++ egs/chime_wsj0/s5/utils/pinyin_map.pl | 78 ++ egs/chime_wsj0/s5/utils/prepare_lang.sh | 332 +++++++++ egs/chime_wsj0/s5/utils/queue.pl | 313 ++++++++ egs/chime_wsj0/s5/utils/reduce_data_dir.sh | 52 ++ .../s5/utils/reduce_data_dir_by_reclist.sh | 53 ++ egs/chime_wsj0/s5/utils/remove_oovs.pl | 43 ++ egs/chime_wsj0/s5/utils/reverse_arpa.py | 187 +++++ egs/chime_wsj0/s5/utils/reverse_lm.sh | 91 +++ egs/chime_wsj0/s5/utils/reverse_lm_test.sh | 90 +++ .../s5/utils/rnnlm_compute_scores.sh | 69 ++ egs/chime_wsj0/s5/utils/run.pl | 148 ++++ egs/chime_wsj0/s5/utils/s2eps.pl | 27 + egs/chime_wsj0/s5/utils/shuffle_list.pl | 38 + egs/chime_wsj0/s5/utils/slurm.pl | 131 ++++ egs/chime_wsj0/s5/utils/spk2utt_to_utt2spk.pl | 27 + egs/chime_wsj0/s5/utils/split_data.sh | 120 +++ egs/chime_wsj0/s5/utils/split_scp.pl | 221 ++++++ egs/chime_wsj0/s5/utils/subset_data_dir.sh | 159 ++++ .../s5/utils/subset_data_dir_tr_cv.sh | 104 +++ egs/chime_wsj0/s5/utils/subset_scp.pl | 87 +++ egs/chime_wsj0/s5/utils/summarize_warnings.pl | 46 ++ egs/chime_wsj0/s5/utils/sym2int.pl | 98 +++ egs/chime_wsj0/s5/utils/utt2spk_to_spk2utt.pl | 39 + egs/chime_wsj0/s5/utils/validate_data_dir.sh | 218 ++++++ egs/chime_wsj0/s5/utils/validate_dict_dir.pl | 218 ++++++ egs/chime_wsj0/s5/utils/validate_lang.pl | 534 ++++++++++++++ egs/chime_wsj0/s5/utils/write_kwslist.pl | 333 +++++++++ 236 files changed, 33644 insertions(+) create mode 100644 egs/chime_wsj0/s5/cmd.sh create mode 100644 egs/chime_wsj0/s5/conf/decode_dnn.config create mode 100644 egs/chime_wsj0/s5/conf/fbank.conf create mode 100644 egs/chime_wsj0/s5/conf/mfcc.conf create mode 100755 egs/chime_wsj0/s5/local/binmask_wsj0_data_prep.sh create mode 100755 egs/chime_wsj0/s5/local/chime_format_data.sh create mode 100755 egs/chime_wsj0/s5/local/clean_wsj0_data_prep.sh create mode 100755 egs/chime_wsj0/s5/local/copy_clean_ali.sh create mode 100755 egs/chime_wsj0/s5/local/cstr_ndx2flist.pl create mode 100755 egs/chime_wsj0/s5/local/cstr_wsj_data_prep.sh create mode 100755 egs/chime_wsj0/s5/local/cstr_wsj_extend_dict.sh create mode 100755 egs/chime_wsj0/s5/local/dict/add_counts.pl create mode 100755 egs/chime_wsj0/s5/local/dict/count_rules.pl create mode 100755 egs/chime_wsj0/s5/local/dict/filter_dict.pl create mode 100755 egs/chime_wsj0/s5/local/dict/find_acronyms.pl create mode 100755 egs/chime_wsj0/s5/local/dict/get_acronym_prons.pl create mode 100755 egs/chime_wsj0/s5/local/dict/get_candidate_prons.pl create mode 100755 egs/chime_wsj0/s5/local/dict/get_rule_hierarchy.pl create mode 100755 egs/chime_wsj0/s5/local/dict/get_rules.pl create mode 100755 egs/chime_wsj0/s5/local/dict/limit_candidate_prons.pl create mode 100755 egs/chime_wsj0/s5/local/dict/reverse_candidates.pl create mode 100755 egs/chime_wsj0/s5/local/dict/reverse_dict.pl create mode 100755 egs/chime_wsj0/s5/local/dict/score_prons.pl create mode 100755 egs/chime_wsj0/s5/local/dict/score_rules.pl create mode 100755 egs/chime_wsj0/s5/local/dict/select_candidate_prons.pl create mode 100755 egs/chime_wsj0/s5/local/find_noisy_transcripts.pl create mode 100755 egs/chime_wsj0/s5/local/find_transcripts.pl create mode 100755 egs/chime_wsj0/s5/local/flist2scp.pl create mode 100755 egs/chime_wsj0/s5/local/generate_example_kws.sh create mode 100755 egs/chime_wsj0/s5/local/kws_data_prep.sh create mode 100755 egs/chime_wsj0/s5/local/ndx2flist.pl create mode 100755 egs/chime_wsj0/s5/local/nnet2/run_5b.sh create mode 100755 egs/chime_wsj0/s5/local/nnet2/run_5c.sh create mode 100755 egs/chime_wsj0/s5/local/noisy_wsj0_data_prep.sh create mode 100755 egs/chime_wsj0/s5/local/normalize_transcript.pl create mode 100755 egs/chime_wsj0/s5/local/reverb_wsj0_data_prep.sh create mode 100755 egs/chime_wsj0/s5/local/run_basis_fmllr.sh create mode 100755 egs/chime_wsj0/s5/local/run_dnn.sh create mode 100755 egs/chime_wsj0/s5/local/run_fwdbwd.sh create mode 100755 egs/chime_wsj0/s5/local/run_mmi_tri2b.sh create mode 100755 egs/chime_wsj0/s5/local/run_mmi_tri4b.sh create mode 100755 egs/chime_wsj0/s5/local/run_nnet_cpu.sh create mode 100644 egs/chime_wsj0/s5/local/run_raw_fmllr.sh create mode 100755 egs/chime_wsj0/s5/local/run_rnnlms_sgmm5b.sh create mode 100755 egs/chime_wsj0/s5/local/run_rnnlms_tri3b.sh create mode 100755 egs/chime_wsj0/s5/local/run_sgmm.sh create mode 100755 egs/chime_wsj0/s5/local/run_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/local/score.sh create mode 100755 egs/chime_wsj0/s5/local/score_combine.sh create mode 100755 egs/chime_wsj0/s5/local/score_mbr.sh create mode 100755 egs/chime_wsj0/s5/local/wsj_data_prep.sh create mode 100755 egs/chime_wsj0/s5/local/wsj_extend_dict.sh create mode 100755 egs/chime_wsj0/s5/local/wsj_format_data.sh create mode 100755 egs/chime_wsj0/s5/local/wsj_format_local_lms.sh create mode 100755 egs/chime_wsj0/s5/local/wsj_prepare_dict.sh create mode 100755 egs/chime_wsj0/s5/local/wsj_train_lms.sh create mode 100755 egs/chime_wsj0/s5/local/wsj_train_rnnlms.sh create mode 100755 egs/chime_wsj0/s5/path.sh create mode 100755 egs/chime_wsj0/s5/run.sh create mode 100755 egs/chime_wsj0/s5/steps/align_basis_fmllr.sh create mode 100755 egs/chime_wsj0/s5/steps/align_fmllr.sh create mode 100755 egs/chime_wsj0/s5/steps/align_nnet.sh create mode 100755 egs/chime_wsj0/s5/steps/align_raw_fmllr.sh create mode 100755 egs/chime_wsj0/s5/steps/align_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/align_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/align_si.sh create mode 100755 egs/chime_wsj0/s5/steps/append_feats.sh create mode 100755 egs/chime_wsj0/s5/steps/compute_cmvn_stats.sh create mode 100755 egs/chime_wsj0/s5/steps/decode.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_basis_fmllr.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_biglm.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_combine.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_fmllr.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_fmllr_extra.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_fmmi.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_fromlats.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_fwdbwd.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_nnet.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_nnet_cpu.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_raw_fmllr.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_sgmm2_fromlats.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_sgmm2_rescore.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_sgmm2_rescore_project.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_sgmm_fromlats.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_sgmm_rescore.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_si.sh create mode 100755 egs/chime_wsj0/s5/steps/decode_with_map.sh create mode 100755 egs/chime_wsj0/s5/steps/get_ctm.sh create mode 100755 egs/chime_wsj0/s5/steps/get_fmllr_basis.sh create mode 100755 egs/chime_wsj0/s5/steps/get_lexicon_probs.sh create mode 100755 egs/chime_wsj0/s5/steps/get_train_ctm.sh create mode 100755 egs/chime_wsj0/s5/steps/lmrescore.sh create mode 100755 egs/chime_wsj0/s5/steps/make_bn_feats.sh create mode 100755 egs/chime_wsj0/s5/steps/make_denlats.sh create mode 100755 egs/chime_wsj0/s5/steps/make_denlats_nnet.sh create mode 100755 egs/chime_wsj0/s5/steps/make_denlats_nnet_cpu.sh create mode 100755 egs/chime_wsj0/s5/steps/make_denlats_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/make_denlats_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/make_fbank.sh create mode 100755 egs/chime_wsj0/s5/steps/make_fmllr_feats.sh create mode 100755 egs/chime_wsj0/s5/steps/make_fmmi_feats.sh create mode 100755 egs/chime_wsj0/s5/steps/make_index.sh create mode 100755 egs/chime_wsj0/s5/steps/make_mfcc.sh create mode 100755 egs/chime_wsj0/s5/steps/make_plp.sh create mode 100755 egs/chime_wsj0/s5/steps/mixup.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/align.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/decode.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/get_egs.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/get_lda.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/get_lda_block.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/get_perturbed_feats.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/retrain_tanh.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/train_block.sh create mode 100755 egs/chime_wsj0/s5/steps/nnet2/train_tanh.sh create mode 100755 egs/chime_wsj0/s5/steps/pretrain_dbn.sh create mode 100755 egs/chime_wsj0/s5/steps/rnnlmrescore.sh create mode 100755 egs/chime_wsj0/s5/steps/search_index.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/align_fmllr.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/align_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/align_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/align_si.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/decode.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/decode_fmllr.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/decode_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/decode_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/decode_si.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/make_denlats.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/make_denlats_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/make_denlats_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/mk_aslf_lda_mllt.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/mk_aslf_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_deltas.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_lda_mllt.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_mllt.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_mmi.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_mmi_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_mmi_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_mono.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_sat.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/tandem/train_ubm.sh create mode 100755 egs/chime_wsj0/s5/steps/train_deltas.sh create mode 100755 egs/chime_wsj0/s5/steps/train_diag_ubm.sh create mode 100755 egs/chime_wsj0/s5/steps/train_lda_mllt.sh create mode 100755 egs/chime_wsj0/s5/steps/train_mmi.sh create mode 100755 egs/chime_wsj0/s5/steps/train_mmi_fmmi.sh create mode 100755 egs/chime_wsj0/s5/steps/train_mmi_fmmi_indirect.sh create mode 100755 egs/chime_wsj0/s5/steps/train_mmi_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/train_mmi_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/train_mono.sh create mode 100755 egs/chime_wsj0/s5/steps/train_mpe.sh create mode 100755 egs/chime_wsj0/s5/steps/train_nnet.sh create mode 100755 egs/chime_wsj0/s5/steps/train_nnet_cpu.sh create mode 100755 egs/chime_wsj0/s5/steps/train_nnet_cpu_conv.sh create mode 100755 egs/chime_wsj0/s5/steps/train_nnet_cpu_mmi.sh create mode 100755 egs/chime_wsj0/s5/steps/train_nnet_cpu_tanh.sh create mode 100755 egs/chime_wsj0/s5/steps/train_nnet_mmi.sh create mode 100755 egs/chime_wsj0/s5/steps/train_nnet_mpe.sh create mode 100755 egs/chime_wsj0/s5/steps/train_nnet_scheduler.sh create mode 100755 egs/chime_wsj0/s5/steps/train_quick.sh create mode 100755 egs/chime_wsj0/s5/steps/train_raw_sat.sh create mode 100755 egs/chime_wsj0/s5/steps/train_sat.sh create mode 100755 egs/chime_wsj0/s5/steps/train_sat_basis.sh create mode 100755 egs/chime_wsj0/s5/steps/train_sgmm.sh create mode 100755 egs/chime_wsj0/s5/steps/train_sgmm2.sh create mode 100755 egs/chime_wsj0/s5/steps/train_sgmm2_group.sh create mode 100755 egs/chime_wsj0/s5/steps/train_smbr.sh create mode 100755 egs/chime_wsj0/s5/steps/train_ubm.sh create mode 100755 egs/chime_wsj0/s5/steps/word_align_lattices.sh create mode 100755 egs/chime_wsj0/s5/utils/add_disambig.pl create mode 100755 egs/chime_wsj0/s5/utils/add_lex_disambig.pl create mode 100755 egs/chime_wsj0/s5/utils/apply_map.pl create mode 100755 egs/chime_wsj0/s5/utils/best_wer.sh create mode 100755 egs/chime_wsj0/s5/utils/combine_data.sh create mode 100755 egs/chime_wsj0/s5/utils/convert_ctm.pl create mode 100755 egs/chime_wsj0/s5/utils/convert_slf.pl create mode 100755 egs/chime_wsj0/s5/utils/copy_data_dir.sh create mode 100755 egs/chime_wsj0/s5/utils/eps2disambig.pl create mode 100755 egs/chime_wsj0/s5/utils/filter_scp.pl create mode 100755 egs/chime_wsj0/s5/utils/find_arpa_oovs.pl create mode 100755 egs/chime_wsj0/s5/utils/fix_ctm.sh create mode 100755 egs/chime_wsj0/s5/utils/fix_data_dir.sh create mode 100755 egs/chime_wsj0/s5/utils/format_lm.sh create mode 100755 egs/chime_wsj0/s5/utils/format_lm_sri.sh create mode 100755 egs/chime_wsj0/s5/utils/gen_topo.pl create mode 100755 egs/chime_wsj0/s5/utils/int2sym.pl create mode 100755 egs/chime_wsj0/s5/utils/kwslist_post_process.pl create mode 100755 egs/chime_wsj0/s5/utils/ln.pl create mode 100755 egs/chime_wsj0/s5/utils/make_lexicon_fst.pl create mode 100755 egs/chime_wsj0/s5/utils/make_phone_bigram_lang.sh create mode 100755 egs/chime_wsj0/s5/utils/make_unigram_grammar.pl create mode 100755 egs/chime_wsj0/s5/utils/mkgraph.sh create mode 100755 egs/chime_wsj0/s5/utils/nnet-cpu/make_nnet_config.pl create mode 100755 egs/chime_wsj0/s5/utils/nnet-cpu/make_nnet_config_block.pl create mode 100755 egs/chime_wsj0/s5/utils/nnet-cpu/make_nnet_config_preconditioned.pl create mode 100755 egs/chime_wsj0/s5/utils/nnet-cpu/update_learning_rates.pl create mode 100755 egs/chime_wsj0/s5/utils/nnet/analyze_alignments.sh create mode 100755 egs/chime_wsj0/s5/utils/nnet/copy_feats.sh create mode 100755 egs/chime_wsj0/s5/utils/nnet/gen_dct_mat.py create mode 100755 egs/chime_wsj0/s5/utils/nnet/gen_hamm_mat.py create mode 100755 egs/chime_wsj0/s5/utils/nnet/gen_mlp_init.py create mode 100755 egs/chime_wsj0/s5/utils/nnet/gen_rbm_init.py create mode 100755 egs/chime_wsj0/s5/utils/nnet/gen_recurrent_dnn.py create mode 100755 egs/chime_wsj0/s5/utils/nnet/gen_splice.py create mode 100755 egs/chime_wsj0/s5/utils/nnet/init_nnet.sh create mode 100755 egs/chime_wsj0/s5/utils/parse_options.sh create mode 100755 egs/chime_wsj0/s5/utils/pinyin_map.pl create mode 100755 egs/chime_wsj0/s5/utils/prepare_lang.sh create mode 100755 egs/chime_wsj0/s5/utils/queue.pl create mode 100755 egs/chime_wsj0/s5/utils/reduce_data_dir.sh create mode 100755 egs/chime_wsj0/s5/utils/reduce_data_dir_by_reclist.sh create mode 100755 egs/chime_wsj0/s5/utils/remove_oovs.pl create mode 100755 egs/chime_wsj0/s5/utils/reverse_arpa.py create mode 100755 egs/chime_wsj0/s5/utils/reverse_lm.sh create mode 100755 egs/chime_wsj0/s5/utils/reverse_lm_test.sh create mode 100755 egs/chime_wsj0/s5/utils/rnnlm_compute_scores.sh create mode 100755 egs/chime_wsj0/s5/utils/run.pl create mode 100755 egs/chime_wsj0/s5/utils/s2eps.pl create mode 100755 egs/chime_wsj0/s5/utils/shuffle_list.pl create mode 100755 egs/chime_wsj0/s5/utils/slurm.pl create mode 100755 egs/chime_wsj0/s5/utils/spk2utt_to_utt2spk.pl create mode 100755 egs/chime_wsj0/s5/utils/split_data.sh create mode 100755 egs/chime_wsj0/s5/utils/split_scp.pl create mode 100755 egs/chime_wsj0/s5/utils/subset_data_dir.sh create mode 100755 egs/chime_wsj0/s5/utils/subset_data_dir_tr_cv.sh create mode 100755 egs/chime_wsj0/s5/utils/subset_scp.pl create mode 100755 egs/chime_wsj0/s5/utils/summarize_warnings.pl create mode 100755 egs/chime_wsj0/s5/utils/sym2int.pl create mode 100755 egs/chime_wsj0/s5/utils/utt2spk_to_spk2utt.pl create mode 100755 egs/chime_wsj0/s5/utils/validate_data_dir.sh create mode 100755 egs/chime_wsj0/s5/utils/validate_dict_dir.pl create mode 100755 egs/chime_wsj0/s5/utils/validate_lang.pl create mode 100755 egs/chime_wsj0/s5/utils/write_kwslist.pl diff --git a/egs/chime_wsj0/s5/cmd.sh b/egs/chime_wsj0/s5/cmd.sh new file mode 100644 index 000000000..072aa3819 --- /dev/null +++ b/egs/chime_wsj0/s5/cmd.sh @@ -0,0 +1,29 @@ +# "queue.pl" uses qsub. The options to it are +# options to qsub. If you have GridEngine installed, +# change this to a queue you have access to. +# Otherwise, use "run.pl", which will run jobs locally +# (make sure your --num-jobs options are no more than +# the number of cpus on your machine. + +#a) JHU cluster options +export train_cmd="queue.pl -l arch=*64" +export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" +export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" + +#export cuda_cmd="..." + + +#b) BUT cluster options +#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" +#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" +#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" + +#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" +#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" +#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" + +#c) run it locally... +#export train_cmd=run.pl +#export decode_cmd=run.pl +export cuda_cmd=run.pl +#export mkgraph_cmd=run.pl diff --git a/egs/chime_wsj0/s5/conf/decode_dnn.config b/egs/chime_wsj0/s5/conf/decode_dnn.config new file mode 100644 index 000000000..bfaae8670 --- /dev/null +++ b/egs/chime_wsj0/s5/conf/decode_dnn.config @@ -0,0 +1,2 @@ +beam=18.0 # beam for decoding. Was 13.0 in the scripts. +latbeam=10.0 # this has most effect on size of the lattices. diff --git a/egs/chime_wsj0/s5/conf/fbank.conf b/egs/chime_wsj0/s5/conf/fbank.conf new file mode 100644 index 000000000..5fc7774b3 --- /dev/null +++ b/egs/chime_wsj0/s5/conf/fbank.conf @@ -0,0 +1,11 @@ +# No non-default options for now. +--window-type=hamming # disable Dans window, use the standard +--use-energy=false # only fbank outputs +--sample-frequency=16000 # Cantonese is sampled at 8kHz + +--low-freq=64 # typical setup from Frantisek Grezl +--high-freq=8000 +--dither=1 + +--num-mel-bins=40 # 8kHz so we use 15 bins +--htk-compat=true # try to make it compatible with HTK diff --git a/egs/chime_wsj0/s5/conf/mfcc.conf b/egs/chime_wsj0/s5/conf/mfcc.conf new file mode 100644 index 000000000..736150909 --- /dev/null +++ b/egs/chime_wsj0/s5/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/chime_wsj0/s5/local/binmask_wsj0_data_prep.sh b/egs/chime_wsj0/s5/local/binmask_wsj0_data_prep.sh new file mode 100755 index 000000000..1914b3695 --- /dev/null +++ b/egs/chime_wsj0/s5/local/binmask_wsj0_data_prep.sh @@ -0,0 +1,117 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the top-level WSJ corpus directory." + echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory" + echo "within the top-level corpus directory." + exit 1; +fi + +CORPUS=$1 + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +cd $dir + +# reverb list for SI-84 + +find $1/si_tr_s -name '*.wav' | sort -u > train_si84_binmask.flist + + + +# Dev-set Hub 1,2 (503, 913 utterances) + +# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. +# Sometimes this gets copied from the CD's with upcasing, don't know +# why (could be older versions of the disks). +find $1/si_dt_05 -name '*.wav' | sort -u > dev_dt_05_binmask.flist + +find $1/si_et_05 -name '*.wav' | sort -u > test_eval92_5k_binmask.flist + + +# Finding the transcript files: +#find -L $CORPUS -iname '*.dot' > dot_files.flist +if [ ! -e $dir/dot_files.flist ]; then + echo "Could not find $dir/dot_files.flist files, first run clean_data_prep.sh"; + exit 1; +fi + +# Convert the transcripts into our format (no normalization yet) +# adding suffix to utt_id +# 1 for reverb condition +for x in train_si84_binmask dev_dt_05_binmask test_eval92_5k_binmask; do + cat $x.flist | perl -e ' + while(<>) { + m:^\S+/(\w+)\.wav$: || die "Bad line $_"; + $id = $1; + $id =~ tr/A-Z/a-z/; + print "$id $_"; + } + ' | sort > ${x}_wav_tmp.scp + #cat ${x}_wav_tmp.scp | awk '{print $1}' \ + # | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1 + cat ${x}_wav_tmp.scp | perl -e ' + while() { + @A=split(" ", $_); + @B=split("/", $_); + $abs_path_len=@B; + $condition=$B[$abs_path_len-3]; + if ($condition eq "9dB") {$key_suffix=8;} + elsif ($condition eq "6dB") {$key_suffix=9;} + elsif ($condition eq "3dB") {$key_suffix=a;} + elsif ($condition eq "0dB") {$key_suffix=b;} + elsif ($condition eq "m3dB") {$key_suffix=c;} + elsif ($condition eq "m6dB") {$key_suffix=d;} + else {print STDERR "error condition $condition";} + print $A[0].$key_suffix." ".$A[1]."\n"; + } + ' | sort -k1 > ${x}_wav.scp + cat ${x}_wav.scp | awk '{print $1}' \ + | $local/find_noisy_transcripts.pl dot_files.flist > ${x}.trans1 +done + + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in train_si84_binmask dev_dt_05_binmask test_eval92_5k_binmask; do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) +#for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do +# awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \ +# > ${x}_wav.scp +#done + +# Make the utt2spk and spk2utt files. +for x in train_si84_binmask dev_dt_05_binmask test_eval92_5k_binmask; do + cat ${x}_wav.scp | awk '{print $1}' \ + | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +echo "Data preparation succeeded" diff --git a/egs/chime_wsj0/s5/local/chime_format_data.sh b/egs/chime_wsj0/s5/local/chime_format_data.sh new file mode 100755 index 000000000..47bec0b04 --- /dev/null +++ b/egs/chime_wsj0/s5/local/chime_format_data.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# This script takes data prepared in a corpus-dependent way +# in data/local/, and converts it into the "canonical" form, +# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, +# data/train_si284, data/train_si84, etc. + +# Don't bother doing train_si84 separately (although we have the file lists +# in data/local/) because it's just the first 7138 utterances in train_si284. +# We'll create train_si84 after doing the feature extraction. + +. ./path.sh || exit 1; + +echo "Preparing train and test data" +srcdir=data/local/data +lmdir=data/local/nist_lm +tmpdir=data/local/lm_tmp +lexicon=data/local/lang_tmp/lexiconp.txt +mkdir -p $tmpdir + +for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do + mkdir -p data/$x + cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; + cp $srcdir/$x.txt data/$x/text || exit 1; + cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1; + cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1; + utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1; +done + + +# Next, for each type of language model, create the corresponding FST +# and the corresponding lang_test_* directory. + +echo Preparing language models for test + +for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do + test=data/lang_test_${lm_suffix} + mkdir -p $test + for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ + phones/; do + cp -r data/lang/$f $test + done + gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ + utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt + + # grep -v ' ' because the LM seems to have some strange and useless + # stuff in it with multiple 's in the history. Encountered some other similar + # things in a LM from Geoff. Removing all "illegal" combinations of and , + # which are supposed to occur only at being/end of utt. These can cause + # determinization failures of CLG [ends up being epsilon cycles]. + gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + arpa2fst - | fstprint | \ + utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ + utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ + --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > $test/G.fst + fstisstochastic $test/G.fst + # The output is like: + # 9.14233e-05 -0.259833 + # we do expect the first of these 2 numbers to be close to zero (the second is + # nonzero because the backoff weights make the states sum to >1). + # Because of the fiasco for these particular LMs, the first number is not + # as close to zero as it could be. + + # Everything below is only for diagnostic. + # Checking that G has no cycles with empty words on them (e.g. , ); + # this might cause determinization failure of CLG. + # #0 is treated as an empty word. + mkdir -p $tmpdir/g + awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ + < "$lexicon" >$tmpdir/g/select_empty.fst.txt + fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ + fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + echo "Language model has cycles with empty words" && exit 1 + rm -r $tmpdir/g +done + +echo "Succeeded in formatting data." +rm -r $tmpdir diff --git a/egs/chime_wsj0/s5/local/clean_wsj0_data_prep.sh b/egs/chime_wsj0/s5/local/clean_wsj0_data_prep.sh new file mode 100755 index 000000000..45226cc00 --- /dev/null +++ b/egs/chime_wsj0/s5/local/clean_wsj0_data_prep.sh @@ -0,0 +1,190 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the top-level WSJ corpus directory." + echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory" + echo "within the top-level corpus directory." + exit 1; +fi + +CORPUS=$1 + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +cd $dir + +# This version for SI-84 +cat $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \ + | $local/cstr_ndx2flist.pl $CORPUS | sort -u > train_si84_clean.flist + +# This version for SI-284 +#cat $CORPUS/wsj1/doc/indices/si_tr_s.ndx \ +# $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \ +# | $local/cstr_ndx2flist.pl $CORPUS | sort \ +# | grep -v wsj0/si_tr_s/401 > train_si284.flist + +# Now for the test sets. +# $CORPUS/wsj1/doc/indices/readme.doc +# describes all the different test sets. +# Note: each test-set seems to come in multiple versions depending +# on different vocabulary sizes, verbalized vs. non-verbalized +# pronunciations, etc. We use the largest vocab and non-verbalized +# pronunciations. +# The most normal one seems to be the "baseline 60k test set", which +# is h1_p0. + +# Nov'92 (333 utts) +# These index files have a slightly different format; +# have to add .wv1, which is done in cstr_ndx2flist.pl +cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_20.ndx | \ + $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92_clean.flist + +# Nov'92 (330 utts, 5k vocab) +cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_05.ndx | \ + $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92_5k_clean.flist + +# Nov'93: (213 utts) +# Have to replace a wrong disk-id. +#cat $CORPUS/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \ +# $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93.flist + +# Nov'93: (215 utts, 5k) +#cat $CORPUS/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \ +# $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93_5k.flist + +# Dev-set for Nov'93 (503 utts) +#cat $CORPUS/wsj1/doc/indices/h1_p0.ndx | \ +# $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93.flist + +# Dev-set for Nov'93 (513 utts, 5k vocab) +#cat $CORPUS/wsj1/doc/indices/h2_p0.ndx | \ +# $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93_5k.flist + + +# Dev-set Hub 1,2 (503, 913 utterances) + +# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. +# Sometimes this gets copied from the CD's with upcasing, don't know +# why (could be older versions of the disks). +find $CORPUS/wsj0/si_dt_20 -print | grep -i ".wv1" | sort > dev_dt_20_clean.flist +find $CORPUS/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dev_dt_05_clean.flist + + +# Finding the transcript files: +find -L $CORPUS -iname '*.dot' > dot_files.flist + +# Convert the transcripts into our format (no normalization yet) +# adding suffix to utt_id +# 0 for clean condition +for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do + $local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp + cat ${x}_sph_tmp.scp | awk '{print $1}' \ + | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1 + cat ${x}_sph_tmp.scp | awk '{printf("%s0 %s\n", $1, $2);}' > ${x}_sph.scp + cat ${x}_tmp.trans1 | awk '{printf("%s0 ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1 +done + + + + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) +for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do + awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \ + > ${x}_wav.scp +done + +# Make the utt2spk and spk2utt files. +for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do + cat ${x}_sph.scp | awk '{print $1}' \ + | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +#in case we want to limit lm's on most frequent words, copy lm training word frequency list +cp $CORPUS/wsj0/doc/lng_modl/vocab/wfl_64.lst $lmdir +chmod u+w $lmdir/*.lst # had weird permissions on source. + +# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without +# verbalized pronunciations. This is the most common test setup, I understand. + +cp $CORPUS/wsj0/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1; +chmod u+w $lmdir/lm_bg.arpa.gz + +# trigram would be: +cat $CORPUS/wsj0/doc/lng_modl/base_lm/tcb20onp.z | \ + perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' \ + | gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1; + +prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1; +gzip -f $lmdir/lm_tgpr.arpa || exit 1; + +# repeat for 5k language models +cp $CORPUS/wsj0/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1; +chmod u+w $lmdir/lm_bg_5k.arpa.gz + +# trigram would be: !only closed vocabulary here! +cp $CORPUS/wsj0/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1; +chmod u+w $lmdir/lm_tg_5k.arpa.gz +gunzip $lmdir/lm_tg_5k.arpa.gz +tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz +rm $lmdir/lm_tg_5k.arpa + +prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1; +gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1; + + +if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then + rm -f wsj0-train-spkrinfo.txt + wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \ + || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ + wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt ); +fi + +if [ ! -f wsj0-train-spkrinfo.txt ]; then + echo "Could not get the spkrinfo.txt file from LDC website (moved)?" + echo "This is possibly omitted from the training disks; couldn't find it." + echo "Everything else may have worked; we just may be missing gender info" + echo "which is only needed for VTLN-related diagnostics anyway." + exit 1 +fi +# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the +# LDC put it on the web. Perhaps it was accidentally omitted from the +# disks. + +cat $CORPUS/wsj0/doc/spkrinfo.txt \ + ./wsj0-train-spkrinfo.txt | \ + perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \ + awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender + + +echo "Data preparation succeeded" diff --git a/egs/chime_wsj0/s5/local/copy_clean_ali.sh b/egs/chime_wsj0/s5/local/copy_clean_ali.sh new file mode 100755 index 000000000..367a57b48 --- /dev/null +++ b/egs/chime_wsj0/s5/local/copy_clean_ali.sh @@ -0,0 +1,13 @@ +#!/bin/bash +. path.sh +data=$1 +old_ali_dir=$2 +mix_ali_dir=$3 +mkdir -p $mix_ali_dir + +cp $old_ali_dir/{final.mdl,num_jobs,tree} $mix_ali_dir/ + +gunzip -c $old_ali_dir/ali.*.gz | gzip -c > $old_ali_dir/ali.gz + +feats="ark,s,cs:copy-feats scp:$data/feats.scp ark:- |" +copy-clean-ali "$feats" "ark:gunzip -c $old_ali_dir/ali.gz |" "ark:| gzip -c > $mix_ali_dir/ali.1.gz" diff --git a/egs/chime_wsj0/s5/local/cstr_ndx2flist.pl b/egs/chime_wsj0/s5/local/cstr_ndx2flist.pl new file mode 100755 index 000000000..101834e86 --- /dev/null +++ b/egs/chime_wsj0/s5/local/cstr_ndx2flist.pl @@ -0,0 +1,54 @@ +#!/usr/bin/perl + +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 12/1/12 + +# This program takes as its standard input an .ndx file from the WSJ corpus that looks +# like this: +#;; File: tr_s_wv1.ndx, updated 04/26/94 +#;; +#;; Index for WSJ0 SI-short Sennheiser training data +#;; Data is read WSJ sentences, Sennheiser mic. +#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts +#;; per speaker TI) = 7236 utts +#;; +#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 +#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 +#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 + +# and as command-line argument it takes the names of the WSJ disk locations, e.g.: +# /group/corpora/public/wsjcam0/data on DICE machines. +# It outputs a list of absolute pathnames. + +$wsj_dir = $ARGV[0]; + +while(){ + if(m/^;/){ next; } # Comment. Ignore it. + else { + m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; + $filename = $2; # as a subdirectory of the distributed disk. + if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; } + $filename = "$wsj_dir/$filename"; + if (-e $filename) { + print "$filename\n"; + } else { + print STDERR "File $filename found in the index but not on disk\n"; + } + } +} diff --git a/egs/chime_wsj0/s5/local/cstr_wsj_data_prep.sh b/egs/chime_wsj0/s5/local/cstr_wsj_data_prep.sh new file mode 100755 index 000000000..3a447cdc2 --- /dev/null +++ b/egs/chime_wsj0/s5/local/cstr_wsj_data_prep.sh @@ -0,0 +1,187 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the top-level WSJ corpus directory." + echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory" + echo "within the top-level corpus directory." + exit 1; +fi + +CORPUS=$1 + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +cd $dir + +# This version for SI-84 +cat $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \ + | $local/cstr_ndx2flist.pl $CORPUS | sort \ + | grep -v wsj0/si_tr_s/401 > train_si84.flist + +# This version for SI-284 +cat $CORPUS/wsj1/doc/indices/si_tr_s.ndx \ + $CORPUS/wsj0/doc/indices/train/tr_s_wv1.ndx \ + | $local/cstr_ndx2flist.pl $CORPUS | sort \ + | grep -v wsj0/si_tr_s/401 > train_si284.flist + +# Now for the test sets. +# $CORPUS/wsj1/doc/indices/readme.doc +# describes all the different test sets. +# Note: each test-set seems to come in multiple versions depending +# on different vocabulary sizes, verbalized vs. non-verbalized +# pronunciations, etc. We use the largest vocab and non-verbalized +# pronunciations. +# The most normal one seems to be the "baseline 60k test set", which +# is h1_p0. + +# Nov'92 (333 utts) +# These index files have a slightly different format; +# have to add .wv1, which is done in cstr_ndx2flist.pl +cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_20.ndx | \ + $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92.flist + +# Nov'92 (330 utts, 5k vocab) +cat $CORPUS/wsj0/doc/indices/test/nvp/si_et_05.ndx | \ + $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval92_5k.flist + +# Nov'93: (213 utts) +# Have to replace a wrong disk-id. +cat $CORPUS/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \ + $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93.flist + +# Nov'93: (215 utts, 5k) +cat $CORPUS/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \ + $local/cstr_ndx2flist.pl $CORPUS | sort > test_eval93_5k.flist + +# Dev-set for Nov'93 (503 utts) +cat $CORPUS/wsj1/doc/indices/h1_p0.ndx | \ + $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93.flist + +# Dev-set for Nov'93 (513 utts, 5k vocab) +cat $CORPUS/wsj1/doc/indices/h2_p0.ndx | \ + $local/cstr_ndx2flist.pl $CORPUS | sort > test_dev93_5k.flist + + +# Dev-set Hub 1,2 (503, 913 utterances) + +# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. +# Sometimes this gets copied from the CD's with upcasing, don't know +# why (could be older versions of the disks). +find $CORPUS/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist +find $CORPUS/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist + + +# Finding the transcript files: +find -L $CORPUS -iname '*.dot' > dot_files.flist + +# Convert the transcripts into our format (no normalization yet) +for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp + cat ${x}_sph.scp | awk '{print $1}' \ + | $local/find_transcripts.pl dot_files.flist > $x.trans1 +done + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) +for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \ + > ${x}_wav.scp +done + +# Make the utt2spk and spk2utt files. +for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + cat ${x}_sph.scp | awk '{print $1}' \ + | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +#in case we want to limit lm's on most frequent words, copy lm training word frequency list +cp $CORPUS/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir +chmod u+w $lmdir/*.lst # had weird permissions on source. + +# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without +# verbalized pronunciations. This is the most common test setup, I understand. + +cp $CORPUS/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1; +chmod u+w $lmdir/lm_bg.arpa.gz + +# trigram would be: +cat $CORPUS/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \ + perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' \ + | gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1; + +prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1; +gzip -f $lmdir/lm_tgpr.arpa || exit 1; + +# repeat for 5k language models +cp $CORPUS/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1; +chmod u+w $lmdir/lm_bg_5k.arpa.gz + +# trigram would be: !only closed vocabulary here! +cp $CORPUS/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1; +chmod u+w $lmdir/lm_tg_5k.arpa.gz +gunzip $lmdir/lm_tg_5k.arpa.gz +tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz +rm $lmdir/lm_tg_5k.arpa + +prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1; +gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1; + + +if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then + rm -f wsj0-train-spkrinfo.txt + wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \ + || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ + wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt ); +fi + +if [ ! -f wsj0-train-spkrinfo.txt ]; then + echo "Could not get the spkrinfo.txt file from LDC website (moved)?" + echo "This is possibly omitted from the training disks; couldn't find it." + echo "Everything else may have worked; we just may be missing gender info" + echo "which is only needed for VTLN-related diagnostics anyway." + exit 1 +fi +# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the +# LDC put it on the web. Perhaps it was accidentally omitted from the +# disks. + +cat $CORPUS/wsj0/doc/spkrinfo.txt \ + $CORPUS/wsj1/doc/evl_spok/spkrinfo.txt \ + $CORPUS/wsj1/doc/dev_spok/spkrinfo.txt \ + $CORPUS/wsj1/doc/train/spkrinfo.txt \ + ./wsj0-train-spkrinfo.txt | \ + perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \ + awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender + + +echo "Data preparation succeeded" diff --git a/egs/chime_wsj0/s5/local/cstr_wsj_extend_dict.sh b/egs/chime_wsj0/s5/local/cstr_wsj_extend_dict.sh new file mode 100755 index 000000000..b2a9faad7 --- /dev/null +++ b/egs/chime_wsj0/s5/local/cstr_wsj_extend_dict.sh @@ -0,0 +1,172 @@ +#!/bin/bash + +# This script builds a larger word-list and dictionary +# than used for the LMs supplied with the WSJ corpus. +# It uses a couple of strategies to fill-in words in +# the LM training data but not in CMUdict. One is +# to generate special prons for possible acronyms, that +# just consist of the constituent letters. The other +# is designed to handle derivatives of known words +# (e.g. deriving the pron of a plural from the pron of +# the base-word), but in a more general, learned-from-data +# way. +# It makes use of scripts in local/dict/ + +if [ $# -ne 1 ]; then + echo "Usage: local/cstr_wsj_train_lms.sh WSJ1_doc_dir" + exit 1 +fi + +export PATH=$PATH:`pwd`/local/dict/ +srcdir=$1 + +if [ ! -d $srcdir/lng_modl ]; then + echo "Expecting 'lng_modl' under WSJ doc directory '$srcdir'" + exit 1 +fi + +mkdir -p data/local/dict_larger +dir=data/local/dict_larger +cp data/local/dict/* data/local/dict_larger # Various files describing phones etc. + # are there; we just want to copy them as the phoneset is the same. +rm data/local/dict_larger/lexicon.txt # we don't want this. +mincount=2 # Minimum count of an OOV we will try to generate a pron for. + +[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1; + +# Remove comments from cmudict; print first field; remove +# words like FOO(1) which are alternate prons: our dict format won't +# include these markers. +grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | + perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu + +cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu + +echo "Getting training data [this should take at least a few seconds; if not, there's a problem]" + +# Convert to uppercase, remove XML-like markings. +# For words ending in "." that are not in CMUdict, we assume that these +# are periods that somehow remained in the data during data preparation, +# and we we replace the "." with "\n". Note: we found this by looking at +# oov.counts below (before adding this rule). + +touch $dir/cleaned.gz +if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then + echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]"; +else + gunzip -c $srcdir/lng_modl/lm_train/np_data/{87,88,89}/*.z \ + | awk '/^){ chop; $isword{$_} = 1; } + while() { + @A = split(" ", $_); + for ($n = 0; $n < @A; $n++) { + $a = $A[$n]; + if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "." + # and have no other "." in them: treat as period. + print "$a"; + if ($n+1 < @A) { print "\n"; } + } else { print "$a "; } + } + print "\n"; + } + ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz +fi + +# get unigram counts +echo "Getting unigram counts" +gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \ + awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams + +cat $dir/unigrams | awk -v dict=$dir/dict.cmu \ + 'BEGIN{while(getline $dir/oov.counts + +echo "Most frequent unseen unigrams are: " +head $dir/oov.counts + +# Prune away singleton counts, and remove things with numbers in +# (which should have been normalized) and with no letters at all. + + +cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \ + | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist + +# Automatic rule-finding... + +# First make some prons for possible acronyms. +# Note: we don't do this for things like U.K or U.N, +# or A.B. (which doesn't exist anyway), +# as we consider this normalization/spelling errors. + +cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms + +mkdir $dir/f $dir/b # forward, backward directions of rules... + # forward is normal suffix + # rules, backward is reversed (prefix rules). These + # dirs contain stuff we create while making the rule-based + # extensions to the dictionary. + +# Remove ; and , from words, if they are present; these +# might crash our scripts, as they are used as separators there. +filter_dict.pl $dir/dict.cmu > $dir/f/dict +cat $dir/oovlist | filter_dict.pl > $dir/f/oovs +reverse_dict.pl $dir/f/dict > $dir/b/dict +reverse_dict.pl $dir/f/oovs > $dir/b/oovs + +# The next stage takes a few minutes. +# Note: the forward stage takes longer, as English is +# mostly a suffix-based language, and there are more rules +# that it finds. +for d in $dir/f $dir/b; do + ( + cd $d + cat dict | get_rules.pl 2>get_rules.log >rules + get_rule_hierarchy.pl rules >hierarchy + awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ + limit_candidate_prons.pl hierarchy | \ + score_prons.pl dict | \ + count_rules.pl >rule.counts + # the sort command below is just for convenience of reading. + score_rules.pl rules.with_scores + get_candidate_prons.pl rules.with_scores dict oovs | \ + limit_candidate_prons.pl hierarchy > oovs.candidates + ) & +done +wait + +# Merge the candidates. +reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates +select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \ + > $dir/dict.oovs + +cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged + +awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled +sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled + + +# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs +add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts +add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts + +echo "**Top OOVs we handled are:**"; +head $dir/oovlist.handled.counts +echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; +head $dir/oovlist.not_handled.counts + + +echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`" +echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`" +echo "Count of OOVs we didn't handle due to low count is" \ + `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts` +# The two files created above are for humans to look at, as diagnostics. + +cat < $dir/lexicon.txt +!SIL SIL + SPN + SPN + NSN +EOF + +echo "Created $dir/lexicon.txt" diff --git a/egs/chime_wsj0/s5/local/dict/add_counts.pl b/egs/chime_wsj0/s5/local/dict/add_counts.pl new file mode 100755 index 000000000..409277c72 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/add_counts.pl @@ -0,0 +1,31 @@ +#!/usr/bin/perl + + +# Add counts to an oovlist. +# Reads in counts as output by uniq -c, and +# an oovlist, and prints out the counts of the oovlist. + +(@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n"; + +$counts = shift @ARGV; + +open(C, "<$counts") || die "Opening counts file $counts"; + +while() { + @A = split(" ", $_); + @A == 2 || die "Bad line in counts file: $_"; + ($count, $word) = @A; + $count =~ m:^\d+$: || die "Bad count $A[0]\n"; + $counts{$word} = $count; +} + +while(<>) { + chop; + $w = $_; + $w =~ m:\S+: || die "Bad word $w"; + defined $counts{$w} || die "Word $w not present in counts file"; + print "\t$counts{$w}\t$w\n"; +} + + + diff --git a/egs/chime_wsj0/s5/local/dict/count_rules.pl b/egs/chime_wsj0/s5/local/dict/count_rules.pl new file mode 100755 index 000000000..2805e98c3 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/count_rules.pl @@ -0,0 +1,44 @@ +#!/usr/bin/perl + +# This program takes the output of score_prons.pl and collates +# it for each (rule, destress) pair so that we get the +# counts of right/partial/wrong for each pair. + +# The input is a 7-tuple on each line, like: +# word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong +# +# The output format is a 5-tuple like: +# +# rule;destress;right-count;partial-count;wrong-count +# + +if (@ARGV != 0 && @ARGV != 1) { + die "Usage: count_rules.pl < scored_candidate_prons > rule_counts"; +} + + +while(<>) { + chop; + $line = $_; + my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line); + + my $key = $rulename . ";" . $destress; + + if (!defined $counts{$key}) { + $counts{$key} = [ 0, 0, 0 ]; # new anonymous array. + } + $ref = $counts{$key}; + if ($score eq "right") { + $$ref[0]++; + } elsif ($score eq "partial") { + $$ref[1]++; + } elsif ($score eq "wrong") { + $$ref[2]++; + } else { + die "Bad score $score\n"; + } +} + +while ( my ($key, $value) = each(%counts)) { + print $key . ";" . join(";", @$value) . "\n"; +} diff --git a/egs/chime_wsj0/s5/local/dict/filter_dict.pl b/egs/chime_wsj0/s5/local/dict/filter_dict.pl new file mode 100755 index 000000000..1210bb5e6 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/filter_dict.pl @@ -0,0 +1,19 @@ +#!/usr/bin/perl + + +# This program reads and writes either a dictionary or just a list +# of words, and it removes any words containing ";" or "," as these +# are used in these programs. It will warn about these. +# It will die if the pronunciations have these symbols in. +while(<>) { + chop; + @A = split(" ", $_); + $word = shift @A; + + if ($word =~ m:[;,]:) { + print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ; + } else { + $_ =~ m:[;,]: && die "Phones cannot have ; or , in them."; + print $_ . "\n"; + } +} diff --git a/egs/chime_wsj0/s5/local/dict/find_acronyms.pl b/egs/chime_wsj0/s5/local/dict/find_acronyms.pl new file mode 100755 index 000000000..ed4655afa --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/find_acronyms.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl + +# Reads a dictionary, and prints out a list of words that seem to be pronounced +# as acronyms (not including plurals of acronyms, just acronyms). Uses +# the prons of the individual letters (A., B. and so on) to judge this. +# Note: this is somewhat dependent on the convention used in CMUduct, that +# the individual letters are spelled this way (e.g. "A."). + +$max_length = 6; # Max length of words that might be + # acronyms. + +while(<>) { # Read the dict. + chop; + @A = split(" ", $_); + $word = shift @A; + $pron = join(" ", @A); + if ($word =~ m/^([A-Z])\.$/ ) { + chop $word; # Remove trailing "." to get just the letter + $letter = $1; + if (!defined $letter_prons{$letter} ) { + $letter_prons{$letter} = [ ]; # new anonymous array + } + $arrayref = $letter_prons{$letter}; + push @$arrayref, $pron; + } elsif( length($word) <= $max_length ) { + $pronof{$word . "," . $pron} = 1; + $isword{$word} = 1; + #if (!defined $prons{$word} ) { + # $prons{$word} = [ ]; + #} + # push @{$prons{$word}}, $pron; + } +} + +sub get_letter_prons; + +foreach $word (keys %isword) { + my @letter_prons = get_letter_prons($word); + foreach $pron (@letter_prons) { + if (defined $pronof{$word.",".$pron}) { + print "$word $pron\n"; + } + } +} + + +sub get_letter_prons { + @acronym = split("", shift); # The letters in the word. + my @prons = ( "" ); + + while (@acronym > 0) { + $l = shift @acronym; + $n = 1; # num-repeats of letter $l. + while (@acronym > 0 && $acronym[0] eq $l) { + $n++; + shift @acronym; + } + my $arrayref = $letter_prons{$l}; + my @prons_of_block = (); + if ($n == 1) { # Just one repeat. + foreach $lpron ( @$arrayref ) { + push @prons_of_block, $lpron; # typically (always?) just one pron of a letter. + } + } elsif ($n == 2) { # Two repeats. Can be "double a" or "a a" + foreach $lpron ( @$arrayref ) { + push @prons_of_block, "D AH1 B AH0 L " . $lpron; + push @prons_of_block, $lpron . $lpron; + } + } elsif ($n == 3) { # can be "triple a" or "a a a" + foreach $lpron ( @$arrayref ) { + push @prons_of_block, "T R IH1 P AH0 L " . $lpron; + push @prons_of_block, $lpron . $lpron . $lpron; + } + } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times.. + # not sure really. + foreach $lpron ( @$arrayref ) { + $nlpron = ""; + for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; } + push @prons_of_block, $nlpron; + } + } + my @new_prons = (); + foreach $pron (@prons) { + foreach $pron_of_block(@prons_of_block) { + if ($pron eq "") { + push @new_prons, $pron_of_block; + } else { + push @new_prons, $pron . " " . $pron_of_block; + } + } + } + @prons = @new_prons; + } + return @prons; +} diff --git a/egs/chime_wsj0/s5/local/dict/get_acronym_prons.pl b/egs/chime_wsj0/s5/local/dict/get_acronym_prons.pl new file mode 100755 index 000000000..3f9936818 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/get_acronym_prons.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl + +# Reads a dictionary (for prons of letters), and an OOV list, +# and puts out candidate pronunciations of words in that list +# that could plausibly be acronyms. +# We judge that a word can plausibly be an acronym if it is +# a sequence of just letters (no non-letter characters such +# as "'"), or something like U.K., +# and the number of letters is four or less. +# +# If the text were not already pre-normalized, there would +# be other hints such as capitalization. + +# This program appends +# the prons of the individual letters (A., B. and so on) to work out +# the pron of the acronym. +# Note: this is somewhat dependent on the convention used in CMUduct, that +# the individual letters are spelled this way (e.g. "A."). [it seems +# to also have the separated versions. + +if (!(@ARGV == 1 || @ARGV == 2)) { + print "Usage: get_acronym_prons.pl dict [oovlist]"; +} + +$max_length = 4; # Max #letters in an acronym. (Longer + # acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF. + +$dict = shift @ARGV; +open(D, "<$dict") || die "Opening dictionary $dict"; + +while() { # Read the dict, to get the prons of the letters. + chop; + @A = split(" ", $_); + $word = shift @A; + $pron = join(" ", @A); + if ($word =~ m/^([A-Z])\.$/ ) { + chop $word; # Remove trailing "." to get just the letter + $letter = $1; + if (!defined $letter_prons{$letter} ) { + $letter_prons{$letter} = [ ]; # new anonymous array + } + $arrayref = $letter_prons{$letter}; + push @$arrayref, $pron; + } elsif( length($word) <= $max_length ) { + $pronof{$word . "," . $pron} = 1; + $isword{$word} = 1; + #if (!defined $prons{$word} ) { + # $prons{$word} = [ ]; + #} + # push @{$prons{$word}}, $pron; + } +} + +sub get_letter_prons; + +while(<>) { # Read OOVs. + # For now, just do the simple cases without "." in + # between... things with "." in the OOV list seem to + # be mostly errors. + chop; + $word = $_; + if ($word =~ m/^[A-Z]{1,5}$/) { + foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO + print "$word $pron\n"; + } + } elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K. Make the final "." optional. + $letters = $word; + $letters =~ s:\.::g; + foreach $pron ( get_letter_prons($letters) ) { + print "$word $pron\n"; + } + } +} + +sub get_letter_prons { + @acronym = split("", shift); # The letters in the word. + my @prons = ( "" ); + + while (@acronym > 0) { + $l = shift @acronym; + $n = 1; # num-repeats of letter $l. + while (@acronym > 0 && $acronym[0] eq $l) { + $n++; + shift @acronym; + } + my $arrayref = $letter_prons{$l}; + my @prons_of_block = (); + if ($n == 1) { # Just one repeat. + foreach $lpron ( @$arrayref ) { + push @prons_of_block, $lpron; # typically (always?) just one pron of a letter. + } + } elsif ($n == 2) { # Two repeats. Can be "double a" or "a a" + foreach $lpron ( @$arrayref ) { + push @prons_of_block, "D AH1 B AH0 L " . $lpron; + push @prons_of_block, $lpron . " " . $lpron; + } + } elsif ($n == 3) { # can be "triple a" or "a a a" + foreach $lpron ( @$arrayref ) { + push @prons_of_block, "T R IH1 P AH0 L " . $lpron; + push @prons_of_block, "$lpron $lpron $lpron"; + } + } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times.. + # not sure really. + foreach $lpron ( @$arrayref ) { + $nlpron = $lpron; + for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; } + push @prons_of_block, $nlpron; + } + } + my @new_prons = (); + foreach $pron (@prons) { + foreach $pron_of_block(@prons_of_block) { + if ($pron eq "") { + push @new_prons, $pron_of_block; + } else { + push @new_prons, $pron . " " . $pron_of_block; + } + } + } + @prons = @new_prons; + } + return @prons; +} diff --git a/egs/chime_wsj0/s5/local/dict/get_candidate_prons.pl b/egs/chime_wsj0/s5/local/dict/get_candidate_prons.pl new file mode 100755 index 000000000..b13efd203 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/get_candidate_prons.pl @@ -0,0 +1,187 @@ +#!/usr/bin/perl + +# This script takes three command-line arguments (typically files, or "-"): +# the suffix rules (as output by get_rules.pl), the rule-hierarchy +# (from get_rule_hierarchy.pl), and the words that we want prons to be +# generated for (one per line). + +# The output consists of candidate generated pronunciations for those words, +# together with information about how we generated those pronunciations. +# This does not do pruning of the candidates using the restriction +# "you can't use a more general rule when a more specific one is applicable". +# That is done by limit_candidate_prons.pl. + +# Each line of the output consists of a 4-tuple, separated by ";", of the +# form: +# word;pron;base-word;base-pron;rule-name;destress[;rule-score] +# [the last field is only present if you supplied rules with score information]. +# where: +# - "word" is the input word that we queried for, e.g. WASTED +# - "pron" is the generated pronunciation, e.g. "W EY1 S T AH0 D" +# - rule-name is a 4-tuple separated by commas that describes the rule, e.g. +# "STED,STING,D,NG", +# - "base-word" is the base-word we're getting the pron from, +# e.g. WASTING +# - "base-pron" is the pron of the base-word, e.g. "W EY1 S T IH0 NG" +# - "destress" is either "yes" or "no" and corresponds to whether we destressed the +# base-word or not [de-stressing just corresponds to just taking any 2's down to 1's, +# although we may extend this in future]... +# - "rule-score" is a numeric score of the rule (this field is only present +# if there was score information in your rules. + + +(@ARGV == 2 || @ARGV == 3) || die "Usage: get_candidate_prons.pl rules base-dict [ words ]"; + +$min_prefix_len = 3; # this should probably match with get_rules.pl + +$rules = shift @ARGV; # Note: rules may be with destress "yes/no" indicators or without... + # if without, it's treated as if both "yes" and "no" are present. +$dict = shift @ARGV; + +open(R, "<$rules") || die "Opening rules file: $rules"; + +sub process_word; + +while() { + chop $_; + my ($rule, $destress, $rule_score) = split(";", $_); # We may have "destress" markings (yes|no), + # and scores, or we may have just rule, in which case + # $destress and $rule_score will be undefined. + + my @R = split(",", $rule, 4); # "my" means new instance of @R each + # time we do this loop -> important because we'll be creating + # a reference to @R below. + # Note: the last arg to SPLIT tells it how many fields max to get. + # This stops it from omitting empty trailing fields. + @R == 4 || die "Bad rule $_"; + $suffix = $R[0]; # Suffix of word we want pron for. + if (!defined $isrule{$rule}) { + $isrule{$rule} = 1; # make sure we do this only once for each rule + # (don't repeate for different stresses). + if (!defined $suffix2rule{$suffix}) { + # The syntax [ $x, $y, ... ] means a reference to a newly created array + # containing $x, $y, etc. \@R creates an array reference to R. + # so suffix2rule is a hash from suffix to ref to array of refs to + # 4-dimensional arrays. + $suffix2rule{$suffix} = [ \@R ]; + } else { + # Below, the syntax @{$suffix2rule{$suffix}} dereferences the array + # reference inside the hash; \@R pushes onto that array a new array + # reference pointing to @R. + push @{$suffix2rule{$suffix}}, \@R; + } + } + if (!defined $rule_score) { $rule_score = -1; } # -1 means we don't have the score info. + + # Now store information on which destress markings (yes|no) this rule + # is valid for, and the associated scores (if supplied) + # If just the rule is given (i.e. no destress marking specified), + # assume valid for both. + if (!defined $destress) { # treat as if both "yes" and "no" are valid. + $rule_and_destress_to_rule_score{$rule.";yes"} = $rule_score; + $rule_and_destress_to_rule_score{$rule.";no"} = $rule_score; + } else { + $rule_and_destress_to_rule_score{$rule.";".$destress} = $rule_score; + } + +} + +open(D, "<$dict") || die "Opening base dictionary: $dict"; +while() { + @A = split(" ", $_); + $word = shift @A; + $pron = join(" ", @A); + if (!defined $word2prons{$word}) { + $word2prons{$word} = [ $pron ]; # Ref to new anonymous array containing just "pron". + } else { + push @{$word2prons{$word}}, $pron; # Push $pron onto array referred to (@$ref derefs array). + } +} +foreach $word (%word2prons) { + # Set up the hash "prefixcount", which says how many times a char-sequence + # is a prefix (not necessarily a strict prefix) of a word in the dict. + $len = length($word); + for ($l = 0; $l <= $len; $l++) { + $prefixcount{substr($word, 0, $l)}++; + } +} + +open(R, "<$rules") || die "Opening rules file: $rules"; + + +while(<>) { + chop; + m/^\S+$/ || die; + process_word($_); +} + +sub process_word { + my $word = shift @_; + $len = length($word); + # $owncount is used in evaluating whether a particular prefix is a prefix + # of some other word in the dict... if a word itself may be in the dict + # (usually because we're running this on the dict itself), we need to + # correct for this. + if (defined $word2prons{$word}) { $owncount = 1; } else { $owncount = 0; } + + for ($prefix_len = $min_prefix_len; $prefix_len <= $len; $prefix_len++) { + my $prefix = substr($word, 0, $prefix_len); + my $suffix = substr($word, $prefix_len); + if ($prefixcount{$prefix} - $owncount == 0) { + # This prefix is not a prefix of any word in the dict, so no point + # checking the rules below-- none of them can match. + next; + } + $rules_array_ref = $suffix2rule{$suffix}; + if (defined $rules_array_ref) { + foreach $R (@$rules_array_ref) { # @$rules_array_ref dereferences the array. + # $R is a refernce to a 4-dimensional array, whose elements we access with + # $$R[0], etc. + my $base_suffix = $$R[1]; + my $base_word = $prefix . $base_suffix; + my $base_prons_ref = $word2prons{$base_word}; + if (defined $base_prons_ref) { + my $psuffix = $$R[2]; + my $base_psuffix = $$R[3]; + if ($base_psuffix ne "") { + $base_psuffix = " " . $base_psuffix; + # Include " ", the space between phones, to prevent + # matching partial phones below. + } + my $base_psuffix_len = length($base_psuffix); + foreach $base_pron (@$base_prons_ref) { # @$base_prons_ref derefs + # that reference to an array. + my $base_pron_prefix_len = length($base_pron) - $base_psuffix_len; + # Note: these lengths are in characters, not phones. + if ($base_pron_prefix_len >= 0 && + substr($base_pron, $base_pron_prefix_len) eq $base_psuffix) { + # The suffix of the base_pron is what it should be. + my $pron_prefix = substr($base_pron, 0, $base_pron_prefix_len); + my $rule = join(",", @$R); # we'll output this.. + my $len = @R; + for ($destress = 0; $destress <= 1; $destress++) { # Two versions + # of each rule: with destressing and without. + # pron is the generated pron. + if ($destress) { $pron_prefix =~ s/2/1/g; } + my $pron; + if ($psuffix ne "") { $pron = $pron_prefix . " " . $psuffix; } + else { $pron = $pron_prefix; } + # Now print out the info about the generated pron. + my $destress_mark = ($destress ? "yes" : "no"); + my $rule_score = $rule_and_destress_to_rule_score{$rule.";".$destress_mark}; + if (defined $rule_score) { # Means that the (rule,destress) combination was + # seen [note: this if-statement may be pointless, as currently we don't + # do any pruning of rules]. + my @output = ($word, $pron, $base_word, $base_pron, $rule, $destress_mark); + if ($rule_score != -1) { push @output, $rule_score; } # If scores were supplied, + # we also output the score info. + print join(";", @output) . "\n"; + } + } + } + } + } + } + } + } +} diff --git a/egs/chime_wsj0/s5/local/dict/get_rule_hierarchy.pl b/egs/chime_wsj0/s5/local/dict/get_rule_hierarchy.pl new file mode 100755 index 000000000..35805b46b --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/get_rule_hierarchy.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl + +#This reads in rules, of the form put out by get_rules.pl, e.g.: +# ERT,,ER0 T, +# MENT,ING,M AH0 N T,IH0 NG +# S,TON,Z,T AH0 N +# ,ER,IH0 NG,IH0 NG ER0 +# ,'S,M AH0 N,M AH0 N Z +#TIONS,TIVE,SH AH0 N Z,T IH0 V + +# and it works out a hierarchy that says which rules are sub-cases +# of which rules: it outputs on each line a pair separated by ";", where +# each member of the pair is a rule, first one is the specialization, the +# second one being more general. +# E.g.: +# RED,RE,D,/ED,E,D, +# RED,RE,D,/D,,D, +# GING,GE,IH0 NG,/ING,I,IH0 NG, +# TOR,TING,T ER0,T IH0 NG/OR,OR,T ER0,T ER0 +# ERED,ER,D,/RED,R,D, +# ERED,ER,D,/ED,,D, + + + + +while(<>) { + chop; + $rule = $_; + $isrule{$rule} = 1; + push @rules, $rule; +} + +foreach my $rule (@rules) { + # Truncate the letters and phones in the rule, while we + # can, to get more general rules; if the more general rule + # exists, put out the pair. + @A = split(",", $rule); + @suffixa = split("", $A[0]); + @suffixb = split("", $A[1]); + @psuffixa = split(" ", $A[2]); + @psuffixb = split(" ", $A[3]); + for ($common_suffix_len = 0; $common_suffix_len < @suffixa && $common_suffix_len < @suffixb;) { + if ($suffixa[$common_suffix_len] eq $suffixb[$common_suffix_len]) { + $common_suffix_len++; + } else { + last; + } + } + for ($common_psuffix_len = 0; $common_psuffix_len < @psuffixa && $common_psuffix_len < @psuffixb;) { + if ($psuffixa[$common_psuffix_len] eq $psuffixb[$common_psuffix_len]) { + $common_psuffix_len++; + } else { + last; + } + } + # Get all combinations of pairs of integers <= (common_suffix_len, common_psuffix_len), + # except (0,0), and print out this rule together with the corresponding rule (if it exists). + for ($m = 0; $m <= $common_suffix_len; $m++) { + $sa = join("", @suffixa[$m...$#suffixa]); # @x[a..b] is array slice notation. + $sb = join("", @suffixb[$m...$#suffixb]); + for ($n = 0; $n <= $common_psuffix_len; $n++) { + if (!($m == 0 && $n == 0)) { + $psa = join(" ", @psuffixa[$n...$#psuffixa]); + $psb = join(" ", @psuffixb[$n...$#psuffixb]); + $more_general_rule = join(",", ($sa, $sb, $psa, $psb)); + if (defined $isrule{$more_general_rule}) { + print $rule . ";" . $more_general_rule . "\n"; + } + } + } + } +} + diff --git a/egs/chime_wsj0/s5/local/dict/get_rules.pl b/egs/chime_wsj0/s5/local/dict/get_rules.pl new file mode 100755 index 000000000..a5b57b088 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/get_rules.pl @@ -0,0 +1,204 @@ +#!/usr/bin/perl + +# This program creates suggested suffix rules from a dictionary. +# It outputs quadruples of the form: +# suffix,base-suffix,psuffix,base-psuffix +# where "suffix" is the suffix of the letters of a word, "base-suffix" is +# the suffix of the letters of the base-word, "psuffix" is the suffix of the +# pronunciation of the word (a space-separated list of phonemes), and +# "base-psuffix" is the suffix of the pronunciation of the baseword. +# As far as this program is concerned, there is no distinction between +# "word" and "base-word". To simplify things slightly, what it does +# is return all tuples (a,b,c,d) [with a != b] such that there are +# at least $min_suffix_count instances in the dictionary of +# a (word-prefix, pron-prefix) pair where there exists (word,pron) +# pairs of the form +# ( word-prefix . a, pron-prefix . c) +# and +# ( word-prefix . b, pron-prefix . d) +# For example if (a,b,c,d) equals (USLY,US,S L IY0,S) +# then this quadruple will be output as long as there at least +# e.g. 30 instances of prefixes like (FAM, F EY1 M AH0) +# where there exist (word, pron) pairs like: +# FAMOUS, F EY1 M AH0 S +# FAMOUSLY F EY1 M AH0 S L IY0 +# +# There are some modifications to the picture above, for efficiency. +# If $disallow_empty_suffix != 0, this program will not output 4-tuples where +# the first element (the own-word suffix) is empty, as this would cause +# efficiency problems in get_candidate_prons.pl. If +# $ignore_prefix_stress != 0, this program will ignore stress markings +# while evaluating whether prefixes are the same. +# The minimum count for a quadruple to be output is $min_suffix_count +# (e.g. 30). +# +# The function of this program is not to evaluate the accuracy of these rules; +# it is mostly a pruning step, where we suggest rules that have large enough +# counts to be suitable for our later procedure where we evaluate their +# accuracy in predicting prons. + +$disallow_empty_suffix = 1; # Disallow rules where the suffix of the "own-word" is + # empty. This is for efficiency in later stages (e.g. get_candidate_prons.pl). +$min_prefix_len = 3; # this must match with get_candidate_prons.pl +$ignore_prefix_stress = 1; # or 0 to take account of stress in prefix. +$min_suffix_count = 20; + +# Takes in dictionary. + +print STDERR "Reading dict\n"; +while(<>) { + @A = split(" ", $_); + my $word = shift @A; + my $pron = join(" ", @A); + if (!defined $prons{$word}) { + $prons{$word} = $pron; + push @words, $word; + } else { + $prons{$word} = $prons{$word} . ";" . $pron; + } +} + +# Get common suffixes (e.g., count >100). Include empty suffix. + +print STDERR "Getting common suffix counts.\n"; +{ + foreach $word (@words) { + $len = length($word); + for ($x = $min_prefix_len; $x <= $len; $x++) { + $suffix_count{substr($word, $x)}++; + } + } + + foreach $suffix (keys %suffix_count) { + if ($suffix_count{$suffix} >= $min_suffix_count) { + $newsuffix_count{$suffix} = $suffix_count{$suffix}; + } + } + %suffix_count = %newsuffix_count; + undef %newsuffix_count; + + foreach $suffix ( sort { $suffix_count{$b} <=> $suffix_count{$a} } keys %suffix_count ) { + print STDERR "$suffix_count{$suffix} $suffix\n"; + } +} + +print STDERR "Getting common suffix pairs.\n"; + +{ + print STDERR " Getting map from prefix -> suffix-set.\n"; + + # Create map from prefix -> suffix-set. + foreach $word (@words) { + $len = length($word); + for ($x = $min_prefix_len; $x <= $len; $x++) { + $prefix = substr($word, 0, $x); + $suffix = substr($word, $x); + if (defined $suffix_count{$suffix}) { # Suffix is common... + if (!defined $suffixes_of{$prefix}) { + $suffixes_of{$prefix} = [ $suffix ]; # Create a reference to a new array with + # one element. + } else { + push @{$suffixes_of{$prefix}}, $suffix; # Push $suffix onto array that the + # hash member is a reference . + } + } + } + } + my %suffix_set_count; + print STDERR " Getting map from suffix-set -> count.\n"; + while ( my ($key, $value) = each(%suffixes_of) ) { + my @suffixes = sort ( @$value ); + $suffix_set_count{join(";", @suffixes)}++; + } + print STDERR " Getting counts for suffix pairs.\n"; + while ( my ($suffix_set, $count) = each (%suffix_set_count) ) { + my @suffixes = split(";", $suffix_set); + # Consider pairs to be ordered. This is more convenient + # later on. + foreach $suffix_a (@suffixes) { + foreach $suffix_b (@suffixes) { + if ($suffix_a ne $suffix_b) { + $suffix_pair = $suffix_a . "," . $suffix_b; + $suffix_pair_count{$suffix_pair} += $count; + } + } + } + } + + # To save memory, only keep pairs above threshold in the hash. + while ( my ($suffix_pair, $count) = each (%suffix_pair_count) ) { + if ($count >= $min_suffix_count) { + $new_hash{$suffix_pair} = $count; + } + } + %suffix_pair_count = %new_hash; + undef %new_hash; + + # Print out the suffix pairs so the user can see. + foreach $suffix_pair ( + sort { $suffix_pair_count{$b} <=> $suffix_pair_count{$a} } keys %suffix_pair_count ) { + print STDERR "$suffix_pair_count{$suffix_pair} $suffix_pair\n"; + } +} + +print STDERR "Getting common suffix/suffix/psuffix/psuffix quadruples\n"; + +{ + while ( my ($prefix, $suffixes_ref) = each(%suffixes_of) ) { + # Note: suffixes_ref is a reference to an array. We dereference with + # @$suffixes_ref. + # Consider each pair of suffixes (in each order). + foreach my $suffix_a ( @$suffixes_ref ) { + foreach my $suffix_b ( @$suffixes_ref ) { + # could just used "defined" in next line, but this is for clarity. + $suffix_pair = $suffix_a.",".$suffix_b; + if ( $suffix_pair_count{$suffix_pair} >= $min_suffix_count ) { + foreach $pron_a_str (split(";", $prons{$prefix.$suffix_a})) { + @pron_a = split(" ", $pron_a_str); + foreach $pron_b_str (split(";", $prons{$prefix.$suffix_b})) { + @pron_b = split(" ", $pron_b_str); + $len_a = @pron_a; # evaluating array as scalar automatically gives length. + $len_b = @pron_b; + for (my $pos = 0; $pos <= $len_a && $pos <= $len_b; $pos++) { + # $pos is starting-pos of psuffix-pair. + $psuffix_a = join(" ", @pron_a[$pos...$#pron_a]); + $psuffix_b = join(" ", @pron_b[$pos...$#pron_b]); + $quadruple = $suffix_pair . "," . $psuffix_a . "," . $psuffix_b; + $quadruple_count{$quadruple}++; + + my $pron_a_pos = $pron_a[$pos], $pron_b_pos = $pron_b[$pos]; + if ($ignore_prefix_stress) { + $pron_a_pos =~ s/\d//; # e.g convert IH0 to IH. Only affects + $pron_b_pos =~ s/\d//; # whether we exit the loop below. + } + if ($pron_a_pos ne $pron_b_pos) { + # This is important: we don't consider a pron suffix-pair to be + # valid unless the pron prefix is the same. + last; + } + } + } + } + } + } + } + } + # To save memory, only keep pairs above threshold in the hash. + while ( my ($quadruple, $count) = each (%quadruple_count) ) { + if ($count >= $min_suffix_count) { + $new_hash{$quadruple} = $count; + } + } + %quadruple_count = %new_hash; + undef %new_hash; + + # Print out the quadruples for diagnostics. + foreach $quadruple ( + sort { $quadruple_count{$b} <=> $quadruple_count{$a} } keys %quadruple_count ) { + print STDERR "$quadruple_count{$quadruple} $quadruple\n"; + } +} +# Now print out the quadruples; these are the output of this program. +foreach $quadruple (keys %quadruple_count) { + print $quadruple."\n"; +} diff --git a/egs/chime_wsj0/s5/local/dict/limit_candidate_prons.pl b/egs/chime_wsj0/s5/local/dict/limit_candidate_prons.pl new file mode 100755 index 000000000..ceff9fbad --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/limit_candidate_prons.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl + +# This program enforces the rule that +# if a "more specific" rule applies, we cannot use the more general rule. +# It takes in tuples generated by get_candidate_prons (one per line, separated +# by ";"), of the form: +# word;pron;base-word;base-pron;rule-name;de-stress[;rule-score] +# [note: we mean that the last element, the numeric score of the rule, is optional] +# and it outputs a (generally shorter) list +# of the same form. + + +# For each word: + # For each (base-word,base-pron): + # Eliminate "more-general" rules as follows: + # For each pair of rules applying to this (base-word, base-pron): + # If pair is in more-general hash, disallow more general one. + # Let the output be: for each (base-word, base-pron, rule): + # for (destress-prefix) in [yes, no], do: + # print out the word input, the rule-name, [destressed:yes|no], and the new pron. + + +if (@ARGV != 1 && @ARGV != 2) { + die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons"; +} + +$hierarchy = shift @ARGV; +open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy"; + +while() { + chop; + m:.+;.+: || die "Bad rule-hierarchy line $_"; + $hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule + # and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2} + # is defined, else undefined. +} + + +sub process_word; + +undef $cur_word; +@cur_lines = (); + +while(<>) { + # input, output is: + # word;pron;base-word;base-pron;rule-name;destress;score + chop; + m:^([^;]+);: || die "Unexpected input: $_"; + $word = $1; + if (!defined $cur_word || $word eq $cur_word) { + if (!defined $cur_word) { $cur_word = $word; } + push @cur_lines, $_; + } else { + process_word(@cur_lines); # Process a series of suggested prons + # for a particular word. + $cur_word = $word; + @cur_lines = ( $_ ); + } +} +process_word(@cur_lines); + +sub process_word { + my %pair2rule_list; # hash from $baseword.";".$baseword to ref + # to array of [ line1, line2, ... ]. + my @cur_lines = @_; + foreach my $line (@cur_lines) { + my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); + my $key = $baseword.";".$basepron; + if (defined $pair2rule_list{$key}) { + push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to + # by the array ref inside {}. + } else { + $pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x) + } + } + while ( my ($key, $value) = each(%pair2rule_list) ) { + my @lines = @$value; # array of lines that are for this (baseword,basepron). + my @stress, @rules; # Arrays of stress markers and rule names, indexed by + # same index that indexes @lines. + for (my $n = 0; $n < @lines; $n++) { + my $line = $lines[$n]; + my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); + $stress[$n] = $destress; + $rules[$n] = $rulename; + } + for (my $m = 0; $m < @lines; $m++) { + my $ok = 1; # if stays 1, this line is OK. + for (my $n = 0; $n < @lines; $n++) { + if ($m != $n && $stress[$m] eq $stress[$n]) { + if (defined $hierarchy{$rules[$n].";".$rules[$m]}) { + # Note: this "hierarchy" variable is defined if $rules[$n] is a more + # specific instances of $rules[$m], thus invalidating $rules[$m]. + $ok = 0; + last; # no point iterating further. + } + } + } + if ($ok != 0) { + print $lines[$m] . "\n"; + } + } + } +} diff --git a/egs/chime_wsj0/s5/local/dict/reverse_candidates.pl b/egs/chime_wsj0/s5/local/dict/reverse_candidates.pl new file mode 100755 index 000000000..d5c5effc2 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/reverse_candidates.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl + +# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl, +# which is 7-tuples, one per line, of the form: + +# word;pron;base-word;base-pron;rule-name;de-stress;rule-score +# (where rule-score is somtimes listed as optional, but this +# program does expect it, since we don't anticipate it being used +# without it). +# This program assumes that all the words and prons and rules have +# come from a reversed dictionary (reverse_dict.pl) where the order +# of the characters in the words, and the phones in the prons, have +# been reversed, and it un-reverses them. That it, the characters +# in "word" and "base-word", and the phones in "pron" and "base-pron" +# are reversed; and the rule ("rule-name") is parsed as a 4-tuple, +# like: +# suffix,base-suffix,psuffix,base-psuffix +# so this program reverses the characters in "suffix" and "base-suffix" +# and the phones (separated by spaces) in "psuffix" and "base-psuffix". + +sub reverse_str { + $str = shift; + return join("", reverse(split("", $str))); +} +sub reverse_pron { + $str = shift; + return join(" ", reverse(split(" ", $str))); +} + +while(<>){ + chop; + @A = split(";", $_); + @A == 7 || die "Bad input line $_: found $len fields, expected 7."; + + ($word,$pron,$baseword,$basepron,$rule,$destress,$score) = @A; + $word = reverse_str($word); + $pron = reverse_pron($pron); + $baseword = reverse_str($baseword); + $basepron = reverse_pron($basepron); + @R = split(",", $rule, 4); + @R == 4 || die "Bad rule $rule"; + + $R[0] = reverse_str($R[0]); # suffix. + $R[1] = reverse_str($R[1]); # base-suffix. + $R[2] = reverse_pron($R[2]); # pron. + $R[3] = reverse_pron($R[3]); # base-pron. + $rule = join(",", @R); + @A = ($word,$pron,$baseword,$basepron,$rule,$destress,$score); + print join(";", @A) . "\n"; +} diff --git a/egs/chime_wsj0/s5/local/dict/reverse_dict.pl b/egs/chime_wsj0/s5/local/dict/reverse_dict.pl new file mode 100755 index 000000000..75681711b --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/reverse_dict.pl @@ -0,0 +1,14 @@ +#!/usr/bin/perl + +# Used in conjunction with get_rules.pl +# example input line: XANTHE Z AE1 N DH +# example output line: EHTNAX DH N AE1 Z + +while(<>){ + @A = split(" ", $_); + $word = shift @A; + $word = join("", reverse(split("", $word))); # Reverse letters of word. + @A = reverse(@A); # Reverse phones in pron. + unshift @A, $word; + print join(" ", @A) . "\n"; +} diff --git a/egs/chime_wsj0/s5/local/dict/score_prons.pl b/egs/chime_wsj0/s5/local/dict/score_prons.pl new file mode 100755 index 000000000..fd5a004d8 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/score_prons.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl + +# This program takes candidate prons from "get_candidate_prons.pl" or +# "limit_candidate_prons.pl", and a reference dictionary covering those words, +# and outputs the same format but with scoring information added (so we go from +# 6 to 7 fields). The scoring information says, for each generated pron, +# whether we have a match, a partial match, or no match, to some word in the +# dictionary. A partial match means it's correct except for stress. + +# The input is a 6-tuple on each line, like: +# word;pron;base-word;base-pron;rule-name;de-stress +# +# The output is the same except with one more field, the score, +# which may be "right", "wrong", "partial". + +if (@ARGV != 1 && @ARGV != 2) { + die "Usage: score_prons.pl reference_dict [candidate_prons] > scored_candidate_prons"; +} + +$dict = shift @ARGV; +open(D, "<$dict") || die "Opening dictionary $dict"; + +while() { # Set up some hashes that tell us when + # a (word,pron) pair is correct (and the same for + # prons with stress information removed). + chop; + @A = split(" ", $_); + $word = shift @A; + $pron = join(" ", @A); + $pron_nostress = $pron; + $pron_nostress =~ s:\d::g; + $word_and_pron{$word.";".$pron} = 1; + $word_and_pron_nostress{$word.";".$pron_nostress} = 1; +} + +while(<>) { + chop; + $line = $_; + my ($word, $pron, $baseword, $basepron, $rulename, $destress) = split(";", $line); + $pron_nostress = $pron; + $pron_nostress =~ s:\d::g; + if (defined $word_and_pron{$word.";".$pron}) { + $score = "right"; + } elsif (defined $word_and_pron_nostress{$word.";".$pron_nostress}) { + $score = "partial"; + } else { + $score = "wrong"; + } + print $line.";".$score."\n"; +} diff --git a/egs/chime_wsj0/s5/local/dict/score_rules.pl b/egs/chime_wsj0/s5/local/dict/score_rules.pl new file mode 100755 index 000000000..8d165f7f1 --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/score_rules.pl @@ -0,0 +1,52 @@ +#!/usr/bin/perl + +# This program takes the output of count_rules.pl, which is tuples +# of the form +# +# rule;destress;right-count;partial-count;wrong-count +# +# and outputs lines of the form +# +# rule;de-stress;score +# +# where the score, between 0 and 1 (1 better), is +# equal to: +# +# It forms a score between 0 and 1, of the form: +# ((#correct) + $partial_score * (#partial)) / (#correct + #partial + #wrong + $ballast) +# +# where $partial_score (e.g. 0.8) is the score we assign to a "partial" match, +# and $ballast is a small number, e.g. 1, that is treated like "extra" wrong scores, to penalize +# rules with few observations. +# +# It outputs all rules that at are at least the + +$ballast = 1; +$partial_score = 0.8; +$destress_penalty = 1.0e-05; # Give destressed rules a small +# penalty vs. their no-destress counterparts, so if we +# have to choose arbitrarily we won't destress (seems safer)> + +for ($n = 1; $n <= 4; $n++) { + if ($ARGV[0] eq "--ballast") { + shift @ARGV; + $ballast = shift @ARGV; + } + if ($ARGV[0] eq "--partial-score") { + shift @ARGV; + $partial_score = shift @ARGV; + ($partial_score >= 0.0 && $partial_score <= 1.0) || die "Invalid partial_score: $partial_score"; + } +} + +(@ARGV == 0 || @ARGV == 1) || die "Usage: score_rules.pl [--ballast ballast-count] [--partial-score partial-score] [input from count_rules.pl]"; + +while(<>) { + @A = split(";", $_); + @A == 5 || die "Bad input line; $_"; + ($rule,$destress,$right_count,$partial_count,$wrong_count) = @A; + $rule_score = ($right_count + $partial_score*$partial_count) / + ($right_count+$partial_count+$wrong_count+$ballast); + if ($destress eq "yes") { $rule_score -= $destress_penalty; } + print join(";", $rule, $destress, sprintf("%.5f", $rule_score)) . "\n"; +} diff --git a/egs/chime_wsj0/s5/local/dict/select_candidate_prons.pl b/egs/chime_wsj0/s5/local/dict/select_candidate_prons.pl new file mode 100755 index 000000000..d0018c98a --- /dev/null +++ b/egs/chime_wsj0/s5/local/dict/select_candidate_prons.pl @@ -0,0 +1,84 @@ +#!/usr/bin/perl + +# This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl +# or reverse_candidates.pl, which is 7-tuples, one per line, of the form: +# +# word;pron;base-word;base-pron;rule-name;de-stress;rule-score +# +# and selects the most likely prons for the words based on rule +# score. It outputs in the same format as the input (thus, it is +# similar to limit_candidates.pl in its input and output format, +# except it has a different way of selecting the prons to put out). +# +# This script will select the $max_prons best pronunciations for +# each candidate word, subject to the constraint that no pron should +# have a rule score worse than $min_rule_score. +# It first merges the candidates by, if there are multiple candidates +# generating the same pron, selecting the candidate that had the +# best associated score. It then sorts the prons on score and +# selects the n best prons (but doesn't print out candidates with +# score beneath the threshold). + + +$max_prons = 4; +$min_rule_score = 0.35; + + +for ($n = 1; $n <= 3; $n++) { + if ($ARGV[0] eq "--max-prons") { + shift @ARGV; + $max_prons = shift @ARGV; + } + if ($ARGV[0] eq "--min-rule-score") { + shift @ARGV; + $min_rule_score = shift @ARGV; + } +} + +if (@ARGV != 0 && @ARGV != 1) { + die "Usage: select_candidates_prons.pl [candidate_prons] > selected_candidate_prons"; +} + +sub process_word; + +undef $cur_word; +@cur_lines = (); + +while(<>) { + # input, output is: + # word;pron;base-word;base-pron;rule-name;destress;score + chop; + m:^([^;]+);: || die "Unexpected input: $_"; + $word = $1; + if (!defined $cur_word || $word eq $cur_word) { + if (!defined $cur_word) { $cur_word = $word; } + push @cur_lines, $_; + } else { + process_word(@cur_lines); # Process a series of suggested prons + # for a particular word. + $cur_word = $word; + @cur_lines = ( $_ ); + } +} +process_word(@cur_lines); + + +sub process_word { + my %pron2rule_score; # hash from generated pron to rule score for that pron. + my %pron2line; # hash from generated pron to best line for that pron. + my @cur_lines = @_; + foreach my $line (@cur_lines) { + my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); + if (!defined $pron2rule_score{$pron} || + $rule_score > $pron2rule_score{$pron}) { + $pron2rule_score{$pron} = $rule_score; + $pron2line{$pron} = $line; + } + } + my @prons = sort { $pron2rule_score{$b} <=> $pron2rule_score{$a} } keys %pron2rule_score; + for (my $n = 0; $n < @prons && $n < $max_prons && + $pron2rule_score{$prons[$n]} >= $min_rule_score; $n++) { + print $pron2line{$prons[$n]} . "\n"; + } +} + diff --git a/egs/chime_wsj0/s5/local/find_noisy_transcripts.pl b/egs/chime_wsj0/s5/local/find_noisy_transcripts.pl new file mode 100755 index 000000000..720c320c0 --- /dev/null +++ b/egs/chime_wsj0/s5/local/find_noisy_transcripts.pl @@ -0,0 +1,65 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + + +# This program takes on its standard input a list of utterance +# id's, one for each line. (e.g. 4k0c030a is a an utterance id). +# It takes as +# Extracts from the dot files the transcripts for a given +# dataset (represented by a file list). +# + +@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; +$dot_flist = shift @ARGV; + +open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; +while(){ + chop; + m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; + $spk = $1; + $spk2dot{$spk} = $_; +} + + + +while(){ + chop; + $uttid_orig = $_; + $uttid = substr $uttid_orig, 0, 8; + $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; + $spk = $1; + if($spk ne $curspk) { + %utt2trans = { }; # Don't keep all the transcripts in memory... + $curspk = $spk; + $dotfile = $spk2dot{$spk}; + defined $dotfile || die "No dot file for speaker $spk\n"; + open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; + while() { + $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; + $trans = $1; + $utt = $2; + $utt2trans{$utt} = $trans; + } + } + if(!defined $utt2trans{$uttid}) { + print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; + } else { + print "$uttid_orig $utt2trans{$uttid}\n"; + } +} + + diff --git a/egs/chime_wsj0/s5/local/find_transcripts.pl b/egs/chime_wsj0/s5/local/find_transcripts.pl new file mode 100755 index 000000000..0e5d71f79 --- /dev/null +++ b/egs/chime_wsj0/s5/local/find_transcripts.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + + +# This program takes on its standard input a list of utterance +# id's, one for each line. (e.g. 4k0c030a is a an utterance id). +# It takes as +# Extracts from the dot files the transcripts for a given +# dataset (represented by a file list). +# + +@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; +$dot_flist = shift @ARGV; + +open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; +while(){ + chop; + m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; + $spk = $1; + $spk2dot{$spk} = $_; +} + + + +while(){ + chop; + $uttid = $_; + $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; + $spk = $1; + if($spk ne $curspk) { + %utt2trans = { }; # Don't keep all the transcripts in memory... + $curspk = $spk; + $dotfile = $spk2dot{$spk}; + defined $dotfile || die "No dot file for speaker $spk\n"; + open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; + while() { + $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; + $trans = $1; + $utt = $2; + $utt2trans{$utt} = $trans; + } + } + if(!defined $utt2trans{$uttid}) { + print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; + } else { + print "$uttid $utt2trans{$uttid}\n"; + } +} + + diff --git a/egs/chime_wsj0/s5/local/flist2scp.pl b/egs/chime_wsj0/s5/local/flist2scp.pl new file mode 100755 index 000000000..6831d2d7b --- /dev/null +++ b/egs/chime_wsj0/s5/local/flist2scp.pl @@ -0,0 +1,31 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# takes in a file list with lines like +# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 +# and outputs an scp in kaldi format with lines like +# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 +# (the first thing is the utterance-id, which is the same as the basename of the file. + + +while(<>){ + m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_"; + $id = $1; + $id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames) + print "$id $_"; +} + diff --git a/egs/chime_wsj0/s5/local/generate_example_kws.sh b/egs/chime_wsj0/s5/local/generate_example_kws.sh new file mode 100755 index 000000000..2c8494381 --- /dev/null +++ b/egs/chime_wsj0/s5/local/generate_example_kws.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) +# Apache 2.0. + + +if [ $# -ne 2 ]; then + echo "Usage: local/generate_example_kws.sh " + echo " e.g.: local/generate_example_kws.sh data/test_eval92/ " + exit 1; +fi + +datadir=$1; +kwsdatadir=$2; +text=$datadir/text; + +mkdir -p $kwsdatadir; + +# Generate keywords; we generate 20 unigram keywords with at least 20 counts, +# 20 bigram keywords with at least 10 counts and 10 trigram keywords with at +# least 5 counts. +cat $text | perl -e ' + %unigram = (); + %bigram = (); + %trigram = (); + while(<>) { + chomp; + @col=split(" ", $_); + shift @col; + for($i = 0; $i < @col; $i++) { + # unigram case + if (!defined($unigram{$col[$i]})) { + $unigram{$col[$i]} = 0; + } + $unigram{$col[$i]}++; + + # bigram case + if ($i < @col-1) { + $word = $col[$i] . " " . $col[$i+1]; + if (!defined($bigram{$word})) { + $bigram{$word} = 0; + } + $bigram{$word}++; + } + + # trigram case + if ($i < @col-2) { + $word = $col[$i] . " " . $col[$i+1] . " " . $col[$i+2]; + if (!defined($trigram{$word})) { + $trigram{$word} = 0; + } + $trigram{$word}++; + } + } + } + + $max_count = 100; + $total = 20; + $current = 0; + $min_count = 20; + while ($current < $total && $min_count <= $max_count) { + foreach $x (keys %unigram) { + if ($unigram{$x} == $min_count) { + print "$x\n"; + $unigram{$x} = 0; + $current++; + } + if ($current == $total) { + last; + } + } + $min_count++; + } + + $total = 20; + $current = 0; + $min_count = 4; + while ($current < $total && $min_count <= $max_count) { + foreach $x (keys %bigram) { + if ($bigram{$x} == $min_count) { + print "$x\n"; + $bigram{$x} = 0; + $current++; + } + if ($current == $total) { + last; + } + } + $min_count++; + } + + $total = 10; + $current = 0; + $min_count = 3; + while ($current < $total && $min_count <= $max_count) { + foreach $x (keys %trigram) { + if ($trigram{$x} == $min_count) { + print "$x\n"; + $trigram{$x} = 0; + $current++; + } + if ($current == $total) { + last; + } + } + $min_count++; + } + ' > $kwsdatadir/raw_keywords.txt + +echo "Keywords generation succeeded" diff --git a/egs/chime_wsj0/s5/local/kws_data_prep.sh b/egs/chime_wsj0/s5/local/kws_data_prep.sh new file mode 100755 index 000000000..5222a88c9 --- /dev/null +++ b/egs/chime_wsj0/s5/local/kws_data_prep.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) +# Apache 2.0. + + +if [ $# -ne 3 ]; then + echo "Usage: local/kws_data_prep.sh " + echo " e.g.: local/kws_data_prep.sh data/lang_test_bd_tgpr/ data/test_eval92/ data/kws/" + exit 1; +fi + +langdir=$1; +datadir=$2; +kwsdatadir=$3; + +mkdir -p $kwsdatadir; + +# Create keyword id for each keyword +cat $kwsdatadir/raw_keywords.txt | perl -e ' + $idx=1; + while(<>) { + chomp; + printf "WSJ-%04d $_\n", $idx; + $idx++; + }' > $kwsdatadir/keywords.txt + +# Map the keywords to integers; note that we remove the keywords that +# are not in our $langdir/words.txt, as we won't find them anyway... +cat $kwsdatadir/keywords.txt | \ + sym2int.pl --map-oov 0 -f 2- $langdir/words.txt | \ + grep -v " 0 " | grep -v " 0$" > $kwsdatadir/keywords.int + +# Compile keywords into FSTs +transcripts-to-fsts ark:$kwsdatadir/keywords.int ark:$kwsdatadir/keywords.fsts + +# Create utterance id for each utterance; Note that by "utterance" here I mean +# the keys that will appear in the lattice archive. You may have to modify here +cat $datadir/wav.scp | \ + awk '{print $1}' | \ + sort | uniq | perl -e ' + $idx=1; + while(<>) { + chomp; + print "$_ $idx\n"; + $idx++; + }' > $kwsdatadir/utter_id + +# Map utterance to the names that will appear in the rttm file. You have +# to modify the commands below accoring to your rttm file. In the WSJ case +# since each file is an utterance, we assume that the actual file names will +# be the "names" in the rttm, so the utterance names map to themselves. +cat $datadir/wav.scp | \ + awk '{print $1}' | \ + sort | uniq | perl -e ' + while(<>) { + chomp; + print "$_ $_\n"; + }' > $kwsdatadir/utter_map; +echo "Kws data preparation succeeded" diff --git a/egs/chime_wsj0/s5/local/ndx2flist.pl b/egs/chime_wsj0/s5/local/ndx2flist.pl new file mode 100755 index 000000000..b05704293 --- /dev/null +++ b/egs/chime_wsj0/s5/local/ndx2flist.pl @@ -0,0 +1,62 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This program takes as its standard input an .ndx file from the WSJ corpus that looks +# like this: +#;; File: tr_s_wv1.ndx, updated 04/26/94 +#;; +#;; Index for WSJ0 SI-short Sennheiser training data +#;; Data is read WSJ sentences, Sennheiser mic. +#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts +#;; per speaker TI) = 7236 utts +#;; +#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 +#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 +#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 + +#and as command-line arguments it takes the names of the WSJ disk locations, e.g.: +#/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc. +# It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with +# /mnt/matylda2/data/WSJ0/11-1.1. +# It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with +# uppercase rather than lower case filenames. + +foreach $fn (@ARGV) { + $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n"; + $disk_id=$1; + $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1 + $fn =~ s:/$::; # Remove final slash, just in case it is present. + $disk2fn{$disk_id} = $fn; +} + +while(){ + if(m/^;/){ next; } # Comment. Ignore it. + else { + m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; + $disk=$1; + if(!defined $disk2fn{$disk}) { + die "Disk id $disk not found"; + } + $filename = $2; # as a subdirectory of the distributed disk. + if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) { + # The disk 13-16.1 has been uppercased for some reason, on the + # BUT system. This is a fix specifically for that case. + $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why? + } + print "$disk2fn{$disk}/$filename\n"; + } +} diff --git a/egs/chime_wsj0/s5/local/nnet2/run_5b.sh b/egs/chime_wsj0/s5/local/nnet2/run_5b.sh new file mode 100755 index 000000000..1e9adfc25 --- /dev/null +++ b/egs/chime_wsj0/s5/local/nnet2/run_5b.sh @@ -0,0 +1,69 @@ +#!/bin/bash + + +stage=0 +train_stage=-100 +# This trains only unadapted (just cepstral mean normalized) features, +# and uses various combinations of VTLN warping factor and time-warping +# factor to artificially expand the amount of data. + +. cmd.sh + +. utils/parse_options.sh # to parse the --stage option, if given + +[ $# != 0 ] && echo "Usage: local/run_4b.sh [--stage --train-stage ]" && exit 1; + +set -e + +if [ $stage -le 0 ]; then + # Create the training data. + featdir=`pwd`/mfcc/nnet5b; mkdir -p $featdir + fbank_conf=conf/fbank_40.conf + echo "--num-mel-bins=40" > $fbank_conf + steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" \ + $fbank_conf $featdir exp/perturbed_fbanks_si284 data/train_si284 data/train_si284_perturbed_fbank & + steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" --feature-type mfcc \ + conf/mfcc.conf $featdir exp/perturbed_mfcc_si284 data/train_si284 data/train_si284_perturbed_mfcc & + wait +fi + +if [ $stage -le 1 ]; then + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_si284_perturbed_mfcc data/lang exp/tri4b exp/tri4b_ali_si284_perturbed_mfcc +fi + +if [ $stage -le 2 ]; then + steps/nnet2/train_block.sh --stage "$train_stage" \ + --cleanup false \ + --initial-learning-rate 0.01 --final-learning-rate 0.001 \ + --num-epochs 10 --num-epochs-extra 5 \ + --cmd "$decode_cmd" \ + --hidden-layer-dim 1536 \ + --num-block-layers 3 --num-normal-layers 3 \ + data/train_si284_perturbed_fbank data/lang exp/tri4b_ali_si284_perturbed_mfcc exp/nnet5b || exit 1 +fi + +if [ $stage -le 3 ]; then # create testing fbank data. + featdir=`pwd`/mfcc + fbank_conf=conf/fbank_40.conf + for x in test_eval92 test_eval93 test_dev93; do + cp -rT data/$x data/${x}_fbank + rm -r ${x}_fbank/split* || true + steps/make_fbank.sh --fbank-config "$fbank_conf" --nj 8 \ + --cmd "$train_cmd" data/${x}_fbank exp/make_fbank/$x $featdir || exit 1; + steps/compute_cmvn_stats.sh data/${x}_fbank exp/make_fbank/$x $featdir || exit 1; + done +fi + +if [ $stage -le 4 ]; then + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \ + exp/tri4b/graph_bd_tgpr data/test_dev93_fbank exp/nnet5b/decode_bd_tgpr_dev93 + + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \ + exp/tri4b/graph_bd_tgpr data/test_eval92_fbank exp/nnet5b/decode_bd_tgpr_eval92 +fi + + + +exit 0; + diff --git a/egs/chime_wsj0/s5/local/nnet2/run_5c.sh b/egs/chime_wsj0/s5/local/nnet2/run_5c.sh new file mode 100755 index 000000000..288b56996 --- /dev/null +++ b/egs/chime_wsj0/s5/local/nnet2/run_5c.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# This is neural net training on top of adapted 40-dimensional features. +# + +. ./cmd.sh + +( + steps/nnet2/train_tanh.sh \ + --mix-up 8000 \ + --initial-learning-rate 0.01 --final-learning-rate 0.001 \ + --num-hidden-layers 4 --hidden-layer-dim 1024 \ + --cmd "$decode_cmd" \ + data/train_si284 data/lang exp/tri4b_ali_si284 exp/nnet5c || exit 1 + + steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \ + --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \ + exp/tri4b/graph_bd_tgpr data/test_dev93 exp/nnet5c/decode_bd_tgpr_dev93 + + steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \ + --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \ + exp/tri4b/graph_bd_tgpr data/test_eval92 exp/nnet5c/decode_bd_tgpr_eval92 +) + diff --git a/egs/chime_wsj0/s5/local/noisy_wsj0_data_prep.sh b/egs/chime_wsj0/s5/local/noisy_wsj0_data_prep.sh new file mode 100755 index 000000000..8744f25d6 --- /dev/null +++ b/egs/chime_wsj0/s5/local/noisy_wsj0_data_prep.sh @@ -0,0 +1,119 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the top-level WSJ corpus directory." + echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory" + echo "within the top-level corpus directory." + exit 1; +fi + +CORPUS=$1 + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +cd $dir + +# reverb list for SI-84 + +find $1/si_tr_s -name '*.wav' | sort -u > train_si84_noisy.flist + + + +# Dev-set Hub 1,2 (503, 913 utterances) + +# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. +# Sometimes this gets copied from the CD's with upcasing, don't know +# why (could be older versions of the disks). +find $1/si_dt_20 -name '*.wav' | sort -u > dev_dt_20_noisy.flist +find $1/si_dt_05 -name '*.wav' | sort -u > dev_dt_05_noisy.flist + +find $1/si_et_20 -name '*.wav' | sort -u > test_eval92_noisy.flist +find $1/si_et_05 -name '*.wav' | sort -u > test_eval92_5k_noisy.flist + + +# Finding the transcript files: +#find -L $CORPUS -iname '*.dot' > dot_files.flist +if [ ! -e $dir/dot_files.flist ]; then + echo "Could not find $dir/dot_files.flist files, first run clean_data_prep.sh"; + exit 1; +fi + +# Convert the transcripts into our format (no normalization yet) +# adding suffix to utt_id +# 1 for reverb condition +for x in train_si84_noisy dev_dt_05_noisy dev_dt_20_noisy test_eval92_noisy test_eval92_5k_noisy; do + cat $x.flist | perl -e ' + while(<>) { + m:^\S+/(\w+)\.wav$: || die "Bad line $_"; + $id = $1; + $id =~ tr/A-Z/a-z/; + print "$id $_"; + } + ' | sort > ${x}_wav_tmp.scp + #cat ${x}_wav_tmp.scp | awk '{print $1}' \ + # | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1 + cat ${x}_wav_tmp.scp | perl -e ' + while() { + @A=split(" ", $_); + @B=split("/", $_); + $abs_path_len=@B; + $condition=$B[$abs_path_len-5]; + if ($condition eq "9dB") {$key_suffix=2;} + elsif ($condition eq "6dB") {$key_suffix=3;} + elsif ($condition eq "3dB") {$key_suffix=4;} + elsif ($condition eq "0dB") {$key_suffix=5;} + elsif ($condition eq "m3dB") {$key_suffix=6;} + elsif ($condition eq "m6dB") {$key_suffix=7;} + else {print STDERR "error condition $condition";} + print $A[0].$key_suffix." ".$A[1]."\n"; + } + ' | sort -k1 > ${x}_wav.scp + cat ${x}_wav.scp | awk '{print $1}' \ + | $local/find_noisy_transcripts.pl dot_files.flist > ${x}.trans1 +done + + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in train_si84_noisy dev_dt_05_noisy dev_dt_20_noisy test_eval92_noisy test_eval92_5k_noisy; do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) +#for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do +# awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \ +# > ${x}_wav.scp +#done + +# Make the utt2spk and spk2utt files. +for x in train_si84_noisy dev_dt_05_noisy dev_dt_20_noisy test_eval92_noisy test_eval92_5k_noisy; do + cat ${x}_wav.scp | awk '{print $1}' \ + | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +echo "Data preparation succeeded" diff --git a/egs/chime_wsj0/s5/local/normalize_transcript.pl b/egs/chime_wsj0/s5/local/normalize_transcript.pl new file mode 100755 index 000000000..9dd67af3d --- /dev/null +++ b/egs/chime_wsj0/s5/local/normalize_transcript.pl @@ -0,0 +1,59 @@ +#!/usr/bin/perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This takes data from the standard input that's unnormalized transcripts in the format +# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] +# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] +# and outputs normalized transcripts. +# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc + +@ARGV == 1 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2"; +$noise_word = shift @ARGV; + +while() { + $_ =~ m:^(\S+) (.+): || die "bad line $_"; + $utt = $1; + $trans = $2; + print "$utt"; + foreach $w (split (" ",$trans)) { + $w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. . + $w =~ s:\\::g; # Remove backslashes. We don't need the quoting. + $w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts. + $w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts. + if($w =~ m:^\[\<\w+\]$: || # E.g. [\]$: || # E.g. [door_slam>], this means a door slammed in the next word. Delete. + $w =~ m:\[\w+/\]$: || # E.g. [phone_ring/], which indicates the start of this phenomenon. + $w =~ m:\[\/\w+]$: || # E.g. [/phone_ring], which indicates the end of this phenomenon. + $w eq "~" || # This is used to indicate truncation of an utterance. Not a word. + $w eq ".") { # "." is used to indicate a pause. Silence is optional anyway so not much + # point including this in the transcript. + next; # we won't print this word. + } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath]. + print " $noise_word"; + } elsif($w =~ m:^\<([\w\']+)\>$:) { + # e.g. replace with and. (the <> means verbal deletion of a word).. but it's pronounced. + print " $1"; + } elsif($w eq "--DASH") { + print " -DASH"; # This is a common issue; the CMU dictionary has it as -DASH. +# } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word +# print " $1 -DASH"; + } else { + print " $w"; + } + } + print "\n"; +} diff --git a/egs/chime_wsj0/s5/local/reverb_wsj0_data_prep.sh b/egs/chime_wsj0/s5/local/reverb_wsj0_data_prep.sh new file mode 100755 index 000000000..c6903f21c --- /dev/null +++ b/egs/chime_wsj0/s5/local/reverb_wsj0_data_prep.sh @@ -0,0 +1,100 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the top-level WSJ corpus directory." + echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory" + echo "within the top-level corpus directory." + exit 1; +fi + +CORPUS=$1 + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +cd $dir + +# reverb list for SI-84 + +find $1/si_tr_s -name '*.wav' | sort -u > train_si84_reverb.flist + + + +# Dev-set Hub 1,2 (503, 913 utterances) + +# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. +# Sometimes this gets copied from the CD's with upcasing, don't know +# why (could be older versions of the disks). +find $1/si_dt_20 -name '*.wav' | sort -u > dev_dt_20_reverb.flist +find $1/si_dt_05 -name '*.wav' | sort -u > dev_dt_05_reverb.flist + + +# Finding the transcript files: +#find -L $CORPUS -iname '*.dot' > dot_files.flist +if [ ! -e $dir/dot_files.flist ]; then + echo "Could not find $dir/dot_files.flist files, first run clean_data_prep.sh"; + exit 1; +fi + +# Convert the transcripts into our format (no normalization yet) +# adding suffix to utt_id +# 1 for reverb condition +for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do + cat $x.flist | perl -e ' + while(<>) { + m:^\S+/(\w+)\.wav$: || die "Bad line $_"; + $id = $1; + $id =~ tr/A-Z/a-z/; + print "$id $_"; + } + ' | sort > ${x}_wav_tmp.scp + cat ${x}_wav_tmp.scp | awk '{print $1}' \ + | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1 + cat ${x}_wav_tmp.scp | awk '{printf("%s1 %s\n", $1, $2);}' > ${x}_wav.scp + cat ${x}_tmp.trans1 | awk '{printf("%s1 ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1 +done + + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) +#for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do +# awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \ +# > ${x}_wav.scp +#done + +# Make the utt2spk and spk2utt files. +for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do + cat ${x}_wav.scp | awk '{print $1}' \ + | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +echo "Data preparation succeeded" diff --git a/egs/chime_wsj0/s5/local/run_basis_fmllr.sh b/egs/chime_wsj0/s5/local/run_basis_fmllr.sh new file mode 100755 index 000000000..3c04e480a --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_basis_fmllr.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +. cmd.sh + +mfccdir=mfcc + +# Make "per-utterance" versions of the test sets where the speaker +# information corresponds to utterances-- to demonstrate adaptation on +# short utterances, particularly for basis fMLLR +for x in test_eval92 test_eval93 test_dev93 ; do + y=${x}_utt + rm -r data/$y + cp -r data/$x data/$y + cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk; + cp data/$y/utt2spk data/$y/spk2utt; + steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; +done + + + # basis fMLLR experiments. + # First a baseline: decode per-utterance with normal fMLLR. +steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \ + exp/tri3b/graph_tgpr data/test_dev93_utt exp/tri3b/decode_tgpr_dev93_utt || exit 1; +steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \ + exp/tri3b/graph_tgpr data/test_eval92_utt exp/tri3b/decode_tgpr_eval92_utt || exit 1; + + # get the fMLLR basis. +steps/get_fmllr_basis.sh --cmd "$train_cmd" data/train_si84 data/lang exp/tri3b + + # decoding tri3b with basis fMLLR +steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \ + exp/tri3b/graph_tgpr data/test_dev93 exp/tri3b/decode_tgpr_dev93_basis || exit 1; +steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \ + exp/tri3b/graph_tgpr data/test_eval92 exp/tri3b/decode_tgpr_eval92_basis || exit 1; + + # The same, per-utterance. +steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \ + exp/tri3b/graph_tgpr data/test_dev93_utt exp/tri3b/decode_tgpr_dev93_basis_utt || exit 1; +steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \ + exp/tri3b/graph_tgpr data/test_eval92_utt exp/tri3b/decode_tgpr_eval92_basis_utt || exit 1; + + diff --git a/egs/chime_wsj0/s5/local/run_dnn.sh b/egs/chime_wsj0/s5/local/run_dnn.sh new file mode 100755 index 000000000..680a6ca31 --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_dnn.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +# Copyright 2012-2013 Brno University of Technology (Author: Karel Vesely) +# Apache 2.0 + +# In this recipe we build DNN in four stages: +# 1) Data preparations : the fMLLR features are stored to disk +# 2) RBM pre-training : in this unsupervised stage we train stack of RBMs, a good starting point for Cross-entropy trainig +# 3) Frame-level cross-entropy training : in this stage the objective is to classify frames correctly. +# 4) Sequence-criterion training : in this stage the objective is to classify the whole sequence correctly, +# the idea is similar to the 'Discriminative training' in context of GMM-HMMs. + + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +. ./path.sh ## Source the tools/utils (import the queue.pl) + + + +#false && \ +{ +gmmdir=exp/tri4b + +### +### Generate the alignments of dev93 +### (held-out set for Cross-entropy training) +### +steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \ + data/test_dev93 data/lang $gmmdir exp/tri4b_ali_dev93 || exit 1 + +### +### Store the fMLLR features, so we can train on them easily +### + +# train si284 +# generate the features +dir=data-fmllr-tri4b/train_si284 +steps/make_fmllr_feats.sh --nj 20 --cmd "$train_cmd" \ + --transform-dir exp/tri4b_ali_si284 \ + $dir data/train_si284 $gmmdir $dir/_log $dir/_data || exit 1 + +# eval92 +dir=data-fmllr-tri4b/test_eval92 +steps/make_fmllr_feats.sh --nj 8 --cmd "$train_cmd" \ + --transform-dir exp/tri4b/decode_tgpr_eval92 \ + $dir data/test_eval92 $gmmdir $dir/_log $dir/_data || exit 1 + +# dev93 (unsupervised fMLLR) +# held-out set of Cross-entropy training +dir=data-fmllr-tri4b/test_dev93 +steps/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \ + --transform-dir exp/tri4b/decode_tgpr_dev93 \ + $dir data/test_dev93 $gmmdir $dir/_log $dir/_data || exit 1 +} + + + +### +### Now we can pre-train stack of RBMs +### +#false && \ +{ # Pre-train the DBN +dir=exp/tri4b_pretrain-dbn +(tail --pid=$$ -F $dir/_pretrain_dbn.log 2>/dev/null)& +$cuda_cmd $dir/_pretrain_dbn.log \ + steps/pretrain_dbn.sh --rbm-iter 3 data-fmllr-tri4b/train_si284 $dir +} + + + +### +### Now we train the DNN optimizing cross-entropy. +### This will take quite some time. +### + +#false && \ +{ # Train the MLP +dir=exp/tri4b_pretrain-dbn_dnn +ali=exp/tri4b_ali +feature_transform=exp/tri4b_pretrain-dbn/final.feature_transform +dbn=exp/tri4b_pretrain-dbn/6.dbn +(tail --pid=$$ -F $dir/_train_nnet.log 2>/dev/null)& +$cuda_cmd $dir/_train_nnet.log \ + steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ + data-fmllr-tri4b/train_si284 data-fmllr-tri4b/test_dev93 data/lang ${ali}_si284 ${ali}_dev93 $dir || exit 1; +# decode with 'big-dictionary' (reuse HCLG graph) +steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_bd_tgpr_dev93 || exit 1; +steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_bd_tgpr_eval92 || exit 1; +} + + + +### +### Finally we train using sMBR criterion. +### We do Stochastic-GD with per-utterance updates. +### +### To get faster convergence, we will re-generate +### the lattices after 1st epoch of sMBR. +### + +dir=exp/tri4b_pretrain-dbn_dnn_smbr +srcdir=exp/tri4b_pretrain-dbn_dnn +acwt=0.10 + +# First we need to generate lattices and alignments: +#false && \ +{ +steps/align_nnet.sh --nj 100 --cmd "$train_cmd" \ + data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_ali_si284 || exit 1; +steps/make_denlats_nnet.sh --nj 100 --cmd "$decode_cmd" \ + --config conf/decode_dnn.config --acwt $acwt \ + data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_denlats_si284 || exit 1; +} +# Now we re-train the hybrid by single iteration of sMBR +#false && \ +{ +steps/train_nnet_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \ + data-fmllr-tri4b/train_si284 data/lang $srcdir \ + ${srcdir}_ali_si284 ${srcdir}_denlats_si284 $dir || exit 1 +} +# Decode +#false && \ +{ +for ITER in 1; do + # decode dev93 with big dict graph_bd_tgpr + steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_dev93_bd_tgpr_it${ITER} || exit 1 + # decode eval92 with big dict graph_bd_tgpr + steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_eval92_bd_tgpr_it${ITER} || exit 1 +done +} + + +### +### Re-generate lattices and run several more iterations of sMBR +### + +dir=exp/tri4b_pretrain-dbn_dnn_smbr_iter1-lats +srcdir=exp/tri4b_pretrain-dbn_dnn_smbr +acwt=0.10 + +# First we need to generate lattices and alignments: +#false && \ +{ +steps/align_nnet.sh --nj 100 --cmd "$train_cmd" \ + data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_ali_si284 || exit 1; +steps/make_denlats_nnet.sh --nj 100 --cmd "$decode_cmd" \ + --config conf/decode_dnn.config --acwt $acwt \ + data-fmllr-tri4b/train_si284 data/lang $srcdir ${srcdir}_denlats_si284 || exit 1; +} +# Now we re-train the hybrid by several iterations of sMBR +#false && \ +{ +steps/train_nnet_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ + data-fmllr-tri4b/train_si284 data/lang $srcdir \ + ${srcdir}_ali_si284 ${srcdir}_denlats_si284 $dir +} +# Decode +#false && \ +{ +for ITER in 1 2 3 4; do + # decode dev93 with big dict graph_bd_tgpr + steps/decode_nnet.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_dev93 $dir/decode_dev93_bd_tgpr_it${ITER} || exit 1 + # decode eval92 with big dict graph_bd_tgpr + steps/decode_nnet.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4b/graph_bd_tgpr data-fmllr-tri4b/test_eval92 $dir/decode_eval92_bd_tgpr_it${ITER} || exit 1 +done +} + + +# Getting results [see RESULTS file] +# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done diff --git a/egs/chime_wsj0/s5/local/run_fwdbwd.sh b/egs/chime_wsj0/s5/local/run_fwdbwd.sh new file mode 100755 index 000000000..c84f2f1e0 --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_fwdbwd.sh @@ -0,0 +1,41 @@ +#prepare reverse lexicon and language model for backwards decoding +utils/prepare_lang.sh --reverse true data/local/dict "" data/local/lang_tmp.reverse data/lang.reverse || exit 1; +utils/reverse_lm.sh data/local/nist_lm/lm_bg_5k.arpa.gz data/lang.reverse data/lang_test_bg_5k.reverse || exit 1; +utils/reverse_lm_test.sh data/lang_test_bg_5k data/lang_test_bg_5k.reverse || exit 1; + +# normal forward decoding +utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k +steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 8 --cmd "$decode_cmd" \ + exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_10 || exit 1; + +# backward decoding +utils/mkgraph.sh --reverse data/lang_test_bg_5k.reverse exp/tri2a exp/tri2a/graph_bg5k_r +steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 8 --cmd "$decode_cmd" \ + exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_reverse10 || exit 1; + +# pingpong decoding +steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 8 --cmd "$decode_cmd" \ + --first_pass exp/tri2a/decode_eval92_bg5k_10 exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_pingpong10 || exit 1; +steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 8 --cmd "$decode_cmd" \ + --first_pass exp/tri2a/decode_eval92_bg5k_reverse10 exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_pongping10 || exit 1; + +# same for bigger language models (on machine with 8GB RAM, you can run the whole decoding in 3-4 min without SGE) +utils/prepare_lang.sh --reverse true data/local/dict_larger "" data/local/lang_larger.reverse data/lang_bd.reverse || exit; +utils/reverse_lm.sh --lexicon data/local/dict_larger/lexicon.txt data/local/local_lm/3gram-mincount/lm_pr6.0.gz data/lang_bd.reverse data/lang_test_bd_tgpr.reverse || exit 1; +utils/reverse_lm_test.sh data/lang_test_bd_tgpr data/lang_test_bd_tgpr.reverse || exit 1; + +utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri2a exp/tri2a/graph_bd_tgpr +steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 4 --cmd run.pl \ + exp/tri2a/graph_bd_tgpr data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_10 || exit 1; + +utils/mkgraph.sh --reverse data/lang_test_bd_tgpr.reverse exp/tri2a exp/tri2a/graph_bd_tgpr_r +steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 4 --cmd run.pl \ + exp/tri2a/graph_bd_tgpr_r data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_reverse10 || exit 1; + +steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 4 --cmd run.pl \ + --first_pass exp/tri2a/decode_eval92_bdtgpr4_10 exp/tri2a/graph_bd_tgpr_r data/test_eval92 \ + exp/tri2a/decode_eval92_bdtgpr4_pingpong10 || exit 1; + +steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 4 --cmd run.pl \ + --first_pass exp/tri2a/decode_eval92_bdtgpr4_reverse10 exp/tri2a/graph_bd_tgpr data/test_eval92 \ + exp/tri2a/decode_eval92_bdtgpr4_pongping10 || exit 1; diff --git a/egs/chime_wsj0/s5/local/run_mmi_tri2b.sh b/egs/chime_wsj0/s5/local/run_mmi_tri2b.sh new file mode 100755 index 000000000..6517e46a1 --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_mmi_tri2b.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +. ./cmd.sh + +# Train and test MMI (and boosted MMI) on tri2b system. +steps/make_denlats.sh --sub-split 20 --nj 10 --cmd "$train_cmd" \ + data/train_si84 data/lang exp/tri2b exp/tri2b_denlats_si84 || exit 1; + +# train the basic MMI system. +steps/train_mmi.sh --cmd "$train_cmd" \ + data/train_si84 data/lang exp/tri2b_ali_si84 \ + exp/tri2b_denlats_si84 exp/tri2b_mmi || exit 1; +for iter in 3 4; do + steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ + exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_mmi/decode_tgpr_dev93_it$iter & + steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \ + exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi/decode_tgpr_eval92_it$iter & +done + +# MMI with 0.1 boosting factor. +steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \ + data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 \ + exp/tri2b_mmi_b0.1 || exit 1; + +for iter in 3 4; do + steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ + exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it$iter & + steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \ + exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it$iter & +done + + +# Train a UBM with 400 components, for fMMI. +steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \ + 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b + + steps/train_mmi_fmmi.sh --boost 0.1 --cmd "$train_cmd" \ + data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \ + exp/tri2b_fmmi_b0.1 + + for iter in `seq 3 8`; do + steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ + exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it$iter & + done + + steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \ + data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \ + exp/tri2b_fmmi_b0.1_lr0.005 || exit 1; + for iter in `seq 3 8`; do + steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ + exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it$iter & + done + + steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \ + data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \ + exp/tri2b_fmmi_indirect_b0.1 + for iter in `seq 3 8`; do + steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ + exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it$iter & + done diff --git a/egs/chime_wsj0/s5/local/run_mmi_tri4b.sh b/egs/chime_wsj0/s5/local/run_mmi_tri4b.sh new file mode 100755 index 000000000..db34f8e1d --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_mmi_tri4b.sh @@ -0,0 +1,50 @@ +#!/bin/bash +. ./cmd.sh + +steps/make_denlats.sh --nj 30 --sub-split 30 --cmd "$train_cmd" \ + --transform-dir exp/tri4b_ali_si284 \ + data/train_si284 data/lang exp/tri4b exp/tri4b_denlats_si284 || exit 1; + +steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \ + data/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b_denlats_si284 \ + exp/tri4b_mmi_b0.1 || exit 1; + +steps/decode.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode_tgpr_dev93 \ + exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93 + +#first, train UBM for fMMI experiments. +steps/train_diag_ubm.sh --silence-weight 0.5 --nj 30 --cmd "$train_cmd" \ + 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b + +# Next, fMMI+MMI. +steps/train_mmi_fmmi.sh \ + --boost 0.1 --cmd "$train_cmd" data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \ + exp/tri4b_fmmi_a || exit 1; + +for iter in 3 4 5 6 7 8; do + steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri3b/decode_tgpr_dev93 exp/tri4b/graph_tgpr data/test_dev93 \ + exp/tri4b_fmmi_a/decode_tgpr_dev93_it$iter & +done +# decode the last iter with the bd model. +for iter in 8; do + steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri3b/decode_bd_tgpr_dev93 exp/tri4b/graph_bd_tgpr data/test_dev93 \ + exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it$iter & + steps/decode_fmmi.sh --nj 8 --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri3b/decode_bd_tgpr_eval92 exp/tri4b/graph_bd_tgpr data/test_eval92 \ + exp/tri4b_fmmi_a/decode_tgpr_eval92_it$iter & +done + + +# fMMI + mmi with indirect differential. +steps/train_mmi_fmmi_indirect.sh \ + --boost 0.1 --cmd "$train_cmd" data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \ + exp/tri4b_fmmi_indirect || exit 1; + +for iter in 3 4 5 6 7 8; do + steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri3b/decode_tgpr_dev93 exp/tri4b/graph_tgpr data/test_dev93 \ + exp/tri4b_fmmi_indirect/decode_tgpr_dev93_it$iter & +done + diff --git a/egs/chime_wsj0/s5/local/run_nnet_cpu.sh b/egs/chime_wsj0/s5/local/run_nnet_cpu.sh new file mode 100755 index 000000000..c72e521f1 --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_nnet_cpu.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +. ./cmd.sh + + +# ... + +local/nnet2/run_5c.sh + diff --git a/egs/chime_wsj0/s5/local/run_raw_fmllr.sh b/egs/chime_wsj0/s5/local/run_raw_fmllr.sh new file mode 100644 index 000000000..be7d52e1c --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_raw_fmllr.sh @@ -0,0 +1,66 @@ +#!/bin/bash + + +steps/align_raw_fmllr.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ + data/train_si84 data/lang exp/tri2b exp/tri2b_ali_si84_raw + +steps/train_raw_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84_raw exp/tri3c || exit 1; + + +mfccdir=mfcc +for x in test_eval92 test_eval93 test_dev93 ; do + y=${x}_utt + cp -rT data/$x data/$y + cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk; + cp data/$y/utt2spk data/$y/spk2utt; + steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; +done + +( +utils/mkgraph.sh data/lang_test_tgpr exp/tri3c exp/tri3c/graph_tgpr || exit 1; +steps/decode_raw_fmllr.sh --nj 10 --cmd "$decode_cmd" \ + exp/tri3c/graph_tgpr data/test_dev93 exp/tri3c/decode_tgpr_dev93 || exit 1; +steps/decode_raw_fmllr.sh --nj 8 --cmd "$decode_cmd" \ + exp/tri3c/graph_tgpr data/test_eval92 exp/tri3c/decode_tgpr_eval92 || exit 1; + +steps/decode_raw_fmllr.sh --nj 30 --cmd "$decode_cmd" \ + exp/tri3c/graph_tgpr data/test_dev93_utt exp/tri3c/decode_tgpr_dev93_utt || exit 1; +steps/decode_raw_fmllr.sh --nj 30 --cmd "$decode_cmd" \ + exp/tri3c/graph_tgpr data/test_eval92_utt exp/tri3c/decode_tgpr_eval92_utt || exit 1; + +steps/decode_raw_fmllr.sh --use-normal-fmllr true --nj 10 --cmd "$decode_cmd" \ + exp/tri3c/graph_tgpr data/test_dev93 exp/tri3c/decode_tgpr_dev93_2fmllr || exit 1; +steps/decode_raw_fmllr.sh --use-normal-fmllr true --nj 8 --cmd "$decode_cmd" \ + exp/tri3c/graph_tgpr data/test_eval92 exp/tri3c/decode_tgpr_eval92_2fmllr || exit 1; +)& + +( +utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri3c exp/tri3c/graph_bd_tgpr || exit 1; + +steps/decode_raw_fmllr.sh --cmd "$decode_cmd" --nj 8 exp/tri3c/graph_bd_tgpr \ + data/test_eval92 exp/tri3c/decode_bd_tgpr_eval92 + steps/decode_raw_fmllr.sh --cmd "$decode_cmd" --nj 10 exp/tri3c/graph_bd_tgpr \ + data/test_dev93 exp/tri3c/decode_bd_tgpr_dev93 +)& + +steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ + data/train_si284 data/lang exp/tri3c exp/tri3c_ali_si284 || exit 1; + + +steps/train_raw_sat.sh --cmd "$train_cmd" \ + 4200 40000 data/train_si284 data/lang exp/tri3c_ali_si284 exp/tri4d || exit 1; +( + utils/mkgraph.sh data/lang_test_tgpr exp/tri4d exp/tri4d/graph_tgpr || exit 1; + steps/decode_raw_fmllr.sh --nj 10 --cmd "$decode_cmd" \ + exp/tri4d/graph_tgpr data/test_dev93 exp/tri4d/decode_tgpr_dev93 || exit 1; + steps/decode_raw_fmllr.sh --nj 8 --cmd "$decode_cmd" \ + exp/tri4d/graph_tgpr data/test_eval92 exp/tri4d/decode_tgpr_eval92 || exit 1; +) & + + +wait + + +#for x in exp/tri3{b,c}/decode_tgpr*; do grep WER $x/wer_* | utils/best_wer.sh ; done + diff --git a/egs/chime_wsj0/s5/local/run_rnnlms_sgmm5b.sh b/egs/chime_wsj0/s5/local/run_rnnlms_sgmm5b.sh new file mode 100755 index 000000000..67fcee50a --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_rnnlms_sgmm5b.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +for test in dev93 eval92; do + + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \ + data/test_${test} exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 || exit 1; + + +# Note: for N-best-list generation, choosing the acoustic scale (12) that gave +# the best WER on this test set. Ideally we should do this on a dev set. + + # This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM. + steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ + 0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_${test} \ + exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm30_0.25 \ + || exit 1; + + steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ + 0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_${test} \ + exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm100_0.5 \ + || exit 1; + + steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ + 0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_${test} \ + exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm200_0.5 \ + || exit 1; + + steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ + 0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \ + exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.5 \ + || exit 1; + + steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ + 0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \ + exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.75 \ + || exit 1; +done diff --git a/egs/chime_wsj0/s5/local/run_rnnlms_tri3b.sh b/egs/chime_wsj0/s5/local/run_rnnlms_tri3b.sh new file mode 100755 index 000000000..b98446e7b --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_rnnlms_tri3b.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +. cmd.sh + + # This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM. +steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm30_0.25 \ + || exit 1; + +steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm100_0.5 \ + || exit 1; + +steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm200_0.5 \ + || exit 1; + +steps/rnnlmrescore.sh \ + --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 \ + || exit 1; + +steps/rnnlmrescore.sh \ + --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 + +dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75_N1000 +rm -rf $dir +cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 $dir +steps/rnnlmrescore.sh \ + --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg $dir + +dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.75 +rm -rf $dir +cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir +steps/rnnlmrescore.sh \ + --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg $dir + +dir=exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.25 +rm -rf $dir +cp -r exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir +steps/rnnlmrescore.sh \ + --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.25 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg $dir + +steps/rnnlmrescore.sh \ + --N 10 --cmd "$decode_cmd" --inv-acwt 17 \ + 0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_eval92 \ + exp/tri3b/decode_bd_tgpr_eval92_fg exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm300_0.5_N10 \ + || exit 1; + diff --git a/egs/chime_wsj0/s5/local/run_sgmm.sh b/egs/chime_wsj0/s5/local/run_sgmm.sh new file mode 100755 index 000000000..62be4d837 --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_sgmm.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# This script is invoked from ../run.sh +# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity. + +. cmd.sh + +# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for +# training, but this shouldn't have much effect. + +( + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1; + + steps/train_ubm.sh --cmd "$train_cmd" \ + 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1; + + steps/train_sgmm.sh --cmd "$train_cmd" \ + 3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \ + exp/ubm5b/final.ubm exp/sgmm5a || exit 1; + + ( + utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr + steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ + exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 + ) & + + steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \ + --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1; + steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \ + data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 + + steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ + data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1 + + for iter in 1 2 3 4; do + steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \ + exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter & + done + + steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ + --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9 + + for iter in 1 2 3 4; do + steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \ + exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter & + done + +) & + + +( +# The next commands are the same thing on all the si284 data. + +# SGMM system on the si284 data [sgmm5b] + steps/train_ubm.sh --cmd "$train_cmd" \ + 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1; + + steps/train_sgmm.sh --cmd "$train_cmd" \ + 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ + exp/ubm5b/final.ubm exp/sgmm5b || exit 1; + + ( + utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr + steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ + exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93 + steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \ + exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92 + + utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1; + steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \ + exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93 + steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \ + exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92 + ) & + + steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \ + --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284 + + steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \ + data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 + + steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \ + data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1 + + for iter in 1 2 3 4; do + for test in dev93 eval92; do + steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \ + exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter & + + steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \ + exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter & + done + done +) & + + + +# Train quinphone SGMM system. + +steps/train_sgmm.sh --cmd "$train_cmd" \ + --context-opts "--context-width=5 --central-position=2" \ + 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ + exp/ubm5b/final.ubm exp/sgmm5c || exit 1; + +# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93. +steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ + data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93 + diff --git a/egs/chime_wsj0/s5/local/run_sgmm2.sh b/egs/chime_wsj0/s5/local/run_sgmm2.sh new file mode 100755 index 000000000..2e9f5d8e1 --- /dev/null +++ b/egs/chime_wsj0/s5/local/run_sgmm2.sh @@ -0,0 +1,148 @@ +#!/bin/bash + +# This script is invoked from ../run.sh +# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity. + +. cmd.sh + +# Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh; +# this takes out the "symmetric SGMM" part which is not always helpful. + +# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for +# training, but this shouldn't have much effect. + +( + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1; + + steps/train_ubm.sh --cmd "$train_cmd" \ + 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1; + + steps/train_sgmm2.sh --cmd "$train_cmd" \ + 7000 9000 data/train_si84 data/lang exp/tri4b_ali_si84 \ + exp/ubm5a/final.ubm exp/sgmm2_5a || exit 1; + + ( + utils/mkgraph.sh data/lang_test_tgpr exp/sgmm2_5a exp/sgmm2_5a/graph_tgpr + steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ + exp/sgmm2_5a/graph_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 + ) & + + steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \ + --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm2_5a exp/sgmm2_5a_ali_si84 || exit 1; + steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \ + data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 + + steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ + data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1 + + for iter in 1 2 3 4; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 \ + exp/sgmm2_5a_mmi_b0.1/decode_tgpr_dev93_it$iter & + done + + steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ + --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1_m0.9 + + for iter in 1 2 3 4; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 \ + exp/sgmm2_5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter & + done + +) & + + +( +# The next commands are the same thing on all the si284 data. + +# SGMM system on the si284 data [sgmm5b] + steps/train_ubm.sh --cmd "$train_cmd" \ + 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1; + + steps/train_sgmm2.sh --cmd "$train_cmd" \ + 11000 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ + exp/ubm5b/final.ubm exp/sgmm2_5b || exit 1; + + ( + utils/mkgraph.sh data/lang_test_tgpr exp/sgmm2_5b exp/sgmm2_5b/graph_tgpr + steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ + exp/sgmm2_5b/graph_tgpr data/test_dev93 exp/sgmm2_5b/decode_tgpr_dev93 + steps/decode_sgmm2.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \ + exp/sgmm2_5b/graph_tgpr data/test_eval92 exp/sgmm2_5b/decode_tgpr_eval92 + + utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm2_5b exp/sgmm2_5b/graph_bd_tgpr || exit 1; + steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \ + exp/sgmm2_5b/graph_bd_tgpr data/test_dev93 exp/sgmm2_5b/decode_bd_tgpr_dev93 + steps/decode_sgmm2.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \ + exp/sgmm2_5b/graph_bd_tgpr data/test_eval92 exp/sgmm2_5b/decode_bd_tgpr_eval92 + ) & + + + # This shows how you would build and test a quinphone SGMM2 system, but + ( + steps/train_sgmm2.sh --cmd "$train_cmd" \ + --context-opts "--context-width=5 --central-position=2" \ + 11000 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ + exp/ubm5b/final.ubm exp/sgmm2_5c || exit 1; + # Decode from lattices in exp/sgmm2_5b + steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ + data/test_dev93 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_dev93 exp/sgmm2_5c/decode_tgpr_dev93 + steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \ + data/test_eval92 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_eval92 exp/sgmm2_5c/decode_tgpr_eval92 + ) & + + + steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \ + --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm2_5b exp/sgmm2_5b_ali_si284 + + steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \ + data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 + + steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \ + data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1 + + for iter in 1 2 3 4; do + for test in eval92; do # dev93 + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \ + exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter & + done + done + + steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \ + --zero-if-disjoint true data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1_z + + for iter in 1 2 3 4; do + for test in eval92 dev93; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \ + exp/sgmm2_5b_mmi_b0.1_z/decode_bd_tgpr_${test}_it$iter & + done + done + +) & + +wait + +# Examples of combining some of the best decodings: SGMM+MMI with +# MMI+fMMI on a conventional system. + +local/score_combine.sh data/test_eval92 \ + data/lang_test_bd_tgpr \ + exp/tri4b_fmmi_a/decode_tgpr_eval92_it8 \ + exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3 \ + exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3 + + +# %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11 +# %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10 +# combined to: +# %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12 + +# Checking MBR decode of baseline: +cp -r -T exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr} +local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr +# MBR decoding did not seem to help (baseline was 3.85). I think this is normal at such low WERs. +%WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10 diff --git a/egs/chime_wsj0/s5/local/score.sh b/egs/chime_wsj0/s5/local/score.sh new file mode 100755 index 000000000..b18f35041 --- /dev/null +++ b/egs/chime_wsj0/s5/local/score.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +reverse=false +word_ins_penalty=0.0 +min_lmwt=5 +max_lmwt=20 +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + echo " --reverse (true/false) # score with time reversed features " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab \ + ark:- ark,t:$dir/scoring/LMWT.tra || exit 1; + +if $reverse; then + for lmwt in `seq $min_lmwt $max_lmwt`; do + mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig + awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \ + <$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra + done +fi + +# Note: the double level of quoting for the sed command +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ + cat $dir/scoring/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; + +exit 0; diff --git a/egs/chime_wsj0/s5/local/score_combine.sh b/egs/chime_wsj0/s5/local/score_combine.sh new file mode 100755 index 000000000..576962c74 --- /dev/null +++ b/egs/chime_wsj0/s5/local/score_combine.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright 2013 Arnab Ghoshal + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Script for system combination using minimum Bayes risk decoding. +# This calls lattice-combine to create a union of lattices that have been +# normalized by removing the total forward cost from them. The resulting lattice +# is used as input to lattice-mbr-decode. This should not be put in steps/ or +# utils/ since the scores on the combined lattice must not be scaled. + +# begin configuration section. +cmd=run.pl +min_lmwt=9 +max_lmwt=20 +lat_weights= +#end configuration section. + +help_message="Usage: "$(basename $0)" [options] [decode-dir3 ... ] +Options: + --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. + --min-lmwt INT # minumum LM-weight for lattice rescoring + --max-lmwt INT # maximum LM-weight for lattice rescoring + --lat-weights STR # colon-separated string of lattice weights +"; + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 5 ]; then + printf "$help_message\n"; + exit 1; +fi + +data=$1 +graphdir=$2 +odir=${@: -1} # last argument to the script +shift 2; +decode_dirs=( $@ ) # read the remaining arguments into an array +unset decode_dirs[${#decode_dirs[@]}-1] # 'pop' the last argument which is odir +num_sys=${#decode_dirs[@]} # number of systems to combine + +symtab=$graphdir/words.txt +[ ! -f $symtab ] && echo "$0: missing word symbol table '$symtab'" && exit 1; +[ ! -f $data/text ] && echo "$0: missing reference '$data/text'" && exit 1; + + +mkdir -p $odir/log + +for i in `seq 0 $[num_sys-1]`; do + model=${decode_dirs[$i]}/../final.mdl # model one level up from decode dir + for f in $model ${decode_dirs[$i]}/lat.1.gz ; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; + done + lats[$i]="\"ark:gunzip -c ${decode_dirs[$i]}/lat.*.gz |\"" +done + +mkdir -p $odir/scoring/log + +cat $data/text | sed 's:::g' | sed 's:::g' \ + > $odir/scoring/test_filt.txt + +if [ -z "$lat_weights" ]; then + $cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \ + lattice-combine --inv-acoustic-scale=LMWT ${lats[@]} ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab ark:- \ + ark,t:$odir/scoring/LMWT.tra || exit 1; +else + $cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \ + lattice-combine --inv-acoustic-scale=LMWT --lat-weights=$lat_weights \ + ${lats[@]} ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab ark:- \ + ark,t:$odir/scoring/LMWT.tra || exit 1; +fi + +$cmd LMWT=$min_lmwt:$max_lmwt $odir/scoring/log/score.LMWT.log \ + cat $odir/scoring/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$odir/scoring/test_filt.txt ark,p:- ">&" $odir/wer_LMWT || exit 1; + +exit 0 diff --git a/egs/chime_wsj0/s5/local/score_mbr.sh b/egs/chime_wsj0/s5/local/score_mbr.sh new file mode 100755 index 000000000..4052512f7 --- /dev/null +++ b/egs/chime_wsj0/s5/local/score_mbr.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Script for minimum bayes risk decoding. + +[ -f ./path.sh ] && . ./path.sh; + +# begin configuration section. +cmd=run.pl +min_lmwt=9 +max_lmwt=20 +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score_mbr.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score_mbr.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt + +# We submit the jobs separately, not as an array, because it's hard +# to get the inverse of the LM scales. +rm $dir/.error 2>/dev/null +for inv_acwt in `seq $min_lmwt $max_lmwt`; do + acwt=`perl -e "print (1.0/$inv_acwt);"` + $cmd $dir/scoring/rescore_mbr.${inv_acwt}.log \ + lattice-mbr-decode --acoustic-scale=$acwt --word-symbol-table=$symtab \ + "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/${inv_acwt}.tra \ + || touch $dir/.error & +done +wait; +[ -f $dir/.error ] && echo "score_mbr.sh: errror getting MBR outout."; + + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ + cat $dir/scoring/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">" $dir/wer_LMWT || exit 1; + diff --git a/egs/chime_wsj0/s5/local/wsj_data_prep.sh b/egs/chime_wsj0/s5/local/wsj_data_prep.sh new file mode 100755 index 000000000..685b57aa7 --- /dev/null +++ b/egs/chime_wsj0/s5/local/wsj_data_prep.sh @@ -0,0 +1,201 @@ +#!/bin/bash + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + + +if [ $# -le 3 ]; then + echo "Arguments should be a list of WSJ directories, see ../run.sh for example." + exit 1; +fi + + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +cd $dir + +# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command +# line arguments being absolute pathnames. +rm -r links/ 2>/dev/null +mkdir links/ +ln -s $* links + +# Do some basic checks that we have what we expected. +if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then + echo "wsj_data_prep.sh: Spot check of command line arguments failed" + echo "Command line arguments must be absolute pathnames to WSJ directories" + echo "with names like 11-13.1." + exit 1; +fi + +# This version for SI-84 + +cat links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \ + $local/ndx2flist.pl $* | sort | \ + grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si84.flist + +nl=`cat train_si84.flist | wc -l` +[ "$nl" -eq 7138 ] || echo "Warning: expected 7138 lines in train_si84.flist, got $nl" + +# This version for SI-284 +cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \ + links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \ + $local/ndx2flist.pl $* | sort | \ + grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist + +nl=`cat train_si284.flist | wc -l` +[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl" + +# Now for the test sets. +# links/13-34.1/wsj1/doc/indices/readme.doc +# describes all the different test sets. +# Note: each test-set seems to come in multiple versions depending +# on different vocabulary sizes, verbalized vs. non-verbalized +# pronunciations, etc. We use the largest vocab and non-verbalized +# pronunciations. +# The most normal one seems to be the "baseline 60k test set", which +# is h1_p0. + +# Nov'92 (333 utts) +# These index files have a slightly different format; +# have to add .wv1 +cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \ + $local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \ + sort > test_eval92.flist + +# Nov'92 (330 utts, 5k vocab) +cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_05.ndx | \ + $local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \ + sort > test_eval92_5k.flist + +# Nov'93: (213 utts) +# Have to replace a wrong disk-id. +cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h1_p0.ndx | \ + sed s/13_32_1/13_33_1/ | \ + $local/ndx2flist.pl $* | sort > test_eval93.flist + +# Nov'93: (213 utts, 5k) +cat links/13-32.1/wsj1/doc/indices/wsj1/eval/h2_p0.ndx | \ + sed s/13_32_1/13_33_1/ | \ + $local/ndx2flist.pl $* | sort > test_eval93_5k.flist + +# Dev-set for Nov'93 (503 utts) +cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \ + $local/ndx2flist.pl $* | sort > test_dev93.flist + +# Dev-set for Nov'93 (513 utts, 5k vocab) +cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \ + $local/ndx2flist.pl $* | sort > test_dev93_5k.flist + + +# Dev-set Hub 1,2 (503, 913 utterances) + +# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. +# Sometimes this gets copied from the CD's with upcasing, don't know +# why (could be older versions of the disks). +find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist +find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist + + +# Finding the transcript files: +for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist + +# Convert the transcripts into our format (no normalization yet) +for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp + cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1 +done + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1; +done + +# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) +for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp +done + +# Make the utt2spk and spk2utt files. +for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + cat ${x}_sph.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + + +#in case we want to limit lm's on most frequent words, copy lm training word frequency list +cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir +chmod u+w $lmdir/*.lst # had weird permissions on source. + +# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without +# verbalized pronunciations. This is the most common test setup, I understand. + +cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb20onp.z $lmdir/lm_bg.arpa.gz || exit 1; +chmod u+w $lmdir/lm_bg.arpa.gz + +# trigram would be: +cat links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb20onp.z | \ + perl -e 'while(<>){ if(m/^\\data\\/){ print; last; } } while(<>){ print; }' | \ + gzip -c -f > $lmdir/lm_tg.arpa.gz || exit 1; + +prune-lm --threshold=1e-7 $lmdir/lm_tg.arpa.gz $lmdir/lm_tgpr.arpa || exit 1; +gzip -f $lmdir/lm_tgpr.arpa || exit 1; + +# repeat for 5k language models +cp links/13-32.1/wsj1/doc/lng_modl/base_lm/bcb05onp.z $lmdir/lm_bg_5k.arpa.gz || exit 1; +chmod u+w $lmdir/lm_bg_5k.arpa.gz + +# trigram would be: !only closed vocabulary here! +cp links/13-32.1/wsj1/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1; +chmod u+w $lmdir/lm_tg_5k.arpa.gz +gunzip $lmdir/lm_tg_5k.arpa.gz +tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz +rm $lmdir/lm_tg_5k.arpa + +prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1; +gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1; + + +if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then + rm wsj0-train-spkrinfo.txt + ! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \ + echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ + wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt +fi + +if [ ! -f wsj0-train-spkrinfo.txt ]; then + echo "Could not get the spkrinfo.txt file from LDC website (moved)?" + echo "This is possibly omitted from the training disks; couldn't find it." + echo "Everything else may have worked; we just may be missing gender info" + echo "which is only needed for VTLN-related diagnostics anyway." + exit 1 +fi +# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the +# LDC put it on the web. Perhaps it was accidentally omitted from the +# disks. + +cat links/11-13.1/wsj0/doc/spkrinfo.txt \ + links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \ + links/13-34.1/wsj1/doc/dev_spok/spkrinfo.txt \ + links/13-34.1/wsj1/doc/train/spkrinfo.txt \ + ./wsj0-train-spkrinfo.txt | \ + perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \ + awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender + + +echo "Data preparation succeeded" diff --git a/egs/chime_wsj0/s5/local/wsj_extend_dict.sh b/egs/chime_wsj0/s5/local/wsj_extend_dict.sh new file mode 100755 index 000000000..38a06bb48 --- /dev/null +++ b/egs/chime_wsj0/s5/local/wsj_extend_dict.sh @@ -0,0 +1,173 @@ +#!/bin/bash + +# This script builds a larger word-list and dictionary +# than used for the LMs supplied with the WSJ corpus. +# It uses a couple of strategies to fill-in words in +# the LM training data but not in CMUdict. One is +# to generate special prons for possible acronyms, that +# just consist of the constituent letters. The other +# is designed to handle derivatives of known words +# (e.g. deriving the pron of a plural from the pron of +# the base-word), but in a more general, learned-from-data +# way. +# It makes use of scripts in local/dict/ + +if [ $# -ne 1 ]; then + echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/" + exit 1 +fi +if [ "`basename $1`" != 13-32.1 ]; then + echo "Expecting the argument to this script to end in 13-32.1" + exit 1 +fi + +# e.g. +#srcdir=/mnt/matylda2/data/WSJ1/13-32.1 +export PATH=$PATH:`pwd`/local/dict/ +srcdir=$1 +mkdir -p data/local/dict_larger +dir=data/local/dict_larger +cp data/local/dict/* data/local/dict_larger # Various files describing phones etc. + # are there; we just want to copy them as the phoneset is the same. +rm data/local/dict_larger/lexicon.txt # we don't want this. +rm data/local/dict_larger/lexiconp.txt # we don't want this either. +mincount=2 # Minimum count of an OOV we will try to generate a pron for. + +[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1; + +# Remove comments from cmudict; print first field; remove +# words like FOO(1) which are alternate prons: our dict format won't +# include these markers. +grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | + perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu + +cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu + +echo "Getting training data [this should take at least a few seconds; if not, there's a problem]" + +# Convert to uppercase, remove XML-like markings. +# For words ending in "." that are not in CMUdict, we assume that these +# are periods that somehow remained in the data during data preparation, +# and we we replace the "." with "\n". Note: we found this by looking at +# oov.counts below (before adding this rule). + +touch $dir/cleaned.gz +if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then + echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]"; +else + gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \ + | awk '/^){ chop; $isword{$_} = 1; } + while() { + @A = split(" ", $_); + for ($n = 0; $n < @A; $n++) { + $a = $A[$n]; + if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "." + # and have no other "." in them: treat as period. + print "$a"; + if ($n+1 < @A) { print "\n"; } + } else { print "$a "; } + } + print "\n"; + } + ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz +fi + +# get unigram counts +echo "Getting unigram counts" +gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \ + awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams + +cat $dir/unigrams | awk -v dict=$dir/dict.cmu \ + 'BEGIN{while(getline $dir/oov.counts + +echo "Most frequent unseen unigrams are: " +head $dir/oov.counts + +# Prune away singleton counts, and remove things with numbers in +# (which should have been normalized) and with no letters at all. + + +cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \ + | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist + +# Automatic rule-finding... + +# First make some prons for possible acronyms. +# Note: we don't do this for things like U.K or U.N, +# or A.B. (which doesn't exist anyway), +# as we consider this normalization/spelling errors. + +cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms + +mkdir $dir/f $dir/b # forward, backward directions of rules... + # forward is normal suffix + # rules, backward is reversed (prefix rules). These + # dirs contain stuff we create while making the rule-based + # extensions to the dictionary. + +# Remove ; and , from words, if they are present; these +# might crash our scripts, as they are used as separators there. +filter_dict.pl $dir/dict.cmu > $dir/f/dict +cat $dir/oovlist | filter_dict.pl > $dir/f/oovs +reverse_dict.pl $dir/f/dict > $dir/b/dict +reverse_dict.pl $dir/f/oovs > $dir/b/oovs + +# The next stage takes a few minutes. +# Note: the forward stage takes longer, as English is +# mostly a suffix-based language, and there are more rules +# that it finds. +for d in $dir/f $dir/b; do + ( + cd $d + cat dict | get_rules.pl 2>get_rules.log >rules + get_rule_hierarchy.pl rules >hierarchy + awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ + limit_candidate_prons.pl hierarchy | \ + score_prons.pl dict | \ + count_rules.pl >rule.counts + # the sort command below is just for convenience of reading. + score_rules.pl rules.with_scores + get_candidate_prons.pl rules.with_scores dict oovs | \ + limit_candidate_prons.pl hierarchy > oovs.candidates + ) & +done +wait + +# Merge the candidates. +reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates +select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \ + > $dir/dict.oovs + +cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged + +awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled +sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled + + +# add_counts.pl attaches to original counts to the list of handled/not-handled OOVs +add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts +add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts + +echo "**Top OOVs we handled are:**"; +head $dir/oovlist.handled.counts +echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; +head $dir/oovlist.not_handled.counts + + +echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`" +echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`" +echo "Count of OOVs we didn't handle due to low count is" \ + `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts` +# The two files created above are for humans to look at, as diagnostics. + +cat < $dir/lexicon.txt +!SIL SIL + SPN + SPN + NSN +EOF + +echo "Created $dir/lexicon.txt" diff --git a/egs/chime_wsj0/s5/local/wsj_format_data.sh b/egs/chime_wsj0/s5/local/wsj_format_data.sh new file mode 100755 index 000000000..ee1450f59 --- /dev/null +++ b/egs/chime_wsj0/s5/local/wsj_format_data.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# This script takes data prepared in a corpus-dependent way +# in data/local/, and converts it into the "canonical" form, +# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, +# data/train_si284, data/train_si84, etc. + +# Don't bother doing train_si84 separately (although we have the file lists +# in data/local/) because it's just the first 7138 utterances in train_si284. +# We'll create train_si84 after doing the feature extraction. + +. ./path.sh || exit 1; + +echo "Preparing train and test data" +srcdir=data/local/data +lmdir=data/local/nist_lm +tmpdir=data/local/lm_tmp +lexicon=data/local/lang_tmp/lexiconp.txt +mkdir -p $tmpdir + +for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do + mkdir -p data/$x + cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; + cp $srcdir/$x.txt data/$x/text || exit 1; + cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1; + cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1; + utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1; +done + + +# Next, for each type of language model, create the corresponding FST +# and the corresponding lang_test_* directory. + +echo Preparing language models for test + +for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do + test=data/lang_test_${lm_suffix} + mkdir -p $test + for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ + phones/; do + cp -r data/lang/$f $test + done + gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ + utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt + + # grep -v ' ' because the LM seems to have some strange and useless + # stuff in it with multiple 's in the history. Encountered some other similar + # things in a LM from Geoff. Removing all "illegal" combinations of and , + # which are supposed to occur only at being/end of utt. These can cause + # determinization failures of CLG [ends up being epsilon cycles]. + gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + arpa2fst - | fstprint | \ + utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ + utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ + --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > $test/G.fst + fstisstochastic $test/G.fst + # The output is like: + # 9.14233e-05 -0.259833 + # we do expect the first of these 2 numbers to be close to zero (the second is + # nonzero because the backoff weights make the states sum to >1). + # Because of the fiasco for these particular LMs, the first number is not + # as close to zero as it could be. + + # Everything below is only for diagnostic. + # Checking that G has no cycles with empty words on them (e.g. , ); + # this might cause determinization failure of CLG. + # #0 is treated as an empty word. + mkdir -p $tmpdir/g + awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ + < "$lexicon" >$tmpdir/g/select_empty.fst.txt + fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ + fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + echo "Language model has cycles with empty words" && exit 1 + rm -r $tmpdir/g +done + +echo "Succeeded in formatting data." +rm -r $tmpdir diff --git a/egs/chime_wsj0/s5/local/wsj_format_local_lms.sh b/egs/chime_wsj0/s5/local/wsj_format_local_lms.sh new file mode 100755 index 000000000..31b1a8662 --- /dev/null +++ b/egs/chime_wsj0/s5/local/wsj_format_local_lms.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Copyright Johns Hopkins University (Author: Daniel Povey) 2012 + +. ./path.sh + +[ ! -d data/lang_bd ] && echo "Expect data/local/lang_bd to exist" && exit 1; + +lm_srcdir_3g=data/local/local_lm/3gram-mincount +lm_srcdir_4g=data/local/local_lm/4gram-mincount + +[ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1; +[ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1; + +for d in data/lang_test_bd_{tg,tgpr,fg,fgpr}; do + rm -r $d 2>/dev/null + cp -r data/lang_bd $d +done + +lang=data/lang_bd + +# Be careful: this time we dispense with the grep -v ' ' so this might +# not work for LMs generated from all toolkits. +gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \ + arpa2fst - | fstprint | \ + utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ + --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > data/lang_test_bd_tgpr/G.fst || exit 1; + fstisstochastic data/lang_test_bd_tgpr/G.fst + +gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \ + arpa2fst - | fstprint | \ + utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ + --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > data/lang_test_bd_tg/G.fst || exit 1; + fstisstochastic data/lang_test_bd_tg/G.fst + +gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \ + arpa2fst - | fstprint | \ + utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ + --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > data/lang_test_bd_fg/G.fst || exit 1; + fstisstochastic data/lang_test_bd_fg/G.fst + +gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \ + arpa2fst - | fstprint | \ + utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ + --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon > data/lang_test_bd_fgpr/G.fst || exit 1; + fstisstochastic data/lang_test_bd_fgpr/G.fst + +exit 0; diff --git a/egs/chime_wsj0/s5/local/wsj_prepare_dict.sh b/egs/chime_wsj0/s5/local/wsj_prepare_dict.sh new file mode 100755 index 000000000..82ba8ad94 --- /dev/null +++ b/egs/chime_wsj0/s5/local/wsj_prepare_dict.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# Call this script from one level above, e.g. from the s3/ directory. It puts +# its output in data/local/. + +# The parts of the output of this that will be needed are +# [in data/local/dict/ ] +# lexicon.txt +# extra_questions.txt +# nonsilence_phones.txt +# optional_silence.txt +# silence_phones.txt + +# run this from ../ +dir=data/local/dict +mkdir -p $dir + + +# (1) Get the CMU dictionary +svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ + $dir/cmudict || exit 1; + +# can add -r 10966 for strict compatibility. + + +#(2) Dictionary preparation: + + +# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point). +# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones. + +# silence phones, one per line. +(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt +echo SIL > $dir/optional_silence.txt + +# nonsilence phones; on each line is a list of phones that correspond +# really to the same base phone. +cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \ + perl -e 'while(<>){ + chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; + $phones_of{$1} .= "$_ "; } + foreach $list (values %phones_of) {print $list . "\n"; } ' \ + > $dir/nonsilence_phones.txt || exit 1; + +# A few extra questions that will be added to those obtained by automatically clustering +# the "real" phones. These ask about stress; there's also one for silence. +cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; +cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $dir/extra_questions.txt || exit 1; + +grep -v ';;;' $dir/cmudict/cmudict.0.7a | \ + perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ + > $dir/lexicon1_raw_nosil.txt || exit 1; + +# Add to cmudict the silences, noises etc. + +(echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; echo ' NSN'; ) | \ + cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1; + + +# lexicon.txt is without the _B, _E, _S, _I markers. +# This is the input to wsj_format_data.sh +cp $dir/lexicon2_raw.txt $dir/lexicon.txt + + +echo "Dictionary preparation succeeded" + diff --git a/egs/chime_wsj0/s5/local/wsj_train_lms.sh b/egs/chime_wsj0/s5/local/wsj_train_lms.sh new file mode 100755 index 000000000..060f387f2 --- /dev/null +++ b/egs/chime_wsj0/s5/local/wsj_train_lms.sh @@ -0,0 +1,202 @@ +#!/bin/bash + +# This script trains LMs on the WSJ LM-training data. +# It requires that you have already run wsj_extend_dict.sh, +# to get the larger-size dictionary including all of CMUdict +# plus any OOVs and possible acronyms that we could easily +# derive pronunciations for. + +# This script takes no command-line arguments + +dir=data/local/local_lm +srcdir=data/local/dict_larger +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH +( # First make sure the kaldi_lm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d kaldi_lm ]; then + echo Not installing the kaldi_lm toolkit since it is already there. + else + echo Downloading and installing the kaldi_lm tools + if [ ! -f kaldi_lm.tar.gz ]; then + wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1; + fi + tar -xvzf kaldi_lm.tar.gz || exit 1; + cd kaldi_lm + make || exit 1; + echo Done making the kaldi_lm tools + fi +) || exit 1; + + + +if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then + echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist"; + echo "You need to run local/wsj_extend_dict.sh before running this script." + exit 1; +fi + +# Get a wordlist-- keep everything but silence, which should not appear in +# the LM. +awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt + +# Get training data with OOV words (w.r.t. our current vocab) replaced with . +echo "Getting training data with OOV words replaced with (train_nounk.gz)" +gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \ + 'BEGIN{while((getline0) v[$1]=1;} + {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ + | gzip -c > $dir/train_nounk.gz + +# Get unigram counts (without bos/eos, but this doens't matter here, it's +# only to get the word-map, which treats them specially & doesn't need their +# counts). +# Add a 1-count for each word in word-list by including that in the data, +# so all words appear. +gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \ + awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ + sort -nr > $dir/unigram.counts + +# Get "mapped" words-- a character encoding of the words that makes the common words very short. +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map + +gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=1;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz + +# To save disk space, remove the un-mapped training data. We could +# easily generate it again if needed. +rm $dir/train_nounk.gz + +train_lm.sh --arpa --lmtype 3gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 +# 7.8 million N-grams. + +prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ +# 1.45 million N-grams. +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 + +train_lm.sh --arpa --lmtype 4gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 +# 10.3 million N-grams. + +prune_lm.sh --arpa 7.0 $dir/4gram-mincount +# 1.50 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 + + +exit 0 + +### Below here, this script is showing various commands that +## were run during LM tuning. + +train_lm.sh --arpa --lmtype 3gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 +# 7.8 million N-grams. + +prune_lm.sh --arpa 3.0 $dir/3gram-mincount/ +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740 +# 2.5 million N-grams. + +prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ +# 1.45 million N-grams. +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 + +train_lm.sh --arpa --lmtype 4gram-mincount $dir +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 +# 10.3 million N-grams. + +prune_lm.sh --arpa 3.0 $dir/4gram-mincount +#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294 +# 2.6 million N-grams. + +prune_lm.sh --arpa 4.0 $dir/4gram-mincount +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717 +# 2.15 million N-grams. + +prune_lm.sh --arpa 5.0 $dir/4gram-mincount +# 1.86 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023 + +prune_lm.sh --arpa 7.0 $dir/4gram-mincount +# 1.50 million N-grams +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 + +train_lm.sh --arpa --lmtype 3gram $dir +# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866 +# 20.0 million N-grams + +! which ngram-count \ + && echo "SRILM tools not installed so not doing the comparison" && exit 1; + +################# +# You could finish the script here if you wanted. +# Below is to show how to do baselines with SRILM. +# You'd have to install the SRILM toolkit first. + +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout +gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train +(echo ""; echo "" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s + +# 3-gram: +ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2 +#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437 + +# Trying 4-gram: +ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz +ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout +#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822 + +#3-gram with pruning: +ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ + -prune 0.0000001 -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz +ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout +#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs +#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616 +# Around 2.25M N-grams. +# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/" +# above, which gave 2.5 million N-grams and a perplexity of 156. + +# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams. +# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to +# the kaldi_lm experiments above without "-mincount". + +## From here is how to train with +# IRSTLM. This is not really working at the moment. +export IRSTLM=$KALDI_ROOT/tools/irstlm/ + +idir=$dir/irstlm +mkdir $idir +gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \ + gzip -c > $idir/train.gz + +$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no + cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\ +{print $0;}}' > vocab.irstlm.20k + + +$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \ + -n 3 -s improved-kneser-ney -b yes +# Testing perplexity with SRILM tools: +ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout +#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for in closed-vocabulary LM +#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs +#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599 + +# Perplexity is very bad (should be ~141, since we used -p option, +# not 175), +# but adding -debug 3 to the command line shows that +# the IRSTLM LM does not seem to sum to one properly, so it seems that +# it produces an LM that isn't interpretable in the normal way as an ARPA +# LM. + + + diff --git a/egs/chime_wsj0/s5/local/wsj_train_rnnlms.sh b/egs/chime_wsj0/s5/local/wsj_train_rnnlms.sh new file mode 100755 index 000000000..c0d1afaf6 --- /dev/null +++ b/egs/chime_wsj0/s5/local/wsj_train_rnnlms.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson + +# This script trains LMs on the WSJ LM-training data. +# It requires that you have already run wsj_extend_dict.sh, +# to get the larger-size dictionary including all of CMUdict +# plus any OOVs and possible acronyms that we could easily +# derive pronunciations for. + +# This script takes no command-line arguments but takes the --cmd option. + +# Begin configuration section. +rand_seed=0 +cmd=run.pl +nwords=10000 # This is how many words we're putting in the vocab of the RNNLM. +hidden=30 +class=200 # Num-classes... should be somewhat larger than sqrt of nwords. +direct=1000 # Probably number of megabytes to allocate for hash-table for "direct" connections. +rnnlm_ver=rnnlm-0.3e # version of RNNLM to use +# End configuration section. + +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh + +if [ $# != 1 ]; then + echo "Usage: local/wsj_train_rnnlms.sh [options] " + echo "For options, see top of script file" + exit 1; +fi + +dir=$1 +srcdir=data/local/dict_larger +mkdir -p $dir + +export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH + + +( # First make sure the kaldi_lm toolkit is installed. + # Note: this didn't work out of the box for me, I had to + # change the g++ version to just "g++" (no cross-compilation + # needed for me as I ran on a machine that had been setup + # as 64 bit by default. + cd $KALDI_ROOT/tools || exit 1; + if [ -d $rnnlm_ver ]; then + echo Not installing the rnnlm toolkit since it is already there. + else + echo Downloading and installing the rnnlm tools + # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz + if [ ! -f $rnnlm_ver.tgz ]; then + wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1; + fi + mkdir $rnnlm_ver + cd $rnnlm_ver + tar -xvzf ../$rnnlm_ver.tgz || exit 1; + make CC=g++ || exit 1; + echo Done making the rnnlm tools + fi +) || exit 1; + + +if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then + echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist"; + echo "You need to run local/wsj_extend_dict.sh before running this script." + exit 1; +fi + +cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all + +# Get training data with OOV words (w.r.t. our current vocab) replaced with . +echo "Getting training data with OOV words replaced with (train_nounk.gz)" +gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.all \ + 'BEGIN{while((getline0) v[$1]=1;} + {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ + | gzip -c > $dir/all.gz + +echo "Splitting data into train and validation sets." +heldout_sent=10000 +gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data +gunzip -c $dir/all.gz | tail -n +$heldout_sent | \ + perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \ + > $dir/train.in # training data + + + # The rest will consist of a word-class represented by , that + # maps (with probabilities) to a whole class of words. + +# Get unigram counts from our training data, and use this to select word-list +# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class +# that we (manually, at the shell level) assign probabilities for words that +# are in that class. Note: this word-list doesn't need to include ; this +# automatically gets added inside the rnnlm program. +# Note: by concatenating with $dir/wordlist.all, we are doing add-one +# smoothing of the counts. + +cat $dir/train.in $dir/wordlist.all | grep -v '' | grep -v '' | \ + awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ + sort -nr > $dir/unigram.counts + +head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn + +tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts + +tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts` +awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs + + +for type in train valid; do + cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \ + 'BEGIN{while((getline0) v[$1]=1;} + {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ + > $dir/$type +done +rm $dir/train.in # no longer needed-- and big. + +# Now randomize the order of the training data. +cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \ + sort | cut -f 2 > $dir/foo +mv $dir/foo $dir/train + +# OK we'll train the RNNLM on this data. + +# todo: change 100 to 320. +# using 100 classes as square root of 10k. +echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)" +#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \ +# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \ +# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log & + +$cmd $dir/rnnlm.log \ + $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -independent -train $dir/train -valid $dir/valid \ + -rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt 2 -bptt-block 20 \ + -direct-order 4 -direct $direct -binary || exit 1; + + +# make it like a Kaldi table format, with fake utterance-ids. +cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids + +utils/rnnlm_compute_scores.sh $dir $dir/tmp.valid $dir/valid.with_ids \ + $dir/valid.scores +nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which + # is one per word, to account for the at the end of each sentence; this is the + # correct number to normalize buy. +p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` +echo Perplexity is $p | tee $dir/perplexity.log + +rm $dir/train $dir/all.gz + +# This is a better setup, but takes a long time to train: +#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)" +#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \ +# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \ +# -direct-order 4 -direct 2000 -binary diff --git a/egs/chime_wsj0/s5/path.sh b/egs/chime_wsj0/s5/path.sh new file mode 100755 index 000000000..11fb0b17d --- /dev/null +++ b/egs/chime_wsj0/s5/path.sh @@ -0,0 +1,3 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet-cpubin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +export LC_ALL=C diff --git a/egs/chime_wsj0/s5/run.sh b/egs/chime_wsj0/s5/run.sh new file mode 100755 index 000000000..0c3d5c906 --- /dev/null +++ b/egs/chime_wsj0/s5/run.sh @@ -0,0 +1,261 @@ +#!/bin/bash + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +case 0 in #goto here + 1) +;; #here: +esac + +#exit 1; +#need wsj0 for the clean version and LMs +wsj0=/mnt/spdb/wall_street_journal +local/clean_wsj0_data_prep.sh $wsj0 + +reverb=/mnt/spdb/CHiME/chime2-wsj0/reverberated +local/reverb_wsj0_data_prep.sh $reverb + +noisy=/mnt/spdb/CHiME/chime2-wsj0/isolated +local/noisy_wsj0_data_prep.sh $noisy + +local/wsj_prepare_dict.sh || exit 1; + +utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang || exit 1; + +local/chime_format_data.sh || exit 1; + +# Now make MFCC features. +# mfccdir should be some place with a largish disk where you +# want to store MFCC features. + +mfccdir=mfcc +for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do + steps/make_mfcc.sh --nj 10 \ + data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; +done + +# Note: the --boost-silence option should probably be omitted by default +# for normal setups. It doesn't always help. [it's to discourage non-silence +# models from modeling silence.] +mfccdir=mfcc +for x in test_eval92_5k_noisy dev_dt_05_noisy train_si84_noisy; do + steps/make_mfcc.sh --nj 10 \ + data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; +done + +mfccdir=mfcc +for x in dev_dt_05_reverb train_si84_reverb; do + steps/make_mfcc.sh --nj 10 \ + data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; +done + +#begin train gmm systems using multi condition data +#train_si84 = clean+reverb+noisy, +for s in train_si84 ; do + mkdir -p data/$s + cp data/${s}_clean/spk2gender data/$s/ + for x in text wav.scp; do + cat data/${s}_clean/$x data/${s}_reverb/$x data/${s}_noisy/$x | sort -k1 > data/$s/$x + done + cat data/$s/wav.scp | awk '{print $1}' | perl -ane 'chop; m:^...:; print "$_ $&\n";' > data/$s/utt2spk + cat data/$s/utt2spk | utils/utt2spk_to_spk2utt.pl > data/$s/spk2utt +done + +mfccdir=mfcc +for x in train_si84; do + steps/make_mfcc.sh --nj 10 \ + data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; +done + +steps/train_mono.sh --boost-silence 1.25 --nj 10 \ + data/train_si84 data/lang exp/mono0a || exit 1; + + + +utils/mkgraph.sh --mono data/lang_test_tgpr_5k exp/mono0a exp/mono0a/graph_tgpr_5k +#steps/decode.sh --nj 8 \ +# exp/mono0a/graph_tgpr_5k data/test_eval92_5k_clean exp/mono0a/decode_tgpr_eval92_5k_clean +steps/decode.sh --nj 8 \ + exp/mono0a/graph_tgpr_5k data/test_eval92_5k_noisy exp/mono0a/decode_tgpr_eval92_5k_noisy + + +steps/align_si.sh --boost-silence 1.25 --nj 10 \ + data/train_si84 data/lang exp/mono0a exp/mono0a_ali || exit 1; + +steps/train_deltas.sh --boost-silence 1.25 \ + 2000 10000 data/train_si84 data/lang exp/mono0a_ali exp/tri1 || exit 1; + +while [ ! -f data/lang_test_tgpr/tmp/LG.fst ] || \ + [ -z data/lang_test_tgpr/tmp/LG.fst ]; do + sleep 20; +done +sleep 30; +# or the mono mkgraph.sh might be writing +# data/lang_test_tgpr/tmp/LG.fst which will cause this to fail. + +utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri1 exp/tri1/graph_tgpr_5k || exit 1; + +#steps/decode.sh --nj 8 \ +# exp/tri1/graph_tgpr data/test_eval92_5k_clean exp/tri1/decode_tgpr_eval92_5k_clean || exit 1; +steps/decode.sh --nj 8 \ + exp/tri1/graph_tgpr_5k data/test_eval92_5k_noisy exp/tri1/decode_tgpr_eval92_5k_noisy || exit 1; + + +# test various modes of LM rescoring (4 is the default one). +# This is just confirming they're equivalent. +#for mode in 1 2 3 4; do +#steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \ +# data/test_dev93 exp/tri1/decode_tgpr_dev93 exp/tri1/decode_tgpr_dev93_tg$mode || exit 1; +#done + +# demonstrate how to get lattices that are "word-aligned" (arcs coincide with +# words, with boundaries in the right place). +#sil_label=`grep '!SIL' data/lang_test_tgpr/words.txt | awk '{print $2}'` +#steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \ +# data/lang_test_tgpr exp/tri1/decode_tgpr_dev93 exp/tri1/decode_tgpr_dev93_aligned || exit 1; + +steps/align_si.sh --nj 10 \ + data/train_si84 data/lang exp/tri1 exp/tri1_ali_si84 || exit 1; + +# Train tri2a, which is deltas + delta-deltas, on si84 data. +steps/train_deltas.sh \ + 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2a || exit 1; + +utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri2a exp/tri2a/graph_tgpr_5k || exit 1; + +#steps/decode.sh --nj 8 \ +# exp/tri2a/graph_tgpr_5k data/test_eval92_5k_clean exp/tri2a/decode_tgpr_eval92_5k_clean || exit 1; +steps/decode.sh --nj 8 \ + exp/tri2a/graph_tgpr_5k data/test_eval92_5k_noisy exp/tri2a/decode_tgpr_eval92_5k_noisy|| exit 1; + +#utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k +#steps/decode.sh --nj 8 \ +# exp/tri2a/graph_bg5k data/test_eval92_5k_clean exp/tri2a/decode_bg_eval92_5k_clean || exit 1; + +steps/train_lda_mllt.sh \ + --splice-opts "--left-context=3 --right-context=3" \ + 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b || exit 1; + +utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri2b exp/tri2b/graph_tgpr_5k || exit 1; +steps/decode.sh --nj 8 \ + exp/tri2b/graph_tgpr_5k data/test_eval92_5k_noisy exp/tri2b/decode_tgpr_eval92_5k_noisy || exit 1; +#steps/decode.sh --nj 8 \ +# exp/tri2b/graph_tgpr data/test_eval92_clean exp/tri2b/decode_tgpr_eval92_clean || exit 1; + + +# Align tri2b system with si84 data. +steps/align_si.sh --nj 10 \ + --use-graphs true data/train_si84 data/lang exp/tri2b exp/tri2b_ali_si84 || exit 1; + + +# From 2b system, train 3b which is LDA + MLLT + SAT. +steps/train_sat.sh \ + 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b || exit 1; +utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b exp/tri3b/graph_tgpr_5k || exit 1; +steps/decode_fmllr.sh --nj 8 \ + exp/tri3b/graph_tgpr_5k data/test_eval92_5k_noisy exp/tri3b/decode_tgpr_eval92_5k_noisy || exit 1; + + +# From 3b multi-condition system, align noisy si84 data. +steps/align_fmllr.sh --nj 10 \ + data/train_si84_noisy data/lang exp/tri3b exp/tri3b_ali_si84_noisy || exit 1; + +steps/align_fmllr.sh --nj 10 \ + data/dev_dt_05_noisy data/lang exp/tri3b exp/tri3b_ali_dev_dt_05 || exit 1; + +#begin training DNN-HMM system +#only on noisy si84 + +. ./path.sh +#RBM pretraining +dir=exp/tri4a_dnn_pretrain +$cuda_cmd $dir/_pretrain_dbn.log \ + steps/pretrain_dbn.sh --use-gpu-id 0 --nn-depth 7 --rbm-iter 3 data-fbank/train_si84_noisy $dir +#BP +dir=exp/tri4a_dnn +ali=exp/tri3b_ali_si84_noisy +ali_dev=exp/tri3b_ali_dev_dt_05 +feature_transform=exp/tri4a_dnn_pretrain/final.feature_transform +dbn=exp/tri4a_dnn_pretrain/7.dbn +$cuda_cmd $dir/_train_nnet.log \ + steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \ + data-fbank/train_si84_noisy data-fbank/dev_dt_05_noisy data/lang $ali $ali_dev $dir || exit 1; + +utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri4a_dnn exp/tri4a_dnn/graph_tgpr_5k || exit 1; +steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \ + exp/tri4a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1; + +#Retrain system using new ali, +#this is essential +#repeat this process for 3 times +srcdir=exp/tri4a_dnn +steps/align_nnet.sh --nj 10 \ + data-fbank/train_si84_noisy data/lang $srcdir ${srcdir}_ali_si84_noisy || exit 1; +steps/align_nnet.sh --nj 10 \ + data-fbank/dev_dt_05_noisy data/lang $srcdir ${srcdir}_ali_dt_05_noisy || exit 1; + +#no need to do pretraining again +dir=exp/tri5a_dnn +ali=exp/tri4a_dnn_ali_si84_noisy +ali_dev=exp/tri4a_dnn_ali_dt_05_noisy +feature_transform=exp/tri4a_dnn_pretrain/final.feature_transform +dbn=exp/tri4a_dnn_pretrain/7.dbn +$cuda_cmd $dir/_train_nnet.log \ + steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \ + data-fbank/train_si84_noisy data-fbank/dev_dt_05_noisy data/lang $ali $ali_dev $dir || exit 1; + +utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri5a_dnn exp/tri5a_dnn/graph_tgpr_5k || exit 1; +steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \ + exp/tri5a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1; + + +srcdir=exp/tri5a_dnn +steps/align_nnet.sh --nj 10 \ + data-fbank/train_si84_noisy data/lang $srcdir ${srcdir}_ali_si84_noisy || exit 1; +steps/align_nnet.sh --nj 10 \ + data-fbank/dev_dt_05_noisy data/lang $srcdir ${srcdir}_ali_dt_05_noisy || exit 1; + +. ./path.sh +dir=exp/tri6a_dnn +ali=exp/tri5a_dnn_ali_si84_noisy +ali_dev=exp/tri5a_dnn_ali_dt_05_noisy +feature_transform=exp/tri4a_dnn_pretrain/final.feature_transform +dbn=exp/tri4a_dnn_pretrain/7.dbn +$cuda_cmd $dir/_train_nnet.log \ + steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \ + data-fbank/train_si84_noisy data-fbank/dev_dt_05_noisy data/lang $ali $ali_dev $dir || exit 1; + +utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri6a_dnn exp/tri6a_dnn/graph_tgpr_5k || exit 1; +steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \ + exp/tri6a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1; + +srcdir=exp/tri6a_dnn +steps/align_nnet.sh --nj 10 \ + data-fbank/train_si84_noisy data/lang $srcdir ${srcdir}_ali_si84_noisy || exit 1; +steps/align_nnet.sh --nj 10 \ + data-fbank/dev_dt_05_noisy data/lang $srcdir ${srcdir}_ali_dt_05_noisy || exit 1; + +. ./path.sh +dir=exp/tri7a_dnn +ali=exp/tri6a_dnn_ali_si84_noisy +ali_dev=exp/tri6a_dnn_ali_dt_05_noisy +feature_transform=exp/tri4a_dnn_pretrain/final.feature_transform +dbn=exp/tri4a_dnn_pretrain/7.dbn +$cuda_cmd $dir/_train_nnet.log \ + steps/train_nnet.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 --use-gpu-id 0 \ + data-fbank/train_si84_noisy data-fbank/dev_dt_05_noisy data/lang $ali $ali_dev $dir || exit 1; + +utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri7a_dnn exp/tri7a_dnn/graph_tgpr_5k || exit 1; +steps/decode_nnet.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \ + exp/tri7a_dnn/graph_tgpr_5k data-fbank/test_eval92_5k_noisy $dir/decode_tgpr_5k_eval92_5k_noisy || exit 1; + + + diff --git a/egs/chime_wsj0/s5/steps/align_basis_fmllr.sh b/egs/chime_wsj0/s5/steps/align_basis_fmllr.sh new file mode 100755 index 000000000..54f35b36a --- /dev/null +++ b/egs/chime_wsj0/s5/steps/align_basis_fmllr.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Copyright 2013 GoVivace Inc (Author: Nagendra Goel) +# Apache 2.0 + +# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta) +# + fMLLR (probably with SAT models). +# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl +# is not present), then does 2 iterations of fMLLR estimation. + +# If you supply the --use-graphs option, it will use the training +# graphs from the source directory (where the model is). In this +# case the number of jobs must match the source directory. + + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +boost_silence=1.5 # factor by which to boost silence during alignment. +fmllr_update_type=full +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_fmllr.sh " + echo "e.g.: steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --fmllr-update-type (full|diag|offset|none) # default full." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 +graphdir=$dir + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +sdata=$data/split$nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.occs $dir; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; +alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/boost_phones.csl` $alimdl - |" +mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |" + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + + +if [ $stage -le 1 ]; then + echo "$0: aligning data in $data using $alimdl and speaker-independent features." + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing fMLLR transforms" + if [ "$alimdl" != "$mdl" ]; then + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \ + gmm-est-basis-fmllr-gpost --fmllr-min-count=22 --num-iters=10 \ + --size-scale=0.2 --step-size-iters=3 \ + --write-weights=ark:$dir/pre_wgt.JOB \ + $mdl $srcdir/fmllr.basis "$sifeats" ark,s,cs:- \ + ark:$dir/trans.JOB || exit 1; +# else +# $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ +# ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ +# weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ +# gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ +# --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ +# ark,s,cs:- ark:$dir/trans.JOB || exit 1; + fi +fi + +feats="$sifeats transform-feats ark:$dir/pre_trans.JOB ark:- ark:- |" + +if [ $stage -le 3 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +#rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/align_fmllr.sh b/egs/chime_wsj0/s5/steps/align_fmllr.sh new file mode 100755 index 000000000..3052eb409 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/align_fmllr.sh @@ -0,0 +1,148 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta) +# + fMLLR (probably with SAT models). +# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl +# is not present), then does 2 iterations of fMLLR estimation. + +# If you supply the --use-graphs option, it will use the training +# graphs from the source directory (where the model is). In this +# case the number of jobs must match the source directory. + + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +boost_silence=1.0 # factor by which to boost silence during alignment. +fmllr_update_type=full +norm_vars=false +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_fmllr.sh " + echo "e.g.: steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --fmllr-update-type (full|diag|offset|none) # default full." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +sdata=$data/split$nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.occs $dir; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $srcdir/full.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; +alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |" +mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |" + + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + + +if [ $stage -le 1 ]; then + echo "$0: aligning data in $data using $alimdl and speaker-independent features." + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing fMLLR transforms" + if [ "$alimdl" != "$mdl" ]; then + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + else + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + fi +fi + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +if [ $stage -le 3 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/align_nnet.sh b/egs/chime_wsj0/s5/steps/align_nnet.sh new file mode 100755 index 000000000..fe70416e6 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/align_nnet.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright 2012 Brno University of Technology (Author: Karel Vesely) +# Apache 2.0 + +# Computes training alignments using MLP model + +# If you supply the "--use-graphs true" option, it will use the training +# graphs from the source directory (where the model is). In this +# case the number of jobs must match with the source directory. + + +# Begin configuration section. +nj=4 +cmd=run.pl +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +use_gpu_id=-1 # disable gpu +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: $0 " + echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; + +#Get the files we will need +nnet=$srcdir/final.nnet; +[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1; + +class_frame_counts=$srcdir/ali_train_pdf.counts +[ -z "$class_frame_counts" ] && echo "Error class_frame_counts '$class_frame_counts' does not exist!" && exit 1; + +feature_transform=$srcdir/final.feature_transform +if [ ! -f $feature_transform ]; then + echo "Missing feature_transform '$feature_transform'" + exit 1 +fi + +model=$dir/final.mdl +[ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1; + +### +### Prepare feature pipeline (same as for decoding) +### +# Create the feature stream: +feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" +# Optionally add cmvn +if [ -f $srcdir/norm_vars ]; then + norm_vars=$(cat $srcdir/norm_vars 2>/dev/null) + [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1 + feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |" +fi +# Optionally add deltas +if [ -f $srcdir/delta_order ]; then + delta_order=$(cat $srcdir/delta_order) + feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |" +fi + +# Finally add feature_transform and the MLP +feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |" +### +### +### + +echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" + +tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; +# We could just use gmm-align-mapped in the next line, but it's less efficient as it compiles the +# training graphs one by one. +$cmd JOB=1:$nj $dir/log/align.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \ + align-compiled-mapped $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/final.mdl ark:- \ + "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1; + +echo "$0: done aligning data." diff --git a/egs/chime_wsj0/s5/steps/align_raw_fmllr.sh b/egs/chime_wsj0/s5/steps/align_raw_fmllr.sh new file mode 100755 index 000000000..4e33a8a14 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/align_raw_fmllr.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta) +# + fMLLR (probably with SAT models). +# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl +# is not present), then does 2 iterations of fMLLR estimation. + +# If you supply the --use-graphs option, it will use the training +# graphs from the source directory (where the model is). In this +# case the number of jobs must match the source directory. + + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +boost_silence=1.0 # factor by which to boost silence during alignment. +norm_vars=false +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_fmllr.sh " + echo "e.g.: steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +sdata=$data/split$nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.occs $dir; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. + +if [[ ! -f $srcdir/final.mat || ! -f $srcdir/full.mat ]]; then + echo "$0: we require final.mat and full.mat in the source directory $srcdir" +fi + +full_lda_mat="get-full-lda-mat --print-args=false $srcdir/final.mat $srcdir/full.mat -|" +cp $srcdir/full.mat $srcdir/final.mat $dir + +splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" +sifeats="$splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |" + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; +alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |" +mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |" + + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + + +if [ $stage -le 1 ]; then + echo "$0: aligning data in $data using $alimdl and speaker-independent features." + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing fMLLR transforms" + if [ "$alimdl" != "$mdl" ]; then + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-raw-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $mdl "$full_lda_mat" "$splicedfeats" ark,s,cs:- ark:$dir/raw_trans.JOB || exit 1; + else + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-est-fmllr-raw --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$full_lda_mat" \ + "$splicedfeats" ark,s,cs:- ark:$dir/raw_trans.JOB || exit 1; + fi +fi + +feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + +if [ $stage -le 3 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/align_sgmm.sh b/egs/chime_wsj0/s5/steps/align_sgmm.sh new file mode 100755 index 000000000..833afa539 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/align_sgmm.sh @@ -0,0 +1,193 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments and (if needed) speaker-vectors, given an +# SGMM system. If the system is built on top of SAT, you should supply +# transforms with the --transform-dir option. + +# If you supply the --use-graphs option, it will use the training +# graphs from the source directory. + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +use_graphs=false # use graphs from srcdir +use_gselect=false # use gselect info from srcdir [regardless, we use + # Gaussian-selection info, we might have to compute it though.] +gselect=15 # Number of Gaussian-selection indices for SGMMs. +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +transform_dir= # directory to find fMLLR transforms in. +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_sgmm.sh " + echo "e.g.: steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\" + echo " exp/sgmm4a exp/sgmm5a_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --transform-dir # directory to find fMLLR transforms" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +sdata=$data/split$nj + +mkdir -p $dir/log +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir +cp $srcdir/final.occs $dir; + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option during alignment." +fi +## + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir + ln.pl $srcdir/fsts.*.gz $dir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + +## Work out where we're getting the Gaussian-selection info from +if $use_gselect; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1; + graphdir=$srcdir + gselect_opt="--gselect=ark,s,cs:gunzip -c $srcdir/gselect.JOB.gz|" + ln.pl $srcdir/gselect.*.gz $dir +else + graphdir=$dir + if [ $stage -le 1 ]; then + echo "$0: computing Gaussian-selection info" + # Note: doesn't matter whether we use $alimdl or $mdl, they will + # have the same gselect info. + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm-gselect --full-gmm-nbest=$gselect $alimdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; + fi + gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" +fi + + +if [ $alimdl == $mdl ]; then + # Speaker-independent decoding-- just one pass. Not normal. + T=`sgmm-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1; + [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1; + + if [ $stage -le 2 ]; then + echo "$0: aligning data in $data using model $mdl (no speaker-vectors)" + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + sgmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + echo "$0: done aligning data." + exit 0; +fi + +# Continue with system with speaker vectors. +if [ $stage -le 2 ]; then + echo "$0: aligning data in $data using model $alimdl" + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: computing speaker vectors (1st pass)" + $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + sgmm-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \ + sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: computing speaker vectors (2nd pass)" + $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ + --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1; + rm $dir/pre_vecs.* +fi + +if [ $stage -le 5 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \ + --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/align_sgmm2.sh b/egs/chime_wsj0/s5/steps/align_sgmm2.sh new file mode 100755 index 000000000..38ff02ddc --- /dev/null +++ b/egs/chime_wsj0/s5/steps/align_sgmm2.sh @@ -0,0 +1,193 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments and (if needed) speaker-vectors, given an +# SGMM system. If the system is built on top of SAT, you should supply +# transforms with the --transform-dir option. + +# If you supply the --use-graphs option, it will use the training +# graphs from the source directory. + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +use_graphs=false # use graphs from srcdir +use_gselect=false # use gselect info from srcdir [regardless, we use + # Gaussian-selection info, we might have to compute it though.] +gselect=15 # Number of Gaussian-selection indices for SGMMs. +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +transform_dir= # directory to find fMLLR transforms in. +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_sgmm.sh " + echo "e.g.: steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\" + echo " exp/sgmm4a exp/sgmm5a_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --transform-dir # directory to find fMLLR transforms" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +sdata=$data/split$nj + +mkdir -p $dir/log +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir +cp $srcdir/final.occs $dir; + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option during alignment." +fi +## + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir + ln.pl $srcdir/fsts.*.gz $dir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + +## Work out where we're getting the Gaussian-selection info from +if $use_gselect; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1; + graphdir=$srcdir + gselect_opt="--gselect=ark,s,cs:gunzip -c $srcdir/gselect.JOB.gz|" + ln.pl $srcdir/gselect.*.gz $dir +else + graphdir=$dir + if [ $stage -le 1 ]; then + echo "$0: computing Gaussian-selection info" + # Note: doesn't matter whether we use $alimdl or $mdl, they will + # have the same gselect info. + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm2-gselect --full-gmm-nbest=$gselect $alimdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; + fi + gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" +fi + + +if [ $alimdl == $mdl ]; then + # Speaker-independent decoding-- just one pass. Not normal. + T=`sgmm2-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1; + [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1; + + if [ $stage -le 2 ]; then + echo "$0: aligning data in $data using model $mdl (no speaker-vectors)" + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + sgmm2-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + echo "$0: done aligning data." + exit 0; +fi + +# Continue with system with speaker vectors. +if [ $stage -le 2 ]; then + echo "$0: aligning data in $data using model $alimdl" + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: computing speaker vectors (1st pass)" + $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + sgmm2-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \ + sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: computing speaker vectors (2nd pass)" + $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ + --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1; + rm $dir/pre_vecs.* +fi + +if [ $stage -le 5 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \ + --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/align_si.sh b/egs/chime_wsj0/s5/steps/align_si.sh new file mode 100755 index 000000000..d525550f1 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/align_si.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments using a model with delta or +# LDA+MLLT features. + +# If you supply the "--use-graphs true" option, it will use the training +# graphs from the source directory (where the model is). In this +# case the number of jobs must match with the source directory. + + +# Begin configuration section. +nj=4 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +boost_silence=1.0 # Factor by which to boost silence during alignment. +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_si.sh " + echo "e.g.: steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.occs $dir; + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" + +mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/final.mdl - |" + +if $use_graphs; then + [ $nj != "`cat $srcdir/num_jobs`" ] && echo "$0: mismatch in num-jobs" && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "$0: no such file $srcdir/fsts.1.gz" && exit 1; + + $cmd JOB=1:$nj $dir/log/align.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \ + "ark:gunzip -c $srcdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +else + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + # We could just use gmm-align in the next line, but it's less efficient as it compiles the + # training graphs one by one. + $cmd JOB=1:$nj $dir/log/align.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" ark:- \ + "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +echo "$0: done aligning data." diff --git a/egs/chime_wsj0/s5/steps/append_feats.sh b/egs/chime_wsj0/s5/steps/append_feats.sh new file mode 100755 index 000000000..9ae6e8279 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/append_feats.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# This script appends the features in two data directories. + +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +cmd=run.pl +nj=4 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "usage: append_feats.sh [options] "; + echo "options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data_src1=$1 +data_src2=$2 +data=$3 +logdir=$4 +mfccdir=$5 + +# make $mfccdir an absolute pathname. +mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}` + +utils/split_data.sh $data_src1 $nj || exit 1; +utils/split_data.sh $data_src2 $nj || exit 1; + +mkdir -p $mfccdir $logdir + +mkdir -p $data +cp $data_src1/* $data/ 2>/dev/null # so we get the other files, such as utt2spk. +rm $data/cmvn.scp 2>/dev/null +rm -r $data/split* 2>/dev/null + +# use "name" as part of name of the archive. +name=`basename $data` + +$cmd JOB=1:$nj $logdir/append.JOB.log \ + append-feats --truncate-frames=true \ + scp:$data_src1/split$nj/JOB/feats.scp scp:$data_src2/split$nj/JOB/feats.scp \ + ark,scp:$mfccdir/appended_$name.JOB.ark,$mfccdir/appended_$name.JOB.scp || exit 1; + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $mfccdir/appended_$name.$n.scp >> $data/feats.scp || exit 1; +done > $data/feats.scp || exit 1; + + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi + +echo "Succeeded creating MFCC features for $name" diff --git a/egs/chime_wsj0/s5/steps/compute_cmvn_stats.sh b/egs/chime_wsj0/s5/steps/compute_cmvn_stats.sh new file mode 100755 index 000000000..17eb62e83 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/compute_cmvn_stats.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Compute cepstral mean and variance statistics per speaker. +# We do this in just one job; it's fast. +# This script takes no options. +# +# Note: there is no option to do CMVN per utterance. The idea is +# that if you did it per utterance it would not make sense to do +# per-speaker fMLLR on top of that (since you'd be doing fMLLR on +# top of different offsets). Therefore what would be the use +# of the speaker information? In this case you should probably +# make the speaker-ids identical to the utterance-ids. The +# speaker information does not have to correspond to actual +# speakers, it's just the level you want to adapt at. + +echo "$0 $@" # Print the command line for logging + +fake=false +if [ $1 == "--fake" ]; then + fake=true + shift +fi + +if [ $# != 3 ]; then + echo "usage: compute_cmvn_stats.sh [--fake] "; + echo "(note: --fake gives you fake cmvn stats that do no normalization.)" + exit 1; +fi + +if [ -f path.sh ]; then . ./path.sh; fi + +data=$1 +logdir=$2 +cmvndir=$3 + +# make $cmvndir an absolute pathname. +cmvndir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $cmvndir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $cmvndir || exit 1; +mkdir -p $logdir || exit 1; + + +required="$data/feats.scp $data/spk2utt" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_cmvn.sh: no such file $f" + exit 1; + fi +done + +if $fake; then + dim=`feat-to-dim scp:$data/feats.scp -` + ! cat $data/spk2utt | awk -v dim=$dim '{print $1, "["; for (n=0; n < dim; n++) { printf("0 "); } print "1"; + for (n=0; n < dim; n++) { printf("1 "); } print "0 ]";}' | \ + copy-matrix ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \ + echo "Error creating fake CMVN stats" && exit 1; +else + ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \ + 2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats" && exit 1; +fi + +cp $cmvndir/cmvn_$name.scp $data/cmvn.scp || exit 1; + +nc=`cat $data/cmvn.scp | wc -l` +nu=`cat $data/spk2utt | wc -l` +if [ $nc -ne $nu ]; then + echo "Error: it seems not all of the speakers got cmvn stats ($nc != $nu);" + exit 1; +fi + +echo "Succeeded creating CMVN stats for $name" diff --git a/egs/chime_wsj0/s5/steps/decode.sh b/egs/chime_wsj0/s5/steps/decode.sh new file mode 100755 index 000000000..f41ba6349 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration section. +transform_dir= +iter= +model= # You can specify the model to use (e.g. if you want to use the .alimdl) +stage=0 +nj=4 +cmd=run.pl +max_active=7000 +max_arcs=-1 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +scoring_opts= +# note: there are no more min-lmwt and max-lmwt options, instead use +# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20" +skip_scoring=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --model # which model to use (e.g. to" + echo " # specify the final.alimdl)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # dir to find fMLLR transforms " + echo " --acwt # acoustic scale used for lattice generation " + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '-pe smp 4' if you supply --num-threads 4" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -z "$model" ]; then # if --model was not specified on the command line... + if [ -z $iter ]; then model=$srcdir/final.mdl; + else model=$srcdir/$iter.mdl; fi +fi + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1; +done + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode.sh: feature type is $feat_type"; + +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi + +if [ $stage -le 0 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster$thread_string --max-arcs=$max_arcs --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi + +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir +fi + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_basis_fmllr.sh b/egs/chime_wsj0/s5/steps/decode_basis_fmllr.sh new file mode 100755 index 000000000..b0521aa59 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_basis_fmllr.sh @@ -0,0 +1,206 @@ +#!/bin/bash + +# Copyright 2012 Carnegie Mellon University (Author: Yajie Miao) +# Johns Hopkins University (Author: Daniel Povey) + +# Decoding script that does basis fMLLR. This can be on top of delta+delta-delta, +# or LDA+MLLT features. + +# There are 3 models involved potentially in this script, +# and for a standard, speaker-independent system they will all be the same. +# The "alignment model" is for the 1st-pass decoding and to get the +# Gaussian-level alignments for the "adaptation model" the first time we +# do fMLLR. The "adaptation model" is used to estimate fMLLR transforms +# and to generate state-level lattices. The lattices are then rescored +# with the "final model". +# +# The following table explains where we get these 3 models from. +# Note: $srcdir is one level up from the decoding directory. +# +# Model Default source: +# +# "alignment model" $srcdir/final.alimdl --alignment-model +# (or $srcdir/final.mdl if alimdl absent) +# "adaptation model" $srcdir/final.mdl --adapt-model +# "final model" $srcdir/final.mdl --final-model + + +# Begin configuration section +first_beam=10.0 # Beam used in initial, speaker-indep. pass +first_max_active=2000 # max-active used in initial pass. +alignment_model= +adapt_model= +final_model= +stage=0 +acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in + # lattice generation. + +# Parameters in alignment of training data +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +align_beam=10 +retry_beam=40 + +max_active=7000 +beam=13.0 +lattice_beam=6.0 +nj=4 +silence_weight=0.01 +cmd=run.pl +si_dir= +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode_basis_fmllr.sh [options] " + echo " e.g.: steps/decode_basis_fmllr.sh exp/tri2b/graph_tgpr data/train_si84 data/test_dev93 exp/tri2b/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --adapt-model # Model to compute transforms with" + echo " --alignment-model # Model to get Gaussian-level alignments for" + echo " # 1st pass of transform computation." + echo " --final-model # Model to finally decode with" + echo " --si-dir # use this to skip 1st pass of decoding" + echo " # Caution-- must be with same tree" + echo " --acwt # default 0.08333 ... used to get posteriors" + + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash. + +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. + +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; + +# Some checks. Note: we don't need $srcdir/tree but we expect +# it should exist, given the current structure of the scripts. +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree $srcdir/fmllr.basis; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; +## + +## Do the speaker-independent decoding, if --si-dir option not present. ## +if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass. + si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si". + if [ $stage -le 0 ]; then + steps/decode.sh --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam --model $alignment_model --max-active $first_max_active $graphdir $data $si_dir || exit 1; + fi +fi +## + +## Some checks, and setting of defaults for variables. +[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1; +[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1; +[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl +[ -z "$final_model" ] && final_model=$srcdir/final.mdl +for f in $adapt_model $final_model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +## + +## Set up the unadapted features "$sifeats" for testing set +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +## + +## Now get the first-pass fMLLR transforms. +## We give all the default parameters in gmm-est-basis-fmllr +if [ $stage -le 1 ]; then + echo "$0: getting first-pass fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \ + gunzip -c $si_dir/lat.JOB.gz \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \ + gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \ + gmm-est-basis-fmllr-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + --fmllr-min-count=200 --num-iters=10 --size-scale=0.2 \ + --step-size-iters=3 --write-weights=ark:$dir/pre_wgt.JOB \ + $adapt_model $srcdir/fmllr.basis "$sifeats" ark,s,cs:- \ + ark:$dir/pre_trans.JOB || exit 1; +fi +## + +pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |" + +## Do the main lattice generation pass. Note: we don't determinize the lattices at +## this stage, as we're going to use them in acoustic rescoring with the larger +## model, and it's more correct to store the full state-level lattice for this purpose. +if [ $stage -le 2 ]; then + echo "$0: doing main lattice generation phase" + $cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt \ + --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \ + || exit 1; +fi +## + +## Do a second pass of estimating the transform-- this time with the lattices +## generated from the alignment model. Compose the transforms to get +## $dir/trans.1, etc. +if [ $stage -le 3 ]; then + echo "$0: estimating fMLLR transforms a second time." + $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \ + "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \ + gmm-est-basis-fmllr --fmllr-min-count=200 \ + --spk2utt=ark:$sdata/JOB/spk2utt --write-weights=ark:$dir/trans_tmp_wgt.JOB \ + $adapt_model $srcdir/fmllr.basis "$pass1feats" ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \ + compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \ + ark:$dir/trans.JOB || exit 1; +fi +## + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +# Rescore the state-level lattices with the final adapted features, and the final model +# (which by default is $srcdir/final.mdl, but which may be specified on the command line, +# useful in case of discriminatively trained systems). +# At this point we prune and determinize the lattices and write them out, ready for +# language model rescoring. + +if [ $stage -le 4 ]; then + echo "$0: doing a final pass of acoustic rescoring." + $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \ + gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1; +fi + +[ ! -x local/score.sh ] && \ + echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +rm $dir/{trans_tmp,pre_trans}.* + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_biglm.sh b/egs/chime_wsj0/s5/steps/decode_biglm.sh new file mode 100755 index 000000000..ec2d0667c --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_biglm.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration. +nj=4 +cmd=run.pl +maxactive=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: steps/decode_si_biglm.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode_si.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + + +graphdir=$1 +oldlm_fst=$2 +newlm_fst=$3 +data=$4 +dir=$5 + +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $graphdir/HCLG.fst $oldlm_fst $newlm_fst; do + [ ! -f $f ] && echo "decode_si.sh: no such file $f" && exit 1; +done + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +[ -f `dirname $oldlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \ + echo "Warning: old LM words.txt does not match with that in $graphdir .. probably will not work."; +[ -f `dirname $newlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \ + echo "Warning: new LM words.txt does not match with that in $graphdir .. probably will not work."; + +# fstproject replaces the disambiguation symbol #0, which only appears on the +# input side, with the that appears in the corresponding arcs on the output side. +oldlm_cmd="fstproject --project_output=true $oldlm_fst | fstarcsort --sort_type=ilabel |" +newlm_cmd="fstproject --project_output=true $newlm_fst | fstarcsort --sort_type=ilabel |" + +$cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-biglm-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $srcdir/final.mdl $graphdir/HCLG.fst "$oldlm_cmd" "$newlm_cmd" "$feats" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_combine.sh b/egs/chime_wsj0/s5/steps/decode_combine.sh new file mode 100755 index 000000000..b8ac5ede1 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_combine.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Combine two decoding directories by composing the lattices (we +# apply a weight to each of the original weights, by default 0.5 each). + +# Begin configuration section. +weight1=0.5 # Weight on 1st set of lattices. +cmd=run.pl +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: steps/decode_combine.sh [options] " + echo " e.g.: steps/decode_combine.sh data/lang data/test exp/dir1/decode exp/dir2/decode exp/combine_1_2/decode" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --weight1 # Weight on 1st set of lattices (default 0.5)" + exit 1; +fi + +data=$1 +lang_or_graphdir=$2 +srcdir1=$3 +srcdir2=$4 +dir=$5 + +for f in $data/utt2spk $lang_or_graphdir/phones.txt $srcdir1/lat.1.gz $srcdir2/lat.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj1=`cat $srcdir1/num_jobs` || exit 1; +nj2=`cat $srcdir2/num_jobs` || exit 1; +[ $nj1 -ne $nj2 ] && echo "$0: mismatch in number of jobs $nj1 versus $nj2" && exit 1; +nj=$nj1 + +mkdir -p $dir/log +echo $nj > $dir/num_jobs + +# The lattice-interp command does the score interpolation (with composition), +# and the lattice-copy-backoff replaces the result with the 1st lattice, in +# cases where the composed result was empty. +$cmd JOB=1:$nj $dir/log/interp.JOB.log \ + lattice-interp --alpha=$weight1 "ark:gunzip -c $srcdir1/lat.JOB.gz|" \ + "ark,s,cs:gunzip -c $srcdir2/lat.JOB.gz|" ark:- \| \ + lattice-copy-backoff "ark,s,cs:gunzip -c $srcdir1/lat.JOB.gz|" ark,s,cs:- \ + "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $lang_or_graphdir $dir + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_fmllr.sh b/egs/chime_wsj0/s5/steps/decode_fmllr.sh new file mode 100755 index 000000000..4d171a2a4 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_fmllr.sh @@ -0,0 +1,217 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) + +# Decoding script that does fMLLR. This can be on top of delta+delta-delta, or +# LDA+MLLT features. + +# There are 3 models involved potentially in this script, +# and for a standard, speaker-independent system they will all be the same. +# The "alignment model" is for the 1st-pass decoding and to get the +# Gaussian-level alignments for the "adaptation model" the first time we +# do fMLLR. The "adaptation model" is used to estimate fMLLR transforms +# and to generate state-level lattices. The lattices are then rescored +# with the "final model". +# +# The following table explains where we get these 3 models from. +# Note: $srcdir is one level up from the decoding directory. +# +# Model Default source: +# +# "alignment model" $srcdir/final.alimdl --alignment-model +# (or $srcdir/final.mdl if alimdl absent) +# "adaptation model" $srcdir/final.mdl --adapt-model +# "final model" $srcdir/final.mdl --final-model + + +# Begin configuration section +first_beam=10.0 # Beam used in initial, speaker-indep. pass +first_max_active=2000 # max-active used in initial pass. +first_max_arcs=-1 +alignment_model= +adapt_model= +final_model= +stage=0 +acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in + # lattice generation. +max_active=7000 +max_arcs=-1 +beam=13.0 +lattice_beam=6.0 +nj=4 +silence_weight=0.01 +cmd=run.pl +si_dir= +fmllr_update_type=full +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +skip_scoring=false +scoring_opts= +norm_vars=false +# End configuration section +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: steps/decode_fmllr.sh [options] " + echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --adapt-model # Model to compute transforms with" + echo " --alignment-model # Model to get Gaussian-level alignments for" + echo " # 1st pass of transform computation." + echo " --final-model # Model to finally decode with" + echo " --si-dir # use this to skip 1st pass of decoding" + echo " # Caution-- must be with same tree" + echo " --acwt # default 0.08333 ... used to get posteriors" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '-pe smp 4' if you supply --num-threads 4" + echo " --scoring-opts # options to local/score.sh" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash. + +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +sdata=$data/split$nj; + +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + + +mkdir -p $dir/log +split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. + +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; + +# Some checks. Note: we don't need $srcdir/tree but we expect +# it should exist, given the current structure of the scripts. +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; +## + +## Do the speaker-independent decoding, if --si-dir option not present. ## +if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass. + si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si". + if [ $stage -le 0 ]; then + steps/decode.sh --parallel-opts "$parallel_opts" --scoring-opts "$scoring_opts" \ + --num-threads $num_threads --skip-scoring $skip_scoring \ + --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam \ + --model $alignment_model --max-arcs $max_arcs --max-active \ + $first_max_active $graphdir $data $si_dir || exit 1; + fi +fi +## + +## Some checks, and setting of defaults for variables. +[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1; +[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1; +[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl +[ -z "$final_model" ] && final_model=$srcdir/final.mdl +for f in $adapt_model $final_model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +## + +## Set up the unadapted features "$sifeats" +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +## + +## Now get the first-pass fMLLR transforms. +if [ $stage -le 1 ]; then + echo "$0: getting first-pass fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \ + gunzip -c $si_dir/lat.JOB.gz \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \ + gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \ + ark:$dir/pre_trans.JOB || exit 1; +fi +## + +pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |" + +## Do the main lattice generation pass. Note: we don't determinize the lattices at +## this stage, as we're going to use them in acoustic rescoring with the larger +## model, and it's more correct to store the full state-level lattice for this purpose. +if [ $stage -le 2 ]; then + echo "$0: doing main lattice generation phase" + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --max-arcs=$max_arcs \ + --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \ + || exit 1; +fi +## + +## Do a second pass of estimating the transform-- this time with the lattices +## generated from the alignment model. Compose the transforms to get +## $dir/trans.1, etc. +if [ $stage -le 3 ]; then + echo "$0: estimating fMLLR transforms a second time." + $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \ + "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass1feats" \ + ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \ + compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \ + ark:$dir/trans.JOB || exit 1; +fi +## + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +# Rescore the state-level lattices with the final adapted features, and the final model +# (which by default is $srcdir/final.mdl, but which may be specified on the command line, +# useful in case of discriminatively trained systems). +# At this point we prune and determinize the lattices and write them out, ready for +# language model rescoring. + +if [ $stage -le 4 ]; then + echo "$0: doing a final pass of acoustic rescoring." + $cmd $parallel_opts JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \ + gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1; +fi + +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir +fi + +rm $dir/{trans_tmp,pre_trans}.* + +exit 0; + diff --git a/egs/chime_wsj0/s5/steps/decode_fmllr_extra.sh b/egs/chime_wsj0/s5/steps/decode_fmllr_extra.sh new file mode 100755 index 000000000..51cc06057 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_fmllr_extra.sh @@ -0,0 +1,250 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) + +# Decoding script that does fMLLR. This can be on top of delta+delta-delta, or +# LDA+MLLT features. +# This script does an extra pass of lattice generation over and above what the original +# script did-- it's for robustness in the case where your original cepstral mean +# normalization was way off. +# We also added a new option --distribute=true (by default) to +# weight-silence-post. This weights the silence frames in a different way, +# weighting all posteriors on the frame rather than just the silence ones, which +# removes a particular kind of bias that the old approach suffered from. + +# There are 3 models involved potentially in this script, +# and for a standard, speaker-independent system they will all be the same. +# The "alignment model" is for the 1st-pass decoding and to get the +# Gaussian-level alignments for the "adaptation model" the first time we +# do fMLLR. The "adaptation model" is used to estimate fMLLR transforms +# and to generate state-level lattices. The lattices are then rescored +# with the "final model". +# +# The following table explains where we get these 3 models from. +# Note: $srcdir is one level up from the decoding directory. +# +# Model Default source: +# +# "alignment model" $srcdir/final.alimdl --alignment-model +# (or $srcdir/final.mdl if alimdl absent) +# "adaptation model" $srcdir/final.mdl --adapt-model +# "final model" $srcdir/final.mdl --final-model + + +# Begin configuration section +first_beam=10.0 # Beam used in initial, speaker-indep. pass +first_max_active=2000 # max-active used in first two passes. +first_latbeam=4.0 # lattice pruning beam for si decode and first-pass fMLLR decode. + # the different spelling from lattice_beam is unfortunate; these scripts + # have a history. +alignment_model= +adapt_model= +final_model= +cleanup=true +stage=0 +acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in + # lattice generation. +max_active=7000 +beam=13.0 +lattice_beam=6.0 +nj=4 +silence_weight=0.01 +distribute=true # option to weight-silence-post. +cmd=run.pl +si_dir= +fmllr_update_type=full +skip_scoring=false +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +scoring_opts= + +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode_fmllr.sh [options] " + echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --adapt-model # Model to compute transforms with" + echo " --alignment-model # Model to get Gaussian-level alignments for" + echo " # 1st pass of transform computation." + echo " --final-model # Model to finally decode with" + echo " --si-dir # use this to skip 1st pass of decoding" + echo " # Caution-- must be with same tree" + echo " --acwt # default 0.08333 ... used to get posteriors" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '-pe smp 4' if you supply --num-threads 4" + echo " --scoring-opts # options to local/score.sh" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash. + +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +sdata=$data/split$nj; + +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. + +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; + +# Some checks. Note: we don't need $srcdir/tree but we expect +# it should exist, given the current structure of the scripts. +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; +## + +## Do the speaker-independent decoding, if --si-dir option not present. ## +if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass. + si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si". + if [ $stage -le 0 ]; then + steps/decode.sh --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam --model $alignment_model\ + --max-active $first_max_active --parallel-opts "${parallel_opts}" --num-threads $num_threads\ + --skip-scoring true $graphdir $data $si_dir || exit 1; + fi +fi +## + +## Some checks, and setting of defaults for variables. +[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1; +[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1; +[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl +[ -z "$final_model" ] && final_model=$srcdir/final.mdl +for f in $adapt_model $final_model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +## + +## Set up the unadapted features "$sifeats" +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +## + +## Now get the first-pass fMLLR transforms. +if [ $stage -le 1 ]; then + echo "$0: getting first-pass fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \ + gunzip -c $si_dir/lat.JOB.gz \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post --distribute=$distribute $silence_weight $silphonelist $alignment_model ark:- ark:- \| \ + gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \ + ark:$dir/trans1.JOB || exit 1; +fi +## + +pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans1.JOB ark:- ark:- |" + +## Do the first adapted lattice generation pass. +if [ $stage -le 2 ]; then + echo "$0: doing first adapted lattice generation phase" + $cmd $parallel_opts JOB=1:$nj $dir/log/decode1.JOB.log\ + gmm-latgen-faster$thread_string --max-active=$first_max_active --beam=$first_beam --lattice-beam=$first_latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat1.JOB.gz" \ + || exit 1; +fi + + +## Do a second pass of estimating the transform. Compose the transforms to get +## $dir/trans2.*. +if [ $stage -le 3 ]; then + echo "$0: estimating fMLLR transforms a second time." + $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \ + lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat1.JOB.gz|" ark:- \| \ + weight-silence-post --distribute=$distribute $silence_weight $silphonelist $adapt_model ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass1feats" \ + ark,s,cs:- ark:$dir/trans1b.JOB '&&' \ + compose-transforms --b-is-affine=true ark:$dir/trans1b.JOB ark:$dir/trans1.JOB \ + ark:$dir/trans2.JOB || exit 1; + if $cleanup; then + rm $dir/trans1b.* $dir/trans1.* $dir/lat1.*.gz + fi +fi +## + +pass2feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans2.JOB ark:- ark:- |" + +# Generate a 3rd set of lattices, with the "adaptation model"; we'll use these +# to adapt a 3rd time, and we'll rescore them. Since we should be close to the final +# fMLLR, we don't bother dumping un-determinized lattices to disk. + +## Do the final lattice generation pass (but we'll rescore these lattices +## after another stage of adaptation.) +if [ $stage -le 4 ]; then + echo "$0: doing final lattice generation phase" + $cmd $parallel_opts JOB=1:$nj $dir/log/decode2.JOB.log\ + gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $adapt_model $graphdir/HCLG.fst "$pass2feats" "ark:|gzip -c > $dir/lat2.JOB.gz" \ + || exit 1; +fi + + +## Do a third pass of estimating the transform. Compose the transforms to get +## $dir/trans.*. +if [ $stage -le 5 ]; then + echo "$0: estimating fMLLR transforms a third time." + $cmd JOB=1:$nj $dir/log/fmllr_pass3.JOB.log \ + lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat2.JOB.gz|" ark:- \| \ + weight-silence-post --distribute=$distribute $silence_weight $silphonelist $adapt_model ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass2feats" \ + ark,s,cs:- ark:$dir/trans2b.JOB '&&' \ + compose-transforms --b-is-affine=true ark:$dir/trans2b.JOB ark:$dir/trans2.JOB \ + ark:$dir/trans.JOB || exit 1; + if $cleanup; then + rm $dir/trans2b.* $dir/trans2.* + fi +fi +## + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +if [ $stage -le 6 ]; then + echo "$0: doing a final pass of acoustic rescoring." + $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \ + gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat2.JOB.gz|" "$feats" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + if $cleanup; then + rm $dir/lat2.*.gz + fi +fi + +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir +fi + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_fmmi.sh b/egs/chime_wsj0/s5/steps/decode_fmmi.sh new file mode 100755 index 000000000..1e7ab532f --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_fmmi.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# Decoding of fMMI or fMPE models (feature-space discriminative training). +# If transform-dir supplied, expects e.g. fMLLR transforms in that dir. + +# Begin configuration section. +stage=1 +iter=final +nj=4 +cmd=run.pl +maxactive=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +ngselect=2; # Just use the 2 top Gaussians for fMMI/fMPE. Should match train. +transform_dir= +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +scoring_opts= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode_fmmi.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode_fmmi.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "You can also use fMLLR features-- you have to supply --transform-dir option." + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --acwt # acoustic scale used for lattice generation " + echo " --transform-dir # where to find fMLLR transforms." + echo " --scoring-opts # options to local/score.sh" + echo " # speaker-adapted decoding" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '-pe smp 4' if you supply --num-threads 4" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +model=$srcdir/$iter.mdl + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "decode_fmmi.sh: no such file $f" && exit 1; +done + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode_fmmi.sh: feature type is $feat_type"; + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi + +fmpefeats="$feats fmpe-apply-transform $srcdir/$iter.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" + +if [ $stage -le 1 ]; then + # Get Gaussian selection info. + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + gmm-gselect --n=$ngselect $srcdir/$iter.fmpe "$feats" \ + "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +if [ $stage -le 2 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster$thread_string --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$fmpefeats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi + +if [ $stage -le 3 ]; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir +fi +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_fromlats.sh b/egs/chime_wsj0/s5/steps/decode_fromlats.sh new file mode 100755 index 000000000..5b8f41a86 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_fromlats.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Decode, limited to the word-sequences that were present in a set +# of lattices on disk. The other lattices do not have to be built +# with the same tree or the same context size-- however, you do +# have to be using the same vocabulary (words.txt)-- if not you'd +# have to map the vocabulary somehow. + +# Note: if the trees are identical, you can use gmm-rescore-lattice. + +# Mechanism: create an unweighted acceptor (on words) for each utterance, +# compose that with G, determinize, and then use compile-train-graphs-fsts +# to compile a graph for each utterance, to decode with. + +# Begin configuration. +cmd=run.pl +maxactive=7000 +beam=20.0 +latbeam=7.0 +acwt=0.083333 +batch_size=75 # Limits memory blowup in compile-train-graphs-fsts +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + + + +if [ $# != 4 ]; then + echo "Usage: steps/decode_si_fromlats.sh [options] " + echo "e.g.: steps/decode_si_fromlats.sh data/test_dev93 data/lang_test_tg exp/tri2b/decode_tgpr_dev93 exp/tri2a/decode_tgpr_dev93_fromlats" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + + +data=$1 +lang=$2 +olddir=$3 +dir=$4 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +mkdir -p $dir/log + +nj=`cat $olddir/num_jobs` || exit 1; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +sdata=$data/split$nj +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj >$dir/num_jobs + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $olddir/lat.1.gz \ + $srcdir/tree $lang/L_disambig.fst $lang/phones.txt; do + [ ! -f $f ] && echo "decode_si_fromlats.sh: no such file $f" && exit 1; +done + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + + +$cmd JOB=1:$nj $dir/log/decode_lats.JOB.log \ + lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \ + fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \ + fstdeterminizestar ark:- ark:- \| \ + compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \ + --batch-size=$batch_size $scale_opts $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \ + gmm-latgen-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam --acoustic-scale=$acwt \ + --allow-partial=true --word-symbol-table=$lang/words.txt \ + $srcdir/final.mdl ark:- "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $lang $dir + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_fwdbwd.sh b/egs/chime_wsj0/s5/steps/decode_fwdbwd.sh new file mode 100755 index 000000000..b12f0270a --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_fwdbwd.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey), BUT (Author: Mirko Hannemann) +# Apache 2.0 + +# Begin configuration section. +transform_dir= +first_pass= +iter= +model= # You can specify the model to use (e.g. if you want to use the .alimdl) +nj=4 +reverse=false +cmd=run.pl +max_active=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +extra_beam=0.0 # small additional beam over varying beam +max_beam=100.0 # maximum of varying beam +scoring_opts= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode_fwdbwd.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --first_pass # decoding dir of first pass" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --model # which model to use (e.g. to" + echo " # specify the final.alimdl)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform_dir # dir to find fMLLR transforms " + echo " # speaker-adapted decoding" + echo " --scoring-opts # options to local/score.sh" + echo " --reverse [true/false] # time reversal of features" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -z "$model" ]; then # if --model was not specified on the command line... + if [ -z $iter ]; then model=$srcdir/final.mdl; + else model=$srcdir/$iter.mdl; fi +fi + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst $graphdir/words.txt; do + [ ! -f $f ] && echo "decode_fwdbwd.sh: no such file $f" && exit 1; +done + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode_fwdbwd.sh: feature type is $feat_type"; + +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi +if $reverse; then + feats="$feats reverse-feats ark:- ark:- |" +fi + +if [ -f $first_pass/lat.1.gz ]; then + echo "converting first pass lattice to graph arc acceptor" + $cmd JOB=1:$nj $dir/log/arc_graph.JOB.log \ + time lattice-arcgraph $model $graphdir/HCLG.fst \ + "ark:gunzip -c $first_pass/lat.JOB.gz|" ark,t:$dir/lat.JOB.arcs || exit 1; + # --write-lattices=ark,t:$dir/lat.det + # --acoustic-scale=$acwt --lattice-beam=$latbeam --prune=false \ + + echo "decode with tracking first pass lattice" + $cmd JOB=1:$nj $dir/log/decode_fwdbwd.JOB.log \ + gmm-latgen-tracking --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true \ + --extra-beam=$extra_beam --max-beam=$max_beam \ + --word-symbol-table=$graphdir/words.txt --verbose=2 \ + $model $graphdir/HCLG.fst "$feats" ark:$dir/lat.JOB.arcs "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +else + $cmd JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh $scoring_opts --cmd "$cmd" --reverse $reverse $scoring_opts $data $graphdir $dir + +echo "Decoding done." +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_nnet.sh b/egs/chime_wsj0/s5/steps/decode_nnet.sh new file mode 100755 index 000000000..e8f0d2865 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_nnet.sh @@ -0,0 +1,128 @@ +#!/bin/bash + +# Copyright 2012-2013 Karel Vesely, Daniel Povey +# Apache 2.0 + +# Begin configuration section. +nnet= # Optionally pre-select network to use for getting state-likelihoods +feature_transform= # Optionally pre-select feature transform (in front of nnet) +model= # Optionally pre-select transition model +class_frame_counts= # Optionally pre-select class-counts used to compute PDF priors + +stage=0 # stage=1 skips lattice generation +nj=4 +cmd=run.pl +max_active=7000 # maximum of active tokens +max_mem=50000000 # limit the fst-size to 50MB (larger fsts are minimized) +beam=13.0 # GMM:13.0 +latbeam=8.0 # GMM:6.0 +acwt=0.10 # GMM:0.0833, note: only really affects pruning (scoring is on lattices). +scoring_opts="--min-lmwt 4 --max-lmwt 15" +skip_scoring=false +use_gpu_id=-1 # disable gpu +parallel_opts="-pe smp 2" # use 2 CPUs (1 DNN-forward, 1 decoder) +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the DNN + transition model is." + echo "e.g.: $0 exp/dnn1/graph_tgpr data/test exp/dnn1/decode_tgpr" + echo "" + echo "This script works on plain or modified features (CMN,delta+delta-delta)," + echo "which are then sent through feature-transform. It works out what type" + echo "of features you used from content of srcdir." + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo "" + echo " --nnet # which nnet to use (opt.)" + echo " --feature-transform # select transform in front of nnet (opt.)" + echo " --class-frame-counts # file with frame counts (used to compute priors) (opt.)" + echo " --model # which transition model to use (opt.)" + echo "" + echo " --acwt # select acoustic scale for decoding" + echo " --scoring-opts # options forwarded to local/score.sh" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -z "$nnet" ]; then # if --nnet was not specified on the command line... + nnet=$srcdir/final.nnet; +fi +[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1; + +if [ -z "$model" ]; then # if --model was not specified on the command line... + model=$srcdir/final.mdl; +fi + +# find the feature_transform to use +if [ -z "$feature_transform" ]; then + feature_transform=$srcdir/final.feature_transform +fi +if [ ! -f $feature_transform ]; then + echo "Missing feature_transform '$feature_transform'" + exit 1 +fi + +# check that files exist +for f in $sdata/1/feats.scp $nnet_i $nnet $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +# PREPARE THE LOG-POSTERIOR COMPUTATION PIPELINE +if [ -z "$class_frame_counts" ]; then + class_frame_counts=$srcdir/ali_train_pdf.counts +else + echo "Overriding class_frame_counts by $class_frame_counts" +fi + +# Create the feature stream: +feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" +# Optionally add cmvn +if [ -f $srcdir/norm_vars ]; then + norm_vars=$(cat $srcdir/norm_vars 2>/dev/null) + [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1 + feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |" +fi +# Optionally add deltas +if [ -f $srcdir/delta_order ]; then + delta_order=$(cat $srcdir/delta_order) + feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |" +fi + + +# Run the decoding in the queue +if [ $stage -le 0 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet "$feats" ark:- \| \ + latgen-faster-mapped --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi + +# Run the scoring +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1; +fi + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_nnet_cpu.sh b/egs/chime_wsj0/s5/steps/decode_nnet_cpu.sh new file mode 100755 index 000000000..8d2851608 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_nnet_cpu.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does decoding with a neural-net. If the neural net was built on +# top of fMLLR transforms from a conventional system, you should provide the +# --transform-dir option. + +# Begin configuration section. +stage=1 +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. If --transform-dir set, must match that number! +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +cmd=run.pl +beam=15.0 +max_active=7000 + +#WARNING: This option is renamed lat_beam (it was renamed to follow the naming +# in the other scripts +lattice_beam=8.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +scoring_opts= +skip_scoring=false +feat_type= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: steps/decode_nnet_cpu.sh [options] " + echo " e.g.: steps/decode_nnet_cpu.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/tri3b/graph_tgpr data/test_dev93 exp/tri4a_nnet/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '-pe smp 4' if you supply --num-threads 4" + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +model=$srcdir/$iter.mdl + +for f in $graphdir/HCLG.fst $data/feats.scp $model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +## Set up features. +if [ -z "$feat_type" ]; then + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi + echo "$0: feature type is $feat_type" +fi + +case $feat_type in + raw) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";; + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + if [ "$feat_type" == "raw" ]; then + [ ! -f $transform_dir/raw_trans.1 ] && echo "$0: no such file $transform_dir/raw_trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/raw_trans.JOB ark:- ark:- |" + else + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" + fi +elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then + echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi +## + + +if [ $stage -le 1 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + nnet-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt "$model" \ + $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. + + +if [ $stage -le 2 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_raw_fmllr.sh b/egs/chime_wsj0/s5/steps/decode_raw_fmllr.sh new file mode 100755 index 000000000..7cd929ed1 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_raw_fmllr.sh @@ -0,0 +1,235 @@ +#!/bin/bash + +# Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey) + +# This decoding script is like decode_fmllr.sh, but it does the fMLLR on +# the raw cepstra, using the model in the LDA+MLLT space +# +# Decoding script that does fMLLR. This can be on top of delta+delta-delta, or +# LDA+MLLT features. + +# There are 3 models involved potentially in this script, +# and for a standard, speaker-independent system they will all be the same. +# The "alignment model" is for the 1st-pass decoding and to get the +# Gaussian-level alignments for the "adaptation model" the first time we +# do fMLLR. The "adaptation model" is used to estimate fMLLR transforms +# and to generate state-level lattices. The lattices are then rescored +# with the "final model". +# +# The following table explains where we get these 3 models from. +# Note: $srcdir is one level up from the decoding directory. +# +# Model Default source: +# +# "alignment model" $srcdir/final.alimdl --alignment-model +# (or $srcdir/final.mdl if alimdl absent) +# "adaptation model" $srcdir/final.mdl --adapt-model +# "final model" $srcdir/final.mdl --final-model + + +# Begin configuration section +first_beam=10.0 # Beam used in initial, speaker-indep. pass +first_max_active=2000 # max-active used in initial pass. +first_max_arcs=-1 +alignment_model= +adapt_model= +final_model= +stage=0 +acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in + # lattice generation. +max_active=7000 +use_normal_fmllr=false +max_arcs=-1 +beam=13.0 +lattice_beam=6.0 +nj=4 +silence_weight=0.01 +cmd=run.pl +si_dir= +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +skip_scoring=false +scoring_opts= +norm_vars=false +# End configuration section +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: steps/decode_fmllr.sh [options] " + echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --adapt-model # Model to compute transforms with" + echo " --alignment-model # Model to get Gaussian-level alignments for" + echo " # 1st pass of transform computation." + echo " --final-model # Model to finally decode with" + echo " --si-dir # use this to skip 1st pass of decoding" + echo " # Caution-- must be with same tree" + echo " --acwt # default 0.08333 ... used to get posteriors" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '-pe smp 4' if you supply --num-threads 4" + echo " --scoring-opts # options to local/score.sh" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash. + +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +sdata=$data/split$nj; + +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + + +mkdir -p $dir/log +split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. + +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; + +# Some checks. Note: we don't need $srcdir/tree but we expect +# it should exist, given the current structure of the scripts. +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; +## + +## Do the speaker-independent decoding, if --si-dir option not present. ## +if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass. + si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si". + if [ $stage -le 0 ]; then + steps/decode.sh --parallel-opts "$parallel_opts" --scoring-opts "$scoring_opts" \ + --num-threads $num_threads --skip-scoring $skip_scoring \ + --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam \ + --model $alignment_model --max-arcs $max_arcs --max-active \ + $first_max_active $graphdir $data $si_dir || exit 1; + fi +fi +## + +## Some checks, and setting of defaults for variables. +[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1; +[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1; +[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl +[ -z "$final_model" ] && final_model=$srcdir/final.mdl +for f in $adapt_model $final_model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +## + +if [[ ! -f $srcdir/final.mat || ! -f $srcdir/full.mat ]]; then + echo "$0: we require final.mat and full.mat in the source directory $srcdir" +fi + +splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" +sifeats="$splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |" + +full_lda_mat="get-full-lda-mat --print-args=false $srcdir/final.mat $srcdir/full.mat -|" + +## + +## Now get the first-pass fMLLR transforms. +if [ $stage -le 1 ]; then + echo "$0: getting first-pass raw-fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \ + gunzip -c $si_dir/lat.JOB.gz \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \ + gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-raw-gpost --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$full_lda_mat" \ + "$splicedfeats" ark,s,cs:- ark:$dir/pre_trans.JOB || exit 1; +fi +## + +pass1splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |" +pass1feats="$pass1splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |" + +## Do the main lattice generation pass. Note: we don't determinize the lattices at +## this stage, as we're going to use them in acoustic rescoring with the larger +## model, and it's more correct to store the full state-level lattice for this purpose. +if [ $stage -le 2 ]; then + echo "$0: doing main lattice generation phase" + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --max-arcs=$max_arcs \ + --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \ + || exit 1; +fi +## + +## Do a second pass of estimating the transform-- this time with the lattices +## generated from the alignment model. Compose the transforms to get +## $dir/trans.1, etc. +if [ $stage -le 3 ]; then + echo "$0: estimating raw-fMLLR transforms a second time." + $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \ + "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \ + gmm-est-fmllr-raw --spk2utt=ark:$sdata/JOB/spk2utt \ + $adapt_model "$full_lda_mat" "$pass1splicedfeats" ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \ + compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \ + ark:$dir/raw_trans.JOB || exit 1; +fi +## + +feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + +if [ $stage -le 4 ] && $use_normal_fmllr; then + echo "$0: estimating normal fMLLR transforms" + $cmd JOB=1:$nj $dir/log/fmllr_pass3.JOB.log \ + gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \ + gmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt \ + $adapt_model "$feats" ark,s,cs:- ark:$dir/trans.JOB || exit 1; +fi + +if $use_normal_fmllr; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" +fi + +# Rescore the state-level lattices with the final adapted features, and the final model +# (which by default is $srcdir/final.mdl, but which may be specified on the command line, +# useful in case of discriminatively trained systems). +# At this point we prune and determinize the lattices and write them out, ready for +# language model rescoring. + +if [ $stage -le 5 ]; then + echo "$0: doing a final pass of acoustic rescoring." + $cmd $parallel_opts JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \ + gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1; +fi + +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir +fi + +#rm $dir/{trans_tmp,pre_trans}.* + +exit 0; + diff --git a/egs/chime_wsj0/s5/steps/decode_sgmm.sh b/egs/chime_wsj0/s5/steps/decode_sgmm.sh new file mode 100755 index 000000000..ddb6a67e9 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_sgmm.sh @@ -0,0 +1,257 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, with speaker vectors. +# If the SGMM system was +# built on top of fMLLR transforms from a conventional system, you should +# provide the --transform-dir option. + +# Begin configuration section. +stage=1 +alignment_model= +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +cmd=run.pl +beam=15.0 +gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: + # the first_pass_gselect variable is used for the 1st pass of + # decoding and can be tighter. +first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in + # the 1st pass of decoding (lattice generation). +max_active=7000 + +#WARNING: This option is renamed lat_beam (it was renamed to follow the naming +# in the other scripts +lattice_beam=8.0 # Beam we use in lattice generation. +vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for + # speaker-vector computation. Can be quite tight (actually we could + # probably just do best-path. +use_fmllr=false +fmllr_iters=10 +fmllr_min_count=1000 + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: steps/decode_sgmm.sh [options] " + echo " e.g.: steps/decode_sgmm.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --alignment-model # Model for the first-pass decoding." + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 13.0" + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1 +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" +gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi +## + +## Calculate FMLLR pre-transforms if needed. We are doing this here since this +## step is requried by models both with and without speaker vectors +if $use_fmllr; then + if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then + echo "$0: computing pre-transform for fMLLR computation." + sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; + fi +fi + +## Save Gaussian-selection info to disk. +# Note: we can use final.mdl regardless of whether there is an alignment model-- +# they use the same UBM. +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; + +# Generate state-level lattice which we can rescore. This is done with the +# alignment model and no speaker-vectors. +if [ $stage -le 2 ]; then + $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ + sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $alignment_model \ + $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; +fi + +## Check if the model has speaker vectors +spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'` + +if [ $spkdim -gt 0 ]; then ### For models with speaker vectors: + +# Estimate speaker vectors (1st pass). Prune before determinizing +# because determinization can take a while on un-pruned lattices. +# Note: the sgmm-post-to-gpost stage is necessary because we have +# a separate alignment-model and final model, otherwise we'd skip it +# and use sgmm-est-spkvecs. + if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \ + sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \ + sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; + fi + +# Estimate speaker vectors (2nd pass). Since we already have spk vectors, +# at this point we need to rescore the lattice to get the correct posteriors. + if [ $stage -le 4 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; + fi + rm $dir/pre_vecs.* + + if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. + if [ $stage -le 6 ]; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + fi + rm $dir/pre_lat.*.gz + +else ### For models without speaker vectors: + + if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. + if [ $stage -le 6 ] && $use_fmllr; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + rm $dir/pre_lat.*.gz + else # Already done with decoding if no adaptation needed. + for n in `seq 1 $nj`; do + mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz + done + fi + +fi + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. + + +if [ $stage -le 7 ]; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $graphdir $dir +fi +echo "Decoding done." +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_sgmm2.sh b/egs/chime_wsj0/s5/steps/decode_sgmm2.sh new file mode 100755 index 000000000..490b582d2 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_sgmm2.sh @@ -0,0 +1,211 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, with speaker vectors. +# If the SGMM system was +# built on top of fMLLR transforms from a conventional system, you should +# provide the --transform-dir option. + +# Begin configuration section. +stage=1 +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +cmd=run.pl +beam=13.0 +gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: + # the first_pass_gselect variable is used for the 1st pass of + # decoding and can be tighter. +first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in + # the 1st pass of decoding (lattice generation). +max_active=7000 +max_arcs=-1 + +#WARNING: This option is renamed lat_beam (it was renamed to follow the naming +# in the other scripts +lattice_beam=6.0 # Beam we use in lattice generation. +vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for + # speaker-vector computation. Can be quite tight (actually we could + # probably just do best-path. +use_fmllr=false +fmllr_iters=10 +fmllr_min_count=1000 +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +skip_scoring=false +scoring_opts= +# note: there are no more min-lmwt and max-lmwt options, instead use +# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20" +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: steps/decode_sgmm2.sh [options] " + echo " e.g.: steps/decode_sgmm2.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 13.0" + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +silphonelist=`cat $graphdir/phones/silence.csl` || exit 1 +gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" +gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + if [ -f $transform_dir/trans.1 ]; then + echo "$0: using transforms from $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" + elif [ -f $transform_dir/raw_trans.1 ]; then + feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + else + echo "$0: no such file $transform_dir/trans.1 or $transform_dir/raw_trans.1, invalid --transform-dir option?" + exit 1; + fi +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi +## + +## Save Gaussian-selection info to disk. +# Note: we can use final.mdl regardless of whether there is an alignment model-- +# they use the same UBM. + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm2-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +# Generate state-level lattice which we can rescore. This is done with the alignment +# model and no speaker-vectors. +if [ $stage -le 2 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode_pass1.JOB.log \ + sgmm2-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --max-arcs=$max_arcs --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $srcdir/final.alimdl \ + $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; +fi + +# Estimate speaker vectors (1st pass). Prune before determinizing +# because determinization can take a while on un-pruned lattices. +# Note: the sgmm2-post-to-gpost stage is necessary because we have +# a separate alignment-model and final model, otherwise we'd skip it +# and use sgmm2-est-spkvecs. +if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.alimdl ark:- ark:- \| \ + sgmm2-post-to-gpost "$gselect_opt" $srcdir/final.alimdl "$feats" ark:- ark:- \| \ + sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; +fi + +# Estimate speaker vectors (2nd pass). Since we already have spk vectors, +# at this point we need to rescore the lattice to get the correct posteriors. +if [ $stage -le 4 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm2-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; +fi +rm $dir/pre_vecs.* + +if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then + echo "$0: computing pre-transform for fMLLR computation." + sgmm2-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; + fi + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm2-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" +fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. +if [ $stage -le 6 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi +rm $dir/pre_lat.*.gz + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at different +# acoustic scales to get the final output. + + +if [ $stage -le 7 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + fi +fi + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_sgmm2_fromlats.sh b/egs/chime_wsj0/s5/steps/decode_sgmm2_fromlats.sh new file mode 100755 index 000000000..8db01d4a0 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_sgmm2_fromlats.sh @@ -0,0 +1,270 @@ +#!/bin/bash + +# Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM2 system, with speaker vectors. If the +# SGMM2 system was built on top of fMLLR transforms from a conventional system, +# you should provide the --transform-dir option. + +# This script does not use a decoding graph, but instead you provide +# a previous decoding directory with lattices in it. This script will only +# make use of the word sequences in the lattices; it limits the decoding +# to those sequences. You should also provide a "lang" directory from +# which this script will use the G.fst and L.fst. + +# Begin configuration section. +stage=1 +alignment_model= +transform_dir= # dir to find fMLLR transforms. +acwt=0.08333 # Just a default value, used for adaptation and beam-pruning.. +batch_size=75 # Limits memory blowup in compile-train-graphs-fsts +cmd=run.pl +beam=20.0 +gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: + # the first_pass_gselect variable is used for the 1st pass of + # decoding and can be tighter. +first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in + # the 1st pass of decoding (lattice generation). +max_active=7000 +lattice_beam=8.0 # Beam we use in lattice generation. +vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for + # speaker-vector computation. Can be quite tight (actually we could + # probably just do best-path. +use_fmllr=false +fmllr_iters=10 +fmllr_min_count=1000 +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: steps/decode_sgmm_fromlats.sh [options] " + echo "" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --alignment-model # Model for the first-pass decoding." + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 13.0" + exit 1; +fi + +data=$1 +lang=$2 +olddir=$3 +dir=$4 +srcdir=`dirname $dir` + +for f in $data/feats.scp $lang/G.fst $lang/L_disambig.fst $lang/phones/disambig.int \ + $srcdir/final.mdl $srcdir/tree $olddir/lat.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=`cat $olddir/num_jobs` || exit 1; +sdata=$data/split$nj; +silphonelist=`cat $lang/phones/silence.csl` || exit 1 +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" +gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +## Set up features + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" +if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then + transform_dir=$olddir +fi + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi + +## Calculate FMLLR pre-transforms if needed. We are doing this here since this +## step is requried by models both with and without speaker vectors +if $use_fmllr; then + if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then + echo "$0: computing pre-transform for fMLLR computation." + sgmm2-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; + fi +fi + +## Save Gaussian-selection info to disk. +# Note: we can use final.mdl regardless of whether there is an alignment model-- +# they use the same UBM. +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm2-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; + +# Generate state-level lattice which we can rescore. This is done with the +# alignment model and no speaker-vectors. +if [ $stage -le 2 ]; then + $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ + lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \ + fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \ + fstdeterminizestar ark:- ark:- \| \ + compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \ + --batch-size=$batch_size $scale_opts \ + $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \ + sgmm2-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ + --word-symbol-table=$lang/words.txt "$gselect_opt_1stpass" $alignment_model \ + "ark:-" "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; +fi + +## Check if the model has speaker vectors +spkdim=`sgmm2-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'` + +if [ $spkdim -gt 0 ]; then ### For models with speaker vectors: + +# Estimate speaker vectors (1st pass). Prune before determinizing +# because determinization can take a while on un-pruned lattices. +# Note: the sgmm2-post-to-gpost stage is necessary because we have +# a separate alignment-model and final model, otherwise we'd skip it +# and use sgmm2-est-spkvecs. + if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \ + sgmm2-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \ + sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; + fi + +# Estimate speaker vectors (2nd pass). Since we already have spk vectors, +# at this point we need to rescore the lattice to get the correct posteriors. + if [ $stage -le 4 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm2-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; + fi + rm $dir/pre_vecs.* + + if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm2-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. + if [ $stage -le 6 ]; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + fi + rm $dir/pre_lat.*.gz + +else ### For models without speaker vectors: + + if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm2-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. + if [ $stage -le 6 ] && $use_fmllr; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + rm $dir/pre_lat.*.gz + else # Already done with decoding if no adaptation needed. + for n in `seq 1 $nj`; do + mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz + done + fi + +fi + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. + + +if [ $stage -le 7 ]; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh --cmd "$cmd" $data $lang $dir + echo "score confidence and timing with sclite" + #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $lang $dir +fi +echo "Decoding done." +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore.sh b/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore.sh new file mode 100755 index 000000000..4a752fd06 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, by rescoring lattices +# generated from a previous SGMM system. The directory with the lattices +# is assumed to contain speaker vectors, if used. Basically it rescores +# the lattices one final time, using the same setup as the final decoding +# pass of the source dir. The assumption is that the model may have +# been discriminatively trained. + +# If the system was built on top of fMLLR transforms from a conventional system, +# you should provide the --transform-dir option. + +# Begin configuration section. +transform_dir= # dir to find fMLLR transforms. +cmd=run.pl +iter=final +skip_scoring=false +scoring_opts= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: steps/decode_sgmm_rescore.sh [options] " + echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --iter # iteration of model to use (default: final)" + exit 1; +fi + +graphdir=$1 +data=$2 +olddir=$3 +dir=$4 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \ + $srcdir/$iter.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=`cat $olddir/num_jobs` || exit 1; +sdata=$data/split$nj; +gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|" +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -f $olddir/vecs.1 ]; then + echo "$0: using speaker vectors from $olddir" + spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + echo "$0: no speaker vectors found." + spkvecs_opt= +fi + + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi + +if [ -f $olddir/trans.1 ]; then + echo "$0: using (in addition to any previous transforms) transforms from $olddir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |" +fi +## + +# Rescore the state-level lattices with the model provided. Just +# one command in this script. +echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl" +$cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm2-rescore-lattice "$gselect_opt" $spkvecs_opt \ + $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir +fi + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore_project.sh b/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore_project.sh new file mode 100755 index 000000000..eb8347f75 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_sgmm2_rescore_project.sh @@ -0,0 +1,172 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, by rescoring lattices +# generated from a previous SGMM system. This version does the "predictive" +# SGMM, where we subtract some constant times the log-prob of the left +# few spliced frames, and the same for the right few. +# The directory with the lattices +# is assumed to contain any speaker vectors, if used. This script just +# adds into the acoustic scores, (some constant, default -0.25) times +# the acoustic score of the left model, and the same for the right model. + +# the lattices one final time, using the same setup as the final decoding +# pass of the source dir. The assumption is that the model may have +# been discriminatively trained. + +# If the system was built on top of fMLLR transforms from a conventional system, +# you should provide the --transform-dir option. + +# Begin configuration section. +stage=0 +transform_dir= # dir to find fMLLR transforms. +cmd=run.pl +iter=final +prob_scale=-0.25 +dimensions=0:13:104:117 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: steps/decode_sgmm_rescore_project.sh [options] " + echo " e.g.: steps/decode_sgmm_rescore_project.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/tri2b/full.mat exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a/decode_dev93_tgpr_predict" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --prob-scale # Default -0.25, scale on left and right models." + exit 1; +fi + +full_lda_mat=$1 +graphdir=$2 +data=$3 +olddir=$4 +dir=$5 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $full_lda_mat $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz \ + $olddir/gselect.1.gz $srcdir/$iter.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=`cat $olddir/num_jobs` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -f $olddir/vecs.1 ]; then + echo "$0: using speaker vectors from $olddir" + spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + echo "$0: no speaker vectors found." + spkvecs_opt= +fi + +if [ $stage -le 0 ]; then + # Get full LDA+MLLT mat and its inverse. Note: the full LDA+MLLT mat is + # the LDA+MLLT mat, plus the "rejected" rows of the LDA matrix. + $cmd $dir/log/get_full_lda.log \ + get-full-lda-mat $srcdir/final.mat $full_lda_mat $dir/full.mat $dir/full_inv.mat || exit 1; +fi + +if [ $stage -le 1 ]; then + left_start=`echo $dimensions | cut '-d:' -f 1`; + left_end=`echo $dimensions | cut '-d:' -f 2`; + right_start=`echo $dimensions | cut '-d:' -f 3`; + right_end=`echo $dimensions | cut '-d:' -f 4`; + + # Prepare left and right models. For now, the dimensions are hardwired (e.g., 13 MFCCs and splice 9 frames). + # Note: the choice of dividing by the prob of the left 4 and the right 4 frames is a bit arbitrary and + # we could investigate different configurations. + $cmd $dir/log/left.log \ + sgmm2-project --start-dim=$left_start --end-dim=$left_end $srcdir/final.mdl $dir/full.mat $dir/left.mdl $dir/left.mat || exit 1; + $cmd $dir/log/right.log \ + sgmm2-project --start-dim=$right_start --end-dim=$right_end $srcdir/final.mdl $dir/full.mat $dir/right.mdl $dir/right.mat || exit 1; +fi + + +# we apply the scaling on the new acoustic probs by adding the inverse +# of that to the old acoustic probs, and then later inverting again. +# this has to do with limitations in sgmm2-rescore-lattice: we can only +# scale the *old* acoustic probs, not the new ones. +inverse_prob_scale=`perl -e "print (1.0 / $prob_scale);"` +cur_lats="ark:gunzip -c $olddir/lat.JOB.gz | lattice-scale --acoustic-scale=$inverse_prob_scale ark:- ark:- |" + +## Set up features. Note: we only support LDA+MLLT features, this +## is inherent in the method, we could not support deltas. + +for model_type in left right; do + + feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" # spliced features. + if [ ! -z "$transform_dir" ]; then # using speaker-specific transforms. + # we want to transform in the sequence: $dir/full.mat, then the result of + # (extend-transform-dim $transform_dir/trans.JOB), then $dir/full_inv.mat to + # get back to the spliced space, then the left.mat or right.mat. But + # note that compose-transforms operates in matrix-multiplication order, + # which is opposite from the "order of applying the transforms" order. + new_dim=$[`copy-matrix --binary=false $dir/full.mat - | wc -l` - 1]; # 117 in normal case. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk 'ark:extend-transform-dim --new-dimension=$new_dim ark:$transform_dir/trans.JOB ark:- | compose-transforms ark:- $dir/full.mat ark:- | compose-transforms $dir/full_inv.mat ark:- ark:- | compose-transforms $dir/${model_type}.mat ark:- ark:- |' ark:- ark:- |" + else # else, we transform with the "left" or "right" matrix; these transform from the + # spliced space. + feats="$feats transform-feats $dir/${model_type}.mat |" + # If we don't have the --transform-dir option, make sure the model was + # trained in the same way. + if grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." + fi + fi + if [ -f $olddir/trans.1 ]; then + echo "$0: warning: not using transforms in $olddir (this is just a " + echo " limitation of the script right now, and could be fixed)." + fi + + if [ $stage -le 2 ]; then + echo "Getting gselect info for $model_type model." + $cmd JOB=1:$nj $dir/log/gselect.$model_type.JOB.log \ + sgmm2-gselect $dir/$model_type.mdl "$feats" \ + "ark,t:|gzip -c >$dir/gselect.$model_type.JOB.gz" || exit 1; + fi + gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.$model_type.JOB.gz|" + + + # Rescore the state-level lattices with the model provided. Just + # one command in this script. + # The --old-acoustic-scale=1.0 option means we just add the scores + # to the old scores. + if [ $stage -le 3 ]; then + echo "$0: rescoring lattices with $model_type model" + $cmd JOB=1:$nj $dir/log/rescore.${model_type}.JOB.log \ + sgmm2-rescore-lattice --old-acoustic-scale=1.0 "$gselect_opt" $spkvecs_opt \ + $dir/$model_type.mdl "$cur_lats" "$feats" \ + "ark:|gzip -c > $dir/lat.${model_type}.JOB.gz" || exit 1; + fi + cur_lats="ark:gunzip -c $dir/lat.${model_type}.JOB.gz |" +done + +if [ $stage -le 4 ]; then + echo "$0: getting final lattices." + $cmd JOB=1:$nj $dir/log/scale_lats.JOB.log \ + lattice-scale --acoustic-scale=$prob_scale "$cur_lats" "ark:|gzip -c >$dir/lat.JOB.gz" \ + || exit 1; +fi + +rm $dir/lat.{left,right}.*.gz 2>/dev/null # note: if these still exist, it will + # confuse the scoring script. + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_sgmm_fromlats.sh b/egs/chime_wsj0/s5/steps/decode_sgmm_fromlats.sh new file mode 100755 index 000000000..a926ed618 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_sgmm_fromlats.sh @@ -0,0 +1,273 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, with speaker vectors. +# If the SGMM system was +# built on top of fMLLR transforms from a conventional system, you should +# provide the --transform-dir option. +# This script does not use a decoding graph, but instead you provide +# a previous decoding directory with lattices in it. This script will only +# make use of the word sequences in the lattices; it limits the decoding +# to those sequences. You should also provide a "lang" directory from +# which this script will use the G.fst and L.fst. + +# Begin configuration section. +stage=1 +alignment_model= +transform_dir= # dir to find fMLLR transforms. +acwt=0.08333 # Just a default value, used for adaptation and beam-pruning.. +batch_size=75 # Limits memory blowup in compile-train-graphs-fsts +cmd=run.pl +beam=20.0 +gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: + # the first_pass_gselect variable is used for the 1st pass of + # decoding and can be tighter. +first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in + # the 1st pass of decoding (lattice generation). +max_active=7000 + +#WARNING: This option is renamed lat_beam (it was renamed to follow the naming +# in the other scripts +lattice_beam=8.0 # Beam we use in lattice generation. +vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for + # speaker-vector computation. Can be quite tight (actually we could + # probably just do best-path. +use_fmllr=false +fmllr_iters=10 +fmllr_min_count=1000 +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: steps/decode_sgmm_fromlats.sh [options] " + echo "" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --alignment-model # Model for the first-pass decoding." + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 13.0" + exit 1; +fi + +data=$1 +lang=$2 +olddir=$3 +dir=$4 +srcdir=`dirname $dir` + +for f in $data/feats.scp $lang/G.fst $lang/L_disambig.fst $lang/phones/disambig.int \ + $srcdir/final.mdl $srcdir/tree $olddir/lat.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=`cat $olddir/num_jobs` || exit 1; +sdata=$data/split$nj; +silphonelist=`cat $lang/phones/silence.csl` || exit 1 +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" +gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +## Set up features + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" +if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then + transform_dir=$olddir +fi + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi + +## Calculate FMLLR pre-transforms if needed. We are doing this here since this +## step is requried by models both with and without speaker vectors +if $use_fmllr; then + if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then + echo "$0: computing pre-transform for fMLLR computation." + sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; + fi +fi + +## Save Gaussian-selection info to disk. +# Note: we can use final.mdl regardless of whether there is an alignment model-- +# they use the same UBM. +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ + sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ + "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; +fi + +## Work out name of alignment model. ## +if [ -z "$alignment_model" ]; then + if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; + else alignment_model=$srcdir/final.mdl; fi +fi +[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; + +# Generate state-level lattice which we can rescore. This is done with the +# alignment model and no speaker-vectors. +if [ $stage -le 2 ]; then + $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ + lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \ + fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \ + fstdeterminizestar ark:- ark:- \| \ + compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \ + --batch-size=$batch_size $scale_opts \ + $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \ + sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ + --word-symbol-table=$lang/words.txt "$gselect_opt_1stpass" $alignment_model \ + "ark:-" "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; +fi + +## Check if the model has speaker vectors +spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'` + +if [ $spkdim -gt 0 ]; then ### For models with speaker vectors: + +# Estimate speaker vectors (1st pass). Prune before determinizing +# because determinization can take a while on un-pruned lattices. +# Note: the sgmm-post-to-gpost stage is necessary because we have +# a separate alignment-model and final model, otherwise we'd skip it +# and use sgmm-est-spkvecs. + if [ $stage -le 3 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \ + sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \ + sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; + fi + +# Estimate speaker vectors (2nd pass). Since we already have spk vectors, +# at this point we need to rescore the lattice to get the correct posteriors. + if [ $stage -le 4 ]; then + $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ + $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; + fi + rm $dir/pre_vecs.* + + if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. + if [ $stage -le 6 ]; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + fi + rm $dir/pre_lat.*.gz + +else ### For models without speaker vectors: + + if $use_fmllr; then + # Estimate fMLLR transforms (note: these may be on top of any + # fMLLR transforms estimated with the baseline GMM system. + if [ $stage -le 5 ]; then # compute fMLLR transforms. + echo "$0: computing fMLLR transforms." + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + gunzip -c $dir/pre_lat.JOB.gz \| \ + sgmm-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \ + "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ + lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ + lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ + weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ + sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ + --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ + $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; + fi + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + fi + +# Now rescore the state-level lattices with the adapted features and the +# corresponding model. Prune and determinize the lattices to limit +# their size. + if [ $stage -le 6 ] && $use_fmllr; then + $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \ + $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + rm $dir/pre_lat.*.gz + else # Already done with decoding if no adaptation needed. + for n in `seq 1 $nj`; do + mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz + done + fi + +fi + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. + + +if [ $stage -le 7 ]; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh --cmd "$cmd" $data $lang $dir + echo "score confidence and timing with sclite" + #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $lang $dir +fi +echo "Decoding done." +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_sgmm_rescore.sh b/egs/chime_wsj0/s5/steps/decode_sgmm_rescore.sh new file mode 100755 index 000000000..9b23e8ece --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_sgmm_rescore.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script does decoding with an SGMM system, by rescoring lattices +# generated from a previous SGMM system. The directory with the lattices +# is assumed to contain speaker vectors, if used. Basically it rescores +# the lattices one final time, using the same setup as the final decoding +# pass of the source dir. The assumption is that the model may have +# been discriminatively trained. + +# If the system was built on top of fMLLR transforms from a conventional system, +# you should provide the --transform-dir option. + +# Begin configuration section. +transform_dir= # dir to find fMLLR transforms. +cmd=run.pl +iter=final +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "Usage: steps/decode_sgmm_rescore.sh [options] " + echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + echo " --iter # iteration of model to use (default: final)" + exit 1; +fi + +graphdir=$1 +data=$2 +olddir=$3 +dir=$4 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. + +for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \ + $srcdir/$iter.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=`cat $olddir/num_jobs` || exit 1; +sdata=$data/split$nj; +gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|" +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -f $olddir/vecs.1 ]; then + echo "$0: using speaker vectors from $olddir" + spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + echo "$0: no speaker vectors found." + spkvecs_opt= +fi + + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" +elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then + echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi + +if [ -f $olddir/trans.1 ]; then + echo "$0: using (in addition to any previous transforms) transforms from $olddir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |" +fi +## + +# Rescore the state-level lattices with the model provided. Just +# one command in this script. +echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl" +$cmd JOB=1:$nj $dir/log/rescore.JOB.log \ + sgmm-rescore-lattice "$gselect_opt" $spkvecs_opt \ + $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \ + "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_si.sh b/egs/chime_wsj0/s5/steps/decode_si.sh new file mode 100755 index 000000000..f41ba6349 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_si.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration section. +transform_dir= +iter= +model= # You can specify the model to use (e.g. if you want to use the .alimdl) +stage=0 +nj=4 +cmd=run.pl +max_active=7000 +max_arcs=-1 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +scoring_opts= +# note: there are no more min-lmwt and max-lmwt options, instead use +# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20" +skip_scoring=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/decode.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --model # which model to use (e.g. to" + echo " # specify the final.alimdl)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # dir to find fMLLR transforms " + echo " --acwt # acoustic scale used for lattice generation " + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '-pe smp 4' if you supply --num-threads 4" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -z "$model" ]; then # if --model was not specified on the command line... + if [ -z $iter ]; then model=$srcdir/final.mdl; + else model=$srcdir/$iter.mdl; fi +fi + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1; +done + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode.sh: feature type is $feat_type"; + +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi + +if [ $stage -le 0 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + gmm-latgen-faster$thread_string --max-arcs=$max_arcs --max-active=$max_active --beam=$beam --lattice-beam=$latbeam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi + +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir +fi + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/decode_with_map.sh b/egs/chime_wsj0/s5/steps/decode_with_map.sh new file mode 100755 index 000000000..4af3b9987 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/decode_with_map.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# Copyright 2012 Neha Agrawal, Cisco Systems; +# Johns Hopkins University (Author: Daniel Povey); +# +# Apache 2.0 + +# Begin configuration section. +transform_dir= +iter= +model= # You can specify the model to use (e.g. if you want to use the .alimdl) +nj=4 +cmd=run.pl +max_active=7000 +beam=13.0 +latbeam=6.0 +acwt=0.083333 # note: only really affects pruning (scoring is on lattices). +mean_tau=20 +weight_tau=10 +flags=mw # could also contain "v" for variance; the default + # tau for that is 50. +stage=1 +# End configuration section. + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: steps/decode.sh [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the model is." + echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --model # which model to use (e.g. to" + echo " # specify the final.alimdl)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # dir to find fMLLR transforms " + echo " # speaker-adapted decoding" + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +if [ -z "$model" ]; then # if --model was not specified on the command line... + if [ -z $iter ]; then model=$srcdir/final.mdl; + else model=$srcdir/$iter.mdl; fi +fi + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do + [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1; +done + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "decode.sh: feature type is $feat_type"; + +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi + +if [ $stage -le 1 ]; then + echo "Doing first-pass decoding before MAP decoding." + $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ + gmm-decode-faster --max-active=$max_active --beam=$beam \ + --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model $graphdir/HCLG.fst "$feats" ark:$dir/tmp.JOB.tra ark:$dir/pass1_decode.JOB.ali || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "Computing MAP stats and doing MAP-adapted decoding" + $cmd JOB=1:$nj $dir/log/decode_pass2.JOB.log \ + ali-to-post ark:$dir/pass1_decode.JOB.ali ark:- \| \ + gmm-adapt-map --mean-tau=$mean_tau --weight-tau=$weight_tau \ + --update-flags=$flags --spk2utt=ark:$sdata/JOB/spk2utt \ + $model "$feats" ark:- ark:- \| \ + gmm-latgen-map --lattice-beam=$latbeam --acoustic-scale=$acwt \ + --utt2spk=ark:$sdata/JOB/utt2spk --max-active=$max_active --beam=$beam \ + --allow-partial=true --word-symbol-table=$graphdir/words.txt \ + $model ark,s,cs:- $graphdir/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" +fi +#rm -f $dir/pass1_decode.*.ali +#rm -f $dir/tmp.*.tra + +[ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; +local/score.sh --cmd "$cmd" $data $graphdir $dir + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/get_ctm.sh b/egs/chime_wsj0/s5/steps/get_ctm.sh new file mode 100755 index 000000000..866fa2ab2 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/get_ctm.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. + +# This script produces CTM files from a decoding directory that has lattices +# present. + + +# begin configuration section. +cmd=run.pl +stage=0 +use_segments=true # if we have a segments file, use it to convert + # the segments to be relative to the original files. +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/get_ctm.sh [options] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --use-segments (true|false) # use segments and reco2file_and_channel files " + echo " # to produce a ctm relative to the original audio" + echo " # files, with channel information (typically needed" + echo " # for NIST scoring)." + echo "e.g.:" + echo "local/get_ctm.sh data/train data/lang exp/tri4a/decode/" + exit 1; +fi + +data=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +dir=$3 + +model=$dir/../final.mdl # assume model one level up from decoding dir. + + +for f in $lang/words.txt $lang/phones/word_boundary.int \ + $model $dir/lat.1.gz; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; +done + +name=`basename $data`; # e.g. eval2000 + +mkdir -p $dir/scoring/log + +if [ $stage -le 0 ]; then + if [ -f $data/segments ]; then + f=$data/reco2file_and_channel + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; + filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel" + else + filter_cmd=cat + fi + + $cmd LMWT=5:20 $dir/scoring/log/get_ctm.LMWT.log \ + mkdir -p $dir/score_LMWT/ '&&' \ + lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ + nbest-to-ctm ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1; +fi + + diff --git a/egs/chime_wsj0/s5/steps/get_fmllr_basis.sh b/egs/chime_wsj0/s5/steps/get_fmllr_basis.sh new file mode 100755 index 000000000..9ae46bc24 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/get_fmllr_basis.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright 2012 Carnegie Mellon University (Author: Yajie Miao) +# Johns Hopkins University (Author: Daniel Povey) + +# Decoding script that computes basis for basis-fMLLR (see decode_fmllr_basis.sh). +# This can be on top of delta+delta-delta, or LDA+MLLT features. + +stage=0 +# Parameters in alignment of training data +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +per_utt=true # If true, then treat each utterance as a separate speaker for purposes of + # basis training... this is recommended if the number of actual speakers in your + # training set is less than (feature-dim) * (feature-dim+1). +align_beam=10 +retry_beam=40 +silence_weight=0.01 +cmd=run.pl +# End configuration section + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: steps/get_fmllr_basis.sh [options] " + echo " e.g.: steps/decode_basis_fmllr.sh data/train_si84 data/lang exp/tri3b/" + echo "Note: we currently assume that this is the same data you trained the model with." + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd # Command to run in parallel with" + exit 1; +fi + +data=$1 +lang=$2 +dir=$3 + +nj=`cat $dir/num_jobs` || exit 1; +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +splice_opts=`cat $dir/splice_opts 2>/dev/null` # frame-splicing options. + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +for f in $data/feats.scp $dir/final.alimdl $dir/final.mdl $dir/ali.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set up the unadapted features "$sifeats". +if [ -f $dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + + # Set up the adapted features "$feats" for training set. +if [ -f $srcdir/trans.1 ]; then + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$sdata/trans.JOB ark:- ark:- |"; +else + feats="$sifeats"; +fi + + +if $per_utt; then + spk2utt_opt= # treat each utterance as separate speaker when computing basis. + echo "Doing per-utterance adaptation for purposes of computing the basis." +else + echo "Doing per-speaker adaptation for purposes of computing the basis." + [ `cat $sdata/spk2utt | wc -l` -lt $[41*40] ] && \ + echo "Warning: number of speakers is small, might be better to use --per-utt=true." + spk2utt_opt="--spk2utt=ark:$sdata/JOB/spk2utt" +fi + +# Note: we get Gaussian level alignments with the "final.mdl" and the +# speaker adapted features. +$cmd JOB=1:$nj $dir/log/basis_acc.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + weight-silence-post $silence_weight $silphonelist $dir/final.mdl ark:- ark:- \| \ + gmm-post-to-gpost $dir/final.mdl "$feats" ark:- ark:- \| \ + gmm-basis-fmllr-accs-gpost $spk2utt_opt \ + $dir/final.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1; + +# Compute the basis matrices. +$cmd $dir/log/basis_training.log \ + gmm-basis-fmllr-training $dir/final.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1; +rm $dir/basis.acc.* 2>/dev/null + +exit 0; + diff --git a/egs/chime_wsj0/s5/steps/get_lexicon_probs.sh b/egs/chime_wsj0/s5/steps/get_lexicon_probs.sh new file mode 100755 index 000000000..22053e2f3 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/get_lexicon_probs.sh @@ -0,0 +1,225 @@ +#!/bin/bash +# Copyright 2013 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + + +# From a training or alignment directory, and an original lexicon.txt and lang/ +# directory, obtain a new lexicon with pronunciation probabilities. + + +# Begin configuration section. +stage=0 +smooth_count=1.0 # Amount of count to add corresponding to each original lexicon entry; + # this corresponds to add-one smoothing of the pron-probs. +max_one=true # If true, normalize the pron-probs so the maximum value for each word is 1.0, + # rather than summing to one. This is quite standard. + +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: steps/get_lexicon_probs.sh " + echo "e.g.: steps/get_lexicon_probs.sh data/train data/lang exp/tri5 data/local/lexicon.txt \\" + echo " exp/tri5_lexprobs data/local_withprob/lexicon.txt" + echo "Note: we assume you ran using word-position-dependent phones but both the old and new lexicon will not have" + echo "these markings. We also assume the new lexicon will have pron-probs but the old one does not; this limitation" + echo "of the script can be removed later." + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --stage # used to control partial re-running." + echo " --max-one # If true, normalize so max prob of each" + echo " # word is one. Default: true" + echo " --smooth # Amount to smooth each count by (default: 1.0)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +old_lexicon=$4 +dir=$5 +new_lexicon=$6 + +oov=`cat $lang/oov.int` || exit 1; +nj=`cat $srcdir/num_jobs` || exit 1; + +for f in $data/text $lang/L.fst $lang/phones/word_boundary.int $srcdir/ali.1.gz $old_lexicon; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +mkdir -p $dir/log +utils/split_data.sh $data $nj # Make sure split data-dir exists. +sdata=$data/split$nj + + +mkdir -p $dir/log + +if [ $stage -le 0 ]; then + + ( ( for n in `seq $nj`; do gunzip -c $srcdir/ali.$n.gz; done ) | \ + linear-to-nbest ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $data/text |" '' '' ark:- | \ + lattice-align-words $lang/phones/word_boundary.int $srcdir/final.mdl ark:- ark:- | \ + lattice-to-phone-lattice --replace-words=false $srcdir/final.mdl ark:- ark,t:- | \ + awk '{ if (NF == 4) { word_phones = sprintf("%s %s", $3, $4); count[word_phones]++; } } + END { for(key in count) { print count[key], key; } }' | \ + sed s:0,0,:: | awk '{print $2, $1, $3;}' | sed 's/_/ /g' | \ + utils/int2sym.pl -f 3- $lang/phones.txt | \ + sed -E 's/_I( |$)/ /g' | sed -E 's/_E( |$)/ /g' | sed -E 's/_B( |$)/ /g' | sed -E 's/_S( |$)/ /g' | \ + utils/int2sym.pl -f 1 $lang/words.txt > $dir/lexicon_counts.txt + ) 2>&1 | tee $dir/log/get_fsts.log + +fi + +cat $old_lexicon | awk '{if (!($2 > 0.0 && $2 < 1.0)) { exit(1); }}' && \ + echo "Error: old lexicon $old_lexicon appears to have pron-probs; we don't expect this." && \ + exit 1; + +mkdir -p `dirname $new_lexicon` || exit 1; + +if [ $stage -le 1 ]; then + grep -v -w '^' $dir/lexicon_counts.txt | \ + perl -e ' ($old_lexicon, $smooth_count, $max_one) = @ARGV; + ($smooth_count >= 0) || die "Invalid smooth_count $smooth_count"; + ($max_one eq "true" || $max_one eq "false") || die "Invalid max_one variable $max_one"; + open(O, "<$old_lexicon")||die "Opening old-lexicon file $old_lexicon"; + while() { + $_ =~ m/(\S+)\s+(.+)/ || die "Bad old-lexicon line $_"; + $word = $1; + $orig_pron = $2; + # Remember the mapping from canonical prons to original prons: in the case of + # syllable based systems we want to remember the locations of tabs in + # the original lexicon. + $pron = join(" ", split(" ", $orig_pron)); + $orig_pron{$word,$pron} = $orig_pron; + $count{$word,$pron} += $smooth_count; + $tot_count{$word} += $smooth_count; + } + while () { + $_ =~ m/(\S+)\s+(\S+)\s+(.+)/ || die "Bad new-lexicon line $_"; + $word = $1; + $this_count = $2; + $pron = join(" ", split(" ", $3)); + $count{$word,$pron} += $this_count; + $tot_count{$word} += $this_count; + } + if ($max_one eq "true") { # replace $tot_count{$word} with max count + # of any pron. + %tot_count = {}; # set to empty assoc array. + foreach $key (keys %count) { + ($word, $pron) = split($; , $key); # $; is separator for strings that index assoc. arrays. + $this_count = $count{$key}; + if (!defined $tot_count{$word} || $this_count > $tot_count{$word}) { + $tot_count{$word} = $this_count; + } + } + } + foreach $key (keys %count) { + ($word, $pron) = split($; , $key); # $; is separator for strings that index assoc. arrays. + $this_orig_pron = $orig_pron{$key}; + if (!defined $this_orig_pron) { die "Word $word and pron $pron did not appear in original lexicon."; } + if (!defined $tot_count{$word}) { die "Tot-count not defined for word $word."; } + $prob = $count{$key} / $tot_count{$word}; + print "$word\t$prob\t$this_orig_pron\n"; # Output happens here. + } ' $old_lexicon $smooth_count $max_one > $new_lexicon || exit 1; +fi + +exit 0; + +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.occs $dir; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. + + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; +alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |" +mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |" + + +## Work out where we're getting the graphs from. +if $use_graphs; then + [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ + echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; + [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; + graphdir=$srcdir +else + graphdir=$dir + if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; + fi +fi + + +if [ $stage -le 1 ]; then + echo "$0: aligning data in $data using $alimdl and speaker-independent features." + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing fMLLR transforms" + if [ "$alimdl" != "$mdl" ]; then + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + else + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + fi +fi + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +if [ $stage -le 3 ]; then + echo "$0: doing final alignment." + $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \ + "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done aligning data." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/get_train_ctm.sh b/egs/chime_wsj0/s5/steps/get_train_ctm.sh new file mode 100755 index 000000000..e81a20e82 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/get_train_ctm.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. + +# This script produces CTM files from a training directory that has alignments +# present. + + +# begin configuration section. +cmd=run.pl +stage=0 +use_segments=true # if we have a segments file, use it to convert + # the segments to be relative to the original files. +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/get_train_ctm.sh [options] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --use-segments (true|false) # use segments and reco2file_and_channel files " + echo " # to produce a ctm relative to the original audio" + echo " # files, with channel information (typically needed" + echo " # for NIST scoring)." + echo "e.g.:" + echo "local/get_train_ctm.sh data/train data/lang exp/tri3a_ali" + echo "Produces ctm in: exp/tri3a_ali/ctm" + exit 1; +fi + +data=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +dir=$3 + +model=$dir/final.mdl # assume model one level up from decoding dir. + + +for f in $lang/words.txt $lang/phones/word_boundary.int \ + $model $dir/ali.1.gz $lang/oov.int; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; +done + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir/scoring/log + +if [ $stage -le 0 ]; then + if [ -f $data/segments ]; then + f=$data/reco2file_and_channel + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; + filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel" + else + filter_cmd=cat + fi + + $cmd $dir/log/get_ctm.log \ + linear-to-nbest "ark:gunzip -c $dir/ali.*.gz|" \ + "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/text |" \ + '' '' ark:- \| \ + lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ + nbest-to-ctm ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + $filter_cmd '>' $dir/ctm || exit 1; +fi diff --git a/egs/chime_wsj0/s5/steps/lmrescore.sh b/egs/chime_wsj0/s5/steps/lmrescore.sh new file mode 100755 index 000000000..e6150ada9 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/lmrescore.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# Begin configuration section. +mode=4 +cmd=run.pl +skip_scoring=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +for x in `seq 2`; do + [ "$1" == "--cmd" ] && cmd=$2 && shift 2; + [ "$1" == "--mode" ] && mode=$2 && shift 2; +done + +if [ $# != 5 ]; then + echo "Do language model rescoring of lattices (remove old LM, add new LM)" + echo "Usage: steps/lmrescore.sh [options] " + echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--mode (1|2|3|4)]" + exit 1; +fi + +[ -f path.sh ] && . ./path.sh; + +oldlang=$1 +newlang=$2 +data=$3 +indir=$4 +outdir=$5 + +oldlm=$oldlang/G.fst +newlm=$newlang/G.fst +! cmp $oldlang/words.txt $newlang/words.txt && echo "Warning: vocabularies may be incompatible." +[ ! -f $oldlm ] && echo Missing file $oldlm && exit 1; +[ ! -f $newlm ] && echo Missing file $newlm && exit 1; +! ls $indir/lat.*.gz >/dev/null && echo "No lattices input directory $indir" && exit 1; + +oldlmcommand="fstproject --project_output=true $oldlm |" +newlmcommand="fstproject --project_output=true $newlm |" + +mkdir -p $outdir/log + +phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'` + +if [ "$mode" == 4 ]; then + # we have to prepare $outdir/Ldet.fst in this case: determinized + # lexicon (determinized on phones), with disambig syms removed. + # take L_disambig.fst; get rid of transition with "#0 #0" on it; determinize + # with epsilon removal; remove disambiguation symbols. + fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \ + fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$outdir/Ldet.fst || exit 1; +fi + +nj=`cat $indir/num_jobs` || exit 1; +cp $indir/num_jobs $outdir + + +#for lat in $indir/lat.*.gz; do +# number=`basename $lat | cut -d. -f2`; +# newlat=$outdir/`basename $lat` + +case "$mode" in + 1) # 1 is inexact, it's the original way of doing it. + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-lmrescore --lm-scale=-1.0 "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ + lattice-lmrescore --lm-scale=1.0 ark:- "$newlmcommand" "ark,t:|gzip -c>$outdir/lat.JOB.gz" \ + || exit 1; + ;; + 2) # 2 is equivalent to 1, but using more basic operations, combined. + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + gunzip -c $indir/lat.JOB.gz \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \ + lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \ + lattice-determinize ark:- ark:- \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \ + lattice-compose ark:- "fstproject --project_output=true $newlm |" ark:- \| \ + lattice-determinize ark:- ark:- \| \ + gzip -c \>$outdir/lat.JOB.gz || exit 1; + ;; + 3) # 3 is "exact" in that we remove the old LM scores accepting any path + # through G.fst (which is what we want as that happened in lattice + # generation), but we add the new one with "phi matcher", only taking + # backoff arcs if an explicit arc did not exist. + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + gunzip -c $indir/lat.JOB.gz \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \ + lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \ + lattice-determinize ark:- ark:- \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \ + lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \ + lattice-determinize ark:- ark:- \| \ + gzip -c \>$outdir/lat.JOB.gz || exit 1; + ;; + 4) # 4 is also exact (like 3), but instead of subtracting the old LM-scores, + # it removes the old graph scores entirely and adds in the lexicon, + # grammar and transition weights. + mdl=`dirname $indir`/final.mdl + [ ! -f $mdl ] && echo No such model $mdl && exit 1; + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + gunzip -c $indir/lat.JOB.gz \| \ + lattice-scale --lm-scale=0.0 ark:- ark:- \| \ + lattice-to-phone-lattice $mdl ark:- ark:- \| \ + lattice-compose ark:- $outdir/Ldet.fst ark:- \| \ + lattice-determinize ark:- ark:- \| \ + lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \ + lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \ + $mdl ark:- ark:- \| \ + gzip -c \>$outdir/lat.JOB.gz || exit 1; + ;; +esac + +rm $outdir/Ldet.fst 2>/dev/null + +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh --cmd "$cmd" $data $newlang $outdir +else + echo "Not scoring because requested so..." +fi + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/make_bn_feats.sh b/egs/chime_wsj0/s5/steps/make_bn_feats.sh new file mode 100755 index 000000000..53bf57778 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_bn_feats.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# Copyright 2012 Karel Vesely, Daniel Povey +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +remove_last_layers=4 # remove N last components from the nnet +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "usage: $0 [options] "; + echo "options: " + echo " --trim-transforms # number of NNet Components to remove from the end" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +if [ -f path.sh ]; then . path.sh; fi + +data=$1 +srcdata=$2 +nndir=$3 +logdir=$4 +bnfeadir=$5 + +######## CONFIGURATION + +# copy the dataset metadata from srcdata. +mkdir -p $data || exit 1; +cp $srcdata/* $data 2>/dev/null; rm $data/feats.scp $data/cmvn.scp; + +# make $bnfeadir an absolute pathname. +bnfeadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $bnfeadir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $bnfeadir || exit 1; +mkdir -p $data || exit 1; +mkdir -p $logdir || exit 1; + + +srcscp=$srcdata/feats.scp +scp=$data/feats.scp + +required="$srcscp $nndir/final.nnet" + +for f in $required; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + exit 1; + fi +done + +if [ ! -d $srcdata/split$nj -o $srcdata/split$nj -ot $srcdata/feats.scp ]; then + utils/split_data.sh $srcdata $nj +fi + + +#cut the MLP +nnet=$bnfeadir/feature_extractor.nnet +copy-nnet --remove-last-layers=$remove_last_layers --binary=false $nndir/final.nnet $nnet 2>$logdir/feature_extractor.log + +#get the feature transform +feature_transform=$nndir/final.feature_transform + +echo "Creating bn-feats into $data" + +### +### Prepare feature pipeline +feats="ark,s,cs:copy-feats scp:$srcdata/split$nj/JOB/feats.scp ark:- |" +# Optionally add cmvn +if [ -f $nndir/norm_vars ]; then + norm_vars=$(cat $nndir/norm_vars 2>/dev/null) + feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |" +fi +# Optionally add deltas +if [ -f $nndir/delta_order ]; then + delta_order=$(cat $nndir/delta_order) + feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |" +fi +### +### + +#Run the forward pass +$cmd JOB=1:$nj $logdir/make_bnfeats.JOB.log \ + nnet-forward --feature-transform=$feature_transform $nnet "$feats" \ + ark,scp:$bnfeadir/raw_bnfea_$name.JOB.ark,$bnfeadir/raw_bnfea_$name.JOB.scp \ + || exit 1; + + +N0=$(cat $srcdata/feats.scp | wc -l) +N1=$(cat $bnfeadir/raw_bnfea_$name.*.scp | wc -l) +if [[ "$N0" != "$N1" ]]; then + echo "Error producing bnfea features for $name:" + echo "Original feats : $N0 Bottleneck feats : $N1" + exit 1; +fi + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $bnfeadir/raw_bnfea_$name.$n.scp >> $data/feats.scp +done + + +echo "Succeeded creating MLP-BN features for $name ($data)" + diff --git a/egs/chime_wsj0/s5/steps/make_denlats.sh b/egs/chime_wsj0/s5/steps/make_denlats.sh new file mode 100755 index 000000000..786407e1e --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_denlats.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Create denominator lattices for MMI/MPE training. +# Creates its output in $dir/lat.*.gz + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +lattice_beam=7.0 +acwt=0.1 +max_active=5000 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +# This is in bytes, but not "real" bytes-- you have to multiply +# by something like 5 or 10 to get real bytes (not sure why so large) +num_threads=1 +parallel_opts= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/make_denlats.sh [options] " + echo " e.g.: steps/make_denlats.sh data/train data/lang exp/tri1 exp/tri1_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + echo " --num-threads # number of threads per decoding job" + echo " --parallel-opts # if >1 thread, add this to 'cmd', e.g. -pe smp 6" + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir + +cp -r $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. +new_lang="$dir/"$(basename "$lang") +echo "Making unigram grammar FST in $new_lang" +cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \ + || exit 1; + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph. + +echo "Compiling decoding graph in $dir/dengraph" +if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1; +fi + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "align_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "$0: using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ + && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; + [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \ + echo "$0: LDA transforms differ between $srcdir and $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +else + if [ -f $srcdir/final.alimdl ]; then + echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option."; + exit 1; + fi +fi + +if [ $sub_split -eq 1 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \ + gmm-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +else + for n in `seq $nj`; do + if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + gmm-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1; + echo Merging archives for data subset $n + rm $dir/.error 2>/dev/null; + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error; + done | gzip -c > $dir/lat.$n.gz || touch $dir/.error; + [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1; + rm $dir/lat.$n.*.gz + touch $dir/.done.$n + fi + done +fi + + +echo "$0: done generating denominator lattices." diff --git a/egs/chime_wsj0/s5/steps/make_denlats_nnet.sh b/egs/chime_wsj0/s5/steps/make_denlats_nnet.sh new file mode 100755 index 000000000..0ba20982e --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_denlats_nnet.sh @@ -0,0 +1,177 @@ +#!/bin/bash +# Copyright 2012-2013 Karel Vesely, Daniel Povey +# Apache 2.0. + +# Create denominator lattices for MMI/MPE/sMBR training. +# Creates its output in $dir/lat.*.ark,$dir/lat.scp +# The lattices are uncompressed, we need random access for DNN training. + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +lattice_beam=7.0 +acwt=0.1 +max_active=5000 +nnet= +max_mem=20000000 # This will stop the processes getting too large. +# This is in bytes, but not "real" bytes-- you have to multiply +# by something like 5 or 10 to get real bytes (not sure why so large) +# End configuration section. +use_gpu_id=-1 # disable gpu +parallel_opts="-pe smp 2" + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/$0 [options] " + echo " e.g.: steps/$0 data/train data/lang exp/tri1 exp/tri1_denlats" + echo "Works for plain features (or CMN, delta), forwarded through feature-transform." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir + +cp -r $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. +new_lang="$dir/"$(basename "$lang") +echo "Making unigram grammar FST in $new_lang" +cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \ + || exit 1; + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph. + +echo "Compiling decoding graph in $dir/dengraph" +if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1; +fi + + + +#Get the files we will need +cp $srcdir/{tree,final.mdl} $dir + +[ -z "$nnet" ] && nnet=$srcdir/final.nnet; +[ ! -f "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1; + +class_frame_counts=$srcdir/ali_train_pdf.counts +[ -z "$class_frame_counts" ] && echo "Error class_frame_counts '$class_frame_counts' does not exist!" && exit 1; + +feature_transform=$srcdir/final.feature_transform +if [ ! -f $feature_transform ]; then + echo "Missing feature_transform '$feature_transform'" + exit 1 +fi + +model=$dir/final.mdl +[ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1; + +### +### Prepare feature pipeline (same as for decoding) +### +# Create the feature stream: +feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" +# Optionally add cmvn +if [ -f $srcdir/norm_vars ]; then + norm_vars=$(cat $srcdir/norm_vars 2>/dev/null) + [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1 + feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |" +fi +# Optionally add deltas +if [ -f $srcdir/delta_order ]; then + delta_order=$(cat $srcdir/delta_order) + feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |" +fi + +# Finally add feature_transform and the MLP +feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |" +### +### +### + + + +### +### We will produce lattices, where the correct path is not necessarily present +### + +#1) We don't use reference path here... + +echo "Generating the denlats" +#2) Generate the denominator lattices +if [ $sub_split -eq 1 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \ + latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" "ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp" || exit 1; +else + for n in `seq $nj`; do + if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=$(echo $feats | sed s:JOB/:$n/split$sub_split/JOB/:g) + $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" "ark,scp:$dir/lat.$n.JOB.ark,$dir/lat.$n.JOB.scp" || exit 1; + echo Merging lists for data subset $n + for k in `seq $sub_split`; do + cat $dir/lat.$n.$k.scp + done > $dir/lat.$n.all.scp + echo Merge the ark $n + lattice-copy scp:$dir/lat.$n.all.scp ark,scp:$dir/lat.$n.ark,$dir/lat.$n.scp || exit 1; + #remove the data + rm $dir/lat.$n.*.ark $dir/lat.$n.*.scp $dir/lat.$n.all.scp + touch $dir/.done.$n + fi + done +fi + + + +#3) Merge the SCPs to create full list of lattices (will use random access) +echo Merging to single list $dir/lat.scp +for ((n=1; n<=nj; n++)); do + cat $dir/lat.$n.scp +done > $dir/lat.scp + + +echo "$0: done generating denominator lattices." diff --git a/egs/chime_wsj0/s5/steps/make_denlats_nnet_cpu.sh b/egs/chime_wsj0/s5/steps/make_denlats_nnet_cpu.sh new file mode 100755 index 000000000..7dbb9c3f8 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_denlats_nnet_cpu.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Create denominator lattices for MMI/MPE training. +# Creates its output in $dir/lat.*.gz + +# Begin configuration section. +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. If --transform-dir set, must match that number! +cmd=run.pl +sub_split=1 +beam=13.0 +lattice_beam=7.0 +acwt=0.1 +max_active=5000 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +# This is in bytes, but not "real" bytes-- you have to multiply +# by something like 5 or 10 to get real bytes (not sure why so large) +# End configuration section. +num_threads=1 # Number of threads used in nnet-logprob computation. If you set + # this to a different value, make sure to also set the appropriate + # queue options. If you set this too high it won't use all the + # threads as most of the time will be taken in the decoder. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/make_denlats_nnet_cpu.sh [options] " + echo " e.g.: steps/make_denlats_nnet_cpu.sh data/train data/lang exp/tri1 exp/tri1_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir + +cp -r $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. +new_lang="$dir/"$(basename "$lang") +echo "Making unigram grammar FST in $new_lang" +cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \ + || exit 1; + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph. + +echo "Compiling decoding graph in $dir/dengraph" +if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1; +fi + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "align_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "$0: using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ + && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; + [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \ + echo "$0: LDA transforms differ between $srcdir and $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +else + if [ -f $srcdir/final.alimdl ]; then + echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option."; + exit 1; + fi +fi + +if [ $sub_split -eq 1 ]; then + $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \ + nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats" ark:- \| \ + latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +else + for n in `seq $nj`; do + if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + nnet-logprob-parallel --num-threads=$num_threads $srcdir/final.mdl "$feats_subset" ark:- \| \ + latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1; + echo Merging archives for data subset $n + rm $dir/.error 2>/dev/null; + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error; + done | gzip -c > $dir/lat.$n.gz || touch $dir/.error; + [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1; + rm $dir/lat.$n.*.gz + touch $dir/.done.$n + fi + done +fi + + +echo "$0: done generating denominator lattices." diff --git a/egs/chime_wsj0/s5/steps/make_denlats_sgmm.sh b/egs/chime_wsj0/s5/steps/make_denlats_sgmm.sh new file mode 100755 index 000000000..4f63bae5a --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_denlats_sgmm.sh @@ -0,0 +1,159 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Create denominator lattices for MMI/MPE training, with SGMM models. If the +# features have fMLLR transforms you have to supply the --transform-dir option. +# It gets any speaker vectors from the "alignment dir" ($alidir). Note: this is +# possibly a slight mismatch because the speaker vectors come from supervised +# adaptation. + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +lattice_beam=7.0 +acwt=0.1 +max_active=5000 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/make_denlats_sgmm.sh [options] " + echo " e.g.: steps/make_denlats_sgmm.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 # could also be $srcdir, but only if no vectors supplied. +dir=$4 + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir + +cp -r $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. +new_lang="$dir/"$(basename "$lang") +echo "Making unigram grammar FST in $new_lang" +cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \ + || exit 1; + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph. + +echo "Compiling decoding graph in $dir/dengraph" +if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + utils/mkgraph.sh $new_lang $alidir $dir/dengraph || exit 1; +fi + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "align_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "$0: using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ + && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; + [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \ + echo "$0: LDA transforms differ between $alidir and $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +else + echo "Assuming you don't have a SAT system, since no --transform-dir option supplied " +fi + +if [ -f $alidir/gselect.1.gz ]; then + gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|" +else + echo "$0: no such file $alidir/gselect.1.gz" && exit 1; +fi + +if [ -f $alidir/vecs.1 ]; then + spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" +else + if [ -f $alidir/final.alimdl ]; then + echo "You seem to have an SGMM system with speaker vectors," + echo "yet we can't find speaker vectors. Perhaps you supplied" + echo "the model director instead of the alignment directory?" + exit 1; + fi +fi + +if [ $sub_split -eq 1 ]; then + $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \ + sgmm-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $alidir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +else + for n in `seq $nj`; do + if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"` + gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"` + $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + sgmm-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \ + --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \ + --word-symbol-table=$lang/words.txt $alidir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1; + echo Merging archives for data subset $n + rm $dir/.error 2>/dev/null; + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error; + done | gzip -c > $dir/lat.$n.gz || touch $dir/.error; + [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1; + rm $dir/lat.$n.*.gz + touch $dir/.done.$n + fi + done +fi + + +echo "$0: done generating denominator lattices with SGMMs." diff --git a/egs/chime_wsj0/s5/steps/make_denlats_sgmm2.sh b/egs/chime_wsj0/s5/steps/make_denlats_sgmm2.sh new file mode 100755 index 000000000..b6b901252 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_denlats_sgmm2.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# Create denominator lattices for MMI/MPE training, with SGMM models. If the +# features have fMLLR transforms you have to supply the --transform-dir option. +# It gets any speaker vectors from the "alignment dir" ($alidir). Note: this is +# possibly a slight mismatch because the speaker vectors come from supervised +# adaptation. + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +lattice_beam=7.0 +acwt=0.1 +max_active=5000 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +num_threads=1 +parallel_opts= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/make_denlats_sgmm2.sh [options] " + echo " e.g.: steps/make_denlats_sgmm2.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + echo " --num-threads # number of threads per decoding job" + echo " --parallel-opts # if >1 thread, add this to 'cmd', e.g. -pe smp 6" + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 # could also be $srcdir, but only if no vectors supplied. +dir=$4 + +sdata=$data/split$nj +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +if [ $num_threads -gt 1 ]; then + # the -parallel becomes part of the binary name we decode with. + thread_string="-parallel --num-threads=$num_threads" +fi + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +mkdir -p $dir + +cp -r $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. +new_lang="$dir/"$(basename "$lang") +echo "Making unigram grammar FST in $new_lang" +cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile > $new_lang/G.fst \ + || exit 1; + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph. + +echo "Compiling decoding graph in $dir/dengraph" +if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + utils/mkgraph.sh $new_lang $alidir $dir/dengraph || exit 1; +fi + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "align_si.sh: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "$0: using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ + && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; + [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \ + echo "$0: LDA transforms differ between $alidir and $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +else + echo "Assuming you don't have a SAT system, since no --transform-dir option supplied " +fi + +if [ -f $alidir/gselect.1.gz ]; then + gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|" +else + echo "$0: no such file $alidir/gselect.1.gz" && exit 1; +fi + +if [ -f $alidir/vecs.1 ]; then + spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" + [ "`cat $alidir/num_jobs`" -ne "$nj" ] \ + && echo "$0: mismatch in number of jobs with $alidir" && exit 1; +else + if [ -f $alidir/final.alimdl ]; then + echo "$0: You seem to have an SGMM system with speaker vectors," + echo "yet we can't find speaker vectors. Perhaps you supplied" + echo "the model director instead of the alignment directory?" + exit 1; + fi +fi + +if [ $sub_split -eq 1 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \ + sgmm2-latgen-faster$thread_string $spkvecs_opt "$gselect_opt" --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ + --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $alidir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +else + for n in `seq $nj`; do + if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"` + gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"` + $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + sgmm2-latgen-faster$thread_string $spkvecs_opt_subset "$gselect_opt_subset" \ + --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \ + --word-symbol-table=$lang/words.txt $alidir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1; + echo Merging archives for data subset $n + rm $dir/.error 2>/dev/null; + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error; + done | gzip -c > $dir/lat.$n.gz || touch $dir/.error; + [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1; + rm $dir/lat.$n.*.gz + touch $dir/.done.$n + fi + done +fi + + +echo "$0: done generating denominator lattices with SGMMs." diff --git a/egs/chime_wsj0/s5/steps/make_fbank.sh b/egs/chime_wsj0/s5/steps/make_fbank.sh new file mode 100755 index 000000000..45255058c --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_fbank.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2012 Karel Vesely Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +fbank_config=conf/fbank.conf +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "usage: make_fbank.sh [options] "; + echo "options: " + echo " --fbank-config # config passed to compute-fbank-feats " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +logdir=$2 +fbankdir=$3 + + +# make $fbankdir an absolute pathname. +fbankdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbankdir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $fbankdir || exit 1; +mkdir -p $logdir || exit 1; + +scp=$data/wav.scp + +required="$scp $fbank_config" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_fbank.sh: no such file $f" + exit 1; + fi +done + +# note: in general, the double-parenthesis construct in bash "((" is "C-style +# syntax" where we can get rid of the $ for variable names, and omit spaces. +# The "for" loop in this style is a special construct. + + +if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + split_segments="" + for ((n=1; n<=nj; n++)); do + split_segments="$split_segments $logdir/segments.$n" + done + + utils/split_scp.pl $data/segments $split_segments || exit 1; + rm $logdir/.error 2>/dev/null + + $cmd JOB=1:$nj $logdir/make_fbank.JOB.log \ + extract-segments scp:$scp $logdir/segments.JOB ark:- \| \ + compute-fbank-feats --verbose=2 --config=$fbank_config ark:- \ + ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \ + || exit 1; + +else + echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." + split_scps="" + for ((n=1; n<=nj; n++)); do + split_scps="$split_scps $logdir/wav.$n.scp" + done + + utils/split_scp.pl $scp $split_scps || exit 1; + + $cmd JOB=1:$nj $logdir/make_fbank.JOB.log \ + compute-fbank-feats --verbose=2 --config=$fbank_config scp:$logdir/wav.JOB.scp \ + ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \ + || exit 1; + +fi + + +if [ -f $logdir/.error.$name ]; then + echo "Error producing fbank features for $name:" + tail $logdir/make_fbank.*.log + exit 1; +fi + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $fbankdir/raw_fbank_$name.$n.scp || exit 1; +done > $data/feats.scp + +rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi + +echo "Succeeded creating filterbank features for $name" diff --git a/egs/chime_wsj0/s5/steps/make_fmllr_feats.sh b/egs/chime_wsj0/s5/steps/make_fmllr_feats.sh new file mode 100755 index 000000000..0c4fc1a22 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_fmllr_feats.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Copyright 2012 Karel Vesely +# Johns Hopkins University (Author: Daniel Povey), +# +# Apache 2.0. + +# This script is for use in neural network training and testing; it dumps +# (LDA+MLLT or splice+delta) + fMLLR features in a similar format to +# conventional raw MFCC features. + +# Begin configuration section. +nj=4 +cmd=run.pl +transform_dir= +norm_vars=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: $0 [options] " + echo "e.g.: $0 data-fmllr/train data/train exp/tri5a exp/make_fmllr_feats/log plp/processed/" + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "You can also use fMLLR features-- you have to supply --transform-dir option." + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # where to find fMLLR transforms." + exit 1; +fi + + +data=$1 +srcdata=$2 +gmmdir=$3 +logdir=$4 +feadir=$5 + + + +#srcdir=$1 -> gmmdir +#data=$2 -> srcdata +#dir=$3 -> ruzne +#tgtdata=$4 -> feadir + +sdata=$srcdata/split$nj; +splice_opts=`cat $gmmdir/splice_opts 2>/dev/null` + +mkdir -p $data $logdir $feadir +[[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1; + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +if [ -f $gmmdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." && exit 1 +# [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ +# echo "Mismatch in number of jobs with $transform_dir" && exit 1; +# feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |" +fi + + +#prepare the dir +cp $srcdata/* $data; rm $data/{feats.scp,cmvn.scp}; + +# make $bnfeadir an absolute pathname. +feadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $feadir ${PWD}` + +name=`basename $data` + +#forward the feats +$cmd JOB=1:$nj $logdir/make_fmllr_feats.JOB.log \ + copy-feats "$feats" \ + ark,scp:$feadir/feats_fmllr_$name.JOB.ark,$feadir/feats_fmllr_$name.JOB.scp || exit 1; + +#merge the feats to single SCP +for n in $(seq 1 $nj); do + cat $feadir/feats_fmllr_$name.$n.scp +done > $data/feats.scp + +echo "$0 finished... $srcdata -> $data ($gmmdir)" + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/make_fmmi_feats.sh b/egs/chime_wsj0/s5/steps/make_fmmi_feats.sh new file mode 100755 index 000000000..fe6ceee14 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_fmmi_feats.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# Decoding of fMMI or fMPE models (feature-space discriminative training). +# If transform-dir supplied, expects e.g. fMLLR transforms in that dir. + +# Begin configuration section. +iter=final +nj=4 +cmd=run.pl +ngselect=2; # Just use the 2 top Gaussians for fMMI/fMPE. Should match train. +transform_dir= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: $0 [options] " + echo "e.g.: $0 data-fmmi/train data/train exp/tri5a_fmmi_b0.1 data-fmmi/train/_log data-fmmi/train/_data " + echo "" + echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" + echo "what type of features you used (assuming it's one of these two)" + echo "You can also use fMLLR features-- you have to supply --transform-dir option." + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --transform-dir # where to find fMLLR transforms." + exit 1; +fi + + +data=$1 +srcdata=$2 +gmmdir=$3 +logdir=$4 +feadir=$5 + + + +#srcdir=$1 -> gmmdir +#data=$2 -> srcdata +#dir=$3 -> ruzne +#tgtdata=$4 -> feadir + +sdata=$srcdata/split$nj; +splice_opts=`cat $gmmdir/splice_opts 2>/dev/null` + +mkdir -p $data $logdir $feadir +[[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1; + +for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $gmmdir/$iter.fmpe; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +if [ -f $gmmdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type"; + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- |";; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then # add transforms to features... + echo "Using fMLLR transforms from $transform_dir" + [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." + [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \ + echo "Mismatch in number of jobs with $transform_dir"; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" +fi + +# Get Gaussian selection info. +$cmd JOB=1:$nj $logdir/gselect.JOB.log \ + gmm-gselect --n=$ngselect $gmmdir/$iter.fmpe "$feats" \ + "ark:|gzip -c >$feadir/gselect.JOB.gz" || exit 1; + +#prepare the dir +cp $srcdata/* $data; rm $data/{feats.scp,cmvn.scp}; + +# make $bnfeadir an absolute pathname. +feadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $feadir ${PWD}` + +#forward the feats +$cmd JOB=1:$nj $logdir/make_fmmi_feats.JOB.log \ + fmpe-apply-transform $gmmdir/$iter.fmpe "$feats" "ark,s,cs:gunzip -c $feadir/gselect.JOB.gz|" \ + ark,scp:$feadir/feats_fmmi.JOB.ark,$feadir/feats_fmmi.JOB.scp || exit 1; + +#merge the feats to single SCP +for n in $(seq 1 $nj); do + cat $feadir/feats_fmmi.$n.scp +done > $data/feats.scp + +echo "$0 finished... $srcdata -> $data ($gmmdir)" + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/make_index.sh b/egs/chime_wsj0/s5/steps/make_index.sh new file mode 100755 index 000000000..4eef666ad --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_index.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) +# Apache 2.0 + +# Begin configuration section. +model= # You can specify the model to use +cmd=run.pl +acwt=0.083333 +lmwt=1.0 +max_silence_frames=50 +max_states=1000000 +max_expand=20 # limit memory blowup in lattice-align-words +strict=true +word_ins_penalty=0 +silence_word= # Specify this only if you did so in kws_setup +skip_optimization=false # If you only search for few thousands of keywords, you probablly + # can skip the optimization; but if you're going to search for + # millions of keywords, you'd better do set this optimization to + # false and do the optimization on the final index. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: steps/make_index.sh [options] " + echo "... where is where you have the lattices, and is assumed to be" + echo " a sub-directory of the directory where the model is." + echo "e.g.: steps/make_index.sh data/kws data/lang exp/sgmm2_5a_mmi/decode/ exp/sgmm2_5a_mmi/decode/kws/" + echo "" + echo "main options (for others, see top of script file)" + echo " --acwt # acoustic scale used for lattice" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --lmwt # lm scale used for lattice" + echo " --model # which model to use" + echo " # speaker-adapted decoding" + echo " --max-silence-frames # maximum #frames for silence" + exit 1; +fi + + +kwsdatadir=$1; +langdir=$2; +decodedir=$3; +kwsdir=$4; +srcdir=`dirname $decodedir`; # The model directory is one level up from decoding directory. + +mkdir -p $kwsdir/log; +nj=`cat $decodedir/num_jobs` || exit 1; +echo $nj > $kwsdir/num_jobs; +word_boundary=$langdir/phones/word_boundary.int +utter_id=$kwsdatadir/utter_id + +if [ -z "$model" ]; then # if --model was not specified on the command line... + model=$srcdir/final.mdl; +fi + +for f in $word_boundary $model $decodedir/lat.1.gz; do + [ ! -f $f ] && echo "make_index.sh: no such file $f" && exit 1; +done + +echo "Using model: $model" + +if [ ! -z $silence_word ]; then + silence_int=`grep -w $silence_word $langdir/words.txt | awk '{print $2}'` + [ -z $silence_int ] && \ + echo "Error: could not find integer representation of silence word $silence_word" && exit 1; + silence_opt="--silence-label=$silence_int" +fi + +$cmd JOB=1:$nj $kwsdir/log/index.JOB.log \ + lattice-add-penalty --word-ins-penalty=$word_ins_penalty "ark:gzip -cdf $decodedir/lat.JOB.gz|" ark:- \| \ + lattice-align-words $silence_opt --max-expand=$max_expand $word_boundary $model ark:- ark:- \| \ + lattice-scale --acoustic-scale=$acwt --lm-scale=$lmwt ark:- ark:- \| \ + lattice-to-kws-index --max-silence-frames=$max_silence_frames --strict=$strict ark:$utter_id ark:- ark:- \| \ + kws-index-union --skip-optimization=$skip_optimization --strict=$strict --max-states=$max_states \ + ark:- "ark:|gzip -c > $kwsdir/index.JOB.gz" + + +exit 0; diff --git a/egs/chime_wsj0/s5/steps/make_mfcc.sh b/egs/chime_wsj0/s5/steps/make_mfcc.sh new file mode 100755 index 000000000..3ca06c50e --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_mfcc.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +mfcc_config=conf/mfcc.conf +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "usage: make_mfcc.sh [options] "; + echo "options: " + echo " --mfcc-config # config passed to compute-mfcc-feats " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +logdir=$2 +mfccdir=$3 + + +# make $mfccdir an absolute pathname. +mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $mfccdir || exit 1; +mkdir -p $logdir || exit 1; + +scp=$data/wav.scp + +required="$scp $mfcc_config" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_mfcc.sh: no such file $f" + exit 1; + fi +done + +# note: in general, the double-parenthesis construct in bash "((" is "C-style +# syntax" where we can get rid of the $ for variable names, and omit spaces. +# The "for" loop in this style is a special construct. + + +if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + split_segments="" + for ((n=1; n<=nj; n++)); do + split_segments="$split_segments $logdir/segments.$n" + done + + utils/split_scp.pl $data/segments $split_segments || exit 1; + rm $logdir/.error 2>/dev/null + + $cmd JOB=1:$nj $logdir/make_mfcc.JOB.log \ + extract-segments scp:$scp $logdir/segments.JOB ark:- \| \ + compute-mfcc-feats --verbose=2 --config=$mfcc_config ark:- \ + ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \ + || exit 1; + +else + echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." + split_scps="" + for ((n=1; n<=nj; n++)); do + split_scps="$split_scps $logdir/wav.$n.scp" + done + + utils/split_scp.pl $scp $split_scps || exit 1; + + $cmd JOB=1:$nj $logdir/make_mfcc.JOB.log \ + compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp \ + ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \ + || exit 1; + +fi + + +if [ -f $logdir/.error.$name ]; then + echo "Error producing mfcc features for $name:" + tail $logdir/make_mfcc.*.log + exit 1; +fi + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1; +done > $data/feats.scp + +rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully processed ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi + +echo "Succeeded creating MFCC features for $name" diff --git a/egs/chime_wsj0/s5/steps/make_plp.sh b/egs/chime_wsj0/s5/steps/make_plp.sh new file mode 100755 index 000000000..0e543817b --- /dev/null +++ b/egs/chime_wsj0/s5/steps/make_plp.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +nj=4 +cmd=run.pl +plp_config=conf/plp.conf +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "usage: make_plp.sh [options] "; + echo "options: " + echo " --plp-config # config passed to compute-plp-feats " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +logdir=$2 +plpdir=$3 + + +# make $plpdir an absolute pathname. +plpdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $plpdir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $plpdir || exit 1; +mkdir -p $logdir || exit 1; + +scp=$data/wav.scp + +required="$scp $plp_config" + +for f in $required; do + if [ ! -f $f ]; then + echo "make_plp.sh: no such file $f" + exit 1; + fi +done + +# note: in general, the double-parenthesis construct in bash "((" is "C-style +# syntax" where we can get rid of the $ for variable names, and omit spaces. +# The "for" loop in this style is a special construct. + + +if [ -f $data/segments ]; then + echo "$0 [info]: segments file exists: using that." + split_segments="" + for ((n=1; n<=nj; n++)); do + split_segments="$split_segments $logdir/segments.$n" + done + + utils/split_scp.pl $data/segments $split_segments || exit 1; + rm $logdir/.error 2>/dev/null + + $cmd JOB=1:$nj $logdir/make_plp.JOB.log \ + extract-segments scp:$scp $logdir/segments.JOB ark:- \| \ + compute-plp-feats --verbose=2 --config=$plp_config ark:- \ + ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \ + || exit 1; + +else + echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." + split_scps="" + for ((n=1; n<=nj; n++)); do + split_scps="$split_scps $logdir/wav.$n.scp" + done + + utils/split_scp.pl $scp $split_scps || exit 1; + + $cmd JOB=1:$nj $logdir/make_plp.JOB.log \ + compute-plp-feats --verbose=2 --config=$plp_config scp:$logdir/wav.JOB.scp \ + ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \ + || exit 1; + +fi + + +if [ -f $logdir/.error.$name ]; then + echo "Error producing plp features for $name:" + tail $logdir/make_plp.*.log + exit 1; +fi + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $plpdir/raw_plp_$name.$n.scp || exit 1; +done > $data/feats.scp + +rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully ($nf != $nu);" + echo "consider using utils/fix_data_dir.sh $data" +fi + +echo "Succeeded creating PLP features for $name" diff --git a/egs/chime_wsj0/s5/steps/mixup.sh b/egs/chime_wsj0/s5/steps/mixup.sh new file mode 100755 index 000000000..6a74eb88d --- /dev/null +++ b/egs/chime_wsj0/s5/steps/mixup.sh @@ -0,0 +1,148 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# mix up (or down); do 3 iters of model training; realign; then do two more +# iterations of model training. + +# Begin configuration section. +cmd=run.pl +beam=10 +retry_beam=40 +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +num_iters=5 +realign_iters=3 # Space-separated list of iterations to realign on. +stage=0 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: steps/mixup.sh " + echo " e.g.: steps/mixup.sh 20000 data/train_si84 data/lang exp/tri3b exp/tri3b_20k" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + exit 1; +fi + +numgauss=$1 +data=$2 +lang=$3 +srcdir=$4 +dir=$5 + +for f in $data/feats.scp $srcdir/final.mdl $srcdir/final.mat; do + [ ! -f $f ] && echo "mixup_lda_etc.sh: no such file $f" && exit 1; +done + +nj=`cat $srcdir/num_jobs` || exit 1; +sdata=$data/split$nj; + +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +mkdir -p $dir/log +cp $srcdir/splice_opts $dir 2>/dev/null +cp $srcdir/final.mat $dir +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/tree $dir + + +## Set up features. +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac +if [ -f $srcdir/trans.1 ]; then + echo Using transforms from $srcdir; + rm $dir/trans.* 2>/dev/null + ln.pl $srcdir/trans.* $dir # Link those transforms to current directory. + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" +else + feats="$sifeats" +fi +## Done setting up features. + +rm $dir/fsts.*.gz 2>/dev/null +ln.pl $srcdir/fsts.*.gz $dir # Link training-graph FSTs to current directory. + +## Mix up old model +if [ $stage -le 0 ]; then + echo Mixing up old model to $numgauss Gaussians +# Note: this script also works for mixing down. + $cmd $dir/log/mixup.log \ + gmm-mixup --mix-up=$numgauss --mix-down=$numgauss \ + $srcdir/final.mdl $srcdir/final.occs $dir/1.mdl || exit 1; +fi +## Done. + +cur_alidir=$srcdir # dir to find alignments. +[ -z "$realign_iters" ] && ln.pl $srcdir/ali.*.gz $dir; # link alignments, if + # we won't be generating them. + +x=1 +while [ $x -le $num_iters ]; do + echo "$0: iteration $x" + if echo $realign_iters | grep -w $x >/dev/null; then + if [ $stage -le $x ]; then + echo "$0: realigning data" + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + cur_alidir=$dir + fi + if [ $stage -le $x ]; then + echo "$0: accumulating statistics" + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $cur_alidir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + echo "$0: re-estimating model" + [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc + rm $dir/$x.occs 2>/dev/null + fi + x=$[$x+1] +done + +rm $dir/final.mdl $dir/final.occs 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs + +if [ -f $dir/trans.1 ]; then + echo "$0: accumulating stats for alignment model." + $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \ + ark,s,cs:- $dir/$x.JOB.acc || exit 1; + [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1; + echo "$0: Re-estimating alignment model." + $cmd $dir/log/est_alimdl.log \ + gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1; + rm $dir/$x.*.acc + rm $dir/final.alimdl 2>/dev/null + ln -s $x.alimdl $dir/final.alimdl +fi + +utils/summarize_warnings.pl $dir/log + +echo Done diff --git a/egs/chime_wsj0/s5/steps/nnet2/align.sh b/egs/chime_wsj0/s5/steps/nnet2/align.sh new file mode 100755 index 000000000..c7a395981 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/nnet2/align.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Copyright 2012 Brno University of Technology (Author: Karel Vesely) +# 2013 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Computes training alignments using MLP model + +# If you supply the "--use-graphs true" option, it will use the training +# graphs from the source directory (where the model is). In this +# case the number of jobs must match with the source directory. + + +# Begin configuration section. +nj=4 +cmd=run.pl +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +transform_dir= +iter=final +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [--transform-dir ] " + echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +cp $srcdir/{tree,${iter}.mdl} $dir || exit 1; + + +## Set up features. Note: these are different from the normal features +## because we have one rspecifier that has the features for the entire +## training set, not separate ones for each batch. +if [ -z "$feat_type" ]; then + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +case $feat_type in + raw) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + ;; + lda) + splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + cp $srcdir/splice_opts $dir 2>/dev/null + cp $srcdir/final.mat $dir || exit 1; + feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp $sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + if ! [ $nj -eq `cat $transform_dir/num_jobs` ]; then + echo "$0: Number of jobs mismatch with transform-dir: $nj versus `cat $transform_dir/num_jobs`"; + exit 1; + fi + if [ $feat_type == "lda" ]; then + [ ! -f $transform_dir/raw_trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1; + echo "$0: using transforms from $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" + fi + if [ $feat_type == "raw" ]; then + [ ! -f $transform_dir/raw_trans.1 ] && echo "No such file $transform_dir/raw_trans.1" && exit 1; + echo "$0: using raw-fMLLR transforms from $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" + fi +fi + + +echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" + +tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + +$cmd JOB=1:$nj $dir/log/align.JOB.log \ + compile-train-graphs $dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" ark:- \| \ + nnet-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $srcdir/${iter}.mdl \ + ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + +echo "$0: done aligning data." + diff --git a/egs/chime_wsj0/s5/steps/nnet2/decode.sh b/egs/chime_wsj0/s5/steps/nnet2/decode.sh new file mode 100755 index 000000000..fdf6c4109 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/nnet2/decode.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does decoding with a neural-net. If the neural net was built on +# top of fMLLR transforms from a conventional system, you should provide the +# --transform-dir option. + +# Begin configuration section. +stage=1 +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. If --transform-dir set, must match that number! +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +cmd=run.pl +beam=15.0 +max_active=7000 +lat_beam=8.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # If you supply num-threads, you should supply this too. +scoring_opts= +skip_scoring=false +feat_type= +spk_vecs_dir= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo " e.g.: $0 --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo " exp/tri3b/graph_tgpr data/test_dev93 exp/tri4a_nnet/decode_dev93_tgpr" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '-pe smp 4' if you supply --num-threads 4" + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +model=$srcdir/$iter.mdl + +for f in $graphdir/HCLG.fst $data/feats.scp $model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +## Set up features. +if [ -z "$feat_type" ]; then + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi + echo "$0: feature type is $feat_type" +fi + +case $feat_type in + raw) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";; + lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + if [ "$feat_type" == "lda" ]; then + [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" + else + [ ! -f $transform_dir/raw_trans.1 ] && echo "$0: no such file $transform_dir/raw_trans.1" && exit 1; + [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ + && echo "$0: #jobs mismatch with transform-dir." && exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/raw_trans.JOB ark:- ark:- |" + fi +elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then + echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi +## + +if [ ! -z $spk_vecs_dir ]; then + [ ! -f $spk_vecs_dir/vecs.1 ] && echo "No such file $spk_vecs_dir/vecs.1" && exit 1; + spk_vecs_opt=("--spk-vecs=ark:cat $spk_vecs_dir/vecs.*|" "--utt2spk=ark:$data/utt2spk") +else + spk_vecs_opt=() +fi + +if [ $stage -le 1 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + nnet-latgen-faster$thread_string "${spk_vecs_opt[@]}" --max-active=$max_active --beam=$beam \ + --lattice-beam=$lat_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$model" \ + $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. + + +if [ $stage -le 2 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/chime_wsj0/s5/steps/nnet2/get_egs.sh b/egs/chime_wsj0/s5/steps/nnet2/get_egs.sh new file mode 100755 index 000000000..9b52067d0 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/nnet2/get_egs.sh @@ -0,0 +1,276 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the neural net (and also +# the validation examples used for diagnostics), and puts them in separate archives. + +# Begin configuration section. +cmd=run.pl +feat_type= +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics +hidden_layer_dim=300 +within_class_factor=0.0001 +num_valid_frames_combine=0 # #valid frames for combination weights at the very end. +num_train_frames_combine=10000 # # train frames for the above. +num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs +samples_per_iter=400000 # each iteration of training, see this many samples + # per job. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. +transform_dir= # If supplied, overrides alidir +num_jobs_nnet=16 # Number of neural net jobs to run in parallel +stage=0 +io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. +splice_width=4 # meaning +- 4 frames on each side for second LDA +spk_vecs_dir= +random_copy=false + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: steps/nnet2/get_egs.sh [opts] " + echo " e.g.: steps/nnet2/get_egs.sh data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-jobs-nnet # Number of parallel jobs to use for main neural net" + echo " # training (will affect results as well as speed; try 8, 16)" + echo " # Note: if you increase this, you may want to also increase" + echo " # the learning rate." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-width # Number of frames on each side to append for feature input" + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --num-frames-diagnostic <#frames|4000> # Number of frames used in computing (train,valid) diagnostics" + echo " --num-valid-frames-combine <#frames|10000> # Number of frames used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +oov=`cat $lang/oov.int` +num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $alidir/tree $dir + + +# Get list of validation utterances. +awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ + > $dir/valid_uttlist || exit 1; + +if [ -f $data/utt2uniq ]; then + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; + +[ -z "$transform_dir" ] && transform_dir=$alidir + +## Set up features. Note: these are different from the normal features +## because we have one rspecifier that has the features for the entire +## training set, not separate ones for each batch. +if [ -z $feat_type ]; then + if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +case $feat_type in + raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + ;; + lda) + splice_opts=`cat $alidir/splice_opts 2>/dev/null` + cp $alidir/splice_opts $dir 2>/dev/null + cp $alidir/final.mat $dir + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then + echo "$0: using transforms from $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |" +fi +if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then + echo "$0: using raw-fMLLR transforms from $transform_dir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |" +fi + +if [ $stage -le 0 ]; then + echo "$0: working out number of frames of training data" + num_frames=`feat-to-len scp:$data/feats.scp ark,t:- | awk '{x += $2;} END{print x;}'` || exit 1; + echo $num_frames > $dir/num_frames +else + num_frames=`cat $dir/num_frames` || exit 1; +fi + +# Working out number of iterations per epoch. +iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1; +[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1 +samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)] +echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations," +echo "$0: giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)." + + +## If --est-lda=true, o LDA on top of whatever features we already have; store +## the matrix which we'll put into the neural network as a constant. + +feat_dim=`feat-to-dim "$train_subset_feats" -` || exit 1; +lda_dim=$[$feat_dim*(1+2*($splice_width))]; # No dim reduction. + +nnet_context_opts="--left-context=$splice_width --right-context=$splice_width" +mkdir -p $dir/egs + +if [ ! -z $spk_vecs_dir ]; then + [ ! -f $spk_vecs_dir/vecs.1 ] && echo "No such file $spk_vecs_dir/vecs.1" && exit 1; + spk_vecs_opt=("--spk-vecs=ark:cat $spk_vecs_dir/vecs.*|" "--utt2spk=ark:$data/utt2spk") +else + spk_vecs_opt=() +fi + +if [ $stage -le 2 ]; then + echo "Getting validation and training subset examples." + rm $dir/.error 2>/dev/null + $cmd $dir/log/create_valid_subset.log \ + nnet-get-egs $nnet_context_opts "${spk_vecs_opt[@]}" "$valid_feats" \ + "ark,cs:gunzip -c $alidir/ali.*.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \ + "ark:$dir/egs/valid_all.egs" || touch $dir/.error & + $cmd $dir/log/create_train_subset.log \ + nnet-get-egs $nnet_context_opts "${spk_vecs_opt[@]}" "$train_subset_feats" \ + "ark,cs:gunzip -c $alidir/ali.*.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \ + "ark:$dir/egs/train_subset_all.egs" || touch $dir/.error & + wait; + [ -f $dir/.error ] && exit 1; + echo "Getting subsets of validation examples for diagnostics and combination." + $cmd $dir/log/create_valid_subset_combine.log \ + nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/egs/valid_all.egs \ + ark:$dir/egs/valid_combine.egs || touch $dir/.error & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/valid_all.egs \ + ark:$dir/egs/valid_diagnostic.egs || touch $dir/.error & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet-subset-egs --n=$num_train_frames_combine ark:$dir/egs/train_subset_all.egs \ + ark:$dir/egs/train_combine.egs || touch $dir/.error & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/train_subset_all.egs \ + ark:$dir/egs/train_diagnostic.egs || touch $dir/.error & + wait + cat $dir/egs/valid_combine.egs $dir/egs/train_combine.egs > $dir/egs/combine.egs + + for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs +fi + +if [ $stage -le 3 ]; then + mkdir -p $dir/temp + + # Other scripts might need to know the following info: + echo $num_jobs_nnet >$dir/egs/num_jobs_nnet + echo $iters_per_epoch >$dir/egs/iters_per_epoch + echo $samples_per_iter_real >$dir/egs/samples_per_iter + + echo "Creating training examples"; + # in $dir/egs, create $num_jobs_nnet separate files with training examples. + # The order is not randomized at this point. + + egs_list= + for n in `seq 1 $num_jobs_nnet`; do + egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark" + done + echo "Generating training examples on disk" + # The examples will go round-robin to egs_list. + $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \ + nnet-get-egs $nnet_context_opts "${spk_vecs_opt[@]}" "$feats" \ + "ark,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \ + nnet-copy-egs ark:- $egs_list || exit 1; +fi + +if [ $stage -le 4 ]; then + # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and + # then split into multiple parts egs.JOB.*.scp for different parts of the + # data, 0 .. $iters_per_epoch-1. + + if [ $iters_per_epoch -eq 1 ]; then + echo "Since iters-per-epoch == 1, just concatenating the data." + for n in `seq 1 $num_jobs_nnet`; do + cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1; + rm $dir/egs/egs_orig.$n.*.ark # don't "|| exit 1", due to NFS bugs... + done + else # We'll have to split it up using nnet-copy-egs. + egs_list= + for n in `seq 0 $[$iters_per_epoch-1]`; do + egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark" + done + # note, the "|| true" below is a workaround for NFS bugs + # we encountered running this script with Debian-7, NFS-v4. + $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \ + nnet-copy-egs --random=$random_copy --srand=JOB \ + "ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list '&&' \ + '(' rm $dir/egs/egs_orig.JOB.*.ark '||' true ')' || exit 1; + fi +fi + +if [ $stage -le 5 ]; then + # Next, shuffle the order of the examples in each of those files. + # Each one should not be too large, so we can do this in memory. + echo "Shuffling the order of training examples" + echo "(in order to avoid stressing the disk, these won't all run at once)." + + + # note, the "|| true" below is a workaround for NFS bugs + # we encountered running this script with Debian-7, NFS-v4. + for n in `seq 0 $[$iters_per_epoch-1]`; do + $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \ + nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \ + ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark '&&' \ + '(' rm $dir/egs/egs_tmp.JOB.$n.ark '||' true ')' || exit 1; + done +fi + +echo "$0: Finished preparing training examples" diff --git a/egs/chime_wsj0/s5/steps/nnet2/get_lda.sh b/egs/chime_wsj0/s5/steps/nnet2/get_lda.sh new file mode 100755 index 000000000..6c261c423 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/nnet2/get_lda.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the neural net (and also +# the validation examples used for diagnostics), and puts them in separate archives. + +# Begin configuration section. +cmd=run.pl + +feat_type= +stage=0 +splice_width=4 # meaning +- 4 frames on each side for second LDA +rand_prune=4.0 # Relates to a speedup we do for LDA. +within_class_factor=0.0001 # This affects the scaling of the transform rows... + # sorry for no explanation, you'll have to see the code. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: steps/nnet2/get_lda.sh [opts] " + echo " e.g.: steps/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo " As well as extracting the examples, this script will also do the LDA computation," + echo " if --est-lda=true (default:true)" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --splice-width # Number of frames on each side to append for feature input" + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +oov=`cat $lang/oov.int` +num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $alidir/tree $dir + +## Set up features. Note: these are different from the normal features +## because we have one rspecifier that has the features for the entire +## training set, not separate ones for each batch. +if [ -z $feat_type ]; then + if [ -f $alidir/final.mat ] && ! [ -f $alidir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +case $feat_type in + raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + ;; + lda) + splice_opts=`cat $alidir/splice_opts 2>/dev/null` + cp $alidir/splice_opts $dir 2>/dev/null + cp $alidir/final.mat $dir + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ -f $alidir/trans.1 ] && [ $feat_type != "raw" ]; then + echo "$0: using transforms from $alidir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |" +fi +if [ -f $alidir/raw_trans.1 ] && [ $feat_type == "raw" ]; then + echo "$0: using raw-fMLLR transforms from $alidir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/raw_trans.JOB ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/raw_trans.*|' ark:- ark:- |" +fi + + +feat_dim=`feat-to-dim "$train_subset_feats" -` || exit 1; +lda_dim=$[$feat_dim*(1+2*($splice_width))]; # No dim reduction. + +if [ $stage -le 0 ]; then + echo "$0: Accumulating LDA statistics." + $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \ + acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \ + $dir/lda.JOB.acc || exit 1; +fi + +echo $feat_dim > $dir/feat_dim +echo $lda_dim > $dir/lda_dim + +if [ $stage -le 1 ]; then + nnet-get-feature-transform --within-class-factor=$within_class_factor --dim=$lda_dim $dir/lda.mat $dir/lda.*.acc \ + 2>$dir/log/lda_est.log || exit 1; + rm $dir/lda.*.acc +fi + +echo "$0: Finished estimating LDA" diff --git a/egs/chime_wsj0/s5/steps/nnet2/get_lda_block.sh b/egs/chime_wsj0/s5/steps/nnet2/get_lda_block.sh new file mode 100755 index 000000000..e370be05b --- /dev/null +++ b/egs/chime_wsj0/s5/steps/nnet2/get_lda_block.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the neural net (and also +# the validation examples used for diagnostics), and puts them in separate archives. + +# Begin configuration section. +cmd=run.pl + +stage=0 +splice_width=4 # meaning +- 4 frames on each side for second LDA +rand_prune=4.0 # Relates to a speedup we do for LDA. +within_class_factor=0.0001 # This affects the scaling of the transform rows... + # sorry for no explanation, you'll have to see the code. +block_size=10 +block_shift=5 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: steps/nnet2/get_lda_block.sh [opts] " + echo " e.g.: steps/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo " As well as extracting the examples, this script will also do the LDA computation," + echo " if --est-lda=true (default:true)" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --splice-width # Number of frames on each side to append for feature input" + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +oov=`cat $lang/oov.int` +num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $alidir/tree $dir + +## Set up features. Note: these are different from the normal features +## because we have one rspecifier that has the features for the entire +## training set, not separate ones for each batch. + + +feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" +train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + +feat_dim=`feat-to-dim "$train_subset_feats" -` || exit 1; + +if [ $stage -le 0 ]; then + echo "$0: Accumulating LDA statistics." + $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \ + acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \ + $dir/lda.JOB.acc || exit 1; +fi + +echo $feat_dim > $dir/feat_dim + +echo -n > $dir/indexes +# Get list of indexes, e.g. a file like: +# 0 1 2 3 4 5 6 7 8 9 +# 5 6 7 8 9 10 11 12 13 14 +# 10 ... + +cur_index=0 +num_blocks=0 +context_length=$[1+2*($splice_width)] + +while [ $[$cur_index+$block_size] -lt $feat_dim ]; do + for n in `seq $cur_index $[cur_index+$block_size-1]`; do + echo -n `seq $n $feat_dim $[$n+($feat_dim*($context_length-1))]` '' >> $dir/indexes + done + echo >> $dir/indexes + num_blocks=$[$num_blocks+1] + cur_index=$[$cur_index+$block_shift] + if [ $[$cur_index+$block_size] -gt $feat_dim ]; then + cur_index=$[$feat_dim-$block_size]; + fi +done +echo $num_blocks >$dir/num_blocks + +lda_dim=`cat $dir/indexes | wc -w` +echo $lda_dim > $dir/lda_dim + +if [ $stage -le 1 ]; then + nnet-get-feature-transform-multi --within-class-factor=$within_class_factor $dir/indexes $dir/lda.*.acc $dir/lda.mat \ + 2>$dir/log/lda_est.log || exit 1; + rm $dir/lda.*.acc +fi + +echo "$0: Finished estimating LDA" diff --git a/egs/chime_wsj0/s5/steps/nnet2/get_perturbed_feats.sh b/egs/chime_wsj0/s5/steps/nnet2/get_perturbed_feats.sh new file mode 100755 index 000000000..0679da7b2 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/nnet2/get_perturbed_feats.sh @@ -0,0 +1,89 @@ +#!/bin/bash + + +# begin configuration section + +cmd="run.pl" +pairs="1.1-1.0 1.05-1.2 1.0-0.8 0.95-1.1 0.9-0.9" # Pairs of (VTLN warp factor, time-warp factor) +stage=0 +cleanup=true +feature_type=fbank +# end configuration section + +set -e +. utils/parse_options.sh + +if [ $# -ne 5 ]; then + echo "Usage: $0 [options] " + echo "e.g.: $0 mfcc conf/fbank_40.conf exp/perturbed_fbank_train data/train data/train_perturbed_fbank" + echo "Supported options: " + echo "--feature-type (fbank|mfcc|plp) # Type of features we are making" + echo "--cmd 'command-program' # Mechanism to run jobs, e.g. run.pl" + echo "--pairs # Pairs of (vtln-warp, time-warp) factors, " + echo " # default $pairs" + echo "--stage # Use for partial re-run" + echo "--cleanup (true|false) # If false, do not clean up temp files (default: true)" + exit 1; +fi + +base_config=$1 +featdir=$2 +dir=$3 # dir/log* will contain log-files +inputdata=$4 +data=$5 + +for f in $base_config $inputdata/wav.scp; do + if [ ! -f $f ]; then + echo "Expected file $f to exist" + exit 1; + fi +done + +if [ "$feature_type" != "fbank" ] && [ "$feature_type" != "mfcc" ] && \ + [ "$feature_type" != "plp" ]; then + echo "$0: Invalid option --feature-type=$feature_type" + exit 1; +fi + +mkdir -p $featdir +mkdir -p $dir/conf $dir/log + +all_feature_dirs="" + +for pair in $pairs; do + vtln_warp=`echo $pair | cut -d- -f1` + time_warp=`echo $pair | cut -d- -f2` + fs=`perl -e "print ($time_warp*10);"` + conf=$dir/conf/$pair.conf + this_dir=$dir/$pair + + ( cat $base_config; echo; echo "--frame-shift=$fs"; echo "--vtln-warp=$vtln_warp" ) > $conf + + echo "Making ${feature_type} features for VTLN-warp $vtln_warp and time-warp $time_warp" + + feature_data=${data}-$pair + all_feature_dirs="$all_feature_dirs $feature_data" + + utils/copy_data_dir.sh --spk-prefix ${pair}- --utt-prefix ${pair}- $inputdata $feature_data + steps/make_${feature_type}.sh --${feature_type}-config $conf --nj 8 --cmd "$cmd" $feature_data $this_dir $featdir + + steps/compute_cmvn_stats.sh $feature_data $this_dir $featdir +done + +utils/combine_data.sh $data $all_feature_dirs + + +# In the combined feature directory, create a file utt2uniq which maps +# our extended utterance-ids to "unique utterances". This enables the +# script steps/nnet2/get_egs.sh to hold out data in a more proper way. +cat $data/utt2spk | \ + perl -e ' while(){ @A=split; $x=shift @A; $y=$x; + foreach $pair (@ARGV) { $y =~ s/^${pair}-// && last; } print "$x $y\n"; } ' $pairs \ + > $data/utt2uniq + +if $cleanup; then + echo "$0: Cleaning up temporary directories for ${feature_type} features." + # Note, this just removes the .scp files and so on, not the data which is located in + # $featdir and which is still needed. + rm -r $all_feature_dirs +fi diff --git a/egs/chime_wsj0/s5/steps/nnet2/retrain_tanh.sh b/egs/chime_wsj0/s5/steps/nnet2/retrain_tanh.sh new file mode 100755 index 000000000..c58a61dd4 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/nnet2/retrain_tanh.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + +# This script is for training networks with tanh nonlinearities; it starts with +# a given model and supports increasing the hidden-layer dimension. It is +# otherwise similar to train_tanh.sh + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs during which we reduce + # the learning rate; number of iteration is worked out from this. +num_epochs_extra=5 # Number of epochs after we stop reducing + # the learning rate. +num_iters_final=20 # Maximum number of final iterations to give to the + # optimization over the validation set. +initial_learning_rate=0.04 +final_learning_rate=0.004 +softmax_learning_rate_factor=0.5 # Train this layer half as fast as the other layers. + + +minibatch_size=128 # by default use a smallish minibatch size for neural net + # training; this controls instability which would otherwise + # be a problem with multi-threaded update. Note: it also + # interacts with the "preconditioned" update which generally + # works better with larger minibatch size, so it's not + # completely cost free. + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + + +stage=-5 + + +mix_up=0 # Number of components to mix up to (should be > #tree leaves, if + # specified.) Will do this at the start. +widen=0 # If specified, it will increase the hidden-layer dimension + # to this value. Will do this at the start. +bias_stddev=0.5 # will be used for widen + +num_threads=16 +parallel_opts="-pe smp $num_threads" # using a smallish #threads by default, out of stability concerns. + # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads. +cleanup=true +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 --widen 1024 exp/tri4_nnet/egs exp/tri4_nnet exp/tri5_nnet" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of main training" + echo " # while reducing learning rate (determines #iterations, together" + echo " # with --samples-per-iter and --num-jobs-nnet)" + echo " --num-epochs-extra <#epochs-extra|5> # Number of extra epochs of training" + echo " # after learning rate fully reduced" + echo " --initial-learning-rate # Learning rate at start of training, e.g. 0.02 for small" + echo " # data, 0.01 for large data" + echo " --final-learning-rate # Learning rate at end of training, e.g. 0.004 for small" + echo " # data, 0.001 for large data" + echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer," + echo " # per context-dependent state. Try a number several times #states." + echo " --num-threads # Number of parallel threads per job (will affect results" + echo " # as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --num-iters-final <#iters|10> # Number of final iterations to give to nnet-combine-fast to " + echo " # interpolate parameters (the weights are learned with a validation set)" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + exit 1; +fi + +egs_dir=$1 +nnet_dir=$2 +dir=$3 + +# Check some files. +for f in $egs_dir/egs.1.0.ark $nnet_dir/final.mdl; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1; +iters_per_epoch=`cat $egs_dir/iters_per_epoch` || exit 1; + +mkdir -p $dir/log + +cp $nnet_dir/splice_opts $dir 2>/dev/null +cp $nnet_dir/final.mat $dir 2>/dev/null # any LDA matrix... +cp $nnet_dir/tree $dir + + +if [ $stage -le -2 ] && [ $mix_up -gt 0 ]; then + echo Mixing up to $mix_up components + $cmd $dir/log/mix_up.$x.log \ + nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \ + $nnet_dir/final.mdl $dir/0.mdl || exit 1; +else + cp $nnet_dir/final.mdl $dir/0.mdl || exit 1; +fi + +if [ $stage -le -1 ] && [ $widen -gt 0 ]; then + echo "$0: Widening nnet to hidden-layer-dim=$widen" + $cmd $dir/log/widen.log \ + nnet-am-widen --hidden-layer-dim=$widen $dir/0.mdl $dir/0.mdl || exit 1; +fi + +num_iters_reduce=$[$num_epochs * $iters_per_epoch]; +num_iters_extra=$[$num_epochs_extra * $iters_per_epoch]; +num_iters=$[$num_iters_reduce+$num_iters_extra] + +echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling " +echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, " +echo "$0: (while reducing learning rate) + (with constant learning rate)." + +x=0 +while [ $x -lt $num_iters ]; do + if [ $x -ge 0 ] && [ $stage -le $x ]; then + # Set off jobs doing some diagnostics, in the background. + $cmd $dir/log/compute_prob_valid.$x.log \ + nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs & + $cmd $dir/log/compute_prob_train.$x.log \ + nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs & + + echo "Training neural net (pass $x)" + + $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \ + nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \ + ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \ + nnet-train-parallel --num-threads=$num_threads \ + --minibatch-size=$minibatch_size --srand=$x $dir/$x.mdl \ + ark:- $dir/$[$x+1].JOB.mdl \ + || exit 1; + + nnets_list= + for n in `seq 1 $num_jobs_nnet`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.mdl" + done + + learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`; + softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`; + nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo 2>/dev/null || exit 1 + nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'` + na=`cat $dir/foo | grep AffineComponent | wc -l` # number of last AffineComopnent layer [one-based] + lr_string="$learning_rate" + for n in `seq 2 $nu`; do + if [ $n -eq $na ]; then lr=$softmax_learning_rate; + else lr=$learning_rate; fi + lr_string="$lr_string:$lr" + done + + $cmd $dir/log/average.$x.log \ + nnet-am-average $nnets_list - \| \ + nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1; + + rm $nnets_list + fi + x=$[$x+1] +done + +# Now do combination. +# At the end, final.mdl will be a combination of the last e.g. 10 models. +if [ $num_iters_final -gt $num_iters_extra ]; then + echo "Setting num_iters_final=$num_iters_extra" + num_iters_final=$num_iters_extra +fi +start=$[$num_iters-$num_iters_final+1] +nnets_list= +for x in `seq $start $num_iters`; do + nnets_list="$nnets_list $dir/$x.mdl" +done + +if [ $stage -le $num_iters ]; then + num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'` + mb=$[($num_egs+$num_threads-1)/$num_threads] + $cmd $parallel_opts $dir/log/combine.log \ + nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \ + $nnets_list ark:$egs_dir/combine.egs $dir/final.mdl || exit 1; +fi + +sleep 2; # make sure final.mdl exists. + +# Compute the probability of the final, combined model with +# the same subset we used for the previous compute_probs, as the +# different subsets will lead to different probs. +$cmd $dir/log/compute_prob_valid.final.log \ + nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs & +$cmd $dir/log/compute_prob_train.final.log \ + nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs & + +echo Done + +if $cleanup; then + echo Removing most of the models + for x in `seq 0 $num_iters`; do + if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then + # delete all but every 10th model; don't delete the ones which combine to form the final model. + rm $dir/$x.mdl + fi + done +fi diff --git a/egs/chime_wsj0/s5/steps/nnet2/train_block.sh b/egs/chime_wsj0/s5/steps/nnet2/train_block.sh new file mode 100755 index 000000000..aa6e2e725 --- /dev/null +++ b/egs/chime_wsj0/s5/steps/nnet2/train_block.sh @@ -0,0 +1,376 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# this is as train_tanh3.sh but for on top of fbank feats-- we have block-diagonal +# transforms for the first few layers, on separate frequency bands. +# Otherwise it's tanh. + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs during which we reduce + # the learning rate; number of iteration is worked out from this. +num_epochs_extra=5 # Number of epochs after we stop reducing + # the learning rate. +num_iters_final=20 # Maximum number of final iterations to give to the + # optimization over the validation set. +initial_learning_rate=0.04 +final_learning_rate=0.004 +bias_stddev=0.0 +shrink_interval=5 # shrink every $shrink_interval iters except while we are + # still adding layers, when we do it every iter. +shrink=true +num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if + # given. +softmax_learning_rate_factor=0.5 # Train this layer half as fast as the other layers. + +hidden_layer_dim=300 # You may want this larger, e.g. 1024 or 2048. + +minibatch_size=128 # by default use a smallish minibatch size for neural net + # training; this controls instability which would otherwise + # be a problem with multi-threaded update. Note: it also + # interacts with the "preconditioned" update which generally + # works better with larger minibatch size, so it's not + # completely cost free. + +samples_per_iter=200000 # each iteration of training, see this many samples + # per job. This option is passed to get_egs.sh +num_jobs_nnet=16 # Number of neural net jobs to run in parallel. This option + # is passed to get_egs.sh. +get_egs_stage=0 +spk_vecs_dir= + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + +add_layers_period=2 # by default, add new layers every 2 iterations. + +num_block_layers=2 +num_normal_layers=2 +block_size=10 +block_shift=5 + +stage=-5 + +io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. +splice_width=7 # meaning +- 7 frames on each side for second LDA +randprune=4.0 # speeds up LDA. +alpha=4.0 +max_change=10.0 +mix_up=0 # Number of components to mix up to (should be > #tree leaves, if + # specified.) +num_threads=16 +parallel_opts="-pe smp $num_threads" # using a smallish #threads by default, out of stability concerns. + # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads. +cleanup=true +egs_dir= +lda_opts= +egs_opts= +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of main training" + echo " # while reducing learning rate (determines #iterations, together" + echo " # with --samples-per-iter and --num-jobs-nnet)" + echo " --num-epochs-extra <#epochs-extra|5> # Number of extra epochs of training" + echo " # after learning rate fully reduced" + echo " --initial-learning-rate # Learning rate at start of training, e.g. 0.02 for small" + echo " # data, 0.01 for large data" + echo " --final-learning-rate # Learning rate at end of training, e.g. 0.004 for small" + echo " # data, 0.001 for large data" + echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" + echo " --initial-num-hidden-layers <#hidden-layers|1> # Number of hidden layers to start with." + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer," + echo " # per context-dependent state. Try a number several times #states." + echo " --num-jobs-nnet # Number of parallel jobs to use for main neural net" + echo " # training (will affect results as well as speed; try 8, 16)" + echo " # Note: if you increase this, you may want to also increase" + echo " # the learning rate." + echo " --num-threads # Number of parallel threads per job (will affect results" + echo " # as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads." + echo " --io-opts # Options given to e.g. queue.pl for jobs that do a lot of I/O." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-width # Number of frames on each side to append for feature input" + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --lda-dim # Dimension to reduce spliced features to with LDA" + echo " --num-iters-final <#iters|10> # Number of final iterations to give to nnet-combine-fast to " + echo " # interpolate parameters (the weights are learned with a validation set)" + echo " --num-utts-subset <#utts|300> # Number of utterances in subsets used for validation and diagnostics" + echo " # (the validation subset is held out from training)" + echo " --num-frames-diagnostic <#frames|4000> # Number of frames used in computing (train,valid) diagnostics" + echo " --num-valid-frames-combine <#frames|10000> # Number of frames used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1; + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +splice_opts=`cat $alidir/splice_opts 2>/dev/null` +cp $alidir/splice_opts $dir 2>/dev/null +cp $alidir/tree $dir + + +# Get list of validation utterances. +awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ + > $dir/valid_uttlist || exit 1; +awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; + + +if [ $stage -le -4 ]; then + echo "$0: calling get_lda.sh" + steps/nnet2/get_lda_block.sh --block-size $block_size --block-shift $block_shift \ + $lda_opts --splice-width $splice_width --cmd "$cmd" $data $lang $alidir $dir || exit 1; +fi + +# these files will have been written by get_lda_block.sh +feat_dim=`cat $dir/feat_dim` || exit 1; +lda_dim=`cat $dir/lda_dim` || exit 1; +num_blocks=`cat $dir/num_blocks` || exit 1; + +if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then + echo "$0: calling get_egs.sh" + [ ! -z $spk_vecs_dir ] && spk_vecs_opt="--spk-vecs-dir $spk_vecs_dir"; + steps/nnet2/get_egs.sh $spk_vecs_opt --samples-per-iter $samples_per_iter --num-jobs-nnet $num_jobs_nnet \ + --splice-width $splice_width --stage $get_egs_stage --cmd "$cmd" $egs_opts --feat-type raw \ + $data $lang $alidir $dir || exit 1; +fi + +if [ -z $egs_dir ]; then + egs_dir=$dir/egs +fi + +iters_per_epoch=`cat $egs_dir/iters_per_epoch` || exit 1; +! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \ + echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir" +num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` + + +if [ $stage -le -2 ]; then + echo "$0: initializing neural net"; + + hidden_block_size=`perl -e "print int(sqrt(($hidden_layer_dim*$hidden_layer_dim)/$num_blocks));"` + echo "Hidden block size is $hidden_block_size" + hidden_block_dim=$[$hidden_block_size*$num_blocks] + block_stddev=`perl -e "print 1.0/sqrt($block_size);"` + hidden_block_stddev=`perl -e "print 1.0/sqrt($hidden_block_size);"` + first_hidden_layer_stddev=`perl -e "print 1.0/sqrt($hidden_block_dim);"` + stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"` + + + cat >$dir/nnet.config <>$dir/nnet.config <>$dir/nnet.config <>$dir/nnet.config <>$dir/nnet.config <