Added KL-HMM

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3241 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-12-01 18:58:22 +00:00 · 2013-12-01 18:58:22 +00:00 · cdd493df56
--- a/egs/wsj/s5/cmd.sh
+++ b/egs/wsj/s5/cmd.sh
@ -9,8 +9,9 @@
 export train_cmd="queue.pl -l arch=*64"
 export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
 export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
+export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G"
+export cuda_cmd="queue.pl -l gpu=1"

-#export cuda_cmd="..."


 #b) BUT cluster options
--- a/egs/wsj/s5/local/run_kl_hmm.sh
+++ b/egs/wsj/s5/local/run_kl_hmm.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright 2013 Idiap Research Institute (Author: David Imseng)
+# Apache 2.0
+
+. cmd.sh
+
+states=20000
+dir=exp/tri4b_pretrain-dbn_dnn/
+
+steps/kl_hmm/build_tree.sh --cmd "$big_memory_cmd" --thresh -1 --nnet_dir exp/tri4b_pretrain-dbn_dnn/ \
+ ${states} data-fmllr-tri4b/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b-${states} || exit 1;
+
+utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri4b-${states} exp/tri4b-${states}/graph_bd_tgpr || exit 1;
+
+steps/kl_hmm/train_kl_hmm.sh --nj 30 --cmd "$big_memory_cmd" --model exp/tri4b-${states}/final.mdl data-fmllr-tri4b/train_si284 exp/tri4b-${states} $dir/kl-hmm-${states}
+
+steps/kl_hmm/decode_kl_hmm.sh --nj 10 --cmd "$big_memory_cmd" --acwt 0.1 --nnet $dir/kl-hmm-${states}/final.nnet --model exp/tri4b-${states}/final.mdl \
+  --config conf/decode_dnn.config exp/tri4b-${states}/graph_bd_tgpr/ data-fmllr-tri4b/test_dev93 $dir/decode_dev93_kl-hmm-bd-${states}_tst
+
+steps/kl_hmm/decode_kl_hmm.sh --nj 8 --cmd "$big_memory_cmd" --acwt 0.1 --nnet $dir/kl-hmm-${states}/final.nnet --model exp/tri4b-${states}/final.mdl \
+  --config conf/decode_dnn.config exp/tri4b-${states}/graph_bd_tgpr/ data-fmllr-tri4b/test_eval92 $dir/decode_eval92_kl-hmm-bd-${states}_tst
+
+
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@ -324,6 +324,8 @@ local/run_sgmm2.sh
 # You probably wany to run the hybrid recipe as it is complementary:
 local/run_dnn.sh

+# You probably want to try KL-HMM 
+#local/run_kl_hmm.sh

 # Getting results [see RESULTS file]
 # for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
--- a/egs/wsj/s5/steps/kl_hmm/build_tree.sh
+++ b/egs/wsj/s5/steps/kl_hmm/build_tree.sh
@ -0,0 +1,152 @@
+#!/bin/bash
+
+# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey),
+#                      Idiap Research Institute (Author: David Imseng)
+# Apache 2.0
+
+# Begin configuration.
+stage=-4 #  This allows restarting after partway, when something when wrong.
+config=
+cmd=run.pl
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+num_iters=35    # Number of iterations of training
+max_iter_inc=25 # Last iter to increase #Gauss on.
+beam=10
+retry_beam=40
+boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
+power=0.25 # Exponent for number of gaussians according to occurrence counts
+cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
+thresh=20
+use_gpu="no"
+nnet_dir=
+context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
+tmpdir=
+no_softmax=true
+# End configuration.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh;
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+   echo "Usage: steps/train_deltas.sh <num-leaves> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
+   echo "e.g.: steps/train_deltas.sh 2000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --stage <stage>                                  # stage to do partial re-run from."
+   echo "  --thresh   "
+   echo "  --cluster_thresh "
+   echo "  --nnet_dir "
+   echo "  --context_opts "
+   echo "  --tmpdir    "
+   echo "  --no-softmax    "
+   exit 1;
+fi
+
+numleaves=$1
+data=$2
+lang=$3
+alidir=$4
+dir=$5
+
+
+for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do
+  [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
+done
+
+numgauss=$numleaves
+incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+nj=`cat $alidir/num_jobs` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+
+sdata=$data/split$nj;
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+nnet=${nnet_dir}/final.nnet
+feature_transform=${nnet_dir}/final.feature_transform
+
+featsdim="ark:copy-feats scp:$data/feats.scp ark:- |"
+nnetfeats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# Optionally add cmvn
+if [ -f ${nnet_dir}/norm_vars ]; then
+  norm_vars=$(cat ${nnet_dir}/norm_vars 2>/dev/null)
+  [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1
+  nnetfeats="$nnetfeats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+  featsdim="$featsdim apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
+fi
+# Optionally add deltas
+if [ -f ${nnet_dir}/delta_order ]; then
+  delta_order=$(cat ${nnet_dir}/delta_order)
+  nnetfeats="$nnetfeats add-deltas --delta-order=$delta_order ark:- ark:- |"
+  featsdim="$featsdim add-deltas --delta-order=$delta_order ark:- ark:- |"
+fi
+
+feats="ark,s,cs:nnet-forward "
+if [[ ! -z $feature_transform ]]; then 
+    feats=${feats}" --feature-transform=$feature_transform "
+fi
+feats=${feats}"--no-softmax=$no_softmax --use-gpu=$use_gpu $nnet \"$nnetfeats\" ark:- |" 
+
+feat_dim=$(feat-to-dim --print-args=false "$featsdim" -)
+rm $dir/.error 2>/dev/null
+
+if [[ ! -z $tmpdir ]]; then 
+    mkdir -p $tmpdir 
+else
+    tmpdir=$dir
+fi
+
+if [ $stage -le -3 ]; then
+  echo "$0: accumulating tree stats"
+  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
+    acc-tree-stats $context_opts --var-floor=1.0 --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
+     "ark:gunzip -c $alidir/ali.JOB.gz|" $tmpdir/JOB.treeacc || exit 1;
+  sum-tree-stats $dir/treeacc $tmpdir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
+  rm $tmpdir/*.treeacc
+fi
+
+if [ $stage -le -2 ]; then
+  echo "$0: getting questions for tree-building, via clustering"
+  # preparing questions, roots file...
+  cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
+  cat $lang/phones/extra_questions.int >> $dir/questions.int
+  compile-questions $context_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+
+  echo "$0: building the tree"
+ # $cmd $dir/log/build_tree.log \
+    build-tree $context_opts --verbose=1 --max-leaves=$numleaves --cluster-thresh=$cluster_thresh --thresh=$thresh $dir/treeacc $lang/phones/roots.int \
+    $dir/questions.qst $lang/topo $dir/tree &> $dir/log/build_tree.log || exit 1;
+
+  gmm-init-model-flat --dim=$feat_dim $dir/tree $lang/topo $dir/1.mdl
+
+ rm $dir/treeacc
+fi
+
+if [ $stage -le -1 ]; then
+  # Convert the alignments.
+  echo "$0: converting alignments from $alidir to use current tree"
+  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
+    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
+     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 0 ]; then
+  echo "$0: compiling graphs of transcripts"
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
+    compile-train-graphs $dir/tree $dir/1.mdl  $lang/L.fst  \
+     "ark:utils/sym2int.pl -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
+      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+
+rm $dir/final.mdl 2>/dev/null
+ln -s 1.mdl $dir/final.mdl
+
+# Summarize warning messages...
+utils/summarize_warnings.pl  $dir/log
+
+echo "$0: Done building the tree in $dir"
+
--- a/egs/wsj/s5/steps/kl_hmm/decode_kl_hmm.sh
+++ b/egs/wsj/s5/steps/kl_hmm/decode_kl_hmm.sh
@ -0,0 +1,121 @@
+#!/bin/bash
+
+# Copyright 2012-2013 Karel Vesely,
+#                     Daniel Povey,
+#                     Idiap Research Institute (Author: David Imseng)
+# Apache 2.0
+
+# Begin configuration section.  
+nnet= # Optionally pre-select network to use for getting state-likelihoods
+feature_transform= # Optionally pre-select feature transform (in front of nnet)
+model= # Optionally pre-select transition model
+
+stage=0 # stage=1 skips lattice generation
+nj=4
+cmd=run.pl
+max_active=7000 # maximum of active tokens
+max_mem=50000000 # limit the fst-size to 50MB (larger fsts are minimized)
+beam=13.0 # GMM:13.0
+latbeam=8.0 # GMM:6.0
+acwt=0.1 # GMM:0.0833, note: only really affects pruning (scoring is on lattices).
+scoring_opts="--min-lmwt 1 --max-lmwt 12"
+skip_scoring=false
+use_gpu="no" # disable gpu
+parallel_opts=""
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the DNN + transition model is."
+   echo "e.g.: $0 exp/dnn1/graph_tgpr data/test exp/dnn1/decode_tgpr"
+   echo ""
+   echo "This script works on plain or modified features (CMN,delta+delta-delta),"
+   echo "which are then sent through feature-transform. It works out what type"
+   echo "of features you used from content of srcdir."
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo ""
+   echo "  --nnet <nnet>                                    # which nnet to use (opt.)"
+   echo "  --feature-transform <nnet>                       # select transform in front of nnet (opt.)"
+   echo "  --model <model>                                  # which transition model to use (opt.)"
+   echo ""
+   echo "  --acwt <float>                                   # select acoustic scale for decoding"
+   echo "  --scoring-opts <opts>                            # options forwarded to local/score.sh"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -z "$nnet" ]; then # if --nnet <nnet> was not specified on the command line...
+  nnet=$srcdir/final.nnet; 
+fi
+[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
+
+if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
+  model=$srcdir/final.mdl;
+fi
+
+# find the feature_transform to use
+if [ -z "$feature_transform" ]; then
+  feature_transform=$srcdir/final.feature_transform
+fi
+if [ ! -f $feature_transform ]; then
+  echo "Missing feature_transform '$feature_transform'"
+  exit 1
+fi
+
+# check that files exist
+for f in $sdata/1/feats.scp $nnet_i $nnet $model $graphdir/HCLG.fst; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+# Create the feature stream:
+feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# Optionally add cmvn
+if [ -f $srcdir/norm_vars ]; then
+  norm_vars=$(cat $srcdir/norm_vars 2>/dev/null)
+  [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1
+  feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+fi
+# Optionally add deltas
+if [ -f $srcdir/delta_order ]; then
+  delta_order=$(cat $srcdir/delta_order)
+  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
+fi
+
+
+# Run the decoding in the queue
+if [ $stage -le 0 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet-forward --feature-transform=$feature_transform --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
+    latgen-faster-mapped --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$latbeam \
+    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+
+# Run the scoring
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1;
+fi
+
+exit 0;
--- a/egs/wsj/s5/steps/kl_hmm/train_kl_hmm.sh
+++ b/egs/wsj/s5/steps/kl_hmm/train_kl_hmm.sh
@ -0,0 +1,121 @@
+#!/bin/bash
+
+# Copyright 2012-2013 Karel Vesely,
+#                     Daniel Povey,
+#                     Idiap Research Institute (Author: David Imseng)
+# Apache 2.0
+
+# Begin configuration section.  
+nnet= # Optionally pre-select network to use for getting state-likelihoods
+feature_transform= # Optionally pre-select feature transform (in front of nnet)
+model= # Optionally pre-select transition model
+class_frame_counts= # Optionally pre-select class-counts used to compute PDF priors 
+
+stage=0 # stage=1 skips lattice generation
+nj=32
+cmd=$decode_cmd
+max_active=7000 # maximum of active tokens
+max_mem=50000000 # limit the fst-size to 50MB (larger fsts are minimized)
+use_gpu="no" # disable gpu
+parallel_opts="" 
+tmpdir=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 [options] <data-dir> <align-dir> <kl-hmm-dir>"
+   echo "... where <kl-hmm-dir> is assumed to be a sub-directory of the directory"
+   echo " where the DNN + transition model is."
+   echo "e.g.: $0 data/train exp/dnn1/kl-hmm-train"
+   echo ""
+   echo "This script works on plain or modified features (CMN,delta+delta-delta),"
+   echo "which are then sent through feature-transform. It works out what type"
+   echo "of features you used from content of srcdir."
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo ""
+   echo "  --nnet <nnet>                                    # which nnet to use (opt.)"
+   echo "  --feature-transform <nnet>                       # select transform in front of nnet (opt.)"
+   echo "  --model <model>                                  # which transition model to use (opt.)"
+   echo "  --tmpdir >dir>                                   # Temp directory to store the statistics, becuase they can get big (opt.)"
+   exit 1;
+fi
+
+
+data=$1
+alidir=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+if [ -z "$nnet" ]; then # if --nnet <nnet> was not specified on the command line...
+  nnet=$srcdir/final.nnet; 
+fi
+[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
+
+if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
+  model=$srcdir/final.mdl;
+fi
+
+# find the feature_transform to use
+if [ -z "$feature_transform" ]; then
+  feature_transform=$srcdir/final.feature_transform
+fi
+if [ ! -f $feature_transform ]; then
+  echo "Missing feature_transform '$feature_transform'"
+  exit 1
+fi
+
+# check that files exist
+for f in $sdata/1/feats.scp $nnet_i $nnet $model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+# Create the feature stream:
+feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# Optionally add cmvn
+if [ -f $srcdir/norm_vars ]; then
+  norm_vars=$(cat $srcdir/norm_vars 2>/dev/null)
+  [ ! -f $sdata/1/cmvn.scp ] && echo "$0: cannot find cmvn stats $sdata/1/cmvn.scp" && exit 1
+  feats="$feats apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+fi
+# Optionally add deltas
+if [ -f $srcdir/delta_order ]; then
+  delta_order=$(cat $srcdir/delta_order)
+  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
+fi
+
+ali="ark:gunzip -c $alidir/ali.*.gz |"
+
+if [[ ! -z $tmpdir ]]; then 
+    mkdir -p $tmpdir 
+else
+    tmpdir=$dir
+fi
+
+nkl_states=$(hmm-info --print-args=false $alidir/final.mdl | grep pdfs | awk '{ print $NF }')
+if [ $stage -le 0 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/acc-stats.JOB.log \
+  nnet-kl-hmm-acc --nkl-states=${nkl_states} "ark:nnet-forward --feature-transform=$feature_transform --use-gpu=$use_gpu $nnet \"$feats\" ark:- |" "ark:ali-to-pdf --print-args=false $alidir/final.mdl \"$ali\" ark:- |" $tmpdir/kl-hmm-stats.JOB
+fi
+
+sum-matrices $dir/accumulated-kl-hmm-stats $tmpdir/kl-hmm-stats.*
+
+rm $tmpdir/kl-hmm-stats.*
+
+nnet-kl-hmm-mat-to-component $dir/kl-hmm.nnet $dir/accumulated-kl-hmm-stats
+
+nnet-concat $dir/../final.nnet $dir/kl-hmm.nnet $dir/final.nnet
+
+exit 0;
--- a/src/nnet/nnet-component.cc
+++ b/src/nnet/nnet-component.cc
@ -21,9 +21,11 @@

 #include "nnet/nnet-nnet.h"
 #include "nnet/nnet-activation.h"
+#include "nnet/nnet-kl-hmm.h"
 #include "nnet/nnet-affine-transform.h"
 #include "nnet/nnet-rbm.h"
 #include "nnet/nnet-various.h"
+#include "nnet/nnet-kl-hmm.h"

 namespace kaldi {
 namespace nnet1 {
@ -40,6 +42,7 @@ const struct Component::key_value Component::kMarkerMap[] = {
  { Component::kCopy,"<copy>" },
  { Component::kAddShift,"<addshift>" },
  { Component::kRescale,"<rescale>" },
+  { Component::kKlHmm,"<klhmm>" }
 };


@ -119,6 +122,9 @@ Component* Component::Read(std::istream &is, bool binary) {
    case Component::kRescale :
      p_comp = new Rescale(dim_in, dim_out);
      break;
+    case Component::kKlHmm :
+      p_comp = new KlHmm(dim_in, dim_out);
+      break;
    case Component::kUnknown :
    default :
      KALDI_ERR << "Missing type: " << token;
--- a/src/nnet/nnet-component.h
+++ b/src/nnet/nnet-component.h
@ -64,7 +64,8 @@ class Component {
    kTranspose,
    kBlockLinearity,
    kAddShift,
-    kRescale
+    kRescale,
+    kKlHmm
  } ComponentType;
  /// A pair of type and marker 
  struct key_value {
--- a/src/nnet/nnet-kl-hmm.h
+++ b/src/nnet/nnet-kl-hmm.h
@ -0,0 +1,150 @@
+// nnet/nnet-kl-hmm.h
+
+// Copyright 2013  Idiap Research Institute (Author: David Imseng)
+//                 Karlsruhe Institute of Technology (Author: Ngoc Thang Vu)
+//                 Brno University of Technology (Author: Karel Vesely)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_NNET_NNET_KL_HMM_H_
+#define KALDI_NNET_NNET_KL_HMM_H_
+
+#include "nnet/nnet-component.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-rand.h"
+#include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+namespace nnet1 {
+
+class KlHmm : public Component {
+ public:
+  KlHmm(int32 dim_in, int32 dim_out) 
+    : Component(dim_in, dim_out), kl_stats_(dim_out, dim_in, kSetZero)
+  { }
+  ~KlHmm()
+  { }
+
+  Component* Copy() const { return new KlHmm(*this); }
+  ComponentType GetType() const {
+    return kKlHmm;
+  }
+
+  void PropagateFnc(const CuMatrix<BaseFloat> &in, CuMatrix<BaseFloat> *out) {
+    if (kl_inv_q_.NumRows() == 0) {
+      // Copy the CudaMatrix to a Matrix
+      Matrix<BaseFloat> in_tmp(in.NumRows(), in.NumCols());
+      in.CopyToMat(&in_tmp);
+      // Just check if there are posteriors in the Matrix (just check the first row)
+      BaseFloat post_sum=in_tmp.Row(0).Sum();
+      KALDI_ASSERT(ApproxEqual(post_sum, 1.0));
+      // Get a tmp Matrix of the stats
+      Matrix<BaseFloat> kl_stats_tmp(kl_stats_);
+      // Init a vector to get the sum of the rows (for normalization)
+      Vector<BaseFloat> row_sum(kl_stats_.NumRows(), kSetZero);
+      // Get the sum of the posteriors for normalization
+      row_sum.AddColSumMat(1, kl_stats_tmp);
+      // Apply floor to make sure there is no zero
+      row_sum.ApplyFloor(1e-20);
+      // Invert the sum (to normalize)
+      row_sum.InvertElements();
+      // Normalizing the statistics vector
+      kl_stats_tmp.MulRowsVec(row_sum);
+      //Apply floor before inversion and logarithm
+      kl_stats_tmp.ApplyFloor(1e-20);
+      //Apply invesion
+      kl_stats_tmp.InvertElements();
+      //Apply logarithm
+      kl_stats_tmp.ApplyLog();
+      //Inverted and logged values
+      kl_inv_q_.Resize(kl_stats_.NumRows(),kl_stats_.NumCols());
+      //Holds now log (1/Q)
+      kl_inv_q_.CopyFromMat(kl_stats_tmp);
+    }
+    // Get the logarithm of the features for the Entropy calculation
+    // Copy the CudaMatrix to a Matrix
+    Matrix<BaseFloat> in_log_tmp(in.NumRows(), in.NumCols());
+    in.CopyToMat(&in_log_tmp);
+    // Flooring and log
+    in_log_tmp.ApplyFloor(1e-20);
+    in_log_tmp.ApplyLog();
+    CuMatrix<BaseFloat> log_in(in.NumRows(), in.NumCols());
+    log_in.CopyFromMat(in_log_tmp);
+    // P*logP
+    CuMatrix<BaseFloat> tmp_entropy(in);
+    tmp_entropy.MulElements(log_in);
+    // Getting the entropy (sum P*logP)
+    CuVector<BaseFloat> in_entropy(in.NumRows(), kSetZero);
+    in_entropy.AddColSumMat(1,tmp_entropy);
+    // sum P*log (1/Q)
+    out->AddMatMat(1, in, kNoTrans, kl_inv_q_, kTrans, 0);
+    // (sum P*logP) + (sum P*log(1/Q)
+    out->AddVecToCols(1, in_entropy);
+    // return the negative KL-divergence
+    out->Scale(-1);
+  }
+
+  void BackpropagateFnc(const CuMatrix<BaseFloat> &in, const CuMatrix<BaseFloat> &out,
+                        const CuMatrix<BaseFloat> &out_diff, CuMatrix<BaseFloat> *in_diff) {
+    KALDI_ERR << "Unimplemented";
+  }
+ 
+  /// Reads the component content
+  void ReadData(std::istream &is, bool binary) { 
+    kl_stats_.Read(is, binary);
+    KALDI_ASSERT(kl_stats_.NumRows() == output_dim_);
+    KALDI_ASSERT(kl_stats_.NumCols() == input_dim_);
+  }
+
+  /// Writes the component content
+  void WriteData(std::ostream &os, bool binary) const { 
+    kl_stats_.Write(os, binary);
+  }
+  
+  /// Set the statistics matrix
+  void SetStats(const Matrix<BaseFloat> mat) {
+    KALDI_ASSERT(mat.NumRows() == output_dim_);
+    KALDI_ASSERT(mat.NumCols() == input_dim_);
+    kl_stats_.Resize(mat.NumRows(), mat.NumCols());
+    kl_stats_.CopyFromMat(mat);
+   }
+
+  /// Accumulate the statistics for KL-HMM paramter estimation
+  void Accumulate (const Matrix<BaseFloat> &posteriors, const std::vector<int32> &alignment) {
+    KALDI_ASSERT(posteriors.NumRows() == alignment.size());
+    KALDI_ASSERT(posteriors.NumCols() == kl_stats_.NumCols());
+    int32 num_frames = alignment.size();
+    for(int32 i = 0; i < num_frames; i++) {
+      //Convertin the float posterior into a double (to have higher precision during collection)
+      Vector<double> temp(posteriors.Row(i));
+      //Sum all the postiors associated with a particular state
+      kl_stats_.Row(alignment[i]).AddVec(1,temp);
+    }
+  }
+
+ private: 
+  Matrix<double> kl_stats_;
+  CuMatrix<BaseFloat> kl_inv_q_;
+};
+
+
+
+
+} // namespace nnet1
+} // namespace kaldi
+
+#endif
+
--- a/src/nnetbin/Makefile
+++ b/src/nnetbin/Makefile
@ -9,12 +9,13 @@ LDLIBS += $(CUDA_LDLIBS)
 BINFILES = nnet-train-frmshuff \
        nnet-train-xent-hardlab-perutt \
        nnet-train-xent-hardlab-frmshuff \
-	nnet-train-mse-tgtmat-frmshuff \
-	nnet-train-mmi-sequential \
-	nnet-train-mpe-sequential \
-	rbm-train-cd1-frmshuff rbm-convert-to-nnet \
-	nnet-forward nnet-copy nnet-info nnet-concat \
-	transf-to-nnet cmvn-to-nnet 
+        nnet-train-mse-tgtmat-frmshuff \
+        nnet-train-mmi-sequential \
+        nnet-train-mpe-sequential \
+        rbm-train-cd1-frmshuff rbm-convert-to-nnet \
+        nnet-forward nnet-copy nnet-info nnet-concat \
+        transf-to-nnet cmvn-to-nnet \
+        nnet-kl-hmm-acc nnet-kl-hmm-mat-to-component 

 OBJFILES =

@ -24,6 +25,6 @@ TESTFILES =

 ADDLIBS = ../nnet/kaldi-nnet.a ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
          ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
-					../util/kaldi-util.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../base/kaldi-base.a 

 include ../makefiles/default_rules.mk
--- a/src/nnetbin/nnet-kl-hmm-acc.cc
+++ b/src/nnetbin/nnet-kl-hmm-acc.cc
@ -0,0 +1,107 @@
+// nnetbin/nnet-kl-hmm-acc.cc
+
+// Copyright 2013  Idiap Research Institute (Author: David Imseng)
+//                 Karlsruhe Institute of Technology (Author: Ngoc Thang Vu)
+//                 Brno University of Technology (Author: Karel Vesely)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/nnet-nnet.h"
+#include "nnet/nnet-kl-hmm.h"
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  using namespace kaldi::nnet1;
+  try {
+    const char *usage =
+        "Collect the statistics for the Kl-HMM trainign.\n"
+        "Usage:  nnet-kl-hmm-acc [options] <feature-rspecifier> <alignments-rspecifier> <kl-hmm-accumulator>\n"
+        "e.g.: \n"
+        " nnet-kl-hmm-acc scp:train.scp ark:train.ali kl-hmm.acc\n";
+
+    ParseOptions po(usage);
+
+    bool binary = false;
+    int32 n_kl_states = 0;
+    po.Register("binary", &binary, "Write output in binary mode");
+    po.Register("nkl-states", &n_kl_states, "Number of states in Kl-HMM");
+
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string feature_rspecifier = po.GetArg(1),
+      alignments_rspecifier = po.GetArg(2),
+      kl_hmm_accumulator = po.GetArg(3);
+
+    using namespace kaldi;
+    using namespace kaldi::nnet1;
+    typedef kaldi::int32 int32;
+
+    kaldi::int64 total_frames = 0;
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+    RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
+    int32 posterior_dim = feature_reader.Value().NumCols();
+    KlHmm kl_hmm(posterior_dim,n_kl_states);
+
+    int32 num_done = 0, num_no_alignment = 0, num_other_error = 0;
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string utt = feature_reader.Key();
+
+      if (!alignments_reader.HasKey(utt)) {
+        num_no_alignment++;
+      } else {
+        
+        const Matrix<BaseFloat> &mat = feature_reader.Value();
+        const std::vector<int32> &alignment = alignments_reader.Value(utt);
+                 
+        if ((int32)alignment.size() != mat.NumRows()) {
+          KALDI_WARN << "Alignment has wrong size "<< (alignment.size()) << " vs. "<< (mat.NumRows());
+          num_other_error++;
+          continue;
+        }
+
+        // Accumulate the statistics
+        kl_hmm.Accumulate(mat, alignment);  
+        // log
+	KALDI_VLOG(2) << "utt " << utt << ", frames " << alignment.size();
+        total_frames += mat.NumRows();
+      } 
+      num_done++;
+    }
+    KALDI_WARN << "Before writing...";
+    KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment
+              << " with no alignments, " << num_other_error
+              << " with other errors.";
+
+    //store the accumulator
+    {
+      Output out(kl_hmm_accumulator, binary); 
+      kl_hmm.WriteData(out.Stream(), binary);
+    }
+
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
--- a/src/nnetbin/nnet-kl-hmm-mat-to-component.cc
+++ b/src/nnetbin/nnet-kl-hmm-mat-to-component.cc
@ -0,0 +1,77 @@
+// nnetbin/nnet-kl-gmm-sum-accs.cc
+
+// Copyright 2013  Idiap Research Institute (Author: David Imseng)
+//                 Karlsruhe Institute of Technology (Author: Ngoc Thang Vu)
+//                 Brno University of Technology (Author: Karel Vesely)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/nnet-nnet.h"
+#include "nnet/nnet-kl-hmm.h"
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  using namespace kaldi::nnet1;
+
+  try {
+    typedef int32 int32;
+    const char *usage =
+        "Convert matrix of KL-HMM training to nnet component.\n"
+        "Usage: nnet-kl-hmm-mat-to-component [options] nnet-component matrix\n";
+
+    bool binary = true;
+    int32 n_kl_states = 0;
+    int32 n_posterior_dim = 0;
+    ParseOptions po(usage);
+    po.Register("binary", &binary, "Write output in binary mode");
+    po.Register("nkl-states", &n_kl_states, "Number of states in Kl-HMM");
+    po.Register("posterior-dim", &n_posterior_dim, "Dimensionality of posterior features");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_component_filename = po.GetArg(1);
+    std::string mat_filename = po.GetArg(2);
+    
+    Matrix<BaseFloat> kl_stats;
+    {
+      bool binary_read;
+      Input ki(mat_filename, &binary_read);
+      kl_stats.Read(ki.Stream(), binary_read);
+    }
+    
+    KlHmm kl_hmm(kl_stats.NumCols(), kl_stats.NumRows());
+    kl_hmm.SetStats(kl_stats);
+    
+
+    // Write out the accs
+    {
+      Output ko(nnet_component_filename, binary);
+      kl_hmm.Write(ko.Stream(), binary);
+    }
+
+    KALDI_LOG << "Written nnet component to " << nnet_component_filename;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
--- a/src/nnetbin/nnet-kl-hmm-sum-accs.cc
+++ b/src/nnetbin/nnet-kl-hmm-sum-accs.cc
@ -0,0 +1,74 @@
+// nnetbin/nnet-kl-gmm-sum-accs.cc
+
+// Copyright 2013  Idiap Research Institute (Author: David Imseng)
+//                 Karlsruhe Institute of Technology (Author: Ngoc Thang Vu)
+//                 Brno University of Technology (Author: Karel Vesely)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/nnet-nnet.h"
+#include "nnet/nnet-kl-hmm.h"
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  using namespace kaldi::nnet1;
+
+  try {
+    typedef int32 int32;
+    const char *usage =
+        "Sum multiple accumulated stats files for KL-HMM training.\n"
+        "Usage: nnet-kl-hmm-sum-accs [options] nnet-component stats-in1 stats-in2 ...\n";
+
+    bool binary = true;
+    int32 n_kl_states = 0;
+    int32 n_posterior_dim = 0;
+    ParseOptions po(usage);
+    po.Register("binary", &binary, "Write output in binary mode");
+    po.Register("nkl-states", &n_kl_states, "Number of states in Kl-HMM");
+    po.Register("posterior-dim", &n_posterior_dim, "Dimensionality of posterior features");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_component_filename = po.GetArg(1);
+    KlHmm kl_hmm(n_posterior_dim, n_kl_states, NULL);
+
+    int num_accs = po.NumArgs() - 1;
+    for (int i = 2, max = po.NumArgs(); i <= max; i++) {
+      std::string stats_in_filename = po.GetArg(i);
+      Input ki(stats_in_filename);
+      kl_hmm.AddStats(ki.Stream(), binary);
+    }
+
+    // Write out the accs
+    {
+      Output ko(nnet_component_filename, binary);
+      kl_hmm.Write(ko.Stream(), binary);
+    }
+
+    KALDI_LOG << "Summed " << num_accs << " stats ";
+    KALDI_LOG << "Written nnet component to " << nnet_component_filename;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+