Script changes (esp. RE SGMMs).

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@10 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2011-05-15 05:18:18 +00:00 · 2011-05-15 05:18:18 +00:00 · 4330846ff0
--- a/egs/rm/s1/run.sh
+++ b/egs/rm/s1/run.sh
@ -38,7 +38,6 @@ cd data_prep
 ./run.sh /path/to/RM
 cd ..

-
 mkdir -p data
 ( cd data; cp ../data_prep/{train,test*}.{spk2utt,utt2spk} . ; cp ../data_prep/spk2gender.map . )

@ -50,14 +49,14 @@ steps/prepare_graphs.sh
 # data to (e.g. make it a link to a file on some reasonably large file system).
 # If it doesn't exist, the scripts below will make the directory "exp".

-# tempdir should be set to some place to put training mfcc's
+# mfcc should be set to some place to put training mfcc's
 # where you have space.
-#e.g.: tempdir=/mnt/matylda6/jhu09/qpovey/kaldi_rm_mfccb
+#e.g.: mfccdir=/mnt/matylda6/jhu09/qpovey/kaldi_rm_mfccb
 mfccdir=/path/to/mfccdir
 steps/make_mfcc_train.sh $mfccdir
 steps/make_mfcc_test.sh $mfccdir

-steps/train_mono.sh    
+steps/train_mono.sh
 steps/decode_mono.sh  &
 steps/train_tri1.sh
 steps/decode_tri1.sh  &
--- a/egs/rm/s1/steps/decode_tri1_fmllr.sh
+++ b/egs/rm/s1/steps/decode_tri1_fmllr.sh
@ -34,8 +34,6 @@ scripts/mkgraph.sh $tree $model $graphdir
 for test in mar87 oct87 feb89 oct89 feb91 sep92; do
 (
  # Comment the two lines below to make this per-utterance.
-  # This would only work if $srcdir was also per-utterance [otherwise
-  # you'd have to mess with the script a bit].
  spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
  utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk

--- a/egs/rm/s1/steps/decode_tri1_regtree_fmllr.sh
+++ b/egs/rm/s1/steps/decode_tri1_regtree_fmllr.sh
@ -38,8 +38,6 @@ scripts/mkgraph.sh $tree $model $graphdir
 for test in mar87 oct87 feb89 oct89 feb91 sep92; do
 (
  # Comment the two lines below to make this per-utterance.
-  # This would only work if $srcdir was also per-utterance [otherwise
-  # you'd have to mess with the script a bit].
  spk2utt_opt=--spk2utt=ark:data/test_${test}.spk2utt
  utt2spk_opt=--utt2spk=ark:data/test_${test}.utt2spk

--- a/egs/rm/s1/steps/init_sgmm.sh
+++ b/egs/rm/s1/steps/init_sgmm.sh
@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Initialize SGMM from a trained HMM/GMM system.
-
-if [ -f path.sh ]; then . path.sh; fi
-
-dir=exp/sgmm/init
-mkdir -p $dir
-srcdir=exp/tri1
-model=exp/sgmm/0.mdl
-
-init-ubm --intermediate-numcomps=2000 --ubm-numcomps=400 --verbose=2 \
-    --fullcov-ubm=true $srcdir/final.mdl $srcdir/final.occs \
-    $dir/ubm0 2> $dir/cluster.log
-
-
-subset[0]=1000
-subset[1]=1500
-subset[2]=2000
-subset[3]=2500
-
-for x in 0 1 2 3; do
-    echo "Pass $x"
-    feats="ark:scripts/subset_scp.pl ${subset[$x]} data/train.scp | add-deltas --print-args=false scp:- ark:- |"
-    fgmm-global-acc-stats --diag-gmm-nbest=15 --binary=false --verbose=2 $dir/ubm$x "$feats" $dir/$x.acc \
-	2> $dir/acc.$x.log  || exit 1;
-    fgmm-global-est --verbose=2 $dir/ubm$x $dir/$x.acc \
-	$dir/ubm$[$x+1] 2> $dir/update.$x.log || exit 1;
-    rm $dir/$x.acc
-done
-
-sgmm-init $srcdir/final.mdl $dir/ubm4 $model 2> $dir/sgmm_init.log
-
--- a/egs/rm/s1/steps/train_sgmma.sh
+++ b/egs/rm/s1/steps/train_sgmma.sh
@ -28,9 +28,11 @@ numiters=25   # Total number of iterations

 realign_iters="5 10 15";
 silphonelist=`cat data/silphones.csl`
-numsubstates=1500 # Initial #-substates.
-totsubstates=5000 # Target #-substates.
+numleaves=2500
+numsubstates=2500 # Initial #-substates.
+totsubstates=7500 # Target #-substates.
 maxiterinc=15 # Last iter to increase #substates on.
+
 incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] # per-iter increment for #substates
 gselect_opt="--gselect=ark:gunzip -c $dir/gselect.gz|"
 randprune=0.1
@ -38,12 +40,12 @@ mkdir -p $dir

 feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"

-cp $srcdir/tree $dir
+cp $srcdir/topo $dir

 if [ ! -f $ubm ]; then
-  echo "No UBM in $ubm"
+  echo "No UBM in $ubm";
+  exit 1
 fi
-sgmm-init $srcdir/final.mdl $ubm $dir/0.mdl 2> $dir/sgmm_init.log

 echo "aligning all training data"
 if [ ! -f $dir/0.ali ]; then
@ -51,6 +53,33 @@ if [ ! -f $dir/0.ali ]; then
        "$feats" ark,t:$dir/0.ali 2> $dir/align.0.log || exit 1;
 fi

+# We rebuild the tree because we want a larger #states than for a normal
+# GMM system (the optimum #states for SGMMs tends to be a bit higher).
+
+if [ ! -f $dir/treeacc ]; then
+  acc-tree-stats  --ci-phones=$silphonelist $srcmodel "$feats" ark:$dir/0.ali \
+    $dir/treeacc 2> $dir/acc.tree.log  || exit 1;
+fi
+
+cat data/phones.txt | awk '{print $NF}' | grep -v -w 0 > $dir/phones.list
+cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/questions.log || exit 1;
+scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
+compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+
+build-tree --verbose=1 --max-leaves=$numleaves \
+    $dir/treeacc $dir/roots.txt \
+    $dir/questions.qst $dir/topo $dir/tree  2> $dir/train_tree.log || exit 1;
+
+# the sgmm-init program accepts a GMM, so we just create a temporary GMM "0.gmm"
+
+gmm-init-model  --write-occs=$dir/0.occs  \
+    $dir/tree $dir/treeacc $dir/topo $dir/0.gmm 2> $dir/init_gmm.log || exit 1;
+
+sgmm-init $dir/0.gmm $ubm $dir/0.mdl 2> $dir/init_sgmm.log || exit 1;
+
+rm $dir/0.gmm
+
 if [ ! -f $dir/gselect.gz ]; then
 sgmm-gselect $dir/0.mdl "$feats" ark,t:- 2>$dir/gselect.log | gzip -c > $dir/gselect.gz || exit 1;
 fi
@ -76,13 +105,12 @@ while [ $iter -lt $numiters ]; do
     sgmm-acc-stats-ali --update-flags=$flags "$gselect_opt" --rand-prune=$randprune --binary=false $dir/$iter.mdl "$feats" ark:$dir/cur.ali $dir/$iter.acc 2> $dir/acc.$iter.log  || exit 1;
     sgmm-est --update-flags=$flags --split-substates=$numsubstates --write-occs=$dir/$[$iter+1].occs $dir/$iter.mdl $dir/$iter.acc $dir/$[$iter+1].mdl 2> $dir/update.$iter.log || exit 1;
   fi
-# TEMP: will restore these statements later.
-#  	rm $dir/$iter.mdl $dir/$iter.acc
-#  	rm $dir/$iter.occs 
-    if [ $iter -lt $maxiterinc ]; then
-       numsubstates=$[$numsubstates+$incsubstates]
-    fi
-    iter=$[$iter+1];
+   rm $dir/$iter.mdl $dir/$iter.acc
+   rm $dir/$iter.occs 
+   if [ $iter -lt $maxiterinc ]; then
+     numsubstates=$[$numsubstates+$incsubstates]
+   fi
+   iter=$[$iter+1];
 done

 ( cd $dir; rm final.mdl final.occs 2>/dev/null; ln -s $iter.mdl final.mdl; ln -s $iter.occs final.occs )
--- a/egs/rm/s1/steps/train_sgmmb.sh
+++ b/egs/rm/s1/steps/train_sgmmb.sh
@ -33,8 +33,9 @@ ubm=exp/ubma/4.ubm
 realign_iters="5 10 15"; 
 spkvec_iters="5 8 12 17 22"
 silphonelist=`cat data/silphones.csl`
-numsubstates=1500 # Initial #-substates.
-totsubstates=5000 # Target #-substates.
+numleaves=2500
+numsubstates=2500 # Initial #-substates.
+totsubstates=7500 # Target #-substates.
 maxiterinc=15 # Last iter to increase #substates on.
 incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] # per-iter increment for #substates
 gselect_opt="--gselect=ark:gunzip -c $dir/gselect.gz|"
@ -52,9 +53,7 @@ if [ ! -f $ubm ]; then
  echo "No UBM in $ubm"
 fi

-sgmm-init --spk-space-dim=39 $srcdir/final.mdl $ubm $dir/0.mdl 2> $dir/sgmm_init.log || exit 1;
-
-cp $srcdir/tree $dir
+cp $srcdir/topo $dir

 echo "aligning all training data"
 if [ ! -f $dir/0.ali ]; then
@ -62,8 +61,33 @@ if [ ! -f $dir/0.ali ]; then
        "$feats" ark,t:$dir/0.ali 2> $dir/align.0.log || exit 1;
 fi

+# We rebuild the tree because we want a larger #states than for a normal
+# GMM system (the optimum #states for SGMMs tends to be a bit higher).
+
+if [ ! -f $dir/treeacc ]; then
+  acc-tree-stats  --ci-phones=$silphonelist $srcmodel "$feats" ark:$dir/0.ali \
+    $dir/treeacc 2> $dir/acc.tree.log  || exit 1;
+fi
+
+cat data/phones.txt | awk '{print $NF}' | grep -v -w 0 > $dir/phones.list
+cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/questions.log || exit 1;
+scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
+compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+
+build-tree --verbose=1 --max-leaves=$numleaves \
+    $dir/treeacc $dir/roots.txt \
+    $dir/questions.qst $dir/topo $dir/tree  2> $dir/train_tree.log || exit 1;
+
+# the sgmm-init program accepts a GMM, so we just create a temporary GMM "0.gmm"
+
+gmm-init-model  --write-occs=$dir/0.occs  \
+    $dir/tree $dir/treeacc $dir/topo $dir/0.gmm 2> $dir/init_gmm.log || exit 1;
+
+sgmm-init --spk-space-dim=39 $dir/0.gmm $ubm $dir/0.mdl 2> $dir/init_sgmm.log || exit 1;
+
 if [ ! -f $dir/0.mdl ]; then
-   echo "you must run init_sgmm.sh before train_sgmm1.sh"
+   echo "you must run init_sgmm.sh before train_sgmmb.sh"
   exit 1
 fi

--- a/egs/rm/s1/steps/train_tri1.sh
+++ b/egs/rm/s1/steps/train_tri1.sh
@ -58,7 +58,7 @@ scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.tx
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

 # Have to make silence root not-shared because we will not split it.
-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;


 build-tree --verbose=1 --max-leaves=$numleaves \
--- a/egs/rm/s1/steps/train_tri2a.sh
+++ b/egs/rm/s1/steps/train_tri2a.sh
@ -55,7 +55,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2b.sh
+++ b/egs/rm/s1/steps/train_tri2b.sh
@ -78,7 +78,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2c.sh
+++ b/egs/rm/s1/steps/train_tri2c.sh
@ -77,7 +77,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2d.sh
+++ b/egs/rm/s1/steps/train_tri2d.sh
@ -56,7 +56,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2e.sh
+++ b/egs/rm/s1/steps/train_tri2e.sh
@ -63,7 +63,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2f.sh
+++ b/egs/rm/s1/steps/train_tri2f.sh
@ -65,7 +65,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2g.sh
+++ b/egs/rm/s1/steps/train_tri2g.sh
@ -82,7 +82,7 @@ done


 # just a single element. :-separated integer list of context-independent
-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
 # script below tells it not to cluster, but here we avoid accumulating
 # CD-stats for silence.

@ -106,7 +106,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2h.sh
+++ b/egs/rm/s1/steps/train_tri2h.sh
@ -65,7 +65,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2i.sh
+++ b/egs/rm/s1/steps/train_tri2i.sh
@ -69,7 +69,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2j.sh
+++ b/egs/rm/s1/steps/train_tri2j.sh
@ -66,7 +66,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
@ -182,7 +182,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_tri2k.sh
+++ b/egs/rm/s1/steps/train_tri2k.sh
@ -92,7 +92,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;


 build-tree --verbose=1 --max-leaves=$numleaves \
--- a/egs/rm/s1/steps/train_tri2l.sh
+++ b/egs/rm/s1/steps/train_tri2l.sh
@ -68,7 +68,7 @@ cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/question
 scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
 compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;

-scripts/make_roots.pl --separate data/phones.txt `cat data/silphones.csl` shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
+scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;

 build-tree --verbose=1 --max-leaves=$numleaves \
    $dir/treeacc $dir/roots.txt \
--- a/egs/rm/s1/steps/train_ubma.sh
+++ b/egs/rm/s1/steps/train_ubma.sh
@ -37,9 +37,9 @@ subset[3]=2500
 for x in 0 1 2 3; do
    echo "Pass $x"
    feats="ark:scripts/subset_scp.pl ${subset[$x]} data/train.scp | add-deltas --print-args=false scp:- ark:- |"
-    fgmm-acc-stats --diag-gmm-nbest=15 --binary=false --verbose=2 $dir/$x.ubm "$feats" $dir/$x.acc \
+    fgmm-global-acc-stats --diag-gmm-nbest=15 --binary=false --verbose=2 $dir/$x.ubm "$feats" $dir/$x.acc \
 	2> $dir/acc.$x.log  || exit 1;
-    fgmm-est --verbose=2 $dir/$x.ubm $dir/$x.acc \
+    fgmm-global-est --verbose=2 $dir/$x.ubm $dir/$x.acc \
 	$dir/$[$x+1].ubm 2> $dir/update.$x.log || exit 1;
    rm $dir/$x.acc $dir/$x.ubm
 done
--- a/egs/wsj/s1/run.sh
+++ b/egs/wsj/s1/run.sh
@ -84,12 +84,14 @@ cat data/lexicon.txt | awk '{print $1}' | sort | uniq  | \

 cd data_prep

-#TODO: remove following system-specific comments.
-#On BUT system, do:
-./run.sh /mnt/matylda2/data/WSJ?/??-{?,??}.?

-# On Geoff Hinton's system we can do:
-#  ./run.sh  /ais/gobi2/speech/WSJ/*/??-{?,??}.?
+# On BUT system, do:
+# The following command needs a list of directory names from
+# the LDC's WSJ disks.  These will end in e.g. 11-1.1.
+# examples:
+# /ais/gobi2/speech/WSJ/*/??-{?,??}.?
+# /mnt/matylda2/data/WSJ?/??-{?,??}.?
+./run.sh [list-of-directory-names]


 cd ..
--- a/src/TODO
+++ b/src/TODO
@ -6,6 +6,7 @@

 TODO items (mainly for Dan to do):

+  Add separate min-count at root of tree for regression-tree fMLLR/MLLR
  Add fMLLR scripts for SGMM, and also use deeper trees.
  Document configure script.
  Remove cpplint.py from distribution.
--- a/src/doc/model.dox
+++ b/src/doc/model.dox
@ -20,7 +20,7 @@
 namespace kaldi {

 /**
-   \page model Acoustic modeling code
+  \page model Acoustic modeling code

  \section model_intro Introduction

@ -29,7 +29,7 @@ namespace kaldi {
  models (i.e. diagonal GMMs) and Subspace Gaussian Mixture Models (SGMMs), but
  also to be easily extensible to new kinds of model.  In a previous iteration of
  designing this software, we used a virtual base class that both the GMM and
-  SGMM classes inherited from, and wrote command-line tools to handle both types of
+  SGMM classes inherited from, and wrote command-line tools that handled both types of
  model.  Our experience was that a base class is not as useful as one might
  think, because there are too many differences in the models (e.g. they support
  different types of adaptation), and we were forced to constantly expand the
@ -100,12 +100,22 @@ namespace kaldi {
 for such things as model estimation (e.g. see MlEstimateAmDiagGmm), or transform
 estimation (there are various pieces of code that do this; see \ref transforms.

+ \subsection model_full_gmm Full-covariance GMMs
+
+ We have a class \ref FullGmm for full-covariance GMMs, which has similar functionality
+ to the \ref DiagGmm class but with full covariances.  This is mainly of use for training
+ full-covariance Universal Background  Models (UBMs) in the SGMM recipe (see below).
+ The only command-line tools available for full GMMs are used to train global mixture models
+ (i.e. UBMs); we have not
+ implemented a full covariance version of the AmDiagGmm class or the corresponding command
+ line tools, although doing so would be fairly easy.
+
 \section model_sgmm Subspace Gaussian Mixture Models (SGMMs)

 Subspace Gaussian Mixture Models (SGMMs) are implemented by class
 AmSgmm.  This class essentially implements the approach described in
- ``The Subspace Gaussian Mixture Model – a Structured Model for Speech 
- Recognition'', by D. Povey, Lukas Burget et. al Computer Speech and Language, 
+ ``The Subspace Gaussian Mixture Model -- a Structured Model for Speech 
+ Recognition'', by D. Povey, Lukas Burget et. al, Computer Speech and Language, 
 2011.
 The class AmSgmm represents a whole collection of pdf's; there
 is no class that represents a single pdf of the SGMM (as there is for
@ -118,3 +128,4 @@ namespace kaldi {


 */
+}