Committing some scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@103 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2011-06-26 05:04:49 +00:00 · 2011-06-26 05:04:49 +00:00 · d7a6d499aa
--- a/egs/wsj/s1/steps/train_sgmm3b.sh
+++ b/egs/wsj/s1/steps/train_sgmm3b.sh
@ -77,6 +77,7 @@ cp $srcdir/topo $dir
 # Note: a small number of utterances don't have graphs at this stage because of differences
 # in how the data splitting is done when we switch to using speaker information.

+
 echo "Aligning all training data"

 rm -f $dir/.error
@ -89,7 +90,6 @@ done
 wait;
 [ -f $dir/.error ] &&  echo align error RE old system && exit 1

-
 acc-tree-stats  --ci-phones=$silphonelist $srcmodel "$feats" \
  "ark:gunzip -c $dir/0.?.ali.gz|" $dir/treeacc 2> $dir/acc.tree.log  || exit 1;

@ -166,7 +166,7 @@ while [ $x -lt $numiters ]; do
     rm -f $dir/.error
     for n in 1 2 3; do
       sgmm-align-compiled ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
-            "--gselect=ark:gunzip -c $dir/gselect$n.gz|" \
+            "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
           $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \
           "ark:gunzip -c $dir/graphs${n}.fsts.gz|" "${featspart[$n]}" \
           "ark:|gzip -c >$dir/cur${n}.ali.gz" 2> $dir/align.$x.$n.log \
@ -181,9 +181,9 @@ while [ $x -lt $numiters ]; do
      ( ali-to-post "ark:gunzip -c $dir/cur${n}.ali.gz|" ark:- | \
        weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- | \
        sgmm-est-spkvecs --spk2utt=ark:$dir/train$n.spk2utt ${spkvecs_opt[$n]} \
-         "--gselect=ark:gunzip -c $dir/gselect$n.gz|" \
+         "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
          --rand-prune=$randprune $dir/$x.mdl \
-         "${featspart[$n]}" ark:- ark:$dir/tmp$n.vecs && mv $dir/tmp$n.vecs $cur/cur$n.vecs ) \
+         "${featspart[$n]}" ark,s,cs:- ark:$dir/tmp$n.vecs && mv $dir/tmp$n.vecs $dir/cur$n.vecs ) \
                   2>$dir/spkvecs.$x.$n.log \
           || touch $dir/.error &
        spkvecs_opt[$n]="--spk-vecs=ark:$dir/cur$n.vecs"
@ -202,7 +202,7 @@ while [ $x -lt $numiters ]; do

   for n in 1 2 3; do
     sgmm-acc-stats-ali ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
-       --update-flags=$flags "--gselect=ark:gunzip -c $dir/gselect$n.gz|" \
+       --update-flags=$flags "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
       --rand-prune=$randprune --binary=true $dir/$x.mdl "${featspart[$n]}" \
      "ark:gunzip -c $dir/cur$n.ali.gz|" $dir/$x.$n.acc 2> $dir/acc.$x.$n.log \
        || touch $dir/.error &
@ -225,7 +225,7 @@ flags=MwcS
 for n in 1 2 3; do
 ( ali-to-post "ark:gunzip -c $dir/cur$n.ali.gz|" ark:- | \
   sgmm-post-to-gpost ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
-                "--gselect=ark:gunzip -c $dir/gselect$n.gz|" \
+                "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
                 $dir/$x.mdl "${featspart[$n]}" ark,s,cs:- ark:- | \
  sgmm-acc-stats-gpost --update-flags=$flags  $dir/$x.mdl "${featspart[$n]}" \
            ark,s,cs:- $dir/$x.$n.aliacc ) 2> $dir/acc_ali.$x.$n.log || touch $dir/.error &
--- a/egs/wsj/s1/steps/train_sgmm3b2.sh
+++ b/egs/wsj/s1/steps/train_sgmm3b2.sh
@ -0,0 +1,248 @@
+#!/bin/bash
+# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# 3b2 is with dim=50
+# sgmm3b is as sgmm2b (SGMM with speaker vectors), but using all
+# the training data. 
+# Instead of starting from sgmm2b we start from tri3a.  This means we can
+# essentially reuse the train_sgmm2b.sh script, and don't have to do
+# alignment of the model with speaker vectors (which requires multiple
+# passes to do properly and is a bit of a hassle).
+
+if [ -f path.sh ]; then . path.sh; fi
+
+dir=exp/sgmm3b2
+srcdir=exp/tri3a # more convenient as has graphs and alignments for this data already.
+ubm=exp/ubm3a/final.ubm # 600 UBM comps
+srcmodel=$srcdir/final.mdl
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+
+numiters=35 # Total number of iterations.
+realign_iters="5 15 25"; # realign a bit earlier than we did in tri2a, 
+    # since SGMM system quite different
+    # from normal triphone system.
+spkvec_iters="5 8 12 17 22 32"
+maxiterinc=20 # By this iter, we have all the substates.
+numleaves=6000 # was 4.2k for GMM system: incresaing it for SGMM system.
+numsubstates=6000 # initial #-substates
+totsubstates=35000 # a little less than #Gauss for baseline GMM system (40k)
+incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] # per-iter increment for #substates
+phn_dim=50
+phn_dim_iter=3 # iter to increase phn dim.
+
+silphonelist=`cat data/silphones.csl`
+randprune=0.1
+
+mkdir -p $dir
+cp $srcdir/train.scp $dir
+cp $srcdir/train.tra $dir
+scripts/filter_scp.pl $dir/train.scp data/train.utt2spk > $dir/train.utt2spk
+scripts/utt2spk_to_spk2utt.pl $dir/train.utt2spk > $dir/train.spk2utt
+
+scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.scp
+scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.tra
+scripts/split_scp.pl --utt2spk=$dir/train.utt2spk $dir/train{,1,2,3}.utt2spk
+
+for n in 1 2 3 ""; do # The "" handles the un-split one.  Creating spk2utt files..
+  scripts/utt2spk_to_spk2utt.pl $dir/train$n.utt2spk > $dir/train$n.spk2utt
+done
+
+# also see featspart below, used for sub-parts of the features;
+# try to keep them in sync.
+feats="ark:add-deltas --print-args=false scp:$dir/train.scp ark:- |"
+for n in 1 2 3; do
+   featspart[$n]="ark:add-deltas --print-args=false scp:$dir/train${n}.scp ark:- |"
+done
+
+if [ ! -f $ubm ]; then
+  echo "No UBM in $ubm";
+  exit 1
+fi
+
+cp $srcdir/topo $dir
+
+# Align all training data using old model (and old graphs, since we
+# use the same data-subset as last time). 
+# Note: a small number of utterances don't have graphs at this stage because of differences
+# in how the data splitting is done when we switch to using speaker information.
+
+echo "Aligning all training data"
+
+rm -f $dir/.error
+for n in 1 2 3; do
+   gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $srcmodel \
+       "ark:gunzip -c $srcdir/graphs${n}.fsts.gz|" "${featspart[$n]}" \
+       "ark:|gzip -c >$dir/0.${n}.ali.gz" \
+           2> $dir/align.0.${n}.log || touch $dir/.error &
+done
+wait;
+[ -f $dir/.error ] &&  echo align error RE old system && exit 1
+
+acc-tree-stats  --ci-phones=$silphonelist $srcmodel "$feats" \
+  "ark:gunzip -c $dir/0.?.ali.gz|" $dir/treeacc 2> $dir/acc.tree.log  || exit 1;
+
+
+# The next few commands are involved with making the questions
+# for tree clustering.  The extra complexity vs. the RM recipe has
+# to do with the desire to ask questions about the "real" phones
+# ignoring things like stress and position-in-word, and ask questions
+# separately about stress and position-in-word.
+
+# Don't include silences as things to be clustered -> --nosil option.
+scripts/make_shared_phones.sh --nosil | scripts/sym2int.pl data/phones.txt > $dir/phone_sets.list
+cluster-phones $dir/treeacc $dir/phone_sets.list $dir/questions.txt 2> $dir/cluster_phones.log || exit 1;
+scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
+scripts/make_extra_questions.sh | cat $dir/questions_syms.txt - > $dir/questions_syms_all.txt
+scripts/sym2int.pl data/phones.txt < $dir/questions_syms_all.txt > $dir/questions_all.txt
+
+compile-questions $dir/topo $dir/questions_all.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
+
+scripts/make_roots.sh > $dir/roots_syms.txt
+scripts/sym2int.pl --ignore-oov data/phones.txt  < $dir/roots_syms.txt > $dir/roots.txt
+
+build-tree --verbose=1 --max-leaves=$numleaves \
+  $dir/treeacc $dir/roots.txt \
+  $dir/questions.qst $dir/topo $dir/tree  2> $dir/train_tree.log || exit 1;
+
+# the sgmm-init program accepts a GMM, so we just create a temporary GMM "0.gmm"
+
+gmm-init-model  --write-occs=$dir/0.occs  \
+    $dir/tree $dir/treeacc $dir/topo $dir/0.gmm 2> $dir/init_gmm.log || exit 1;
+
+sgmm-init --spk-space-dim=39 $dir/0.gmm $ubm $dir/0.mdl 2> $dir/init_sgmm.log || exit 1;
+
+rm $dir/0.gmm
+
+rm $dir/treeacc
+
+for n in 1 2 3; do
+  sgmm-gselect $dir/0.mdl "${featspart[$n]}" ark,t:- 2>$dir/gselect$n.log | \
+   gzip -c > $dir/gselect${n}.gz || touch $dir/.error &
+done
+wait
+[ -f $dir/.error ] && echo "Error in gselect phase" && exit 1;
+
+# Convert alignments generated from previous model, to use as 
+# initial alignments.
+
+for n in 1 2 3; do
+  convert-ali $srcmodel $dir/0.mdl $dir/tree \
+      "ark:gunzip -c $dir/0.$n.ali.gz|" \
+      "ark:|gzip -c > $dir/cur$n.ali.gz" \
+     2>$dir/convert.$n.log || exit 1; # don't parallelize: mostly I/O.
+done
+rm $dir/0.?.ali.gz
+
+# Make training graphs
+echo "Compiling training graphs"
+
+rm -f $dir/.error
+for n in 1 2 3; do
+  compile-train-graphs $dir/tree $dir/0.mdl  data/L.fst ark:$dir/train${n}.tra \
+     "ark:|gzip -c > $dir/graphs${n}.fsts.gz" \
+     2>$dir/compile_graphs.${n}.log || touch $dir/.error &
+done
+wait
+[ -f $dir/.error ] &&  echo compile-graphs error && exit 1
+
+
+x=0
+while [ $x -lt $numiters ]; do
+   echo "Pass $x"
+   if echo $realign_iters | grep -w $x >/dev/null; then
+     echo "Aligning data"
+     rm -f $dir/.error
+     for n in 1 2 3; do
+       sgmm-align-compiled ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
+            "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
+           $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \
+           "ark:gunzip -c $dir/graphs${n}.fsts.gz|" "${featspart[$n]}" \
+           "ark:|gzip -c >$dir/cur${n}.ali.gz" 2> $dir/align.$x.$n.log \
+             || touch $dir/.error &
+     done
+     wait 
+     [ -f $dir/.error ] && echo error aligning data && exit 1
+   fi
+   if echo $spkvec_iters | grep -w $x >/dev/null; then
+     echo "Computing speaker vectors"
+     for n in 1 2 3; do
+      ( ali-to-post "ark:gunzip -c $dir/cur${n}.ali.gz|" ark:- | \
+        weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- | \
+        sgmm-est-spkvecs --spk2utt=ark:$dir/train$n.spk2utt ${spkvecs_opt[$n]} \
+         "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
+          --rand-prune=$randprune $dir/$x.mdl \
+         "${featspart[$n]}" ark:- ark:$dir/tmp$n.vecs && mv $dir/tmp$n.vecs $dir/cur$n.vecs ) \
+                   2>$dir/spkvecs.$x.$n.log \
+           || touch $dir/.error &
+        spkvecs_opt[$n]="--spk-vecs=ark:$dir/cur$n.vecs"
+     done
+     wait;
+     [ -f $dir/.error ] && echo error computing speaker vectors && exit 1    
+   fi  
+
+   if [ $x -eq 0 ]; then
+     flags=vwcS
+   elif [ $[$x%2] -eq 1 -a $x -gt 4 ]; then # even iters after 4 (i.e. starting from 6)...
+     flags=vNwcS
+   else
+     flags=vMwcS
+   fi
+
+   for n in 1 2 3; do
+     sgmm-acc-stats-ali ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
+       --update-flags=$flags "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
+       --rand-prune=$randprune --binary=true $dir/$x.mdl "${featspart[$n]}" \
+      "ark:gunzip -c $dir/cur$n.ali.gz|" $dir/$x.$n.acc 2> $dir/acc.$x.$n.log \
+        || touch $dir/.error &
+   done
+   wait;
+   [ -f $dir/.error ] && echo error accumulating stats on iter $x && exit 1  
+   if [ $x == $phn_dim_iter ]; then 
+     phn_dim_opt=--increase-phn-dim=$phn_dim
+   else
+     phn_dim_opt=
+   fi
+   sgmm-est $phn_dim_opt --update-flags=$flags --split-substates=$numsubstates \
+      --write-occs=$dir/$[$x+1].occs $dir/$x.mdl "sgmm-sum-accs - $dir/$x.?.acc|" \
+      $dir/$[$x+1].mdl  2> $dir/update.$x.log || exit 1;
+   rm $dir/$x.mdl $dir/$x.?.acc $dir/$x.occs 2>/dev/null
+   if [ $x -lt $maxiterinc ]; then 
+     numsubstates=$[$numsubstates+$incsubstates]
+   fi
+   x=$[$x+1];
+done
+
+( cd $dir; rm final.mdl final.occs 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )
+
+# Create "alignment model"
+flags=MwcS
+for n in 1 2 3; do
+ ( ali-to-post "ark:gunzip -c $dir/cur$n.ali.gz|" ark:- | \
+   sgmm-post-to-gpost ${spkvecs_opt[$n]} --utt2spk=ark:$dir/train$n.utt2spk \
+                "--gselect=ark,s,cs:gunzip -c $dir/gselect$n.gz|" \
+                 $dir/$x.mdl "${featspart[$n]}" ark,s,cs:- ark:- | \
+  sgmm-acc-stats-gpost --update-flags=$flags  $dir/$x.mdl "${featspart[$n]}" \
+            ark,s,cs:- $dir/$x.$n.aliacc ) 2> $dir/acc_ali.$x.$n.log || touch $dir/.error &
+done
+wait;
+[ -f $dir/.error ] && echo error accumulating stats for alignment model && exit 1  
+
+sgmm-est --update-flags=$flags --remove-speaker-space=true $dir/$x.mdl \
+    "sgmm-sum-accs - $dir/$x.?.aliacc|" $dir/$x.alimdl 2>$dir/update_ali.$x.log || exit 1;
+rm $dir/$x.?.aliacc
+
+( cd $dir; rm final.alimdl 2>/dev/null; ln -s $x.alimdl final.alimdl; )
+