sandbox/pitch: Merging trunk changes.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pitch@3226 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-11-27 19:40:50 +00:00 · 2013-11-27 19:40:50 +00:00 · f1bcf2419a
--- a/egs/hkust/s5b/local/segmenter/186k_wordprobmap
+++ b/egs/hkust/s5b/local/segmenter/186k_wordprobmap
--- a/egs/hkust/s5b/local/segmenter/ChiUtf8Segmenter.java
+++ b/egs/hkust/s5b/local/segmenter/ChiUtf8Segmenter.java
@ -0,0 +1,665 @@
+//
+// Copyright 2013-2014, Hong Kong University of Science and Technology (author: Ricky Chan Ho Yin)
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+// This is a Chinese word segmentation program, it allows operation with 5 modes. 
+// It assumes input Chinese characters with UTF-8 encoding.
+//
+// Usage: java ChiUtf8Segmenter [-mode1|-mode2|-mode3|-mode4|-mode5] input_file wordprob_map [numbers_identity_file]
+//
+// Default option:         left longest segments
+// Option: -mode1          left longest segments
+// Option: -mode2          right longest segments
+// Option: -mode3          choose segments from left longest or right longest (which give less segments)
+// Option: -mode4          segments with higher unigram probability of left longest or right longest
+// Option: -mode5          Viterbi search segmentation (by unigram log probability path cost)
+//
+// input_file              name of input file for segmentation
+// wordprob_map            wordlist with log probabilities for segmentation
+// numbers_identity_file   name of file for numbers identification (optional input)
+
+
+import java.io.*;
+import java.lang.*;
+import java.util.*;
+import java.util.regex.*;
+
+public class ChiUtf8Segmenter {
+  private final String encoding = "UTF-8";
+  private final int maxword_length = 8;
+  private final String seg_separator = " ";
+  private final float unkstr_prob = -12.0f;
+
+  private HashMap<String, Float> wordprob_mapdata = null;  // "name to negative cost" or "name to log probability" table
+  private WordProbMap wordprobmap = null;
+  private TreeSet<String> numbers_data = null;
+
+  private static int running_mode;
+    
+  private static ArrayList<String> segstrLeftBuffer = new ArrayList<String>();
+  private static LinkedList<String> segstrRightBuffer = new LinkedList<String>();
+
+  public ChiUtf8Segmenter(String wordprobFile) throws IOException {
+    wordprobmap = new WordProbMap(wordprobFile, encoding);
+    wordprob_mapdata = wordprobmap.getProbMap();
+  }
+
+  public ChiUtf8Segmenter(String wordprobFile, String numbersFile) throws IOException {
+    wordprobmap = new WordProbMap(wordprobFile, encoding);
+    wordprob_mapdata = wordprobmap.getProbMap();
+
+    numbers_data = new TreeSet<String>();
+    loadChiRes(numbers_data, numbersFile);
+  }
+
+  public void cleanRes() {
+    if(wordprobmap!=null) {
+      wordprobmap.clearMap();
+      wordprobmap = null;
+    }
+
+    if(numbers_data!=null) {
+      numbers_data.clear();
+      numbers_data = null;
+    }
+  }
+
+  private void loadChiRes(TreeSet<String> resdata, String sourcefile) {
+    String dataline;
+    try {
+      InputStream in = getClass().getResourceAsStream(sourcefile);
+      BufferedReader rd = new BufferedReader(new InputStreamReader(in, encoding));
+
+      dataline = rd.readLine();
+      while(dataline != null) {
+        dataline = dataline.trim();
+        if(dataline.length() == 0) continue;
+        resdata.add(dataline);
+        dataline = rd.readLine();
+      }
+      in.close();
+      rd.close();
+    }
+    catch (Exception e) {
+      System.err.println("Load resources for "+sourcefile+" error: " + e);
+    }
+  }
+
+  private boolean isNumber(String word) {
+    String tmp;
+
+    if(numbers_data == null) return false;
+
+    int ll = word.length();
+    if(ll == 0) return false;
+
+    for(int i = 0; i<ll; i++) {
+      tmp = word.substring(i, i+1);
+      if(numbers_data.contains(tmp) == false) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  public String segmentLine(String cline, String separator, int mode) {
+    int[] boundaries = null;
+    int[] rboundaries = null;
+    int i, lsepn, rsepn, clen;
+    String concatStr = null;
+
+    clen = cline.length();
+    if(clen==0) return "";
+
+    if(mode == 1) {
+      boundaries = new int[clen];
+      segmentLineLeftOffsets(cline, boundaries);
+      if(boundaries.length == 0) { return cline; }
+    }
+    else if(mode == 2) {
+      rboundaries = new int[clen];
+      segmentLineRightOffsets(cline, rboundaries);
+      if(rboundaries.length == 0) { return cline; }
+    }
+    else if(mode == 3 || mode == 4) {
+      boundaries = new int[clen];
+      rboundaries = new int[clen];
+      segmentLineLeftOffsets(cline, boundaries);
+      segmentLineRightOffsets(cline, rboundaries);
+      if(boundaries.length == 0 && rboundaries.length == 0) { return cline; }
+    }
+    else {
+    }
+
+    if(mode == 1) {
+      concatStr = concatLineLeft(cline, boundaries, separator);
+    }
+    else if(mode == 2) {
+      concatStr = concatLineRight(cline, rboundaries, separator);
+    }
+    else if(mode == 3) {
+      lsepn = rsepn = 0;
+      for(i=0; i<boundaries.length; i++) { 
+        if(boundaries[i] > 0) lsepn++;
+      }
+      for(i = rboundaries.length-1; i >= 0; i--) {
+        if(rboundaries[i] > 0) rsepn++;
+      }
+      if(rsepn < lsepn) { // choose right
+        concatStr = concatLineRight(cline, rboundaries, separator);
+      }
+      else {
+        concatStr = concatLineLeft(cline, boundaries, separator);
+      }
+    }
+    else if(mode == 4) {
+      String tmpstr;
+      float lvalue,rvalue;
+      lvalue = rvalue = 0.0f;
+      concatStr = "";
+      
+      concatLineLeft(cline, boundaries);
+      concatLineRight(cline, rboundaries);
+
+      for(i=0; i<segstrLeftBuffer.size(); i++) {
+        tmpstr = segstrLeftBuffer.get(i);
+        if(wordprob_mapdata.containsKey(tmpstr))
+          lvalue += wordprob_mapdata.get(tmpstr);
+        else lvalue += unkstr_prob;
+      }
+
+      ListIterator<String> listIterator = segstrRightBuffer.listIterator();
+      while (listIterator.hasNext()) {
+        tmpstr = listIterator.next();
+        if(wordprob_mapdata.containsKey(tmpstr))
+          rvalue += wordprob_mapdata.get(tmpstr);
+        else rvalue += unkstr_prob;
+      }
+
+      if(lvalue >= rvalue) {
+        for(i=0; i<segstrLeftBuffer.size(); i++) {
+          concatStr += segstrLeftBuffer.get(i);
+          concatStr += separator;
+        }
+      }
+      else {
+        listIterator = segstrRightBuffer.listIterator();
+        while(listIterator.hasNext()) {
+          concatStr += listIterator.next();
+          concatStr += separator;
+        }
+      }
+    }
+    else if(mode == 5) {
+      concatStr = viterbiSeg(cline, separator);
+    }
+    else {
+      concatStr = ""; // to be implemented for other algorithm
+    }
+
+    return concatStr;
+  }
+
+  private String viterbiSeg(String cline, String separator) {
+    int i, j=0;
+    String segstr, substr = "";
+    ArrayList<String> history_path = null;
+    int history_num_element;
+    float oldpath_prob, newpath_prob;
+    SearchHistoryPath shp;
+    boolean skip_flag;
+
+    int clength = cline.length();
+    if(clength < 1) return substr;
+
+    ArrayList<SearchHistoryPath> bestState = new ArrayList<SearchHistoryPath>(clength);
+    
+    for(i=0; i<clength; i++) {
+      bestState.add(new SearchHistoryPath());
+    }
+
+    i = -1;
+    history_num_element = 0;
+    oldpath_prob = 0.0f;
+
+    while(i<clength-1) {
+      if(i>-1 && bestState.get(i).getNumElement() == 0) {
+        i++;
+        continue;
+      }
+      if(i>-1) {
+        history_num_element = bestState.get(i).getNumElement();
+        history_path = bestState.get(i).getList();
+        oldpath_prob = bestState.get(i).getLogProb(); 
+      }
+
+      skip_flag = false;
+      if( (i+3 <= clength-1) && cline.substring(i+1, i+4).compareTo("<s>")==0 ) {
+        j=3;
+        substr = cline.substring(i+1, i+j+1);
+        skip_flag = true;
+      }
+      else if( (i+4 <= clength-1) && cline.substring(i+1, i+5).compareTo("</s>")==0 ) {
+        j=4;
+        substr = cline.substring(i+1, i+j+1);
+        skip_flag = true;
+      }
+      else if(Character.UnicodeBlock.of(cline.charAt(i+1)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
+        j=1;
+        substr = cline.substring(i+1, i+j+1);
+        if(wordprob_mapdata.containsKey(substr))
+          newpath_prob = oldpath_prob + wordprob_mapdata.get(substr);
+        else 
+          newpath_prob = oldpath_prob + unkstr_prob;
+        if(bestState.get(i+j).getNumElement()==0 || bestState.get(i+j).getLogProb()<newpath_prob) {
+          shp = new SearchHistoryPath();
+          if(history_path != null) shp.setList(history_path);
+          shp.addElement(substr, newpath_prob);
+          bestState.set(i+j, shp);
+        }
+
+        for(j=2; j<=maxword_length && (i+j<clength); j++) {
+          substr = cline.substring(i+1, i+j+1);
+          if(wordprob_mapdata.containsKey(substr)) {
+            newpath_prob = wordprob_mapdata.get(substr) + oldpath_prob;
+            if(bestState.get(i+j).getNumElement()==0 || bestState.get(i+j).getLogProb()<newpath_prob) {
+              shp = new SearchHistoryPath();
+              if(history_path != null) shp.setList(history_path);
+              shp.addElement(substr, newpath_prob);
+              bestState.set(i+j, shp);
+            }
+          }
+        }
+
+      }
+      else if(Character.isWhitespace(cline.charAt(i+1))) {
+        j=1;
+        while ( i+j < clength-1 && Character.isWhitespace(cline.charAt(i+j+1)) && (Character.UnicodeBlock.of(cline.charAt(i+j+1)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ) {
+          j++;
+        }
+        substr = "";
+        skip_flag = true;
+      }
+      else if(Character.isLetter(cline.charAt(i+1))) {
+        j=1;
+        while ( i+j < clength-1 && Character.isLetter(cline.charAt(i+j+1)) && (Character.UnicodeBlock.of(cline.charAt(i+j+1)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ) {
+          j++;
+        }
+        substr = cline.substring(i+1, i+j+1);
+        skip_flag = true;
+      }
+      else if(Character.isDigit(cline.charAt(i+1))) {
+        j=1;
+        while ( i+j < clength-1 && (Character.isDigit(cline.charAt(i+j+1)) || cline.charAt(i+j+1)=='.') && (Character.UnicodeBlock.of(cline.charAt(i+j+1)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ) {
+          j++;
+        }
+        substr = cline.substring(i+1, i+j+1);
+        skip_flag = true;
+      }
+      else {
+        j=1;
+        newpath_prob = oldpath_prob + unkstr_prob;
+        substr = cline.substring(i+1, i+j+1);
+        if(bestState.get(i+j).getNumElement()==0 || bestState.get(i+j).getLogProb()<newpath_prob) {
+          shp = new SearchHistoryPath();
+          if(history_path != null) shp.setList(history_path);
+          shp.addElement(substr, newpath_prob);
+          bestState.set(i+j, shp);
+        }
+      }
+      
+      if(skip_flag==true) {
+        shp = new SearchHistoryPath();
+        if(history_path != null) shp.setList(history_path);
+        shp.addElement(substr, oldpath_prob);
+        bestState.set(i+j, shp);
+        i+=j;
+      }
+      else { i++; }
+    }
+
+    boolean former_num_flag = false;
+    shp = bestState.get(i);
+    segstr = "";
+    ListIterator<String> listIterator = shp.getList().listIterator();
+    while (listIterator.hasNext()) {
+      substr = listIterator.next();
+      if(substr.length()>0) {
+        if(isNumber(substr)==false) {
+          if(former_num_flag==true) segstr += separator;
+          segstr += substr;
+          segstr += separator;
+          former_num_flag = false;
+        }
+        else {
+          segstr += substr;
+          former_num_flag = true;
+        }
+      }
+    }
+
+    shp = null;
+    bestState = null;
+
+    return segstr;
+  }
+  
+  private static ArrayList<String> concatLineLeft(String cline, int [] boundaries) {
+    int i;
+
+    segstrLeftBuffer.clear();
+
+    for(i=0; i<boundaries.length; i++) {
+      if(boundaries[i] > 0) segstrLeftBuffer.add(cline.substring(i, i+boundaries[i]));
+    }
+   
+    return segstrLeftBuffer;
+  }
+
+  private static LinkedList<String> concatLineRight(String cline, int [] boundaries) {
+    int i;
+    String substr;
+
+    segstrRightBuffer.clear();
+
+    for(i = boundaries.length-1; i >= 0; i--) {
+      if(boundaries[i] > 0 && i-boundaries[i]+1 >= 0)
+      {
+        substr = cline.substring(i-boundaries[i]+1, i+1);
+        segstrRightBuffer.addFirst(substr);
+      }
+    }
+
+    return segstrRightBuffer;
+  } 
+
+  private static String concatLineLeft(String cline, int [] boundaries, String separator) {
+    int i;
+
+    StringBuffer clinebuffer = new StringBuffer();
+
+    for(i=0; i<boundaries.length; i++) {
+      if(boundaries[i] > 0)
+      {
+        clinebuffer.append(cline.substring(i, i+boundaries[i]));
+        clinebuffer.append(separator);
+      }
+    }
+   
+    return clinebuffer.toString();
+  }
+
+  private static String concatLineRight(String cline, int [] boundaries, String separator) {
+    int i;
+    String substr;
+
+    StringBuffer clinebuffer = new StringBuffer();
+
+    for(i = boundaries.length-1; i >= 0; i--) {
+      if(boundaries[i] > 0 && i-boundaries[i]+1 >= 0)
+      {
+        substr = cline.substring(i-boundaries[i]+1, i+1);
+        clinebuffer.insert(0, substr);
+        clinebuffer.insert(substr.length(), separator);
+      }
+    }
+
+    return clinebuffer.toString();
+  } 
+
+  private void segmentLineLeftOffsets(String cline, int[] offsets) {
+    int i, j, tmpoffset;
+    int clength = cline.length();
+
+    i = 0;
+    while (i < clength) {
+      if(i+3 <= clength && cline.substring(i, i+3).compareTo("<s>")==0) {
+        offsets[i] = 3;
+        i += 3;
+        continue;
+      }
+
+      if(i+4 <= clength && cline.substring(i, i+4).compareTo("</s>")==0) {
+        offsets[i] = 4;
+        i += 4;
+        continue;
+      }
+
+      if (Character.UnicodeBlock.of(cline.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
+        j = maxword_length;
+        if (i+j > clength) { j = clength - i; }
+        while(i+j <= clength && j > 1) {
+          if (wordprob_mapdata.containsKey(cline.substring(i, i+j))) break; 
+          j--;
+        }
+        offsets[i] = j;
+        i += j;
+      } else if (Character.isWhitespace(cline.charAt(i))) {
+          j=1;
+          while ( i+j < clength && Character.isWhitespace(cline.charAt(i+j)) && (Character.UnicodeBlock.of(cline.charAt(i+j)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ) {
+            j++;
+          }
+          i += j;
+      } else if (Character.isLetter(cline.charAt(i))) {
+          j=1;
+          while( i+j < clength && Character.isLetter(cline.charAt(i+j)) && (Character.UnicodeBlock.of(cline.charAt(i+j)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ) {
+            j++;
+          }
+          offsets[i] = j;
+          i += j;
+      } else if (Character.isDigit(cline.charAt(i))) {
+          j=1;
+          while( i+j < clength && (Character.isDigit(cline.charAt(i+j)) || cline.charAt(i+j)=='.') && (Character.UnicodeBlock.of(cline.charAt(i+j)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ){
+            j++;
+          }
+          offsets[i] = j;
+          i += j;
+      }
+      else {
+           offsets[i] = 1;
+           i++;
+      }
+    }
+
+    i = 0;
+    while (i < clength) {
+      if (offsets[i] > 0)   {
+        while( i+offsets[i] < clength && offsets[i+offsets[i]] > 0 && i+offsets[i]+offsets[i+offsets[i]] <= clength && isNumber(cline.substring(i, i+offsets[i]+offsets[i+offsets[i]])) ) {
+          tmpoffset = offsets[i+offsets[i]];
+          offsets[i+offsets[i]] = 0;
+          offsets[i] = offsets[i] + tmpoffset;
+        }
+      }
+      i++;
+    }
+
+    return;
+  }
+
+  private void segmentLineRightOffsets(String cline, int[] offsets) {
+    int i, j, k, tmpoffset;
+    int clength = cline.length();
+
+    i = clength;
+    while (i > 0) {
+      if(i-3 > -1 && cline.substring(i-3, i).compareTo("<s>")==0) {
+        offsets[i-1] = 3;
+        i -= 3;
+        continue;
+      }
+
+      if(i-4 > -1 && cline.substring(i-4, i).compareTo("</s>")==0) {
+        offsets[i-1] = 4;
+        i -= 4;
+        continue;
+      }
+
+      if (Character.UnicodeBlock.of(cline.charAt(i-1)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
+        j = maxword_length;
+        if (i-j < 0) { j = i; }
+        while(j > 1) {
+          if (wordprob_mapdata.containsKey(cline.substring(i-j, i))) break; 
+          j--;
+        }
+        offsets[i-1] = j;
+        i -= j;
+      } else if (Character.isWhitespace(cline.charAt(i-1))) {
+          j=1;
+          k = i-j-1;
+          while( (k>=0 && Character.isWhitespace(cline.charAt(k))) && (Character.UnicodeBlock.of(cline.charAt(k)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ) {
+            j++;
+            k = i-j-1;
+          }
+          i -= j;
+      } else if (Character.isLetter(cline.charAt(i-1))) {
+          j=1;
+          k = i-j-1;
+          while( (k>=0 && Character.isLetter(cline.charAt(k))) && (Character.UnicodeBlock.of(cline.charAt(k)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ) {
+            j++;
+            k = i-j-1;
+          }
+          offsets[i-1] = j;
+          i -= j;
+      } else if (Character.isDigit(cline.charAt(i-1))) {
+          j=1;
+          k = i-j-1;
+          while( (k>=0 && Character.isDigit(cline.charAt(k))) && (Character.UnicodeBlock.of(cline.charAt(k)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) || (k>=0 && cline.charAt(k)=='.') ){
+            j++;
+            k = i-j-1;
+          }
+          offsets[i-1] = j;
+          i -= j;
+      }
+      else {
+           offsets[i-1] = 1;
+           i--;
+      }
+    }
+
+    i = clength-1;
+    while (i > 0) {
+      if(offsets[i] > 0)   {
+        while(i-offsets[i]+1 > 0 && offsets[i-offsets[i]] > 0 && i-offsets[i]-offsets[i-offsets[i]]+1 >= 0 && isNumber(cline.substring(i-offsets[i]-offsets[i-offsets[i]]+1, i+1))) {
+          tmpoffset = offsets[i-offsets[i]];
+          offsets[i-offsets[i]] = 0;
+          offsets[i] = offsets[i] + tmpoffset;
+        }
+      }
+      i--;
+    }
+
+    return;
+  }
+
+  public void segmentFile(String inputfile, int mode) {
+    String outfile = inputfile + ".seg";
+    String segstring;
+
+    try {
+      String dataline;
+      InputStream in = new FileInputStream(inputfile);
+      BufferedReader rd = new BufferedReader(new InputStreamReader(in, encoding));
+      BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), encoding));
+	    
+      dataline = rd.readLine();
+      while(dataline != null) {
+        segstring = segmentLine(dataline, seg_separator, mode);
+        out.write(segstring);
+        out.newLine();
+        dataline = rd.readLine();
+      }
+
+      in.close();
+      rd.close();
+      out.close();
+    }
+    catch (Exception e) {
+      System.err.println("Exception " + e.toString());
+    }
+
+//  System.gc();
+  }
+
+  private static int setInputMode(String modeStr) {
+    
+    if(modeStr.equals("-mode1")) running_mode = 1;        // left 
+    else if(modeStr.equals("-mode2")) running_mode = 2;   // right
+    else if(modeStr.equals("-mode3")) running_mode = 3;   // left right short
+    else if(modeStr.equals("-mode4")) running_mode = 4;   // left right best prob
+    else if(modeStr.equals("-mode5")) running_mode = 5;   // viterbi
+    else running_mode = 0;
+  
+    return running_mode;
+  }
+
+  public static void printHelp() {
+    System.out.println("Usage: java ChiUtf8Segmenter [-mode1|-mode2|-mode3|-mode4|-mode5] input_file wordprob_map [numbers_identity_file]\n");
+    System.out.println("Default option:\t\tleft longest segments");
+    System.out.println("Option: -mode1\t\tleft longest segments");
+    System.out.println("Option: -mode2\t\tright longest segments");
+    System.out.println("Option: -mode3\t\tchoose segments from left longest or right longest (which give less segments)");
+    System.out.println("Option: -mode4\t\tsegments with higher unigram probability of left longest or right longest");
+    System.out.println("Option: -mode5\t\tViterbi search segmentation (by unigram log probability path cost)\n");
+    System.out.println("Segmented text will be saved to input_file.seg");
+    System.exit(0);
+  }
+
+  public static void main(String[] argv) throws IOException {
+    int mode;
+    String inputfile;
+    ChiUtf8Segmenter segmenter = null;
+  
+    if(argv.length<2 || argv.length>4) {
+      printHelp();
+      System.exit(0);
+    }
+
+    if(argv.length == 2) {
+      if(setInputMode(argv[0])!=0) {
+        printHelp();
+        System.exit(0);
+      }
+      mode = 1; // default mode
+      inputfile = argv[0];
+      segmenter = new ChiUtf8Segmenter(argv[1]); // wordprob_map
+    }
+    else if(argv.length ==4) {
+      mode = setInputMode(argv[0]);
+      if(mode == 0) {
+        printHelp();
+        System.exit(0);
+      }
+      inputfile = argv[1];
+      segmenter = new ChiUtf8Segmenter(argv[2], argv[3]); // wordprob_map numbers_idt_file
+    }
+    else {
+      mode = setInputMode(argv[0]);
+      if(mode == 0) { // unknown in this case, so we assume no input of mode and use default mode
+        mode = 1;
+        inputfile = argv[0];
+        segmenter = new ChiUtf8Segmenter(argv[1], argv[2]); // wordprob_map numbers_idt_file
+      }   
+      else {
+        inputfile = argv[1];
+        segmenter = new ChiUtf8Segmenter(argv[2]); // wordprob_map
+      }
+    }
+
+    System.out.println("Total keys " + segmenter.wordprob_mapdata.size());
+    segmenter.segmentFile(inputfile, mode);
+    System.out.println("Segmentation finished, " + inputfile + " => " + inputfile + ".seg\n");
+    segmenter.cleanRes();
+  }
+}
--- a/egs/hkust/s5b/local/segmenter/MANIFEST.MF
+++ b/egs/hkust/s5b/local/segmenter/MANIFEST.MF
@ -0,0 +1,3 @@
+Manifest-Version: 1.0
+Class-Path: .
+Main-Class: ChiUtf8Segmenter
--- a/egs/hkust/s5b/local/segmenter/Makefile
+++ b/egs/hkust/s5b/local/segmenter/Makefile
@ -0,0 +1,22 @@
+JAVA_COMPILER=javac
+SOURCE_DIR=.
+BIN_DIR=.
+JAR_FILE=ChiUtf8Segmenter.jar
+
+
+SOURCES = $(wildcard $(SOURCE_DIR)/*.java)
+CLASSES = $(patsubst $(SOURCE_DIR)/%.java, $(BIN_DIR)/%.class, $(SOURCES))
+
+all: $(JAR_FILE)
+
+$(JAR_FILE): $(CLASSES)
+	jar -cmf MANIFEST.MF $(JAR_FILE) -C $(BIN_DIR) .
+	chmod +x $(JAR_FILE)
+
+$(BIN_DIR)/%.class: $(SOURCE_DIR)/%.java
+	$(JAVA_COMPILER) -d $(BIN_DIR) $(SOURCE_DIR)/*.java
+
+clean:
+	rm -f $(BIN_DIR)/*.class
+	rm -f $(JAR_FILE)
+
--- a/egs/hkust/s5b/local/segmenter/SearchHistoryPath.java
+++ b/egs/hkust/s5b/local/segmenter/SearchHistoryPath.java
@ -0,0 +1,67 @@
+//
+// Copyright 2013-2014, Hong Kong University of Science and Technology (author: Ricky Chan Ho Yin)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import java.lang.*;
+import java.util.*;
+
+// class for search history path storage
+public class SearchHistoryPath {
+
+  private int number_element;
+  private ArrayList<String> element = null;
+  private float log_prob;
+
+
+  public SearchHistoryPath() {
+    number_element = 0;
+    element = new ArrayList<String>();
+    log_prob = 0.0f;
+  }
+
+  public void addElement(String strVal, float strProb) {
+    number_element++;
+    element.add(strVal);
+    log_prob+=strProb;    
+  }
+
+  public int getNumElement() {
+    return number_element;
+  }
+
+  public float getLogProb() {
+    return log_prob;
+  }
+
+  public ArrayList<String> getList() {
+    return element;
+  }
+
+  public void setList(ArrayList<String> element_path) {
+    element.clear();
+    ListIterator<String> listIterator = element_path.listIterator();
+    while (listIterator.hasNext()) {
+      element.add(listIterator.next());
+    }
+    number_element = element.size();
+  }
+
+  public void clear() {
+    number_element = 0;
+    element.clear();
+    element = null;
+    log_prob = 0.0f;
+  }
+
+}
--- a/egs/hkust/s5b/local/segmenter/WordProbMap.java
+++ b/egs/hkust/s5b/local/segmenter/WordProbMap.java
@ -0,0 +1,88 @@
+//
+// Copyright 2013-2014, Hong Kong University of Science and Technology (author: Ricky Chan Ho Yin)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import java.io.*;
+import java.lang.*;
+import java.util.*;
+import java.util.regex.*;
+
+// class for wordlist and corresponding log probabilities(or cost in negative values for segmentation) 
+class WordProbMap {
+  private String mapName = "wordprobmap";
+  private String encoding = "UTF-8";
+  private HashMap<String, Float> probmap = null;
+
+  public void setName(String mapName) { this.mapName = mapName; }
+  public void setEncoding(String encoding) { this.encoding = encoding; }
+  public String getName() { return mapName; }
+  public String getEncoding() { return encoding; }
+  public HashMap<String, Float> getProbMap() { return probmap; }
+  
+  public WordProbMap() throws IOException {
+    if(readWordProbMap()==false) throw new IOException("read wordprobmap error in WordProbMap.java\n"); 
+  }
+  
+  public WordProbMap(String wordMapFile, String encoding) throws IOException {
+    setName(wordMapFile);
+    setEncoding(encoding);
+    if(readWordProbMap()==false) throw new IOException("read wordprobmap: " + wordMapFile + " error in WordProbMap.java\n");
+  }
+
+  public void clearMap() {
+    if(probmap != null) {
+      probmap.clear();
+      probmap = null;
+    }
+  }
+
+  private boolean readWordProbMap() {
+    try {
+      FileInputStream fin = new FileInputStream(mapName);
+      BufferedReader rd = new BufferedReader(new InputStreamReader(fin, encoding));
+      probmap = new HashMap<String, Float>();
+
+      Pattern p = Pattern.compile("[ \t\r\n]+");
+      String [] b;
+      int line_num = 0;
+
+      String a = rd.readLine();
+      while(a != null) {
+        line_num++;
+        b = p.split(a);
+        if(b.length == 0) {
+          continue;	// empty line
+        }
+        else if(b.length != 2) {
+          throw new IOException("read wordprobmap: "+mapName+" error in line "+line_num+"\n");
+        }
+        if(probmap.containsKey(b[0]) && probmap.get(b[0])>Float.valueOf(b[1]) ) { // appear multiple times, choose max
+          a = rd.readLine();
+          continue;
+        }
+        probmap.put(b[0], Float.valueOf(b[1]));
+        a = rd.readLine();
+      }
+      fin.close();
+      rd.close();
+    }
+    catch (IOException e) {
+      System.err.println(e);
+      return false;
+    }
+
+    return true;
+  }
+}
+
--- a/egs/hkust/s5b/local/segmenter/example/test_sent.txt
+++ b/egs/hkust/s5b/local/segmenter/example/test_sent.txt
@ -0,0 +1,25 @@
+<s></s>
+<s>What is it?</s>
+Give me a number!!
+123.89%%??
+<s>Hello!!Friends!!</s>
+一二三四点五六
+８９．５６７
+123健康贴士
+全球约20%人口使用社交媒体
+Facebook倚重移动平台大力发展亚洲市场
+应用商店助力 智能电视将淘汰传统有线电视
+感受美国最受欢迎小镇魅力
+一睹全球十大自然奇观
+德国韩国泰国新加坡马尔代夫东京首尔巴厘岛迪拜济州岛北海道巴黎普罗旺斯柏林巴塞罗那伦敦纽约旧金山夏威夷日本马来西亚南非瑞士法国英国澳大利亚加拿大美国
+香港的天气怎么样
+上海的天气怎么样
+下周二下午会不会很热
+什么时候会放晴
+今天下午有没有下雨
+今天北京的天气如何
+今天天气是阴天吗
+今天纽约会不会有雪
+伦敦的天气
+你是哪里人
+欧洲天气预报
--- a/egs/hkust/s5b/local/segmenter/run.sh
+++ b/egs/hkust/s5b/local/segmenter/run.sh
@ -0,0 +1,15 @@
+# clean and build
+make clean
+make all
+
+# print command prompt
+java -jar ChiUtf8Segmenter.jar
+
+# example
+java -jar ChiUtf8Segmenter.jar -mode5 example/test_sent.txt 186k_wordprobmap
+mv example/test_sent.txt.seg example/test_sent.txt.seg0
+
+# another example
+java -jar ChiUtf8Segmenter.jar -mode5 example/test_sent.txt 186k_wordprobmap snumbers_u8.txt
+
+
--- a/egs/hkust/s5b/local/segmenter/snumbers_u8.txt
+++ b/egs/hkust/s5b/local/segmenter/snumbers_u8.txt
@ -0,0 +1,37 @@
+零
+一
+二
+三
+四
+五
+六
+七
+八
+九
+十
+百
+千
+万
+亿
+０
+１
+２
+３
+４
+５
+６
+７
+８
+９
+．
+点
+壹
+贰
+叁
+肆
+伍
+陆
+柒
+捌
+玖
+拾
--- a/egs/wsj/s5/steps/align_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/align_basis_fmllr.sh
@ -61,14 +61,15 @@ cp $srcdir/{tree,final.mdl} $dir || exit 1;
 cp $srcdir/final.occs $dir;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
-
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
+cp $srcdir/norm_vars $dir 2>/dev/null # cmn/cmvn option.

 if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/align_fmllr.sh
+++ b/egs/wsj/s5/steps/align_fmllr.sh
@ -23,7 +23,6 @@ beam=10
 retry_beam=40
 boost_silence=1.0 # factor by which to boost silence during alignment.
 fmllr_update_type=full
-norm_vars=false
 # End configuration options.

 echo "$0 $@"  # Print the command line for logging
@ -60,7 +59,8 @@ cp $srcdir/{tree,final.mdl} $dir || exit 1;
 cp $srcdir/final.occs $dir;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
-
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
+cp $srcdir/norm_vars $dir 2>/dev/null # cmn/cmvn option.

 if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
--- a/egs/wsj/s5/steps/align_nnet.sh
+++ b/egs/wsj/s5/steps/align_nnet.sh
@ -18,7 +18,7 @@ align_to_lats=false # optionally produce alignment in lattice format
 lats_decode_opts="--acoustic-scale=0.1 --beam=20 --latbeam=10"
 lats_graph_scales="--transition-scale=1.0 --self-loop-scale=0.1"

-use_gpu_id=-1 # disable gpu
+use_gpu="no" # yes|no|optionaly
 # End configuration options.

 [ $# -gt 0 ] && echo "$0 $@"  # Print the command line for logging
@ -76,7 +76,7 @@ if [ -f $srcdir/delta_order ]; then
  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
 fi
 # Finally add feature_transform and the MLP
-feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
+feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"


 echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"
--- a/egs/wsj/s5/steps/align_raw_fmllr.sh
+++ b/egs/wsj/s5/steps/align_raw_fmllr.sh
@ -22,7 +22,6 @@ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # factor by which to boost silence during alignment.
-norm_vars=false
 # End configuration options.

 echo "$0 $@"  # Print the command line for logging
@ -58,6 +57,8 @@ cp $srcdir/{tree,final.mdl} $dir || exit 1;
 cp $srcdir/final.occs $dir;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
+cp $srcdir/norm_vars $dir 2>/dev/null # cmn/cmvn option.

 if [[ ! -f $srcdir/final.mat || ! -f $srcdir/full.mat ]]; then
  echo "$0: we require final.mat and full.mat in the source directory $srcdir"
--- a/egs/wsj/s5/steps/align_sgmm.sh
+++ b/egs/wsj/s5/steps/align_sgmm.sh
@ -50,10 +50,12 @@ dir=$4
 oov=`cat $lang/oov.int` || exit 1;
 silphonelist=`cat $lang/phones/silence.csl` || exit 1;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 sdata=$data/split$nj

 mkdir -p $dir/log
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $srcdir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 echo $nj > $dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

@ -66,8 +68,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/align_sgmm2.sh
+++ b/egs/wsj/s5/steps/align_sgmm2.sh
@ -50,10 +50,12 @@ dir=$4
 oov=`cat $lang/oov.int` || exit 1;
 silphonelist=`cat $lang/phones/silence.csl` || exit 1;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 sdata=$data/split$nj

 mkdir -p $dir/log
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $srcdir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 echo $nj > $dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

@ -66,8 +68,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/align_si.sh
+++ b/egs/wsj/s5/steps/align_si.sh
@ -48,6 +48,9 @@ echo $nj > $dir/num_jobs
 sdata=$data/split$nj
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
+cp $srcdir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
+
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

 cp $srcdir/{tree,final.mdl} $dir || exit 1;
@ -58,8 +61,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
   ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/decode.sh
+++ b/egs/wsj/s5/steps/decode.sh
@ -75,13 +75,15 @@ done
 if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode.sh: feature type is $feat_type";

-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
+
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac
 if [ ! -z "$transform_dir" ]; then # add transforms to features...
--- a/egs/wsj/s5/steps/decode_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/decode_basis_fmllr.sh
@ -84,6 +84,7 @@ mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

@ -124,8 +125,8 @@ done
 if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type";
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac
 ##
--- a/egs/wsj/s5/steps/decode_biglm.sh
+++ b/egs/wsj/s5/steps/decode_biglm.sh
@ -43,6 +43,7 @@ dir=$5
 srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
 sdata=$data/split$nj;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -58,8 +59,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode_si.sh: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac

--- a/egs/wsj/s5/steps/decode_fmllr.sh
+++ b/egs/wsj/s5/steps/decode_fmllr.sh
@ -47,7 +47,6 @@ num_threads=1 # if >1, will use gmm-latgen-faster-parallel
 parallel_opts=  # If you supply num-threads, you should supply this too.
 skip_scoring=false
 scoring_opts=
-norm_vars=false
 # End configuration section
 echo "$0 $@"  # Print the command line for logging

@ -91,6 +90,7 @@ mkdir -p $dir/log
 split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

--- a/egs/wsj/s5/steps/decode_fmllr_extra.sh
+++ b/egs/wsj/s5/steps/decode_fmllr_extra.sh
@ -100,6 +100,7 @@ mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

@ -142,8 +143,8 @@ done
 if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type";
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac
 ##
--- a/egs/wsj/s5/steps/decode_fmmi.sh
+++ b/egs/wsj/s5/steps/decode_fmmi.sh
@ -57,6 +57,7 @@ dir=$3
 srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
 sdata=$data/split$nj;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 

@ -74,8 +75,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode_fmmi.sh: feature type is $feat_type";

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac

--- a/egs/wsj/s5/steps/decode_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_fromlats.sh
@ -53,6 +53,7 @@ mkdir -p $dir/log

 nj=`cat $olddir/num_jobs` || exit 1;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 sdata=$data/split$nj
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj >$dir/num_jobs
@ -67,8 +68,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode_si.sh: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac

--- a/egs/wsj/s5/steps/decode_fwdbwd.sh
+++ b/egs/wsj/s5/steps/decode_fwdbwd.sh
@ -73,10 +73,11 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode_fwdbwd.sh: feature type is $feat_type";

 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac
 if [ ! -z "$transform_dir" ]; then # add transforms to features...
--- a/egs/wsj/s5/steps/decode_nnet.sh
+++ b/egs/wsj/s5/steps/decode_nnet.sh
@ -25,7 +25,7 @@ scoring_opts="--min-lmwt 4 --max-lmwt 15"

 num_threads=1 # if >1, will use latgen-faster-parallel
 parallel_opts="-pe smp $((num_threads+1))" # use 2 CPUs (1 DNN-forward, 1 decoder)
-use_gpu_id=-1 # -1 disable gpu
+use_gpu="no" # yes|no|optionaly
 # End configuration section.

 echo "$0 $@"  # Print the command line for logging
@ -104,7 +104,7 @@ fi
 # Run the decoding in the queue
 if [ $stage -le 0 ]; then
  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
-    nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet "$feats" ark:- \| \
+    nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
    latgen-faster-mapped$thread_string --max-active=$max_active --max-mem=$max_mem --beam=$beam \
    --lattice-beam=$latbeam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
--- a/egs/wsj/s5/steps/decode_nnet_cpu.sh
+++ b/egs/wsj/s5/steps/decode_nnet_cpu.sh
@ -62,6 +62,7 @@ done

 sdata=$data/split$nj;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 

@ -77,9 +78,9 @@ if [ -z "$feat_type" ]; then
 fi

 case $feat_type in
-  raw) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  raw) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
--- a/egs/wsj/s5/steps/decode_nolats.sh
+++ b/egs/wsj/s5/steps/decode_nolats.sh
@ -78,12 +78,13 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode.sh: feature type is $feat_type";

 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac
 if [ ! -z "$transform_dir" ]; then # add transforms to features...
--- a/egs/wsj/s5/steps/decode_raw_fmllr.sh
+++ b/egs/wsj/s5/steps/decode_raw_fmllr.sh
@ -50,7 +50,6 @@ num_threads=1 # if >1, will use gmm-latgen-faster-parallel
 parallel_opts=  # If you supply num-threads, you should supply this too.
 skip_scoring=false
 scoring_opts=
-norm_vars=false
 # End configuration section
 echo "$0 $@"  # Print the command line for logging

@ -94,6 +93,7 @@ mkdir -p $dir/log
 split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

--- a/egs/wsj/s5/steps/decode_sgmm.sh
+++ b/egs/wsj/s5/steps/decode_sgmm.sh
@ -66,6 +66,7 @@ done
 sdata=$data/split$nj;
 silphonelist=`cat $graphdir/phones/silence.csl` || exit 1
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
 gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"

@ -79,8 +80,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
--- a/egs/wsj/s5/steps/decode_sgmm2.sh
+++ b/egs/wsj/s5/steps/decode_sgmm2.sh
@ -76,6 +76,7 @@ mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

@ -84,8 +85,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
@ -96,7 +97,7 @@ if [ ! -z "$transform_dir" ]; then
    echo "$0: using transforms from $transform_dir"
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
  elif [ -f $transform_dir/raw_trans.1 ]; then
-    feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"    
+    feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"    
  else
    echo "$0: no such file $transform_dir/trans.1 or $transform_dir/raw_trans.1, invalid --transform-dir option?"
    exit 1;
--- a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
@ -70,6 +70,7 @@ nj=`cat $olddir/num_jobs` || exit 1;
 sdata=$data/split$nj;
 silphonelist=`cat $lang/phones/silence.csl` || exit 1
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
 gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"

@ -87,8 +88,8 @@ if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then
 fi

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
--- a/egs/wsj/s5/steps/decode_sgmm2_rescore.sh
+++ b/egs/wsj/s5/steps/decode_sgmm2_rescore.sh
@ -53,6 +53,7 @@ nj=`cat $olddir/num_jobs` || exit 1;
 sdata=$data/split$nj;
 gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|"
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -72,8 +73,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
--- a/egs/wsj/s5/steps/decode_sgmm2_rescore_project.sh
+++ b/egs/wsj/s5/steps/decode_sgmm2_rescore_project.sh
@ -60,6 +60,7 @@ done
 nj=`cat $olddir/num_jobs` || exit 1;
 sdata=$data/split$nj;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -108,7 +109,7 @@ cur_lats="ark:gunzip -c $olddir/lat.JOB.gz | lattice-scale --acoustic-scale=$inv

 for model_type in left right; do

-  feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" # spliced features.
+  feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" # spliced features.
  if [ ! -z "$transform_dir" ]; then  # using speaker-specific transforms.
     # we want to transform in the sequence: $dir/full.mat, then the result of
     # (extend-transform-dim $transform_dir/trans.JOB), then $dir/full_inv.mat to
--- a/egs/wsj/s5/steps/decode_sgmm_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_sgmm_fromlats.sh
@ -73,6 +73,7 @@ nj=`cat $olddir/num_jobs` || exit 1;
 sdata=$data/split$nj;
 silphonelist=`cat $lang/phones/silence.csl` || exit 1
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
 gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"

@ -90,8 +91,8 @@ if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then
 fi

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
--- a/egs/wsj/s5/steps/decode_sgmm_rescore.sh
+++ b/egs/wsj/s5/steps/decode_sgmm_rescore.sh
@ -51,6 +51,7 @@ nj=`cat $olddir/num_jobs` || exit 1;
 sdata=$data/split$nj;
 gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|"
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -70,8 +71,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
--- a/egs/wsj/s5/steps/decode_with_map.sh
+++ b/egs/wsj/s5/steps/decode_with_map.sh
@ -69,10 +69,11 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode.sh: feature type is $feat_type";

 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac
 if [ ! -z "$transform_dir" ]; then # add transforms to features...
--- a/egs/wsj/s5/steps/get_fmllr_basis.sh
+++ b/egs/wsj/s5/steps/get_fmllr_basis.sh
@ -42,6 +42,7 @@ sdata=$data/split$nj;
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

 splice_opts=`cat $dir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 silphonelist=`cat $lang/phones/silence.csl` || exit 1;

@ -54,8 +55,8 @@ done
 if [ -f $dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type";
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |";;
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac

--- a/egs/wsj/s5/steps/make_denlats.sh
+++ b/egs/wsj/s5/steps/make_denlats.sh
@ -51,6 +51,7 @@ dir=$4

 sdata=$data/split$nj
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

@ -87,8 +88,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "align_si.sh: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
@ -116,7 +117,7 @@ if [ $sub_split -eq 1 ]; then
     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
 else
  for n in `seq $nj`; do
-    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
+    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
    else 
      sdata2=$data/split$nj/$n/split$sub_split;
--- a/egs/wsj/s5/steps/make_denlats_nnet.sh
+++ b/egs/wsj/s5/steps/make_denlats_nnet.sh
@ -19,7 +19,7 @@ max_mem=20000000 # This will stop the processes getting too large.
 # This is in bytes, but not "real" bytes-- you have to multiply
 # by something like 5 or 10 to get real bytes (not sure why so large)
 # End configuration section.
-use_gpu_id=-1 # disable gpu
+use_gpu=no # yes|no|optional
 parallel_opts="-pe smp 2"

 echo "$0 $@"  # Print the command line for logging
@ -48,7 +48,6 @@ srcdir=$3
 dir=$4

 sdata=$data/split$nj
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
@ -108,7 +107,7 @@ if [ -f $srcdir/delta_order ]; then
  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
 fi
 # Finally add feature_transform and the MLP
-feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
+feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"


 echo "$0: generating denlats from data '$data', putting lattices in '$dir'"
@ -126,7 +125,7 @@ if [ $sub_split -eq 1 ]; then
      $dir/dengraph/HCLG.fst "$feats" "scp:$dir/lat.store_separately_as_gz.scp" || exit 1;
 else
  for n in `seq $nj`; do
-    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
+    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
    else
      sdata2=$data/split$nj/$n/split$sub_split;
--- a/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
+++ b/egs/wsj/s5/steps/make_denlats_nnet_cpu.sh
@ -52,6 +52,7 @@ dir=$4

 sdata=$data/split$nj
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
@ -85,8 +86,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "align_si.sh: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
@ -115,7 +116,7 @@ if [ $sub_split -eq 1 ]; then
     $dir/dengraph/HCLG.fst ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
 else
  for n in `seq $nj`; do
-    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
+    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
    else 
      sdata2=$data/split$nj/$n/split$sub_split;
--- a/egs/wsj/s5/steps/make_denlats_sgmm.sh
+++ b/egs/wsj/s5/steps/make_denlats_sgmm.sh
@ -48,6 +48,7 @@ dir=$4

 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
@ -81,8 +82,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "align_si.sh: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/make_denlats_sgmm2.sh
+++ b/egs/wsj/s5/steps/make_denlats_sgmm2.sh
@ -52,6 +52,7 @@ dir=$4

 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 if [ $num_threads -gt 1 ]; then
  # the -parallel becomes part of the binary name we decode with.
  thread_string="-parallel --num-threads=$num_threads"
@ -90,8 +91,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "align_si.sh: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/make_fmllr_feats.sh
+++ b/egs/wsj/s5/steps/make_fmllr_feats.sh
@ -1,19 +1,16 @@
 #!/bin/bash

-# Copyright 2012  Karel Vesely
-#                 Johns Hopkins University (Author: Daniel Povey),
+# Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely),
 #                 
 # Apache 2.0.
-
-# This script is for use in neural network training and testing; it dumps
-# (LDA+MLLT or splice+delta) + fMLLR features in a similar format to
-# conventional raw MFCC features. 
+#
+# This script dumps fMLLR features in a new data directory, 
+# which is later used for neural network training/testing.

 # Begin configuration section.  
 nj=4
 cmd=run.pl
 transform_dir=
-norm_vars=false
 # End configuration section.

 echo "$0 $@"  # Print the command line for logging
@ -37,22 +34,15 @@ if [ $# != 5 ]; then
   exit 1;
 fi

-
 data=$1
 srcdata=$2
 gmmdir=$3
 logdir=$4
 feadir=$5

-
-
-#srcdir=$1 -> gmmdir
-#data=$2 -> srcdata
-#dir=$3 -> ruzne
-#tgtdata=$4 -> feadir
-
 sdata=$srcdata/split$nj;
-splice_opts=`cat $gmmdir/splice_opts 2>/dev/null`
+splice_opts=`cat $gmmdir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $gmmdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $data $logdir $feadir
 [[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1;
@ -73,27 +63,23 @@ esac
 if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "Using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." && exit 1
-#  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
-#     echo "Mismatch in number of jobs with $transform_dir" && exit 1;
-#  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |"
 fi

-
-#prepare the dir
-cp $srcdata/* $data; rm $data/{feats.scp,cmvn.scp};
+# prepare the dir
+cp $srcdata/* $data 2>/dev/null; rm $data/{feats,cmvn}.scp;

 # make $bnfeadir an absolute pathname.
 feadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $feadir ${PWD}`

 name=`basename $data`

-#forward the feats
+# forward the feats
 $cmd JOB=1:$nj $logdir/make_fmllr_feats.JOB.log \
  copy-feats "$feats" \
  ark,scp:$feadir/feats_fmllr_$name.JOB.ark,$feadir/feats_fmllr_$name.JOB.scp || exit 1;
   
-#merge the feats to single SCP
+# merge the SCPs
 for n in $(seq 1 $nj); do
  cat $feadir/feats_fmllr_$name.$n.scp 
 done > $data/feats.scp
--- a/egs/wsj/s5/steps/make_fmmi_feats.sh
+++ b/egs/wsj/s5/steps/make_fmmi_feats.sh
@ -1,9 +1,11 @@
 #!/bin/bash

-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely),
+#
 # Apache 2.0
-# Decoding of fMMI or fMPE models (feature-space discriminative training).
-# If transform-dir supplied, expects e.g. fMLLR transforms in that dir.
+#
+# This script dumps fMMI features in a new data directory, 
+# which is later used for neural network training/testing.

 # Begin configuration section.  
 iter=final
@ -35,22 +37,15 @@ if [ $# != 5 ]; then
   exit 1;
 fi

-
 data=$1
 srcdata=$2
 gmmdir=$3
 logdir=$4
 feadir=$5

-
-
-#srcdir=$1 -> gmmdir
-#data=$2 -> srcdata
-#dir=$3 -> ruzne
-#tgtdata=$4 -> feadir
-
 sdata=$srcdata/split$nj;
 splice_opts=`cat $gmmdir/splice_opts 2>/dev/null`
+norm_vars=`cat $gmmdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $data $logdir $feadir
 [[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1;
@ -63,8 +58,8 @@ if [ -f $gmmdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type";

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
 esac

@ -81,18 +76,18 @@ $cmd JOB=1:$nj $logdir/gselect.JOB.log \
  gmm-gselect --n=$ngselect $gmmdir/$iter.fmpe "$feats" \
  "ark:|gzip -c >$feadir/gselect.JOB.gz" || exit 1;

-#prepare the dir
-cp $srcdata/* $data; rm $data/{feats.scp,cmvn.scp};
+# prepare the dir
+cp $srcdata/* $data 2>/dev/null; rm $data/{feats,cmvn}.scp;

 # make $bnfeadir an absolute pathname.
 feadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $feadir ${PWD}`

-#forward the feats
+# forward the feats
 $cmd JOB=1:$nj $logdir/make_fmmi_feats.JOB.log \
  fmpe-apply-transform $gmmdir/$iter.fmpe "$feats" "ark,s,cs:gunzip -c $feadir/gselect.JOB.gz|"  \
  ark,scp:$feadir/feats_fmmi.JOB.ark,$feadir/feats_fmmi.JOB.scp || exit 1;
   
-#merge the feats to single SCP
+# merge the feats to single SCP
 for n in $(seq 1 $nj); do
  cat $feadir/feats_fmmi.$n.scp 
 done > $data/feats.scp
--- a/egs/wsj/s5/steps/mixup.sh
+++ b/egs/wsj/s5/steps/mixup.sh
@ -45,9 +45,11 @@ nj=`cat $srcdir/num_jobs` || exit 1;
 sdata=$data/split$nj;

 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+norm_vars=`cat $srcdir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $dir/log
 cp $srcdir/splice_opts $dir 2>/dev/null
+cp $srcdir/norm_vars $dir 2>/dev/null
 cp $srcdir/final.mat $dir
 echo $nj > $dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -60,8 +62,8 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/pretrain_dbn.sh
+++ b/egs/wsj/s5/steps/pretrain_dbn.sh
@ -50,8 +50,6 @@ splice_step=1      # Stepsize of the splicing (1 is consecutive splice,
                   # value 2 would do [ -10 -8 -6 -4 -2 0 2 4 6 8 10 ] splicing)
 # misc.
 verbose=1 # enable per-cache reports
-# gpu config
-use_gpu_id= # manually select GPU id to run on, (-1 disables GPU) 
 # End configuration.

 echo "$0 $@"  # Print the command line for logging
@ -172,7 +170,7 @@ else
  feature_transform_old=$feature_transform
  feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
  echo "Renormalizing MLP input features into $feature_transform"
-  nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
+  nnet-forward --use-gpu=yes \
    $feature_transform_old "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
    ark:- 2>$dir/log/cmvn_glob_fwd.log |\
  compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
@ -186,7 +184,7 @@ fi


 ###### GET THE DIMENSIONS ######
-num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu-id=-1 $feature_transform ark:- ark:- |" - 2>/dev/null)
+num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null)
 num_hid=$hid_dim


@ -208,14 +206,14 @@ for depth in $(seq 1 $nn_depth); do
    rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate_low --l2-penalty=$rbm_l2penalty \
      --num-iters=$((2*$rbm_iter)) --drop-data=$rbm_drop_data --verbose=$verbose \
      --feature-transform=$feature_transform \
-      ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} $rbm_extra_opts \
+      $rbm_extra_opts \
      $RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
  else
    #This is Bernoulli-Bernoulli RBM
    #cmvn stats for init
    echo "Computing cmvn stats '$dir/$depth.cmvn' for RBM initialization"
    if [ ! -f $dir/$depth.cmvn ]; then 
-      nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
+      nnet-forward --use-gpu=yes \
       "nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
        "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
        ark:- 2>$dir/log/cmvn_fwd.$depth.log | \
@ -232,7 +230,7 @@ for depth in $(seq 1 $nn_depth); do
    rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate --l2-penalty=$rbm_l2penalty \
      --num-iters=$rbm_iter --drop-data=$rbm_drop_data --verbose=$verbose \
      --feature-transform="nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
-      ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} $rbm_extra_opts \
+      $rbm_extra_opts \
      $RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
  fi

--- a/egs/wsj/s5/steps/train_deltas.sh
+++ b/egs/wsj/s5/steps/train_deltas.sh
@ -16,6 +16,7 @@ retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
 power=0.25 # Exponent for number of gaussians according to occurrence counts
 cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
+norm_vars=false # false : cmn, true : cmvn
 # End configuration.

 echo "$0 $@"  # Print the command line for logging
@ -55,7 +56,9 @@ echo $nj > $dir/num_jobs
 sdata=$data/split$nj;
 split_data.sh $data $nj || exit 1;

-feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
+echo $norm_vars > $dir/norm_vars # keep track of feature normalization type for decoding, alignment
+
+feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"

 rm $dir/.error 2>/dev/null

--- a/egs/wsj/s5/steps/train_diag_ubm.sh
+++ b/egs/wsj/s5/steps/train_diag_ubm.sh
@ -52,6 +52,7 @@ silphonelist=`cat $lang/phones/silence.csl` || exit 1;

 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
@ -60,8 +61,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/train_lda_mllt.sh
@ -21,6 +21,7 @@ randprune=4.0 # This is approximately the ratio by which we will speed up the
              # LDA and MLLT calculations via randomized pruning.
 splice_opts=
 cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
+norm_vars=false # false : cmn, true : cmvn
 # End configuration.
 train_tree=true  # if false, don't actually train the tree.
 use_lda_mat=  # If supplied, use this LDA[+MLLT] matrix.
@ -63,11 +64,13 @@ echo $nj >$dir/num_jobs
 echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options
           # so that later stages of system building can know what they were.

+echo $norm_vars > $dir/norm_vars # keep track of feature normalization type
+           # so that later stages of system building can know what they were.
+
 sdata=$data/split$nj;
 split_data.sh $data $nj || exit 1;

-
-splicedfeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
+splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
 # Note: $feats gets overwritten later in the script.
 feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"

--- a/egs/wsj/s5/steps/train_mmi.sh
+++ b/egs/wsj/s5/steps/train_mmi.sh
@ -56,8 +56,10 @@ nj=`cat $alidir/num_jobs` || exit 1;

 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs

@ -72,8 +74,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_mmi_fmmi.sh
+++ b/egs/wsj/s5/steps/train_mmi_fmmi.sh
@ -75,8 +75,10 @@ nj=`cat $alidir/num_jobs` || exit 1;
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;


@ -85,8 +87,8 @@ echo "$0: feature type is $feat_type"

 # Note: $feats is the features before fMPE.
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh
+++ b/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh
@ -73,8 +73,10 @@ nj=`cat $alidir/num_jobs` || exit 1;
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;


@ -83,8 +85,8 @@ echo "$0: feature type is $feat_type"

 # Note: $feats is the features before fMPE.
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_mmi_sgmm.sh
+++ b/egs/wsj/s5/steps/train_mmi_sgmm.sh
@ -50,9 +50,11 @@ nj=`cat $alidir/num_jobs` || exit 1;

 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 echo $nj > $dir/num_jobs

 cp $alidir/tree $dir
@ -66,8 +68,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_mmi_sgmm2.sh
+++ b/egs/wsj/s5/steps/train_mmi_sgmm2.sh
@ -50,8 +50,10 @@ nj=`cat $alidir/num_jobs` || exit 1;

 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs

@ -66,8 +68,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@ -20,6 +20,7 @@ config= # name of config file.
 stage=-4
 power=0.25 # exponent to determine number of gaussians from occurrence counts
 feat_dim=-1 # This option is now ignored but retained for compatibility.
+norm_vars=false # false : cmn, true : cmvn
 # End configuration section.

 echo "$0 $@"  # Print the command line for logging
@ -50,8 +51,9 @@ echo $nj > $dir/num_jobs
 sdata=$data/split$nj;
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

+echo $norm_vars > $dir/norm_vars # keep track of feature normalization type for decoding, alignment

-feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
+feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
 example_feats="`echo $feats | sed s/JOB/1/g`";

 echo "$0: Initializing monophone system."
--- a/egs/wsj/s5/steps/train_mpe.sh
+++ b/egs/wsj/s5/steps/train_mpe.sh
@ -55,8 +55,10 @@ nj=`cat $alidir/num_jobs` || exit 1;

 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs

@ -70,8 +72,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_nnet.sh
+++ b/egs/wsj/s5/steps/train_nnet.sh
@ -46,7 +46,6 @@ train_opts=        # options, passed to the training script
 train_tool=        # optionally change the training tool

 # OTHER
-use_gpu_id= # manually select GPU id to run on, (-1 disables GPU)
 analyze_alignments=true # run the alignment analysis script
 seed=777    # seed value used for training data shuffling and initialization
 # End configuration.
@ -186,8 +185,9 @@ echo "Feature dim is : $feat_dim"

 if [ ! -z "$feature_transform" ]; then
  echo "Using pre-computed feature-transform : '$feature_transform'"
+  [ ! -f $feature_transform ] && echo "Missing file '$feature_transform'" && exit 1
  tmp=$dir/$(basename $feature_transform) 
-  cp $feature_transform $tmp; feature_transform=$tmp
+  cp $feature_transform $tmp; feature_transform=$tmp 
 else
  # Generate the splice transform
  echo "Using splice +/- $splice , step $splice_step"
@ -258,7 +258,7 @@ else
  feature_transform_old=$feature_transform
  feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
  echo "Renormalizing MLP input features into $feature_transform"
-  nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
+  nnet-forward --use-gpu=yes \
    $feature_transform_old "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" \
    ark:- 2>$dir/log/nnet-forward-cmvn.log |\
  compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
@ -299,6 +299,7 @@ else

  #optionally prepend dbn to the initialization
  if [ ! -z $dbn ]; then
+    [ ! -f $dbn ] && echo "Missing file '$dbn'" && exit 1
    mlp_init_old=$mlp_init; mlp_init=$dir/nnet_$(basename $dbn)_dnn.init
    nnet-concat $dbn $mlp_init_old $mlp_init 
  fi
@ -315,7 +316,6 @@ steps/train_nnet_scheduler.sh \
  ${train_opts} \
  ${train_tool:+ --train-tool "$train_tool"} \
  ${config:+ --config $config} \
-  ${use_gpu_id:+ --use-gpu-id $use_gpu_id} \
  $mlp_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir || exit 1


--- a/egs/wsj/s5/steps/train_nnet_mmi.sh
+++ b/egs/wsj/s5/steps/train_nnet_mmi.sh
@ -21,7 +21,6 @@ learn_rate=0.00001
 halving_factor=1.0 #ie. disable halving
 drop_frames=true
 verbose=1
-use_gpu_id=

 seed=777    # seed value used for training data shuffling
 # End configuration section
@ -168,7 +167,6 @@ while [ $x -le $num_iters ]; do
       --learn-rate=$learn_rate \
       --drop-frames=$drop_frames \
       --verbose=$verbose \
-       ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
  fi
  cur_mdl=$dir/$x.nnet
--- a/egs/wsj/s5/steps/train_nnet_mpe.sh
+++ b/egs/wsj/s5/steps/train_nnet_mpe.sh
@ -21,7 +21,6 @@ halving_factor=1.0 #ie. disable halving
 do_smbr=true
 use_silphones=false #setting this to something will enable giving siphones to nnet-mpe
 verbose=1
-use_gpu_id=

 seed=777    # seed value used for training data shuffling
 # End configuration section
@ -151,7 +150,6 @@ while [ $x -le $num_iters ]; do
       --do-smbr=$do_smbr \
       --verbose=$verbose \
       $mpe_silphones_arg \
-       ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
  fi
  cur_mdl=$dir/$x.nnet
--- a/egs/wsj/s5/steps/train_nnet_scheduler.sh
+++ b/egs/wsj/s5/steps/train_nnet_scheduler.sh
@ -25,8 +25,6 @@ end_halving_inc=0.1
 halving_factor=0.5
 # misc.
 verbose=1
-# gpu
-use_gpu_id=
 # tool
 train_tool="nnet-train-xent-hardlab-frmshuff"
 
@ -73,7 +71,6 @@ mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}
 $train_tool --cross-validate=true \
 --bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
 ${feature_transform:+ --feature-transform=$feature_transform} \
- ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
 $mlp_best "$feats_cv" "$labels_cv" \
 2> $dir/log/prerun.log || exit 1;

@ -97,7 +94,6 @@ for iter in $(seq -w $max_iters); do
   --learn-rate=$learn_rate --momentum=$momentum --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
   --bunchsize=$bunch_size --cachesize=$cache_size --randomize=true --verbose=$verbose \
   ${feature_transform:+ --feature-transform=$feature_transform} \
-   ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
   ${seed:+ --seed=$seed} \
   $mlp_best "$feats_tr" "$labels_tr" $mlp_next \
   2> $dir/log/iter$iter.log || exit 1; 
@ -110,7 +106,6 @@ for iter in $(seq -w $max_iters); do
  $train_tool --cross-validate=true \
   --bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
   ${feature_transform:+ --feature-transform=$feature_transform} \
-   ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
   $mlp_next "$feats_cv" "$labels_cv" \
   2>>$dir/log/iter$iter.log || exit 1;
  
--- a/egs/wsj/s5/steps/train_quick.sh
+++ b/egs/wsj/s5/steps/train_quick.sh
@ -57,10 +57,12 @@ incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
 nj=`cat $alidir/num_jobs` || exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $dir/log
 echo $nj >$dir/num_jobs
 cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

 ## Set up features.
@ -68,8 +70,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_raw_sat.sh
+++ b/egs/wsj/s5/steps/train_raw_sat.sh
@ -68,11 +68,13 @@ silphonelist=`cat $lang/phones/silence.csl`
 ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
 sdata=$data/split$nj;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 phone_map_opt=
 [ ! -z "$phone_map" ] && phone_map_opt="--phone-map='$phone_map'"

 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.

 echo $nj >$dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -84,7 +86,7 @@ if [[ ! -f $alidir/final.mat || ! -f $alidir/full.mat ]]; then
  exit 1
 fi

-sisplicedfeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
+sisplicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
 sifeats="$sisplicedfeats transform-feats $alidir/final.mat ark:- ark:- |"


@ -109,7 +111,7 @@ else
  cur_trans_dir=$dir
 fi

-splicedfeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$cur_trans_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |"
+splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$cur_trans_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |"


 if [ $stage -le -5 ]; then
@ -219,7 +221,7 @@ while [ $x -lt $num_iters ]; do
      done
    fi
    cur_trans_dir=$dir
-    splicedfeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$cur_trans_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |"
+    splicedfeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$cur_trans_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |"
    feats="$splicedfeats transform-feats $dir/$cur_lda_iter.mat ark:- ark:- |"
  fi

--- a/egs/wsj/s5/steps/train_sat.sh
+++ b/egs/wsj/s5/steps/train_sat.sh
@ -64,11 +64,13 @@ silphonelist=`cat $lang/phones/silence.csl`
 ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
 sdata=$data/split$nj;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 phone_map_opt=
 [ ! -z "$phone_map" ] && phone_map_opt="--phone-map='$phone_map'"

 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.

 echo $nj >$dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -80,8 +82,8 @@ echo "$0: feature type is $feat_type"

 ## Set up speaker-independent features.
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_sat_basis.sh
+++ b/egs/wsj/s5/steps/train_sat_basis.sh
@ -64,11 +64,13 @@ silphonelist=`cat $lang/phones/silence.csl`
 ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
 sdata=$data/split$nj;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 phone_map_opt=
 [ ! -z "$phone_map" ] && phone_map_opt="--phone-map='$phone_map'"

 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.

 echo $nj >$dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -80,8 +82,8 @@ echo "$0: feature type is $feat_type"

 ## Set up speaker-independent features.
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_segmenter.sh
+++ b/egs/wsj/s5/steps/train_segmenter.sh
@ -49,6 +49,7 @@ numgauss=$numleaves
 incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
 oov=`cat $lang/oov.int` || exit 1;
 ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 nj=`cat $alidir/num_jobs` || exit 1;
 mkdir -p $dir/log
 echo $nj > $dir/num_jobs
@ -56,7 +57,7 @@ echo $nj > $dir/num_jobs
 sdata=$data/split$nj;
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

-feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
+feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"

 rm $dir/.error 2>/dev/null

--- a/egs/wsj/s5/steps/train_sgmm.sh
+++ b/egs/wsj/s5/steps/train_sgmm.sh
@ -81,7 +81,10 @@ mkdir -p $dir/log
 echo $nj > $dir/num_jobs
 sdata=$data/split$nj;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
+
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

 spkvecs_opt=  # Empty option for now, until we estimate the speaker vectors.
@ -92,8 +95,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_sgmm2.sh
+++ b/egs/wsj/s5/steps/train_sgmm2.sh
@ -90,9 +90,11 @@ feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{prin
 [ -z $spk_dim ] && spk_dim=$feat_dim
 nj=`cat $alidir/num_jobs` || exit 1;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 echo $nj > $dir/num_jobs
 sdata=$data/split$nj;
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -105,8 +107,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
@ -116,7 +118,7 @@ if [ -f $alidir/trans.1 ]; then
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
 elif [ -f $alidir/raw_trans.1 ]; then
  echo "$0: using raw-fMLLR transforms from $alidir"
-  feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
 fi
 ##

--- a/egs/wsj/s5/steps/train_sgmm2_group.sh
+++ b/egs/wsj/s5/steps/train_sgmm2_group.sh
@ -94,9 +94,11 @@ feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{prin
 [ -z $spk_dim ] && spk_dim=$feat_dim
 nj=`cat $alidir/num_jobs` || exit 1;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 echo $nj > $dir/num_jobs
 sdata=$data/split$nj;
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@ -109,8 +111,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
@ -120,7 +122,7 @@ if [ -f $alidir/trans.1 ]; then
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
 elif [ -f $alidir/raw_trans.1 ]; then
  echo "$0: using raw-fMLLR transforms from $alidir"
-  feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
 fi
 ##

--- a/egs/wsj/s5/steps/train_smbr.sh
+++ b/egs/wsj/s5/steps/train_smbr.sh
@ -53,8 +53,10 @@ nj=`cat $alidir/num_jobs` || exit 1;

 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null
+cp $alidir/norm_vars $dir 2>/dev/null # cmn/cmvn option.
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs

@ -68,8 +70,8 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
--- a/egs/wsj/s5/steps/train_ubm.sh
+++ b/egs/wsj/s5/steps/train_ubm.sh
@ -62,14 +62,15 @@ echo $nj > $dir/num_jobs
 sdata=$data/split$nj;
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+norm_vars=`cat $alidir/norm_vars 2>/dev/null` || norm_vars=false # cmn/cmvn option, default false.

 ## Set up features.
 if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"

 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
@ -85,7 +86,7 @@ if [ -f $alidir/trans.1 ]; then
  fi
 elif [ -f $alidir/raw_trans.1 ]; then
  echo "$0: using raw-FMLLR transforms from $alidir"
-  feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"  
+  feats="ark,s,cs:apply-cmvn --norm-vars=$norm_vars --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"  
 fi
 ##

--- a/egs/wsj/s5/utils/combine_data.sh
+++ b/egs/wsj/s5/utils/combine_data.sh
@ -28,7 +28,14 @@ mkdir -p $dest;

 export LC_ALL=C

-for file in utt2spk feats.scp text cmvn.scp segments reco2file_and_channel wav.scp $extra_files; do
+for dir in $*; do
+  if [ ! -f $dir/utt2spk ]; then
+    echo "$0: no such file $dir/utt2spk"
+    exit 1;
+  fi
+done
+
+for file in utt2spk feats.scp text cmvn.scp segments reco2file_and_channel wav.scp spk2gender $extra_files; do
  if [ -f $first_src/$file ]; then
    ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
    echo "$0: combined $file"
--- a/egs/wsj/s5/utils/queue.pl
+++ b/egs/wsj/s5/utils/queue.pl
@ -107,7 +107,7 @@ $qdir = "$dir/q";
 $qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q.
 $queue_logfile = "$qdir/$base";

-if (!-d $dir) { system "mkdir $dir 2>/dev/null"; } # another job may be doing this...
+if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this...
 if (!-d $dir) { die "Cannot make the directory $dir\n"; }
 # make a directory called "q",
 # where we will put the log created by qsub... normally this doesn't contain
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@ -21,7 +21,7 @@ for x in `seq 3`; do
 done

 if [ $# -ne 1 ]; then
-  echo "Usage: $0 [---no-feats] [---no-text] [---no-wav] data-dir"
+  echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] data-dir"
  echo "e.g.: $0 data/train"
 fi

@ -80,7 +80,7 @@ check_sorted $data/spk2utt
 cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts

 if [ ! -f $data/text ] && ! $no_text; then
-  echo "$0: no such file $data/text (if this is by design, specify ---no-text)"
+  echo "$0: no such file $data/text (if this is by design, specify --no-text)"
  exit 1;
 fi

@ -104,7 +104,7 @@ fi


 if [ ! -f $data/wav.scp ] && ! $no_wav; then
-  echo "$0: no such file $data/wav.scp (if this is by design, specify ---no-wav)"
+  echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
  exit 1;
 fi

@ -187,7 +187,7 @@ if [ -f $data/wav.scp ]; then
 fi

 if [ ! -f $data/feats.scp ] && ! $no_feats; then
-  echo "$0: no such file $data/feats.scp (if this is by design, specify ---no-feats)"
+  echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
  exit 1;
 fi

--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@ -9,12 +9,16 @@ OPENFST_LDLIBS =
 include ../kaldi.mk

 LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)

-TESTFILES = cuda-matrix-test 
+TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \
+            cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test

-OBJFILES = cu-device.o cu-math.o cu-matrix.o
+
+OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
+           cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o
 ifeq ($(CUDA), true)
-  OBJFILES += cu-kernels.o cu-randkernels.o
+  OBJFILES += cu-kernels.o cu-randkernels.o cu-choleskykernels.o
 endif

 LIBNAME = kaldi-cudamatrix
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@ -0,0 +1,208 @@
+// cudamatrix/cu-array-inl.h
+
+// Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
+#define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
+
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-kernels.h"
+#endif
+
+#include "util/timer.h"
+
+namespace kaldi {
+
+
+template<typename T>
+void CuArray<T>::Resize(MatrixIndexT dim, MatrixResizeType resize_type) {
+  KALDI_ASSERT((resize_type == kSetZero || resize_type == kUndefined) && dim >= 0);
+  if (dim_ == dim) {
+    if (resize_type == kSetZero)
+      SetZero();
+    return;
+  }
+
+  Destroy();
+
+  if (dim == 0) return;
+  
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    CU_SAFE_CALL(cudaMalloc((void**)&data_, dim*sizeof(T)));
+  } else
+#endif
+  {
+    data_ = static_cast<T*>(malloc(dim * sizeof(T)));
+    // We allocate with malloc because we don't want constructors being called.
+    // We basically ignore memory alignment issues here-- we assume the malloc
+    // implementation is forgiving enough that it will automatically align on
+    // sensible boundaries.
+    if (data_ == 0)
+      KALDI_ERR << "Memory allocation failed when initializing CuVector "
+                << "with dimension " << dim << " object size in bytes: "
+                << sizeof(T);
+  }
+
+  dim_ = dim;
+  if (resize_type == kSetZero)
+    SetZero();
+}
+
+template<typename T>
+void CuArray<T>::Destroy() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    if (data_ != NULL) {
+      CU_SAFE_CALL(cudaFree(data_));
+    }
+  } else
+#endif
+  {
+    if (data_ != NULL)
+      free(data_);
+  }
+  dim_ = 0;
+  data_ = NULL;
+}
+
+
+template<typename T>
+void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
+  Resize(src.size(), kUndefined);
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice));
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    memcpy(data_, &src.front(), src.size()*sizeof(T));
+  }
+}
+
+
+
+template<typename T>
+void CuArray<T>::CopyToVec(std::vector<T> *dst) const {
+  if (static_cast<MatrixIndexT>(dst->size()) != dim_) {
+    dst->resize(dim_);
+  }
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(T), cudaMemcpyDeviceToHost));
+    CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim.Elapsed());
+  } else
+#endif
+  {
+    memcpy(&dst->front(), data_, dim_*sizeof(T));
+  }
+}
+
+
+template<typename T>
+void CuArray<T>::SetZero() {
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+    CU_SAFE_CALL(cudaMemset(data_, 0, dim_ * sizeof(T)));
+    CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim.Elapsed());
+  } else
+#endif
+  {
+    memset(static_cast<void*>(data_), 0, dim_ * sizeof(T));
+  }
+}
+
+
+
+/**
+ * Print the vector to stream
+ */
+template<typename T>
+std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
+  std::vector<T> tmp;
+  vec.CopyToVec(&tmp);
+  out << "[";
+  for(int32 i=0; i<tmp.size(); i++) {
+    out << " " << tmp[i];
+  }
+  out << " ]\n";
+  return out;
+}
+
+
+template<class T> 
+inline void CuArray<T>::Set(const T &value) {
+  // This is not implemented yet, we'll do so if it's needed.
+  KALDI_ERR << "CuArray<T>::Set not implemented yet for this type.";
+}
+
+template<> 
+inline void CuArray<int32>::Set(const int32 &value) {
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+
+    dim3 dimBlock(CU2DBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaI32_set_const(dimGrid, dimBlock, data_, value, d);
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    for (int32 i = 0; i < dim_; i++)
+      data_[i] = value;
+  }
+}
+
+template<typename T>
+void CuArray<T>::CopyFromArray(const CuArray<T> &src) {
+  this->Resize(src.Dim(), kUndefined);
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T),
+                            cudaMemcpyDeviceToDevice));
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    memcpy(this->data_, src.data_, dim_ * sizeof(T));
+  }
+}
+
+
+} // namespace kaldi
+
+#endif
--- a/src/cudamatrix/cu-array-test.cc
+++ b/src/cudamatrix/cu-array-test.cc
@ -0,0 +1,124 @@
+// cudamatrix/cu-array-test.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-array.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+
+
+
+template<class T>
+static void UnitTestCuArray() {
+  for (int32 i = 0; i < 30; i++) {
+    int32 size = rand() % 5;
+    size = size * size * size; // Have a good distribution of sizes, including >256.
+    int32 size2 = rand() % 4;
+    std::vector<T> vec(size);
+    std::vector<T> garbage_vec(size2); // We just use garbage_vec to make sure
+                                       // we sometimes resize from empty,
+                                       // sometimes not.
+    
+    int32 byte_size = size * sizeof(T);
+    std::vector<char> rand_c(byte_size);
+    for (size_t i = 0; i < byte_size; i++)
+      rand_c[i] = rand() % 256;
+    if (!vec.empty()) {
+      std::memcpy((void*)&(vec[0]), (void*)&(rand_c[0]),
+                  byte_size);
+    }
+
+    { // test constructor from vector and CopyToVec.
+      CuArray<T> cu_vec(vec);
+      std::vector<T> vec2;
+      cu_vec.CopyToVec(&vec2);
+      KALDI_ASSERT(vec2 == vec);
+    }
+
+    { // test assignment operator from CuArray.
+      CuArray<T> cu_vec(vec);
+      CuArray<T> cu_vec2(garbage_vec);
+      cu_vec2 = cu_vec;
+      std::vector<T> vec2;
+      cu_vec2.CopyToVec(&vec2);
+      KALDI_ASSERT(vec2 == vec);
+      KALDI_ASSERT(cu_vec2.Dim() == int32(vec2.size())); // test Dim()
+    }
+      
+    { // test resize with resize_type = kSetZero.
+      CuArray<T> cu_vec(vec);
+      cu_vec.Resize(size, kSetZero);
+      std::vector<T> vec2(vec);
+
+      if (!vec2.empty())
+        std::memset(&(vec2[0]), 0, vec2.size() * sizeof(T));
+      std::vector<T> vec3;
+      cu_vec.CopyToVec(&vec3);
+      KALDI_ASSERT(vec2 == vec3); // testing equality of zero arrays.
+    }
+
+    if (sizeof(T) == sizeof(int32) && size > 0) { // test Set for type int32, or same size.
+      CuArray<T> cu_vec(vec);
+      cu_vec.Set(vec[0]);
+      for (size_t i = 1; i < vec.size(); i++) vec[i] = vec[0];
+      std::vector<T> vec2;
+      cu_vec.CopyToVec(&vec2);
+      KALDI_ASSERT(vec2 == vec);
+    }
+  }
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no");
+    else
+      CuDevice::Instantiate().SelectGpuId("yes");
+#endif
+
+    //kaldi::UnitTestCuArray<float>();
+    kaldi::UnitTestCuArray<double>();
+    kaldi::UnitTestCuArray<int32>();
+    kaldi::UnitTestCuArray<std::pair<int32, int32> >();
+
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
--- a/src/cudamatrix/cu-array.h
+++ b/src/cudamatrix/cu-array.h
@ -0,0 +1,123 @@
+// cudamatrix/cu-array.h
+
+// Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_ARRAY_H_
+#define KALDI_CUDAMATRIX_CU_ARRAY_H_
+
+#include "matrix/kaldi-vector.h"
+
+namespace kaldi {
+
+
+/**
+ * std::vector equivalent for CUDA computing.  This class is mostly intended as
+ * a CUDA-based mirror of a std::vector object that lives on the CPU.  We don't
+ * call constructors, initializers, etc., on the GPU.
+ */
+template<typename T>
+class CuArray {
+  typedef CuArray<T> ThisType;
+ public:
+
+  /// Default Constructor
+  CuArray<T>() : dim_(0), data_(NULL) {  }
+
+  /// Constructor with memory initialisation.  resize_type may be kSetZero or
+  /// kUndefined.
+  explicit CuArray<T>(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero):
+    dim_(0), data_(NULL) { Resize(dim, resize_type); }
+
+  /// Constructor from CPU-based int vector
+  explicit CuArray<T>(const std::vector<T> &src):
+    dim_(0), data_(NULL) { CopyFromVec(src); }
+
+  explicit CuArray<T>(const CuArray<T> &src):
+   dim_(0), data_(NULL) { CopyFromArray(src); }
+
+  /// Destructor
+  ~CuArray() { Destroy(); }
+
+  /// Return the vector dimension
+  MatrixIndexT Dim() const { return dim_;  }
+
+  /// Get raw pointer
+  const T* Data() const { return data_; }
+
+  T* Data() { return data_; }
+ 
+  /// Allocate the memory.  resize_type may be kSetZero or kUndefined.
+  /// kCopyData not yet supported (can be implemented if needed).
+  void Resize(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero);
+  
+  /// Deallocate the memory and set dim_ and data_ to zero.  Does not call any
+  /// destructors of the objects stored.
+  void Destroy();
+  
+  /// This function resizes if needed.  Note: copying to GPU is done via memcpy,
+  /// and any constructors or assignment operators are not called.
+  void CopyFromVec(const std::vector<T> &src);
+
+  /// This function resizes if needed.
+  void CopyFromArray(const CuArray<T> &src);
+
+  /// This function resizes *dst if needed.  On resize of "dst", the STL vector
+  /// may call copy-constructors, initializers, and assignment operators for
+  /// existing objects (which will be overwritten), but the copy from GPU to CPU
+  /// is done via memcpy.  So be very careful calling this function if your
+  /// objects are more than plain structs.
+  void CopyToVec(std::vector<T> *dst) const;
+
+  /// Sets the memory for the object to zero, via memset.  You should verify
+  /// that this makes sense for type T.
+  void SetZero();
+  
+  /// Set to a constant value.  Note: any copying is done as if using memcpy, and
+  /// assignment operators or destructors are not called.  This is NOT IMPLEMENTED
+  /// YET except for T == int32 (the current implementation will just crash).
+  void Set(const T &value);
+
+  CuArray<T> &operator= (const CuArray<T> &in) {
+    this->CopyFromArray(in); return *this;
+  }
+
+  CuArray<T> &operator= (const std::vector<T> &in) {
+    this->CopyFromVec(in); return *this;
+  }
+  
+ private:
+  MatrixIndexT dim_;     ///< dimension of the vector
+  T *data_;  ///< GPU data pointer (if GPU not available,
+             ///< will point to CPU memory).
+};
+
+
+/// I/O
+template<typename T>
+std::ostream &operator << (std::ostream &out, const CuArray<T> &vec);
+ 
+} // namespace
+
+
+#include "cudamatrix/cu-array-inl.h"
+
+#endif
+
--- a/src/cudamatrix/cu-block-matrix-test.cc
+++ b/src/cudamatrix/cu-block-matrix-test.cc
@ -0,0 +1,254 @@
+// cudamatrix/cu-block-matrix-test.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+/*
+ * ASSERTS
+ */
+template<typename Real> 
+static void AssertEqual(const MatrixBase<Real> &A,
+                        const MatrixBase<Real> &B,
+                        float tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows()&&A.NumCols() == B.NumCols());
+  for (MatrixIndexT i = 0;i < A.NumRows();i++) {
+    for (MatrixIndexT j = 0;j < A.NumCols();j++) {
+      KALDI_ASSERT(std::abs(A(i, j)-B(i, j)) <= tol*std::max(1.0, (double) (std::abs(A(i, j))+std::abs(B(i, j)))));
+    }
+  }
+}
+
+
+template<typename Real> 
+static void AssertEqual(const CuMatrixBase<Real> &A,
+                        const CuMatrixBase<Real> &B,
+                        float tol = 0.001) {
+  Real Anorm = A.FrobeniusNorm(), Bnorm = B.FrobeniusNorm();
+  CuMatrix<Real> diff(A);
+  diff.AddMat(-1.0, B);
+  Real diff_norm = diff.FrobeniusNorm();
+  if (diff_norm > tol * 0.5 * (Anorm + Bnorm)) {
+    KALDI_LOG << "A = " << A;
+    KALDI_LOG << "B = " << B;
+    KALDI_ERR << "Matrices differ, " << diff_norm << " > " << tol << " * 0.5 *  ( "
+              << Anorm << " + " << Bnorm << " ). ";
+  }
+}
+
+
+template<typename Real> 
+static void AssertEqual(const CuBlockMatrix<Real> &A,
+                        const CuBlockMatrix<Real> &B,
+                        float tol = 0.001) {
+  CuMatrix<Real> Acopy(A), Bcopy(B);
+  AssertEqual(Acopy, Bcopy, tol);
+}
+
+
+template<typename Real> 
+static bool ApproxEqual(const CuBlockMatrix<Real> &A,
+                        const CuBlockMatrix<Real> &B,
+                        float tol = 0.001) {
+  CuMatrix<Real> Acopy(A), Bcopy(B);
+  return Acopy.ApproxEqual(Bcopy, tol);
+}
+
+
+
+
+
+template<class Real>
+static void UnitTestCuBlockMatrixIO() {
+  for (int32 i = 0; i < 10; i++) {
+    int32 num_blocks = rand() % 5;
+    std::vector<CuMatrix<Real> > data(num_blocks);
+    for (int32 b = 0; b < num_blocks; b++) {
+      int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
+      if (b % 2 == 0) std::swap(dimM, dimN);
+      data[b].Resize(dimM, dimN);
+      data[b].SetRandn();
+    }
+    CuBlockMatrix<Real> B(data);
+
+    std::ostringstream os;
+    bool binary = (i % 4 < 2);
+    B.Write(os, binary);
+
+    CuBlockMatrix<Real> B2;
+    std::istringstream is(os.str());
+    B2.Read(is, binary);
+
+    CuMatrix<Real> mat(B), mat2(B2);
+    AssertEqual(mat, mat2);
+    if (!data.empty())
+      KALDI_ASSERT(mat.Sum() != 0.0);
+  }
+}
+
+
+
+template<class Real>
+static void UnitTestCuBlockMatrixAddMatBlock() {
+  for (int32 i = 0; i < 20; i++) {
+    int32 num_blocks = rand() % 5;
+    std::vector<CuMatrix<Real> > data(num_blocks);
+    for (int32 b = 0; b < num_blocks; b++) {
+      int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
+      // early failures will have small dim for easier eyeballing.
+      if (b % 2 == 0) std::swap(dimM, dimN);
+      data[b].Resize(dimM, dimN);
+      data[b].SetRandn();
+    }
+    CuBlockMatrix<Real> B(data);
+    int32 B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
+    // will do X += A B
+
+    MatrixTransposeType transB = (i % 2 == 1 ? kTrans : kNoTrans),
+        transA = (i % 3 == 1 ? kTrans : kNoTrans);
+    if (transB == kTrans) std::swap(B_num_rows, B_num_cols);
+    
+    int32 X_num_rows = 100 + rand() % 255, X_num_cols = B_num_cols,
+        A_num_rows = X_num_rows, A_num_cols = B_num_rows;
+    if (data.size() == 0) { X_num_rows = 0; A_num_rows = 0; }
+    if (transA == kTrans) std::swap(A_num_rows, A_num_cols);
+
+    Real alpha = 2.0, beta = -1.0;
+    CuMatrix<Real> X(X_num_rows, X_num_cols);
+    X.SetRandn();
+    CuMatrix<Real> A(A_num_rows, A_num_cols);
+    A.SetRandn();
+
+    CuMatrix<Real> Xcopy(X), Bcopy(B), Xorig(X), Aorig(A);
+    Xcopy.AddMatMat(alpha, A, transA, Bcopy, transB, beta);
+    X.AddMatBlock(alpha, A, transA, B, transB, beta);
+
+    AssertEqual(X, Xcopy);
+  }
+}
+
+
+template<class Real>
+static void UnitTestCuBlockMatrixAddMatMat() {
+  for (int32 i = 0; i < 20; i++) {
+    int32 num_blocks = rand() % 5;
+    std::vector<CuMatrix<Real> > data(num_blocks);
+    for (int32 b = 0; b < num_blocks; b++) {
+      int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
+      if (i == 0) { dimM = 1; dimN = 1; }
+      // early failures will have small dim for easier eyeballing.
+      if (b % 2 == 0) std::swap(dimM, dimN);
+      data[b].Resize(dimM, dimN);
+      data[b].SetRandn();
+    }    
+    
+    CuBlockMatrix<Real> B(data);
+    int32 B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
+    // will do B += C D
+
+    int32 C_num_rows = B_num_rows, C_num_cols = 100 + rand() % 255;
+    if (C_num_rows == 0) C_num_cols = 0;
+    int32 D_num_rows = C_num_cols, D_num_cols = B_num_cols;
+
+    MatrixTransposeType transC = (i % 2 == 1 ? kTrans : kNoTrans),
+        transD = (i % 3 == 1 ? kTrans : kNoTrans);
+    if (transC == kTrans) std::swap(C_num_rows, C_num_cols);
+    if (transD == kTrans) std::swap(D_num_rows, D_num_cols);
+
+    CuMatrix<Real> C(C_num_rows, C_num_cols), D(D_num_rows, D_num_cols);
+    C.SetRandn();
+    D.SetRandn();
+    
+    CuMatrix<Real> Bmat(B);
+
+    Real alpha = 2.0, beta = -1.0;
+
+    CuBlockMatrix<Real> Bcopy(B);
+
+    B.AddMatMat(alpha, C, transC, D, transD, beta);
+    
+    Bmat.AddMatMat(alpha, C, transC, D, transD, beta);
+
+
+    // Now check that the block-structured part of Bmat is the
+    // same as B.
+    Bcopy.CopyFromMat(Bmat); // copy block-structured part from Bmat to Bcopy.
+
+    if (!ApproxEqual(B, Bcopy)) {
+      KALDI_WARN << "CuBlockMatrixTest failure, please report to maintainers: Bcopy = "
+                 << Bcopy << ", B = " << B << ", C = " << C << ", D = " << D
+                 << ", Bmat = " << B << " transD = " << transD << ", transC = "
+                 << transC;
+      KALDI_ERR << "Please give this log to the maintainers.";
+    }
+    KALDI_ASSERT(Bmat.Sum() != 0 || B_num_rows == 0);
+  }
+}
+
+
+template<typename Real> void CuBlockMatrixUnitTest() {
+  UnitTestCuBlockMatrixIO<Real>();
+  UnitTestCuBlockMatrixAddMatBlock<Real>();
+  UnitTestCuBlockMatrixAddMatMat<Real>();
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+#endif
+
+    kaldi::CuBlockMatrixUnitTest<float>();
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+      kaldi::CuBlockMatrixUnitTest<double>();
+    } else {
+      KALDI_WARN << "Double precision not supported";
+    }
+#else
+    kaldi::CuBlockMatrixUnitTest<double>();
+#endif
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@ -0,0 +1,330 @@
+// cudamatrix/cu-block-matrix.cc
+
+// Copyright 2013      Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+#endif
+
+#include "util/timer.h"
+#include "cudamatrix/cu-block-matrix.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-device.h"
+
+namespace kaldi {
+
+template<class Real>
+CuBlockMatrix<Real>::CuBlockMatrix() {
+#if HAVE_CUDA == 1
+  cu_data_ = NULL;
+#endif
+}
+
+template<class Real>
+CuBlockMatrix<Real>::CuBlockMatrix(const std::vector<CuMatrix<Real> >&data) {
+#if HAVE_CUDA == 1
+  cu_data_ = NULL;
+#endif
+  block_data_.resize(data.size());
+  MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
+  for (size_t b = 0; b < data.size(); b++) {
+    MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
+    KALDI_ASSERT(num_rows > 0 && num_cols > 0);
+    BlockMatrixData block_data;
+    block_data.num_rows = num_rows;
+    block_data.num_cols = num_cols;
+    block_data.row_offset = row_offset;
+    block_data.col_offset = col_offset;
+    row_offset += num_rows;
+    col_offset += num_cols;
+    max_num_rows = std::max(max_num_rows, num_rows);
+    block_data_[b] = block_data;
+  }
+  num_rows_ = row_offset;
+  data_.Resize(max_num_rows, col_offset);
+  for (int32 b = 0; b < NumBlocks(); b++)
+    Block(b).CopyFromMat(data[b]);
+  SetCudaData();
+}
+
+
+template<class Real>
+const CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) const {
+  KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
+  const BlockMatrixData &block_data = block_data_[b];
+  return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
+                           block_data.col_offset, block_data.num_cols);
+}
+
+template<class Real>
+CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) {
+  KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
+  BlockMatrixData &block_data = block_data_[b];
+  return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
+                           block_data.col_offset, block_data.num_cols);
+}
+
+
+template<class Real>
+CuBlockMatrix<Real>::CuBlockMatrix(const CuBlockMatrix<Real> &other):
+    data_(other.data_), block_data_(other.block_data_), num_rows_(other.num_rows_) {
+#if HAVE_CUDA == 1
+  cu_data_ = NULL;
+#endif
+  SetCudaData();
+}
+
+template<class Real>
+CuBlockMatrix<Real> &CuBlockMatrix<Real>::operator =(const CuBlockMatrix<Real> &other) {
+  FreeCudaData();
+  data_ = other.data_;
+  block_data_ = other.block_data_;
+  num_rows_ = other.num_rows_;
+  SetCudaData();
+  return *this;
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::FreeCudaData() {
+#if HAVE_CUDA == 1
+  if (cu_data_ != NULL) {
+    if (CuDevice::Instantiate().Enabled()) {
+      CuDevice::Instantiate().Free(cu_data_);
+      cu_data_ = NULL;
+    } else {
+      KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but "
+                << "no GPU is enabled: likely code error.";
+    }
+  }
+#endif
+}
+
+
+template<class Real>
+void CuBlockMatrix<Real>::SetCudaData() {
+#if HAVE_CUDA == 1
+  KALDI_ASSERT(cu_data_ == NULL);
+  if (block_data_.size() == 0) return; // Nothing to do.
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
+    int32 row_offset = 0, col_offset = 0;
+    for (size_t b = 0; b < NumBlocks(); b++) {
+      CuSubMatrix<Real> this_mat = Block(b);
+      CuBlockMatrixData &this_cu_data = tmp_cu_data[b];
+      this_cu_data.row_offset = row_offset;
+      this_cu_data.col_offset = col_offset;
+      this_cu_data.matrix_dim = this_mat.Dim();
+      this_cu_data.matrix_data = static_cast<void*>(this_mat.Data());
+      row_offset += this_mat.NumRows();
+      col_offset += this_mat.NumCols();
+    }
+    size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
+    cu_data_ = static_cast<CuBlockMatrixData*>(
+        CuDevice::Instantiate().Malloc(size));
+    CU_SAFE_CALL(cudaMemcpy(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice));
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+  }
+#endif
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::Swap(CuBlockMatrix<Real> *other) {
+  data_.Swap(&other->data_);
+  block_data_.swap(other->block_data_);
+  std::swap(num_rows_, other->num_rows_);
+#if HAVE_CUDA == 1
+  std::swap(cu_data_, other->cu_data_);
+#endif
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<CuBlockMatrix>");
+  int32 num_blocks = NumBlocks();
+  WriteBasicType(os, binary, num_blocks);
+  for (int32 b = 0; b < num_blocks; b++)
+    this->Block(b).Write(os, binary);
+  WriteToken(os, binary, "</CuBlockMatrix>");  
+}
+
+
+template<class Real>
+void CuBlockMatrix<Real>::Read(std::istream &is, bool binary) {
+  Destroy();
+  int i = Peek(is, binary);
+  std::vector<CuMatrix<Real> > data;
+  if (i != static_cast<int>('<')) {
+    // back-compatibility code so we can read the older format of
+    // MixtureProbComponent.  This code should be deleted eventually.
+    int32 size;
+    ReadBasicType(is, binary, &size);
+    KALDI_ASSERT(size >= 0);
+    data.resize(size);
+    for (int32 i = 0; i < size; i++)
+      data[i].Read(is, binary);
+  } else {
+    ExpectToken(is, binary, "<CuBlockMatrix>");
+    int32 size;
+    ReadBasicType(is, binary, &size);
+    KALDI_ASSERT(size >= 0);
+    data.resize(size);
+    for (int32 i = 0; i < size; i++)
+      data[i].Read(is, binary);
+    ExpectToken(is, binary, "</CuBlockMatrix>");    
+  }
+
+  CuBlockMatrix<Real> block_mat(data); // initializer from std::vector<CuMatrix<Real> > does
+  // the main job of initialization.
+  this->Swap(&block_mat);
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::Destroy() {
+  data_.Resize(0, 0);
+  block_data_.clear();
+  num_rows_ = 0;  
+  FreeCudaData();
+}
+
+// Does *this = alpha A B + beta * *this, discarding elements outside
+// the block structure of the *this matrix. 
+template<class Real>
+void CuBlockMatrix<Real>::AddMatMat(
+    BaseFloat alpha,
+    const CuMatrix<Real> &A, MatrixTransposeType transA,
+    const CuMatrix<Real> &B, MatrixTransposeType transB,
+    BaseFloat beta) {
+  MatrixIndexT A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
+      A_row_stride = A.Stride(), A_col_stride = 1,
+      B_num_rows = B.NumRows(), B_num_cols = B.NumCols(),
+      B_row_stride = B.Stride(), B_col_stride = 1;
+  if (transA == kTrans) {
+    std::swap(A_num_rows, A_num_cols);
+    std::swap(A_row_stride, A_col_stride);
+  }
+  if (transB == kTrans) {
+    std::swap(B_num_rows, B_num_cols);
+    std::swap(B_row_stride, B_col_stride);
+  }
+  KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
+               && A_num_cols == B_num_rows);
+  if (NumBlocks() == 0) return; // empty matrix.
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+
+    // (x,y,z) dimensions are (block-id, row-of-block, col-of-block)
+    // First some logic to choose block dims...
+    // we assume (which we can, safely) that CU1DBLOCK is <= the max threads per block.
+    int32 x_blocksize = std::min(CU1DBLOCK, NumBlocks()); // x dim corresponds to block-idx.
+    int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
+    int32 y_blocksize = max_block_rows;
+    while (y_blocksize * x_blocksize > CU1DBLOCK || y_blocksize > CU2DBLOCK)
+      y_blocksize--;
+    int32 z_blocksize = max_block_cols;
+    while (z_blocksize * x_blocksize * y_blocksize > CU1DBLOCK || z_blocksize > CU2DBLOCK)
+      z_blocksize--;
+    
+    dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
+    dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
+                 n_blocks(max_block_rows, y_blocksize),
+                 n_blocks(max_block_cols, z_blocksize));
+    cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
+                           A.Data(), A_num_cols, A_row_stride, A_col_stride,
+                           B.Data(), B_row_stride, B_col_stride, alpha, beta);
+    CU_SAFE_CALL(cudaGetLastError());    
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+  } else
+#endif
+  {
+    int32 row_offset = 0, col_offset = 0;    
+    for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
+      CuSubMatrix<Real> this_block = Block(b);
+      MatrixIndexT this_num_rows = this_block.NumRows(),
+          this_num_cols = this_block.NumCols();
+      CuSubMatrix<Real> A_part = (transA == kNoTrans ?
+                                  A.Range(row_offset, this_num_rows,
+                                          0, A.NumCols()) :
+                                  A.Range(0, A.NumRows(),
+                                          row_offset, this_num_rows)),
+          B_part = (transB == kNoTrans ?
+                    B.Range(0, B.NumRows(),
+                            col_offset, this_num_cols) :
+                    B.Range(col_offset, this_num_cols,
+                            0, B.NumCols()));
+      this_block.AddMatMat(alpha, A_part, transA, B_part, transB, beta);
+      row_offset += this_num_rows;
+      col_offset += this_num_cols;
+    }
+    KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
+  }
+}
+
+template<class Real>
+MatrixIndexT CuBlockMatrix<Real>::MaxBlockCols() const {
+  MatrixIndexT max_cols = 0;
+  for (size_t i = 0; i < block_data_.size(); i++)
+    max_cols = std::max(max_cols, block_data_[i].num_cols);
+  return max_cols;
+}
+
+template<class Real>
+MatrixIndexT CuBlockMatrix<Real>::MaxBlockRows() const {
+  return data_.NumRows();
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::CopyFromMat(const CuMatrix<Real> &M) {
+  KALDI_ASSERT(NumRows() == M.NumRows() && NumCols() == M.NumCols());
+  MatrixIndexT row_offset = 0, col_offset = 0;
+  for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
+    CuSubMatrix<Real> this_block = Block(b);
+    MatrixIndexT this_num_rows = this_block.NumRows(),
+        this_num_cols = this_block.NumCols();
+    const CuSubMatrix<Real> src(M, row_offset, this_num_rows,
+                                col_offset, this_num_cols);
+    this_block.CopyFromMat(src);
+    row_offset += this_num_rows;
+    col_offset += this_num_cols;
+  }
+  KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
+}
+
+/**
+ * Print the matrix to stream
+ */
+template<typename Real>
+std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
+  bool binary = false;
+  mat.Write(out, binary);
+  return out;
+}
+// instantiate the template
+template
+std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
+template 
+std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);
+
+// Instantiate the class for float and double.
+template class CuBlockMatrix<float>;
+template class CuBlockMatrix<double>;
+
+} // namespace kaldi
--- a/src/cudamatrix/cu-block-matrix.h
+++ b/src/cudamatrix/cu-block-matrix.h
@ -0,0 +1,150 @@
+// cudamatrix/cu-block-matrix.h
+
+// Copyright 2013      Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
+#define KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
+
+#include <sstream>
+
+#include <vector>
+#include "cudamatrix/cu-common.h"
+
+
+namespace kaldi {
+
+
+/**
+   The class CuBlockMatrix holds a vector of objects of type CuMatrix,
+   say, M_1, M_2, .. M_N
+   and it represents the matrix diag(M_1, M_2, ... M_N).  Note:
+   the individual matrices do not have to be square.  The reason the
+   class is needed is mostly so that we can efficiently multiply by this
+   block-diagonal structure in a parallel way.
+
+   If we have a GPU available, CuBlockMatrix will store a copy of the
+   individual CuMatrix quantities M_1 .. M_N on the GPU, but their
+   'primary' home remains on the CPU.. what we mean by this is that
+   while the data remains on the GPU, the "primary" version of the
+   Matrix object that holds the pointers will remain on the CPU.
+   We just copy it over to the GPU whenever it is changed.
+ */
+
+template<typename Real>
+class CuBlockMatrix {
+ public:
+  friend class CuMatrixBase<Real>;
+  
+  CuBlockMatrix();
+
+  CuBlockMatrix(const std::vector<CuMatrix<Real> > &data);
+
+  ~CuBlockMatrix() { Destroy(); }
+  
+  /// Copy constructor
+  CuBlockMatrix(const CuBlockMatrix &other); 
+
+  /// Assignment operator
+  CuBlockMatrix &operator= (const CuBlockMatrix &other); 
+
+  void Write(std::ostream &os, bool binary) const;
+  
+  void Read(std::istream &is, bool binary);
+
+  MatrixIndexT NumRows() const { return num_rows_; }
+
+  MatrixIndexT NumCols() const { return data_.num_cols_; }
+
+  MatrixIndexT NumBlocks() const { return block_data_.size(); }
+  
+  // Returns max num-columns of any block
+  MatrixIndexT MaxBlockCols() const ;
+
+  // Returns max num-rows of any block
+  MatrixIndexT MaxBlockRows() const;
+    
+  const CuSubMatrix<Real> Block(MatrixIndexT b) const;
+
+  CuSubMatrix<Real> Block(MatrixIndexT b); // return CuMatrixBase to disallow resizes.
+
+
+  /// Does *this = alpha A B + beta * *this, discarding elements of the product outside
+  /// the block structure of the *this matrix.  The transA and transB parameters
+  /// can be used to substitute A^T for A and B^T for B, respectively.
+  void AddMatMat(BaseFloat alpha,
+                 const CuMatrix<Real> &A, MatrixTransposeType transA,
+                 const CuMatrix<Real> &B, MatrixTransposeType transB,
+                 BaseFloat beta);
+
+
+  /// Copies elements within the block structure from matrix M, discarding others.
+  /// Note: this has not been implemented in a very efficient way, it's used only
+  /// for testing.
+  void CopyFromMat(const CuMatrix<Real> &M);
+
+  /// Normalizes the columns of *this so that each one sums to one.
+  /// On error (e.g. inf's), will set the column to a constant value that
+  /// sums to one.
+  void NormalizeColumns();
+
+  void Swap(CuBlockMatrix *other);
+  
+ protected:
+  CuMatrix<Real> data_; // This is a single matrix into which
+  // we pack all the blocks (possibly with spaces left over)
+
+  struct BlockMatrixData{
+    MatrixIndexT num_rows;
+    MatrixIndexT num_cols;
+    MatrixIndexT row_offset;
+    MatrixIndexT col_offset;
+  };
+  
+
+#if HAVE_CUDA == 1
+  const CuBlockMatrixData* CuData() const { return cu_data_; }
+#endif
+ private:
+  
+  /// If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL
+  void FreeCudaData();
+  /// If using GPU, allocate and set cu_data_ on the GPU to reflect "data_".
+  void SetCudaData();
+
+
+  /// Frees and deinitializes everything.
+  void Destroy();
+
+  std::vector<BlockMatrixData> block_data_;
+  
+  MatrixIndexT num_rows_; // sum of num_rows of elements of block_data_.
+#if HAVE_CUDA == 1
+  CuBlockMatrixData *cu_data_; // We store the pointers and some additional info
+                               // on the GPU card in a form more suited to
+                               // use by CUDA kernels.
+#endif
+}; // class CuBlockMatrix
+
+template<typename Real>
+std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat);
+
+
+} // namespace Kaldi
+#endif
--- a/src/cudamatrix/cu-choleskykernels-ansi.h
+++ b/src/cudamatrix/cu-choleskykernels-ansi.h
@ -0,0 +1,53 @@
+// cudamatrix/cu-choleskykernel-ansi.h
+
+// Copyright 2010-2013Dr. Stephan Kramer
+//  Institut für Numerische und Angewandte Mathematik
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_ANSI_H_
+#define KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_ANSI_H_
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "cudamatrix/cu-matrixdim.h"
+
+#if HAVE_CUDA == 1
+
+extern "C" {
+
+/*********************************************************
+ * float CUDA kernel calls
+ */
+void cudaF_factorize_diagonal_block(float* A, int block_offset, MatrixDim d);
+void cudaF_strip_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d);
+void cudaF_diag_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d);
+void cudaF_lo_update(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d);
+
+
+/*********************************************************
+ * double CUDA kernel calls
+ */
+void cudaD_factorize_diagonal_block(double* A, int block_offset, MatrixDim d);
+void cudaD_strip_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d);
+void cudaD_diag_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d);
+void cudaD_lo_update(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d);
+}
+
+#endif // HAVE_CUDA
+
+#endif
--- a/src/cudamatrix/cu-choleskykernels.cu
+++ b/src/cudamatrix/cu-choleskykernels.cu
@ -0,0 +1,359 @@
+// cudamatrix/cu-choleskykernel.cu
+
+// Copyright 2010-2013  Dr. Stephan Kramer
+//  Institut fur Numerische und Angewandte Mathematik
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.    
+// You may obtain a copy of the License at    
+//   
+//  http://www.apache.org/licenses/LICENSE-2.0    
+//   
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED    
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,    
+// MERCHANTABLITY OR NON-INFRINGEMENT.                             
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudamatrix/cu-choleskykernels-ansi.h"
+#include <stdio.h>
+
+
+#define TILE_SIZE 16
+
+/***********************************************************************
+ * CUDA kernels
+ * some functions are templated to have the float/double operations
+ */
+__device__ int lex_index_2D (int r, int c, int row_length) {
+  return c +  r*row_length;
+}
+
+
+__device__ int global_pos(int t_pos, int block_offset) {
+  return t_pos + TILE_SIZE*block_offset;
+}
+
+
+__device__ float inv_sqrt(float x) {
+  return rsqrtf(x);
+}
+
+
+__device__ double inv_sqrt(double x) {
+  return rsqrt(x);
+}
+
+
+template<typename T>
+__global__
+void __factorize_diagonal_block(T* A, int block_offset, MatrixDim d) {
+  int global_row_length = d.stride;
+
+  int col = threadIdx.x;
+  int row = threadIdx.y;
+
+  int global_row = global_pos(row,block_offset);
+  int global_col = global_pos(col,block_offset);
+
+  if ((global_row >= d.cols) || (global_col >= d.cols))
+    return;
+
+  int k_max = TILE_SIZE;
+  if (d.cols - global_pos(0,block_offset) < TILE_SIZE)
+    k_max = d.cols % TILE_SIZE;
+
+
+  int idx = lex_index_2D(global_row, global_col, global_row_length);
+  
+  __shared__ T L[TILE_SIZE][TILE_SIZE+1];
+
+  L[row][col] = 0;
+  L[row][col] = A[idx];
+  __syncthreads();
+
+  if ((row >= k_max) || (col >= k_max))
+    return;
+
+
+  T fac;
+
+  for (int k = 0; k < k_max; k++) {
+    __syncthreads();
+    fac = inv_sqrt(L[k][k]);
+    __syncthreads();
+
+    if ((row==k)&&(col>=k))
+      L[col][row] = (L[col][row])*fac;
+
+    __syncthreads();
+
+    if ((row>=col)&&(col>k))
+      L[row][col] = L[row][col] - L[col][k]*L[row][k];
+  }
+  __syncthreads();
+
+    
+  if (row >= col) {
+    A[idx] = L[row][col];
+    if (A[idx] > 100000)
+      A[idx] = 1;
+  }
+}
+
+
+template<typename T>
+__global__
+void __strip_update(T* A, int block_offset, MatrixDim d) {
+  int global_row_length = d.stride;
+
+  int boffy = block_offset;
+  int boffx = blockIdx.x + boffy + 1;
+  
+  int col = threadIdx.x;
+  int row = threadIdx.y;
+
+  __shared__ T topleft[TILE_SIZE][TILE_SIZE+1];
+  __shared__ T workingmat[TILE_SIZE][TILE_SIZE+1];
+
+  int global_row = global_pos(row,block_offset);
+  int global_col = global_pos(col,block_offset);
+
+  if ((global_row >= d.cols) || (global_col >= d.cols))
+    return;
+
+  int idx = lex_index_2D(global_row, global_col, global_row_length);
+
+  topleft[row][col] = 0;  
+  topleft[row][col] = A[idx];
+  //__syncthreads();
+  
+  global_row = global_pos(row,boffx);
+  
+  if (global_row >= d.cols)
+    return;
+
+  int idx_w = lex_index_2D(global_row, global_col, global_row_length);
+  //int row2 = row + block_offset * TILE_SIZE;
+  //int idx_w = row2 + col*global_row_length;
+  workingmat[col][row]=0;
+  workingmat[col][row]=A[idx_w];
+
+  __syncthreads();
+  
+  if (row==0) {
+    for (int k = 0; k < TILE_SIZE; k++) {
+      T sum=0.0;
+      for (int m = 0; m < k; m++) 
+        sum = sum + topleft[k][m]*workingmat[m][col];
+	
+      workingmat[k][col] = (workingmat[k][col] - sum) / topleft[k][k];
+    }
+  }
+
+  __syncthreads();
+
+  A[idx_w] = workingmat[col][row];
+  if (A[idx_w] > 100000)
+    A[idx_w] = 1;
+  //A[idx_w] = 1;
+}
+
+
+template<typename T>
+__global__
+void __diag_update(T* A, int block_offset, MatrixDim d) {
+  int global_row_length = d.stride;
+  int boffx = blockIdx.x + block_offset + 1;
+
+  int col = threadIdx.x;
+  int row = threadIdx.y;
+
+  int global_row = global_pos(row,boffx);
+  int global_col = global_pos(col,block_offset);
+
+  if ((global_row >= d.cols) || (global_col >= d.cols))
+    return;
+
+  int idx = lex_index_2D(global_row, global_col, global_row_length);
+
+  __shared__ T left[TILE_SIZE][TILE_SIZE+1];
+  
+  left[row][col] = 0;
+  left[row][col] = A[idx];
+  
+  __syncthreads();
+
+  T sum = 0.0;
+
+
+  if (row >= col) {
+    for (int kk = 0; kk < TILE_SIZE; kk++)
+      sum = sum + left[row][kk]*left[col][kk];
+    
+    //__syncthreads();
+  
+    global_col = global_pos(col, boffx);
+ 
+    if (global_col >= d.cols)
+      return;
+
+    idx = lex_index_2D(global_row, global_col, global_row_length);
+
+    A[idx] = A[idx] - sum;
+ 
+  }
+}
+
+
+template<typename T>
+__global__
+void __lo_update(T* A, int block_offset, int n_blocks, MatrixDim d) {
+  int global_row_length = d.stride;
+  int col = threadIdx.x;
+  int row = threadIdx.y;
+  
+  int boffy = blockIdx.y + block_offset + 1;
+  //int boffx = boffy + 1;
+  int boffx = boffy + 1;
+
+  __shared__ T left[TILE_SIZE][TILE_SIZE];
+
+  __shared__ T upt[TILE_SIZE][TILE_SIZE + 1];
+  
+  int global_row = global_pos(row,boffy);
+  int global_col_src = global_pos(col,block_offset);
+
+  if ((global_row >= d.cols) || (global_col_src >= d.cols))
+    return;
+
+  int idx = lex_index_2D(global_row, global_col_src, global_row_length);
+  
+  upt[row][col] = 0;
+  upt[row][col] = A[idx];
+  __syncthreads();
+
+  for (; boffx < n_blocks; boffx++) {
+    global_row = global_pos(row,boffx);
+
+    if (global_row >= d.cols) 
+      return;
+
+    idx = lex_index_2D(global_row, global_col_src, global_row_length);
+    
+    left[row][col] = 0;    
+    left[row][col] = A[idx];
+    
+    __syncthreads();
+
+    if (global_row >= d.cols)
+      return;
+
+    T matrixprod = 0.0;
+    
+    for (int kk = 0; kk < TILE_SIZE; kk++)
+      matrixprod += left[row][kk]*upt[col][kk];
+
+    __syncthreads();
+
+    int global_col = global_pos(col,boffy);
+    if (global_col >= d.cols)
+      return;
+        
+    idx = lex_index_2D(global_row, global_col, global_row_length);
+    A[idx] = A[idx] - matrixprod;
+  }
+}
+
+/***********************************************************************
+ * ANSI-C wrappers of CUDA kernels
+ */
+
+/*
+ * float
+ */
+
+void cudaF_factorize_diagonal_block(float* A, int block_offset, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  __factorize_diagonal_block<<<1,threads>>>(A,block_offset,d);
+  cudaThreadSynchronize();
+}
+
+void cudaF_strip_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  if (n_remaining_blocks >= 2) {
+    dim3 stripgrid(n_remaining_blocks-1);
+    __strip_update<<<stripgrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();    
+  } else {
+    int stripgrid = 1;
+    __strip_update<<<stripgrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();    
+  }
+}
+
+void cudaF_diag_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  if (n_remaining_blocks >= 2) {
+    dim3 diaggrid(n_remaining_blocks-1);
+    __diag_update<<<diaggrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();
+  } else {
+    int diaggrid = 1;
+    __diag_update<<<diaggrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();
+  }
+}
+
+void cudaF_lo_update(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) {
+  dim3 logrid;
+  logrid.x = 1;
+  logrid.y = n_remaining_blocks-2;
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  __lo_update<<<logrid,threads>>>(A,block_offset,n_blocks,d);
+  cudaThreadSynchronize();
+}
+/*
+ * double
+ */
+void cudaD_factorize_diagonal_block(double* A, int block_offset, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  __factorize_diagonal_block<<<1,threads>>>(A,block_offset,d);
+  cudaThreadSynchronize();
+}
+
+void cudaD_strip_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  if (n_remaining_blocks >= 2) {
+    dim3 stripgrid(n_remaining_blocks-1);
+    __strip_update<<<stripgrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();    
+  } else {
+    int stripgrid = 1;
+    __strip_update<<<stripgrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();    
+  }
+}
+
+void cudaD_diag_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  if (n_remaining_blocks >= 2) {
+    dim3 diaggrid(n_remaining_blocks-1);
+    __diag_update<<<diaggrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();
+  } else {
+    int diaggrid = 1;
+    __diag_update<<<diaggrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();
+  }
+}
+
+void cudaD_lo_update(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) {
+  dim3 logrid;
+  logrid.x = 1;
+  logrid.y = n_remaining_blocks-2;
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  __lo_update<<<logrid,threads>>>(A,block_offset,n_blocks,d);
+  cudaThreadSynchronize();
+}
--- a/src/cudamatrix/cu-choleskykernels.h
+++ b/src/cudamatrix/cu-choleskykernels.h
@ -0,0 +1,62 @@
+// cudamatrix/cu-choleskykernel.h
+
+// Copyright 2010-2013  Dr. Stephan Kramer
+//  Institut für Numerische und Angewandte Mathematik
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_H_
+#define KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_H_
+
+#if HAVE_CUDA == 1
+
+#include "base/kaldi-error.h"
+#include "cudamatrix/cu-choleskykernels-ansi.h"
+
+/*
+ * In this file are C++ templated wrappers
+ * of the ANSI-C CUDA kernels
+ */
+
+namespace kaldi {
+
+/*********************************************************
+* base templates
+*/
+template<typename Real> inline void cuda_factorize_diagonal_block(Real* A, int block_offset, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+template<typename Real> inline void cuda_strip_update(Real* A, int block_offset, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+template<typename Real> inline void cuda_diag_update(Real* A, int block_offset, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+template<typename Real> inline void cuda_lo_update(Real* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+/*********************************************************
+* float specialization
+*/
+template<> inline void cuda_factorize_diagonal_block<float>(float* A, int block_offset, MatrixDim d) { cudaF_factorize_diagonal_block(A,block_offset,d); }
+template<> inline void cuda_strip_update<float>(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaF_strip_update(A,block_offset,n_remaining_blocks,d); }
+template<> inline void cuda_diag_update<float>(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaF_diag_update(A,block_offset,n_remaining_blocks,d); }
+template<> inline void cuda_lo_update<float>(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { cudaF_lo_update(A,block_offset,n_blocks,n_remaining_blocks,d); }
+/*********************************************************
+* double specialization
+*/
+template<> inline void cuda_factorize_diagonal_block<double>(double* A, int block_offset, MatrixDim d) { cudaD_factorize_diagonal_block(A,block_offset,d); }
+template<> inline void cuda_strip_update<double>(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaD_strip_update(A,block_offset,n_remaining_blocks,d); }
+template<> inline void cuda_diag_update<double>(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaD_diag_update(A,block_offset,n_remaining_blocks,d); }
+template<> inline void cuda_lo_update<double>(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { cudaD_lo_update(A,block_offset,n_blocks,n_remaining_blocks,d); }
+
+} // namespace
+
+#endif // HAVE_CUDA
+
+#endif
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@ -0,0 +1,32 @@
+#ifndef KALDI_CUDAMATRIX_COMMON_H_
+#define KALDI_CUDAMATRIX_COMMON_H_
+
+// This file contains some #includes, forward declarations
+// and typedefs that are needed by all the main header
+// files in this directory.
+
+#include "base/kaldi-common.h"
+#include "matrix/kaldi-blas.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-common.h"
+
+namespace kaldi {
+
+#if HAVE_CUDA == 1
+cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans) {
+  cublasOperation_t cublas_trans;
+
+  if (kaldi_trans == kNoTrans)
+    cublas_trans = CUBLAS_OP_N;
+  else if (kaldi_trans == kTrans)
+    cublas_trans = CUBLAS_OP_T;
+  else
+    cublas_trans = CUBLAS_OP_C;
+  return cublas_trans;
+}
+#endif
+
+} // namespace
+
+
+#endif  // KALDI_CUDAMATRIX_COMMON_H_
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@ -22,20 +22,20 @@

 #ifndef KALDI_CUDAMATRIX_CU_COMMON_H_
 #define KALDI_CUDAMATRIX_CU_COMMON_H_
-
-
-#if HAVE_CUDA==1
-
+#include "cudamatrix/cu-matrixdim.h" // for CU1DBLOCK and CU2DBLOCK

 #include <iostream>
 #include <sstream>
+#include "base/kaldi-error.h"
+#include "matrix/matrix-common.h"

+#if HAVE_CUDA == 1
+#include <cublas.h>
 #include <cuda_runtime_api.h>

-#include "base/kaldi-error.h"


-#define cuSafeCall(fun) \
+#define CU_SAFE_CALL(fun) \
 { \
  int32 ret; \
  if ((ret = (fun)) != 0) { \
@ -47,19 +47,19 @@

 namespace kaldi {

-  /** The size of edge of CUDA square block **/
-  static const int32 CUBLOCK = 16;
+/** Number of blocks in which the task of size 'size' is splitted **/
+inline int32 n_blocks(int32 size, int32 block_size) { 
+  return size / block_size + ((size % block_size == 0)? 0 : 1); 
+}

-  /** Number of blocks in which the task of size 'size' is splitted **/
-  inline int32 n_blocks(int32 size, int32 block_size) { 
-    return size / block_size + ((size % block_size == 0)? 0 : 1); 
-  }
+cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans);
+  
 }

 #endif // HAVE_CUDA

 namespace kaldi {
-// Some forward declarations, frequently needed
+// Some forward declarations, needed for friend declarations.
 template<typename Real> class CuVectorBase;
 template<typename Real> class CuVector;
 template<typename Real> class CuSubVector;
@ -67,7 +67,13 @@ template<typename Real> class CuRand;
 template<typename Real> class CuMatrixBase;
 template<typename Real> class CuMatrix;
 template<typename Real> class CuSubMatrix;
-template<typename Real> class CuRand;
+template<typename Real> class CuPackedMatrix;
+template<typename Real> class CuSpMatrix;
+template<typename Real> class CuTpMatrix;
+
+template<typename Real> class CuBlockMatrix; // this has no non-CU counterpart.
+
+
 }


--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@ -1,6 +1,8 @@
 // cudamatrix/cu-device.cc

 // Copyright 2009-2012  Karel Vesely
+//                2013  Lucas Ondel
+//                2013  Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -19,140 +21,137 @@



-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1

 #include <cublas.h>
 #include <cuda.h>
+#include <cuda_runtime_api.h>

+#include <string>
 #include <vector>
+#include <algorithm>
 #include <dlfcn.h>
+#include <unistd.h> // for sleep

 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
 #include "base/kaldi-error.h"
-
+#include "util/common-utils.h"

 namespace kaldi {

-CuDevice::CuDevice()
- : active_gpu_id_(-3), verbose_(true) 
-{ }
-
-
-
-CuDevice::~CuDevice() {
-  if (Enabled()) {
-    cuSafeCall(cublasShutdown());
-  } else if (active_gpu_id_ == -2) {
-    KALDI_WARN << "CUDA was NOT used! No CUDA GPU detected!";
-  }
-}
-
-

 /** 
- * SelectGpuId(gpu_id) 
+ * SelectGpuId(use_gpu) 
 *
- * The argument 'gpu_id' meaning: 0..N selects a GPU, 
- * -1 disables CUDA, -2 performs GPU auto-detection.
+ * There are 3 'use_gpu' modes for GPU selection:
+ * "yes"      -- Select GPU automatically (or get one by exclusive mode) 
+ *               and die if this fails.
+ * "optional" -- Do as above, but if it fails, back off to CPU.
+ * "no"       -- Run on CPU.
 *
- * If there is no GPU in the system, and we have GPU auto-detection,
- * or GPU is manually disabled the computation will run on CPU. 
- * In other cases it is an error (manual selection).
+ * In case of Compute exclusive mode, the GPU is selected by OS.
 *
- * In case of Compute exclusive mode, the GPU is selected by OS, 
- * this has priority over manual/auto selection of GPU.
+ * Otherwise GPU selection is based on largest proportion of free memory.
+ * This can eventually lead to multiple processes computing on single GPU,
+ * which is slow. More practical is to use "compute exclusive mode".
 *
- * Since the autoselection of GPU is not perfect, it may still 
- * happen that two processes compute on single GPU, which is slow. 
- * The users are advised to use manual selection or exclusive mode.
- *
- * This method must be called at the very beginning of the program
- * (before the cudamatrix objects allocate memory for the data), 
- * or not at all (when we intentionally want to run on the CPU). 
+ * This method is to be called at the very beginning of the program
+ * (before first allocation in cudamatrix), or not at all (default to CPU).
 *
 */
-void CuDevice::SelectGpuId(int32 gpu_id) {
+void CuDevice::SelectGpuId(std::string use_gpu) {
+  // Possible modes  
+  if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional") {
+    KALDI_ERR << "Please choose : --use-gpu=yes|no|optional, passed '" << use_gpu << "'";
+  }
+ 
  // Make sure this function is not called twice!
-  if(Enabled()) {
+  if (Enabled()) {
    KALDI_ERR << "There is already an active GPU " << active_gpu_id_ 
              << ", cannot change it on the fly!";
  }
  // Allow the GPU to stay disabled
-  if(!Enabled() && gpu_id == -1) { 
-    KALDI_LOG << "Selected device: " << gpu_id 
-              << ", we don't even try to get a GPU. We run on CPU.";
-    active_gpu_id_ = -1;
+  if(!Enabled() && use_gpu == "no") { 
+    KALDI_LOG << "Manually selected to compute on CPU.";
    return;
  }
+
  // Check that we have a gpu available
  int32 n_gpu = 0;
  cudaGetDeviceCount(&n_gpu);
-  if(n_gpu == 0 && gpu_id == -2) {
-    // If we do automatic selection and no GPU is found, we run on a CPU
-    KALDI_WARN << "CUDA will NOT be used!!! No CUDA capable GPU detected...";
-    active_gpu_id_ = -2;
-    return;
-  }
-  // In other cases it is an error, no GPU is an error
  if(n_gpu == 0) {
-    KALDI_ERR << "No CUDA capable GPU detected, while explicitly asked for gpu-id '"
-              << gpu_id << "'.";
-  }
-
-
-  //Now we know that there is a GPU in the system, 
-  //and we don't want to have it disabled. 
-  //
-  //For the GPU selection there are 3 possibilities, 
-  //with priorities according to the order:
-  //
-  //1.) We have compute exclusive mode on (GPU is selected by OS)
-  //2.) User did not specify the GPU-id (default value -2), 
-  //    we will do automatic selection.
-  //3.) User specified the GPU to run on, so we select it.
-  if(IsComputeExclusive()) {
-    //we have the GPU context now...
-    ;
-  } else if(gpu_id == -2) {
-    SelectGpuIdAuto();
-  } else {
-    //try to select the desired GPU
-    int32 ret = cudaSetDevice(gpu_id);
-    //handle the possible errors (no recovery!!!)
-    switch(ret) {
-      case cudaSuccess : {
-        //create the GPU context
-        cudaError_t e;
-        e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
-        if(e != cudaSuccess) {
-          KALDI_ERR << "Failed to create CUDA context on a GPU.";
-        }
-        //this was okay, so we are done!
-        KALDI_LOG << "Selected device: " << gpu_id << " (manually)";
-        break;
-      }
-      case cudaErrorInvalidDevice : { 
-        int32 n_gpu = 0;
-        cudaGetDeviceCount(&n_gpu);
-        KALDI_ERR << "cudaSetDevice(" << gpu_id << "):"
-                  << " '" << gpu_id << "' is not a VALID CUDA device! "
-                  << " (system has " << n_gpu << " GPUs,"
-                  << " valid IDs 0.." << n_gpu-1 << ")";
-        break;
-      }
-      default :
-        KALDI_ERR << "cudaSetDevice(" << gpu_id << "): "
-                  << "returned " << ret << ", " 
-                  << cudaGetErrorString((cudaError_t)ret);
+    if (use_gpu == "yes") {
+      KALDI_ERR << "No CUDA GPU detected!";
+    }
+    if (use_gpu == "optional") {
+      KALDI_WARN << "Running on CPU!!! No CUDA GPU detected...";
+      return;
    }
  }

-
-  // Now the we should have active GPU, 
-  // so we can query its name and memory stats
-  // and notify user which GPU is finally used.
  //
+  // Create a CUDA context : in case of compute-exclusive mode OS selects gpu_id,
+  // or default gpu_id=0. In the case with no free GPUs a context cannot be created
+  // (compute-exclusive mode).
+  //
+  cudaError_t e;
+  e = cudaThreadSynchronize(); //<< CUDA context gets created here.
+  if (e != cudaSuccess) {
+    // So far no we don't have context, sleep a bit and retry.
+    int32 sec_sleep = 2;
+    KALDI_WARN << "Will try again to get a GPU after " << sec_sleep 
+               << " seconds.";
+    sleep(sec_sleep);
+    //
+    e = cudaThreadSynchronize(); //<< 2nd trial to get CUDA context.
+    if (e != cudaSuccess) {
+      if (use_gpu == "yes") {
+        KALDI_ERR << "Failed to create CUDA context, no more unused GPUs?";
+      }
+      if (use_gpu == "optional") {
+        KALDI_WARN << "Running on CPU!!! No more unused CUDA GPUs?";
+        return;
+      }
+    }
+  }
+
+  // Re-assure we have the context
+  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+
+  // Check if the machine use compute exclusive mode 
+  if (IsComputeExclusive()) {
+    FinalizeActiveGpu();
+    return;
+  } else {
+    // Or suggest to use compute exclusive mode
+    if(n_gpu > 1) { 
+      KALDI_WARN << "Hint: It is practical to set the GPUs into ``compute exclusive mode''."
+                 << " Selection of free GPUs would be done by OS automatically.";
+    }
+    // And select the GPU according to proportion of free memory
+    if(SelectGpuIdAuto()) {
+      FinalizeActiveGpu();
+      return;
+    } else { 
+      // Could not get GPU, after prevously having the CUDA context?
+      // Strange but not impossible...
+      if (use_gpu == "yes") {
+        KALDI_ERR << "Error acquiring GPU.";
+      }
+      if (use_gpu == "optional") {
+        KALDI_WARN << "Running on CPU!!! Error acquiring GPU.";
+        return;
+      }
+    }
+  }
+}
+
+
+void CuDevice::FinalizeActiveGpu() {
+  // The device at this point should have active GPU, so we can query its name
+  // and memory stats and notify user which GPU is finally used.
+
  // Get the device-id of active device:
  {
    int32 act_gpu_id;
@ -164,44 +163,38 @@ void CuDevice::SelectGpuId(int32 gpu_id) {
    // Remember the id of active GPU 
    active_gpu_id_ = act_gpu_id; //CuDevice::Enabled() is true from now on
    // Initialize the CUBLAS
-    cuSafeCall(cublasInit());
+    CU_SAFE_CALL(cublasInit());

    // Notify user which GPU is finally used
    char name[128];
    DeviceGetName(name,128,act_gpu_id);
-    KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: "
-              << name << "\t" << GetFreeMemory(NULL, NULL);
-  }

+    CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, act_gpu_id));
+    
+    KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t"
+              << GetFreeMemory(&free_memory_at_startup_, NULL) << " version "
+              << properties_.major << "." << properties_.minor;
+
+    if (verbose_) PrintMemoryUsage();
+  }
  return;
 }


+bool CuDevice::DoublePrecisionSupported() {
+  if (!Enabled()) return true;
+  return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
+  // Double precision is supported from version 1.3
+}
+

 bool CuDevice::IsComputeExclusive() {
-  // check that we have a gpu
-  int32 n_gpu = 0;
-  cudaGetDeviceCount(&n_gpu);
-  if(n_gpu == 0) {
-    KALDI_LOG << "No CUDA devices found";
-    return false;
-  }
-  
-  // Create a GPU context
-  // This will be kept if we detect compute exclusive mode
-  // or released in the other case.
-  //
-  // It does not harm if the function gets called twice,
-  // and the context is already created.
-  cudaError_t e;
-  e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
-  if(e != cudaSuccess) {
-    KALDI_ERR << "Failed to create CUDA context on a GPU. No more unused GPUs in compute exclusive mode?";
-  }
-  
+  // assume we already have an CUDA context created
+  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+
  // get the device-id and its device-properties
  int32 gpu_id = -1;
-  e = cudaGetDevice(&gpu_id);
+  cudaError_t e = cudaGetDevice(&gpu_id);
  if(e != cudaSuccess) {
    KALDI_ERR << "Failed to get current device";
  }
@ -216,12 +209,12 @@ bool CuDevice::IsComputeExclusive() {
      KALDI_LOG << "CUDA setup operating under Compute Exclusive Mode.";
      return true;
      break;
-    #if (CUDA_VERSION >= 4000)
+#if (CUDA_VERSION >= 4000)
    case cudaComputeModeExclusiveProcess :
      KALDI_LOG << "CUDA setup operating under Compute Exclusive Process Mode.";
      return true;
      break;
-    #endif
+#endif
    default :
      // The computation mode is not compute-exclusive,
      // in this case we release the GPU context...
@ -234,21 +227,20 @@ bool CuDevice::IsComputeExclusive() {
 }


-
-void CuDevice::SelectGpuIdAuto() {
-  // check that we have at least one gpu
+bool CuDevice::SelectGpuIdAuto() {
+  // Check that we have at least one gpu
  int32 n_gpu = 0;
  cudaGetDeviceCount(&n_gpu);
  if(n_gpu == 0) {
-    KALDI_ERR << "No CUDA devices found";
-    return;
+    KALDI_WARN << "No CUDA devices found";
+    return false;
  }
-
+  
  // The GPU is selected according to maximal free memory ratio
  std::vector<float> free_mem_ratio(n_gpu+1, 0.0);
-  //get ratios of memory use, if possible
+  // Get ratios of memory use, if possible
  KALDI_LOG << "Selecting from " << n_gpu << " GPUs";
-  for(int32 n=0; n<n_gpu; n++) {
+  for(int32 n = 0; n < n_gpu; n++) {
    int32 ret = cudaSetDevice(n);
    switch(ret) {
      case cudaSuccess : {
@ -292,23 +284,22 @@ void CuDevice::SelectGpuIdAuto() {
    if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
  }
  //the free_mem_ratio should be bigger than zero
-  if(!free_mem_ratio[max_id] > 0.0) {
-    KALDI_ERR << "No device could be selected (this should never happen)";
-  }
+  KALDI_ASSERT(free_mem_ratio[max_id] > 0.0);

  //finally select the GPU
  KALDI_LOG << "Selected device: " << max_id << " (automatically)";
-  cuSafeCall(cudaSetDevice(max_id));
+  CU_SAFE_CALL(cudaSetDevice(max_id));
  //create the context
  cudaError_t e;
  e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
  if(e != cudaSuccess) {
-    KALDI_ERR << "Failed to create CUDA context on a GPU.";
+    KALDI_WARN << "Failed to create CUDA context on a GPU.";
+    return false;
  }
+  return true;
 }


-
 void CuDevice::AccuProfile(const std::string &key, double time) { 
  if (profile_map_.find(key) == profile_map_.end()) {
    profile_map_[key] = 0.0;
@ -316,23 +307,35 @@ void CuDevice::AccuProfile(const std::string &key, double time) {
  profile_map_[key] += time;
 }

-
+void CuDevice::PrintMemoryUsage() const {
+  if (Enabled()) {
+    int64 free_memory_now;
+    GetFreeMemory(&free_memory_now, NULL);
+    KALDI_LOG << "Memory used: " << (free_memory_at_startup_ - free_memory_now) << " bytes.";
+  }
+}

 void CuDevice::PrintProfile() {
  if (verbose_ && Enabled()) { 
    std::ostringstream os;
    os << "-----\n[cudevice profile]\n";
    std::map<std::string, double>::iterator it;
-    for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
-      os << it->first << "\t" << it->second << "s\n";
-    }
+    std::vector<std::pair<double, std::string> > pairs;
+    for(it = profile_map_.begin(); it != profile_map_.end(); ++it)
+      pairs.push_back(std::make_pair(it->second, it->first));
+    std::sort(pairs.begin(), pairs.end());
+    size_t max_print = 15, start_pos = (pairs.size() <= max_print ?
+                                        0 : pairs.size() - max_print);
+    for (size_t i = start_pos; i < pairs.size(); i++) 
+      os << pairs[i].second << "\t" << pairs[i].first << "s\n";
    os << "-----";
    KALDI_LOG << os.str();
+    PrintMemoryUsage();
  }
 }


-std::string CuDevice::GetFreeMemory(int64* free, int64* total) {
+std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
 // WARNING! the CUDA API is inconsistent accross versions!
 #if (CUDA_VERSION >= 3020)
  //define the function signature type
@ -406,14 +409,356 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
 }


-////////////////////////////////////////////////
-// The instance of the static singleton 
-//
-CuDevice CuDevice::msDevice;
-//
-////////////////////////////////////////////////
+struct CuAllocatorOptions {
+  int32 count; // Number of times we free and delete a particular size before we
+               // start to cache it.
+  int32 cleanup_interval_bytes;
+  CuAllocatorOptions(): count(1), cleanup_interval_bytes(1000000) { }
+};


+/// We define class CuAllocator inside the .cc file, because we don't want to
+/// expose it in the header.  Its purpose is to hang on to memory that we have
+/// freed, so that we don't waste time in cudaMalloc and cudaMallocPitch().
+/// For some reason, they are sometimes very slow.
+class CuAllocator {
+ public:
+  CuAllocator(const CuAllocatorOptions &opts, CuDevice *device):
+      device_(device), opts_(opts),
+      cleanup_countdown_bytes_(opts.cleanup_interval_bytes) { }
+  
+  inline void *Malloc(size_t size);
+  
+  inline void *MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
+  
+  inline void Free(void *ptr);
+
+  ~CuAllocator();
+ private:
+  inline void *MallocInternal(size_t row_bytes, size_t num_rows, size_t *pitch);
+  
+  // struct MemInfoForSize stores information associated with a particular size
+  // of allocated memory.  The row_bytes and num_rows refer to the arguments of
+  // a cudaMallocPitch call; for regular, non-pitch allocations with cudaMalloc,
+  // we make "row_bytes" zero and the size in bytes is "num_rows"... there is a
+  // reason why we do it this way round (make num_rows contain the size in
+  // bytes); it relates to the ordering of the map, and the behavior when
+  // we didn't find the exact size and want to find larger match.
+
+  
+  struct MemInfoForSize {
+    size_t row_bytes; // or zero, if a regular CudaMalloc, not
+                      // CudaMallocPitch.
+    size_t num_rows; // or the number of rows, if it's a regular CudaMalloc
+                     // call, not CudaMallocPitch.
+    size_t pitch; // If CudaMallocPitch, the pitch returned by CudaMallocPitch;
+                  // this code assumes (and checks) that it's a deterministic
+                  // function of row_bytes and num_rows.
+    size_t countdown; // number that have been freed and not cached.
+    size_t currently_used; // number that are "in the wild".. kept for
+                           // diagnostics and error detection.
+    std::vector<void*> freed; // freed and cached...
+      
+    MemInfoForSize(size_t row_bytes,
+                   size_t num_rows,
+                   int32 count):
+        row_bytes(row_bytes),
+        num_rows(num_rows),
+        pitch(0),
+        countdown(count),
+        currently_used(0) { }
+  };
+
+
+  // FindMemInfo returns the MemInfoForSize object for this (row_bytes,
+  // num_rows) combination if it exists; otherwise...
+  // if there is a MemInfoForSize object with the same row_bytes and larger (but
+  // not more than twice larger) num_rows that has freed memory waiting, it
+  // returns that; otherwise, it returns a new MemInfoForSize object for the
+  // requested size).
+  
+  inline MemInfoForSize *FindMemInfo(size_t row_bytes,
+                                     size_t num_rows) {
+    if (row_bytes >= size_to_list_.size())
+      size_to_list_.resize(row_bytes + 1, NULL);
+    
+    // note: we set row_bytes to 0 for regular, linear allocation.
+    KALDI_ASSERT(num_rows != 0);
+
+    if (size_to_list_[row_bytes] == NULL)
+      size_to_list_[row_bytes] = new std::map<size_t, MemInfoForSize*>;
+
+
+    std::map<size_t, MemInfoForSize*> &size_to_list = *(size_to_list_[row_bytes]);
+
+    typedef std::map<size_t, MemInfoForSize* >::iterator IterType;
+
+    // get an iterator to the requested object or the next-larger one.
+    // Here, upper_bound(num_rows - 1) returns an object strictly greater
+    // than num_rows - 1, which could be num_rows itself.  We need to
+    // treat num_rows == 0 as a special case because of size_t being
+    // unsigned.
+    IterType iter = (num_rows == 0 ? size_to_list.begin() :
+                     size_to_list.upper_bound(num_rows - 1));
+    
+    if (iter != size_to_list.end() && iter->first == num_rows) {
+      // Found a MemInfoForSize object
+      // with the requested size -> return it.
+      KALDI_ASSERT(iter->second->row_bytes == row_bytes &&
+                   iter->second->num_rows == num_rows);
+      return iter->second;
+    } else if (iter != size_to_list.end() &&
+               iter->second->num_rows <= 2 * num_rows &&
+               !iter->second->freed.empty()) {
+      // Return the non-matching one with freed memory, which is larger than
+      // this one but not more than twice larger.
+      KALDI_ASSERT(iter->second->row_bytes == row_bytes &&
+                   iter->second->num_rows > num_rows); // confirm expectations.
+      return iter->second;
+    } else {
+      // There was no such object, and the next-larger object either did not
+      // exist, had more than twice the num-rows requested, or had no free
+      // memory -> create an object with the requested size.
+      return (size_to_list[num_rows] =  new MemInfoForSize(row_bytes, num_rows,
+                                                           opts_.count));
+    }
+  }
+                 
+  void PossiblyCleanup(size_t num_bytes);
+
+  // A periodic housekeeping task..
+  void Cleanup();
+
+  // Frees all memory in the "freed" vectors; memory that the
+  // user freed but we held on to.  If destroy == true, also
+  // clean up all memory held in the size_to_list_ object (i.e.
+  // allocated maps and MemInfoForSize objects).
+  void ReleaseAllCachedMemory(bool destroy = false);
+
+  CuDevice *device_; // device this is attached to...
+  CuAllocatorOptions opts_;
+
+
+  unordered_map<void*, MemInfoForSize*> addr_to_list_;
+
+  // size_to_list_ is indexed first by row_bytes (which is zero for linear
+  // mallocs) and then by num_rows (which for linear mallocs, is the actual size
+  // in bytes).
+  std::vector<std::map<size_t, MemInfoForSize*>* > size_to_list_;
+  
+  int32 cleanup_countdown_bytes_; // countdown in bytes, until the next time we check
+                                  // whether we should do cleanup
+};
+
+
+void* CuAllocator::Malloc(size_t size) {
+  KALDI_ASSERT(size > 0);
+  return MallocInternal(0, size, NULL);
+}
+
+void* CuAllocator::MallocPitch(size_t num_rows, size_t row_bytes,
+                               size_t *pitch) {
+  KALDI_ASSERT(num_rows > 0 && row_bytes > 0 && pitch != NULL);
+  return MallocInternal(num_rows, row_bytes, pitch);
+}
+
+void* CuAllocator::MallocInternal(size_t row_bytes,
+                                  size_t num_rows,
+                                  size_t *pitch_out) {
+  // we share the code for standard cudaMalloc and cudaMallocPitch
+  // because most of it is the same.  for cudaMalloc, we'll have
+  // row_bytes == 0, and num_rows is just the size to be allocated.
+  KALDI_ASSERT(num_rows != 0 && (row_bytes != 0) == (pitch_out != NULL));
+  
+  MemInfoForSize *info = FindMemInfo(row_bytes, num_rows);
+  if (!info->freed.empty()) { // We can satisfy the request with cached,
+                              // previously-allocated memory.
+    void *ans = info->freed.back();
+    info->freed.pop_back();
+    info->currently_used++;
+    addr_to_list_[ans] = info;
+    if (pitch_out) *pitch_out = info->pitch;
+    return ans;
+  } else {
+    PossiblyCleanup(row_bytes == 0 ? num_rows : row_bytes * num_rows);
+    void *ans;
+    if (row_bytes == 0) { // Simple malloc request, not "MallocPitch".
+      size_t size = num_rows;
+      int32 ret = cudaMalloc(&ans, size);
+      if (ret != 0) {
+        KALDI_WARN << "Allocation of memory block of " << size << " bytes "
+                   << "failed, releasing cached memory and retrying.";
+        cudaGetLastError(); // reset the error state
+        ReleaseAllCachedMemory();
+        ret = cudaMalloc(&ans, size);
+        if (ret != 0)
+          KALDI_WARN << "Allocation failed for the second time.    Printing "
+                    << "device memory usage and exiting";
+          device_->PrintMemoryUsage();
+          KALDI_ERR << "Memory allocation failure";
+      }
+    } else {
+      size_t pitch;
+      int32 ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
+      if (ret != 0) { // allocation failed...
+        KALDI_WARN << "Allocation of " << num_rows << " rows, each of size "
+                   << row_bytes << " bytes failed,  releasing cached "
+                   << "memory and retrying.";
+        cudaGetLastError(); // reset the error state
+        ReleaseAllCachedMemory();
+        ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
+        if (ret != 0) {
+          KALDI_WARN << "Allocation failed for the second time.    Printing "
+                    << "device memory usage and exiting";
+          device_->PrintMemoryUsage();
+          KALDI_ERR << "Memory allocation failure";
+        }
+      }
+      KALDI_ASSERT(pitch > 0);
+      if (info->pitch == 0) { // First allocation; have not set info->pitch yet.
+        info->pitch = pitch;
+      } else if (pitch != info->pitch) {
+        KALDI_ERR << "Pitch differs between multiple calls with the same "
+                  << "parameters: " << pitch << " vs. " << info->pitch;
+      }
+      *pitch_out = info->pitch;
+    }
+    addr_to_list_[ans] = info;
+    info->currently_used++;
+    return ans;
+  }
+}
+
+void CuAllocator::Free(void *addr) {
+  unordered_map<void*, MemInfoForSize*>::iterator iter
+      = addr_to_list_.find(addr);
+  if (iter == addr_to_list_.end()) {
+    KALDI_ERR << "Attempt to free address " << addr << " that was not allocated "
+              << "by CuDevice::Malloc() (or was previously freed);";
+  }
+  MemInfoForSize *info = iter->second;
+  addr_to_list_.erase(addr); // Erase this element in the addr_to_list_ map.
+  info->currently_used--;
+  if (info->countdown == 0) { // We have freed [i.e. actually freed with
+                              // CudaFree()] enough of these that we think
+                              // we're wasting too much time this way and
+                              // need to start caching them.
+    info->freed.push_back(addr);
+  } else { // Actually free the address, and decrease "countdown".
+    info->countdown--;
+    CU_SAFE_CALL(cudaFree(addr)); // This is how we free, even if allocated with
+                                  // cudaMallocPitch().
+  }
+}
+
+void CuAllocator::ReleaseAllCachedMemory(bool destroy) {
+  KALDI_VLOG(2) << "Releasing all cached memory.";
+  for (size_t i = 0; i < size_to_list_.size(); i++) {
+    if (size_to_list_[i] == NULL)
+      continue;
+    typedef std::map<size_t, MemInfoForSize*>::iterator  IterType;
+    for (IterType iter = size_to_list_[i]->begin();
+         iter != size_to_list_[i]->end(); ++iter) {
+      MemInfoForSize *info = iter->second;
+      if (destroy && !info->freed.empty()) {
+        // When called from the destructor at program end, if verbose level is
+        // high, say the sizes we had.
+        if (info->row_bytes == 0) {
+          KALDI_VLOG(3) << "Releasing " << info->freed.size() << " blocks of "
+                        << info->num_rows << " bytes.";
+        } else {
+          KALDI_VLOG(3) << "Releasing " << info->freed.size()
+                        << " 2-d blocks of " << info->num_rows << " rows of "
+                        << info->row_bytes << " bytes each.";
+        }
+      }
+      if (!destroy) {
+        // We only do this freeing part when we're *not* called from the
+        // destuctor (destroy = false).  This leads to a crash when called from
+        // the destructor, with cudaFree returning "unload of CUDA runtime
+        // failed".  Presumably this has to do with the destruction order of
+        // C++, which we can't really control.
+        while (!info->freed.empty()) {
+          CU_SAFE_CALL(cudaFree(info->freed.back()));
+          info->freed.pop_back();
+        }
+      }
+      if (destroy)
+        delete info;
+    }
+    if (destroy) {
+      delete size_to_list_[i];
+      size_to_list_[i] = NULL;
+    }
+  }
+}
+
+void CuAllocator::Cleanup() {
+  // TODO: implement this or remove it (and also PossiblyCleanup).
+  // Actually we may never implement this, as just calling
+  // ReleaseAllCachedMemory whenever an allocation fails is probably
+  // sufficient.
+}
+void CuAllocator::PossiblyCleanup(size_t num_bytes) {
+  if (static_cast<size_t>(cleanup_countdown_bytes_) <= num_bytes) {
+    Cleanup();
+    cleanup_countdown_bytes_ = opts_.cleanup_interval_bytes;
+  } else {
+    cleanup_countdown_bytes_ -= static_cast<int32>(num_bytes);
+  }
+}
+
+CuAllocator::~CuAllocator() {
+  // Check that nothing was allocated by the user and not freed.
+  std::set<MemInfoForSize*> unfreed_set;
+  typedef unordered_map<void*, MemInfoForSize *>::iterator IterType;
+  for (IterType iter = addr_to_list_.begin(); iter != addr_to_list_.end();
+       ++iter)
+    unfreed_set.insert(iter->second);
+  for (std::set<MemInfoForSize*>::iterator iter = unfreed_set.begin();
+       iter != unfreed_set.end(); ++iter) {
+    MemInfoForSize *info = *iter;
+    KALDI_ASSERT(info->currently_used > 0); // Or should not be in this set
+                                            // (code error or memory corruption)
+    if (info->num_rows == 0) {
+      KALDI_WARN << info->currently_used << " memory chunks of size "
+                 << info->row_bytes << " were allocated and not freed.";
+    } else {
+      KALDI_WARN << info->currently_used << " memory chunks of size "
+                 << info->row_bytes << " per row, and " << info->num_rows
+                 << " rows, were allocated and not freed.";
+    }
+  }
+  
+  bool destroy = true;
+  ReleaseAllCachedMemory(destroy);
+}
+
+void CuDevice::Free(void *ptr) { allocator_->Free(ptr); }
+
+void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
+  return allocator_->MallocPitch(row_bytes, num_rows, pitch);
+}
+
+void* CuDevice::Malloc(size_t size) {
+  return allocator_->Malloc(size);
+}
+
+CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true),
+                      allocator_(new CuAllocator(CuAllocatorOptions(), this))
+  { }
+
+
+CuDevice::~CuDevice() {
+  if (allocator_ != NULL)
+    delete allocator_;
+  if (Enabled())
+    CU_SAFE_CALL(cublasShutdown());
+}
+  
+// The instance of the static singleton 
+CuDevice CuDevice::global_device_;
+

 }

--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@ -22,75 +22,105 @@
 #ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_
 #define KALDI_CUDAMATRIX_CU_DEVICE_H_

-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1

 #include <map>
 #include <string>
 #include <iostream>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+

 namespace kaldi {

+class CuAllocator; // Forward declaration.
+
 /**
 * Singleton object which represents CUDA device
 * responsible for CUBLAS initilalisation, collects profiling info
 */
 class CuDevice {
- // Singleton interface...
- private:
-  CuDevice();
-  CuDevice(CuDevice&);
-  CuDevice &operator=(CuDevice&);
-
+ // Singleton object (there should only be one instantiated per program)
 public:
  ~CuDevice();
-  static CuDevice& Instantiate() { 
-    return msDevice; 
-  }
+  static inline CuDevice& Instantiate() { return global_device_; }

- private:
-  static CuDevice msDevice;
+  // We provide functions Malloc, MallocPitch and Free which replace cudaMalloc,
+  // cudaMallocPitch and cudaFree.  Their function is to cache the results of
+  // previous allocations to avoid the very large overhead that CUDA's
+  // allocation seems to give for some setups.
+  void* Malloc(size_t size);
+  
+  void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
+  
+  void Free(void *ptr);
+  
+  /// Select a GPU for computation, the 'use_gpu' modes are:
+  ///  "yes"      -- Select GPU automatically and die if this fails.
+  ///  "optional" -- Do as above, but if it fails, back off to CPU. 
+  ///  "no"       -- Run on CPU. 
+  ///  (more comments in cu-device.cc)
+  void SelectGpuId(std::string use_gpu);

-
- /**********************************/
- // Instance interface
- public:
- 
-  /// Check if the CUDA device is selected for use
-  bool Enabled() { 
+  /// Check if the CUDA GPU is selected for use
+  bool Enabled() const {
    return (active_gpu_id_ > -1); 
  }

-  /// Manually select GPU by id (more comments in cu-device.cc)
-  void SelectGpuId(int32 gpu_id);
  /// Get the active GPU id
  int32 ActiveGpuId() {
    return active_gpu_id_;
  }

-  void Verbose(bool verbose) { 
-    verbose_ = verbose; 
-  }
+  /// Returns true if either we have no GPU, or we have a GPU
+  /// and it supports double precision.
+  bool DoublePrecisionSupported();
+
+  void SetVerbose(bool verbose) {  verbose_ = verbose; }

  /// Sum the IO time
  void AccuProfile(const std::string &key, double time);
  void PrintProfile(); 
+
+  void PrintMemoryUsage() const;
  
  void ResetProfile() { 
    profile_map_.clear(); 
  }
  
  /// Get the actual GPU memory use stats
-  std::string GetFreeMemory(int64* free = NULL, int64* total = NULL);
+  std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const;
  /// Get the name of the GPU
  void DeviceGetName(char* name, int32 len, int32 dev); 
  
 private:
-  /// Check if the GPU run in compute exclusive mode
-  bool IsComputeExclusive();
-  /// Automatically select GPU
-  void SelectGpuIdAuto();
+  CuDevice();
+  CuDevice(CuDevice&); // Disallow.
+  CuDevice &operator=(CuDevice&);  // Disallow.
+
+  static CuDevice global_device_;
+  
+  /// Check if the GPU run in compute exclusive mode Returns true if it is
+  /// running in compute exclusive mode and we have a GPU.  Returns false
+  /// otherwise.  Sets error to true if there was some error, such as that we
+  /// were running in compute exclusive modes but no GPUs available; otherwise
+  /// sets it to false.
+  bool IsComputeExclusive();
+
+  /// Automatically select GPU and get CUDA context.  Returns true on success.
+  bool SelectGpuIdAuto();
+
+  /// Try to get CUDA context on manually selected GPU.  Return true on success.
+  bool SelectGpuIdManual(int32 gpu_id);
+
+  void FinalizeActiveGpu();
+  
+  /// Should only be called if Enabled() == true. 
+  int32 MajorDeviceVersion();
+
+  /// Should only be called if Enabled() == true. 
+  int32 MinorDeviceVersion();

- private:
  std::map<std::string, double> profile_map_;
  
  /// active_gpu_id_ values:
@ -99,14 +129,20 @@ class CuDevice {
  /// -1 SelectGpuId was called, but the GPU was manually disabled
  /// 0..N Normal GPU IDs
  int32 active_gpu_id_; 
-  ///
+  
+  int64 free_memory_at_startup_;
+  
+  cudaDeviceProp properties_;

  bool verbose_;

+  CuAllocator *allocator_;
+  
 }; // class CuDevice


-}// namespace
+
+}  // namespace

 #endif // HAVE_CUDA

--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@ -1,6 +1,10 @@
 // cudamatrix/cu-kernels-ansi.h

 // Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)
+//                2013  Hainan Xu	
+//                2013  Xiaohui Zhang
+//                2013	Johns Hopkins University (author: Guoguo Chen)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -25,8 +29,7 @@

 #include "cudamatrix/cu-matrixdim.h"

-#if HAVE_CUDA==1
-
+#if HAVE_CUDA == 1
 extern "C" {

 /*********************************************************
@ -43,13 +46,39 @@ void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, Matr
 /*
 * CuMatrix 
 */
+void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
+void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
+void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim,
+                            const float *vec, const float *mat2, int mat2_row_stride,
+                            int mat2_col_stride, float beta);
+void cudaF_copy_from_tp_trans(int Gr, int Bl, float* A, const float* B, MatrixDim dmat);
+void cudaFD_copy_from_tp_trans(int Gr, int Bl, float* A, const double* B, MatrixDim dmat);
+void cudaF_copy_from_tp(int Gr, int Bl, float* A, const float* B, MatrixDim dmat);
+void cudaFD_copy_from_tp(int Gr, int Bl, float* A, const double* B, MatrixDim dmat);
+void cudaF_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d);
+void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
+void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
+void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
+void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
+void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
+void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
 void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
+void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
+void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec, const float alpha, int dim);
+void cudaF_scale_diag(int Gr, int Bl, float* mat, float value, int dim);
 void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
 void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
-void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim d);
+void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride);
+void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride);
 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
 void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
+void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size);
+void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2,  MatrixDim d, int src_stride, int group_size, float power);
 void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d);
 void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d);
 void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d);
@ -58,29 +87,82 @@ void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, floa
 /*
 * CuVector
 */
+void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig, float changed);
+void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a, float param_1, float param_2, float param_3, int* flag, int dim);
+void cudaF_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim);
+void cudaF_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim);
+void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
+void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
+void cudaF_vec_min(const float* v, float* value, int dim);
+void cudaF_vec_max(const float* v, float* value, int dim);
+void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
+void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
+void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
+void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
+void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
+                            int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
+                            int N_col_stride, int threads_per_element, float beta);  
+void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim);
+void cudaF_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
+void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim);
+void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
+void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
+void cudaF_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size);
+void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim);
+void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim);
+void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
+void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
+void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
 void cudaF_add_row_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d);
 void cudaF_add_col_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d);
 void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
-
+// Note: B_trans is nonzero if B is transposed.
+void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const float *Adata,
+                            int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
+                            const CuBlockMatrixData *B_cu_data, int B_num_blocks,
+                            float alpha, float beta, int B_trans);
+void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
+                             const float *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
+                             const float *D_data, int D_row_stride, int D_col_stride,
+                             float alpha, float beta);
 /*
 * cu::
 */
 void cudaF_softmax(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d);
+void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride);
 void cudaF_softmax_part(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d);
-void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d);
-void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d);
-void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d);
+void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
+void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power);
+void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
+void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int src_stride);
+void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
 void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d);

 void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d);
 void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
 void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d);
+void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in);

 void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
 void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
+void cudaF_one(int Gr, int Bl, float* x, int dim);
 void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
+void cudaF_copy_from_sp(int Gr, int Bl, const float* x, float* y, int d_in, MatrixDim d_out);
+void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
+void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
+void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
+void cudaF_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<float>* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t);
+void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
+                      float *S, MatrixDim sdim);
+void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                             const float *src_data, MatrixDim src_dim,
+                             const Int32Pair *indices);  
+void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
+                         const Int32Pair *indices, int indices_size,
+                         float *output);

-
+  
 /*********************************************************
 * double CUDA kernel calls
 */
@ -88,13 +170,39 @@ void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *co
 /*
 * CuMatrix 
 */
+void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
+void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
+void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
+                            const double *vec, const double *mat2, int mat2_row_stride,
+                            int mat2_col_stride, double beta);  
+void cudaD_copy_from_tp_trans(int Gr, int Bl, double* A, const double* B, MatrixDim dmat);
+void cudaDF_copy_from_tp_trans(int Gr, int Bl, double* A, const float* B, MatrixDim dmat);
+void cudaD_copy_from_tp(int Gr, int Bl, double* A, const double* B, MatrixDim dmat);
+void cudaDF_copy_from_tp(int Gr, int Bl, double* A, const float* B, MatrixDim dmat);
+void cudaD_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d);
+void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
+void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);  
+void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
+void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
+void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
+void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
+void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
 void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
+void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
+void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec, const double alpha, int dim);
+void cudaD_scale_diag(int Gr, int Bl, double* mat, double value, int dim);
 void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
 void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
-void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim d);
+void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride);
+void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride);
 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
 void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
+void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size);
+void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2,  MatrixDim d, int src_stride, int group_size, double power);
 void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d);
 void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d);
 void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d);
@ -103,31 +211,101 @@ void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, do
 /*
 * CuVector
 */
+void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig, double changed);
+void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a, double param_1, double param_2, double param_3, int* flag, int dim);
+void cudaD_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim);
+void cudaD_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim);
+void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim);
+void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
+void cudaD_vec_min(const double* v, double* value, int dim);
+void cudaD_vec_max(const double* v, double* value, int dim);
+void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
+void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
+void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
+void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
+void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
+                            int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
+                            int N_col_stride, int threads_per_element, double beta);  
+void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim);
+void cudaD_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
+void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
+void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim);
+void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
+void cudaD_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size);
+void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim);
+void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim);
+void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
+void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
+void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
 void cudaD_add_row_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d);
 void cudaD_add_col_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d);
 void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
+// note: B_trans is nonzero if B is tranposed.
+void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const double *Adata,
+                            int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
+                            const CuBlockMatrixData *B_cu_data, int B_num_blocks,
+                            double alpha, double beta, int B_trans);
+void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
+                             const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
+                             const double *D_data, int D_row_stride, int D_col_stride,
+                             double alpha, double beta);  
+

 /*
 * cu::
 */
 void cudaD_softmax(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d);
+void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride);
 void cudaD_softmax_part(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d);
-void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d);
-void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d);
-void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d);
+void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
+void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power);
+void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
+void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int src_stride);
+void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
 void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d);

 void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d);
 void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
 void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d);
+void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in);

 void cudaD_randomize(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
 void cudaD_splice(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
+void cudaD_one(int Gr, int Bl, double* x, int dim);
 void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
+void cudaD_copy_from_sp(int Gr, int Bl, const double* x, double* y, int d_in, MatrixDim d_out);
+void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
+void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
+void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);


+// some mostly mixed-type kernels.
+void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
+
+void cudaD_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<double>* x, int s, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t);
+
+void cudaD_transpose_matrix(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* T, MatrixDim tdim,
+                      double *S, MatrixDim sdim);
+void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                             const double *src_data, MatrixDim src_dim,
+                             const Int32Pair *indices);
+void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
+                         const Int32Pair *indices, int indices_size,
+                         double *output);
+  
+  
+  
 } // extern "C" 

 #endif // HAVE_CUDA

+
 #endif
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@ -1,6 +1,11 @@
 // cudamatrix/cu-kernels.h

 // Copyright 2009-2012  Karel Vesely
+//                2013  Ehsan Variani
+//                2014  Johns Hopkins University (author: Daniel Povey)
+//                2013  Hainan Xu
+//                2013  Xiaohui Zhang	
+//                2013  Johns Hopkins University (author: Guoguo Chen)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -22,7 +27,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_KERNELS_H_
 #define KALDI_CUDAMATRIX_CU_KERNELS_H_

-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1

 #include "base/kaldi-error.h"
 #include "cudamatrix/cu-kernels-ansi.h"
@ -34,147 +39,366 @@

 namespace kaldi {

-
-
-/*********************************************************
- * base templates
- */
-
-/*
- * CuMatrix
- */
-template<typename Real> inline void cuda_set_const(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_scale(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_apply_log(dim3 Gr, dim3 Bl, Real *mat, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_mul_elements(dim3 Gr, dim3 Bl, Real *mat, const Real *A, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *scale, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *scale, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *vec_div, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add_mat(dim3 Gr, dim3 Bl, Real alpha, const Real *A, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, Real alpha, const Real *col, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, Real alpha, const Real *row, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
- 
-/*
- * CuVector
- */
-template<typename Real> inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_sum, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_sum, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_invert_elements(dim3 Gr, dim3 Bl, Real *data, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-
-template<typename Real> inline void cuda_sigmoid(dim3 Gr, dim3 Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, Real *eout, const Real *e, const Real *y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_tanh(dim3 Gr, dim3 Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, Real *eout, const Real *e, const Real *y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_softmax(size_t Gr, size_t Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const Real *X, const int32_cuda *vec_ids, Real* Y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-
-template<typename Real> inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, Real *wei, Real *grad, Real l1, Real lr, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, Real *mat_net_out, Real *vec_log_post, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-
-template<typename Real> inline void cuda_randomize(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_splice(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_copy(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
-
-
-
-/*********************************************************
- * float specializations
- */
-
 /*
 * CuMatrix 
 */
-template<> inline void cuda_set_const<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_set_const(Gr,Bl,mat,value,d); }
-template<> inline void cuda_add<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_add(Gr,Bl,mat,value,d); }
-template<> inline void cuda_scale<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_scale(Gr,Bl,mat,value,d); }
-template<> inline void cuda_apply_log<float>(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) { cudaF_apply_log(Gr,Bl,mat,d); }
-template<> inline void cuda_mul_elements<float>(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim d) { cudaF_mul_elements(Gr,Bl,mat,A,d); }
-template<> inline void cuda_mul_cols_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr,Bl,mat,scale,d); }
-template<> inline void cuda_mul_rows_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
-template<> inline void cuda_div_rows_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d) { cudaF_div_rows_vec(Gr,Bl,mat,vec_div,d); }
-template<> inline void cuda_add_mat<float>(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d) { cudaF_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
-template<> inline void cuda_add_vec_to_cols<float>(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
-template<> inline void cuda_add_vec_to_rows<float>(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
+
+inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_upp_low(Gr, Bl, A, dimA); }
+inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_low_upp(Gr, Bl, A, dimA); }
+inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim,
+                                  const float *vec, const float *mat2, int mat2_row_stride,
+                                  int mat2_col_stride, float beta) {
+  cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
+                         mat2_row_stride, mat2_col_stride, beta);
+}
+inline void cuda_copy_from_tp_trans(int Gr, int Bl, float* A, const float* B, MatrixDim dmat) { cudaF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp_trans(int Gr, int Bl, float* A, const double* B, MatrixDim dmat) { cudaFD_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp(int Gr, int Bl, float* A, const float* B, MatrixDim dmat) { cudaF_copy_from_tp(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp(int Gr, int Bl, float* A, const double* B, MatrixDim dmat) { cudaFD_copy_from_tp(Gr,Bl,A,B,dmat); }
+
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+
+inline void cuda_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { cudaF_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
+inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr,Bl,mat,d); }
+inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); }
+inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) { cudaF_apply_heaviside(Gr,Bl,mat,dim); }
+inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim dim) { cudaF_apply_floor(Gr,Bl,mat,floor_val,dim); }
+inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim dim) { cudaF_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
+inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) { cudaF_trace(Gr,Bl,mat,value,dim); }
+inline void cuda_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d) { cudaF_set_diag(Gr,Bl,mat,value,d); }
+inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { cudaF_set_diag_packed(Gr,Bl,mat,value,dim); }
+inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { cudaF_add_diag_packed(Gr,Bl,mat,value,dim); }
+inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_set_const(Gr,Bl,mat,value,d); }
+inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_set_zero_above_diag(Gr,Bl,mat,d); }
+inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_add(Gr,Bl,mat,value,d); }
+inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec, const float alpha, int dim) { cudaF_add_vec2(Gr,Bl,mat,vec,alpha,dim); }
+inline void cuda_scale_diag(int Gr, int Bl, float* mat, float value, int dim) { cudaF_scale_diag(Gr,Bl,mat,value,dim); }
+inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_scale(Gr,Bl,mat,value,d); }
+inline void cuda_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) { cudaF_apply_log(Gr,Bl,mat,d); }
+inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride) {
+  cudaF_mul_elements(Gr,Bl,mat,A,dst_d,src_stride);
+}
+inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride) {
+  cudaF_max(Gr,Bl,mat,A,dst_d,src_stride);
+}
+inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr,Bl,mat,scale,d); }
+inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
+inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
+inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2,  MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
+inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d) { cudaF_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
+inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
+inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
+inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_transpose_matrix(Gr, Bl, mat, d); }
+inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
+                            float *S, MatrixDim sdim) {
+  cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
+}
+
 
 /*
 * CuVector
 */
-template<> inline void cuda_add_row_sum_mat<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
-template<> inline void cuda_add_col_sum_mat<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
-template<> inline void cuda_invert_elements<float>(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
+inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig, float changed) {cudaF_replace_value(Gr, Bl, v, dim, orig, changed); }
+inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d) { cudaF_div_rows_vec(Gr,Bl,mat,vec_div,d); }
+inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a, float param_1, float param_2, float param_3, int* flag, int dim) { cudaF_set_bias_params(Gr,Bl,v,a,param_1,param_2,param_3,flag,dim); }
+inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
+inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
+inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_mul_elements(Gr,Bl,v,a,dim); }
+inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { cudaF_vec_soft_max(Gr,Bl,v,dim); }
+inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(v,value,dim); }
+inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
+inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
+inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
+inline void cuda_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
+inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
+                                  int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
+                                  int N_col_stride, int threads_per_element, float beta) {
+  cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
+                         N_col_stride, threads_per_element, beta);
+}
+inline void cuda_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
+inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
+inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc) { cudaF_vec_sum(Gr,Bl,v,value,dim,inc); }
+inline void cuda_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size) { cudaF_pvec_sum(Gr, Bl, vec, pvec_sum, dim, size); }
+inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim) { cudaF_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
+inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim) { cudaF_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
+inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) { cudaF_vec_apply_exp(Gr,Bl,v,dim); }
+inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) { cudaF_vec_apply_log(Gr,Bl,v,flag,dim); }
+inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
+inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
+inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
+// B_trans nonzero if B transposed.
+inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const float *Adata,
+                                  int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
+                                  const CuBlockMatrixData *B_cu_data, int B_num_blocks,
+                                  float alpha, float beta, int B_trans) {
+  cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride,
+                         B_cu_data, B_num_blocks, alpha, beta, B_trans);
+}
+inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
+                                   const float *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
+                                   const float *D_data, int D_row_stride, int D_col_stride,
+                                   float alpha, float beta) {
+  cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
+                          C_row_stride, C_col_stride, D_data, D_row_stride,
+                          D_col_stride, alpha, beta);
+}
+
+

 /*
 * cu::
 */
-template<> inline void cuda_sigmoid<float>(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d) { cudaF_sigmoid(Gr,Bl,y,x,d); }
-template<> inline void cuda_diff_sigmoid<float>(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d); }
-template<> inline void cuda_tanh<float>(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d) { cudaF_tanh(Gr,Bl,y,x,d); }
-template<> inline void cuda_diff_tanh<float>(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d); }
-template<> inline void cuda_softmax<float>(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d) { cudaF_softmax(Gr,Bl,y,x,d); }
-template<> inline void cuda_softmax_part<float>(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d) { cudaF_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
+inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_soft_hinge(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power) { cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);}
+inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_sigmoid(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int src_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,src_stride); }
+inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d); }
+inline void cuda_softmax(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d) { cudaF_softmax(Gr,Bl,y,x,d); }
+/*
+Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK threads reduce a row at the same time.
+Gr: the number of rows
+*/
+inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d) { cudaF_softmax_part(Gr,Bl,X,vec_ids,Y,d); }

-template<> inline void cuda_regularize_l1<float>(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
-template<> inline void cuda_find_row_max_id<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
-template<> inline void cuda_diff_xent<float>(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
-
-template<> inline void cuda_randomize<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
-
-template<> inline void cuda_splice<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaF_splice(Gr,Bl,y,x,off,d_out,d_in); }
-template<> inline void cuda_copy<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
+inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
+inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in) {
+  cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
+}


-/*********************************************************
- * double specializations
- */
+inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
+
+inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaF_splice(Gr,Bl,y,x,off,d_out,d_in); }
+inline void cuda_one(int Gr,int Bl,float* x,int dim) { cudaF_one(Gr,Bl,x,dim); }
+inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_copy_from_sp(int Gr, int Bl, const float* x, float* y, int d_in, MatrixDim d_out) { cudaF_copy_from_sp(Gr,Bl,x,y,d_in,d_out); }
+inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_lower(Gr,Bl,x,y,d_in); }
+inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_upper(Gr,Bl,x,y,d_in); }
+inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_mean(Gr,Bl,x,y,d_in); }
+inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int32 size, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) {cudaF_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
+inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                                   const float *src_data, MatrixDim src_dim,
+                                   const Int32Pair *indices) {
+  cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
+}
+inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
+                               MatrixDim dim, const Int32Pair *indices,
+                               int indices_size, float *output) {
+  cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
+}
+
+
+// double versions

 /*
 * CuMatrix 
 */
-template<> inline void cuda_set_const<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_set_const(Gr,Bl,mat,value,d); }
-template<> inline void cuda_add<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_add(Gr,Bl,mat,value,d); }
-template<> inline void cuda_scale<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_scale(Gr,Bl,mat,value,d); }
-template<> inline void cuda_apply_log<double>(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_apply_log(Gr,Bl,mat,d); }
-template<> inline void cuda_mul_elements<double>(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim d) { cudaD_mul_elements(Gr,Bl,mat,A,d); }
-template<> inline void cuda_mul_cols_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_cols_vec(Gr,Bl,mat,scale,d); }
-template<> inline void cuda_mul_rows_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
-template<> inline void cuda_div_rows_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d) { cudaD_div_rows_vec(Gr,Bl,mat,vec_div,d); }
-template<> inline void cuda_add_mat<double>(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d) { cudaD_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
-template<> inline void cuda_add_vec_to_cols<double>(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
-template<> inline void cuda_add_vec_to_rows<double>(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
- 
+inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_upp_low(Gr, Bl, A, dimA); }
+inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_low_upp(Gr, Bl, A, dimA); }
+inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
+                                  const double *vec, const double *mat2, int mat2_row_stride,
+                                  int mat2_col_stride, double beta) {
+  cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
+                         mat2_row_stride, mat2_col_stride, beta);
+}
+inline void cuda_copy_from_tp_trans(int Gr, int Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp_trans(int Gr, int Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp(int Gr, int Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp(int Gr, int Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d) { cudaD_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
+inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_apply_exp(Gr,Bl,mat,d); }
+inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim dim) { cudaD_apply_pow(Gr,Bl,mat,power,dim); }
+inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) { cudaD_apply_heaviside(Gr,Bl,mat,dim); }
+inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim dim) { cudaD_apply_floor(Gr,Bl,mat,floor_val,dim); }
+inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim dim) { cudaD_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
+inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) { cudaD_trace(Gr,Bl,mat,value,dim); }
+inline void cuda_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d) { cudaD_set_diag(Gr,Bl,mat,value,d); }
+inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { cudaD_set_diag_packed(Gr,Bl,mat,value,dim); }
+inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { cudaD_add_diag_packed(Gr,Bl,mat,value,dim); }
+inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_set_const(Gr,Bl,mat,value,d); }
+inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_set_zero_above_diag(Gr,Bl,mat,d); }
+inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_add(Gr,Bl,mat,value,d); }
+inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec, const double alpha, int dim) { cudaD_add_vec2(Gr,Bl,mat,vec,alpha,dim); }
+inline void cuda_scale_diag(int Gr, int Bl, double* mat, double value, int dim) { cudaD_scale_diag(Gr,Bl,mat,value,dim); }
+inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_scale(Gr,Bl,mat,value,d); }
+inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_apply_log(Gr,Bl,mat,d); }
+inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) {
+  cudaD_mul_elements(Gr,Bl,mat,A,dst_d,src_stride);
+}
+inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) {
+  cudaD_max(Gr,Bl,mat,A,dst_d,src_stride);
+}
+inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_cols_vec(Gr,Bl,mat,scale,d); }
+inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
+inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size) { cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
+inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2,  MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
+inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d) { cudaD_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
+inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
+inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
+inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_transpose_matrix(Gr, Bl, mat, d); }
+inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* T, MatrixDim tdim,
+                            double *S, MatrixDim sdim) {
+  cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
+}
+
+
 /*
 * CuVector
 */
-template<> inline void cuda_add_row_sum_mat<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
-template<> inline void cuda_add_col_sum_mat<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
-template<> inline void cuda_invert_elements<double>(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
+inline void cuda_replace_value(int Gr, int Bl, double *v, int dim, double orig, double changed) {cudaD_replace_value(Gr, Bl, v, dim, orig, changed); }
+inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d) { cudaD_div_rows_vec(Gr,Bl,mat,vec_div,d); }
+inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a, double param_1, double param_2, double param_3, int* flag, int dim) { cudaD_set_bias_params(Gr,Bl,v,a,param_1,param_2,param_3,flag,dim); }
+inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
+inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
+inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim) { cudaD_vec_mul_elements(Gr,Bl,v,a,dim); }
+inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr,Bl,v,dim); }
+inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_min(v,value,dim); }
+inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
+inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
+inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
+inline void cuda_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
+inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
+                                  int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
+                                  int N_col_stride, int threads_per_element, double beta) {
+  cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
+                         N_col_stride, threads_per_element, beta);
+}
+inline void cuda_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
+inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
+inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc) { cudaD_vec_sum(Gr,Bl,v,value,dim,inc); }
+inline void cuda_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size) { cudaD_pvec_sum(Gr,Bl,vec,pvec_sum,dim,size); }
+inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim) { cudaD_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
+inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim) { cudaD_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
+inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) { cudaD_vec_apply_exp(Gr,Bl,v,dim); }
+inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim) { cudaD_vec_apply_log(Gr,Bl,v,flag,dim); }
+inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
+inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
+inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
+// B_trans nonzero if B transposed.
+inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const double *Adata,
+                                  int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
+                                  const CuBlockMatrixData *B_cu_data, int B_num_blocks,
+                                  double alpha, double beta, int B_trans) {
+  cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride,
+                         B_cu_data, B_num_blocks, alpha, beta, B_trans);
+}
+inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
+                                   const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
+                                   const double *D_data, int D_row_stride, int D_col_stride,
+                                   double alpha, double beta) {
+  cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
+                          C_row_stride, C_col_stride, D_data, D_row_stride,
+                          D_col_stride, alpha, beta);
+}

 /*
 * cu::
 */
-template<> inline void cuda_sigmoid<double>(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d) { cudaD_sigmoid(Gr,Bl,y,x,d); }
-template<> inline void cuda_diff_sigmoid<double>(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d); }
-template<> inline void cuda_tanh<double>(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d) { cudaD_tanh(Gr,Bl,y,x,d); }
-template<> inline void cuda_diff_tanh<double>(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d); }
-template<> inline void cuda_softmax<double>(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d) { cudaD_softmax(Gr,Bl,y,x,d); }
-template<> inline void cuda_softmax_part<double>(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d) { cudaD_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
+inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_soft_hinge(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power) { cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power); }
+inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_sigmoid(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int src_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,src_stride); }
+inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d); }
+inline void cuda_softmax(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d) { cudaD_softmax(Gr,Bl,y,x,d); }
+inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d) { cudaD_softmax_part(Gr,Bl,X,vec_ids,Y,d); }

-template<> inline void cuda_regularize_l1<double>(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
-template<> inline void cuda_find_row_max_id<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
-template<> inline void cuda_diff_xent<double>(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) { cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
+inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
+inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) {
+  cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d);
+}
+inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in) {
+  cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
+}

-template<> inline void cuda_randomize<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
-template<> inline void cuda_splice<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaD_splice(Gr,Bl,y,x,off,d_out,d_in); }
-template<> inline void cuda_copy<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaD_splice(Gr,Bl,y,x,off,d_out,d_in); }
+inline void cuda_one(int Gr,int Bl,double* x,int dim) { cudaD_one(Gr,Bl,x,dim); }
+inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_copy_from_sp(int Gr, int Bl, const double* x, double* y, int d_in, MatrixDim d_out) { cudaD_copy_from_sp(Gr,Bl,x,y,d_in,d_out); }
+inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_lower(Gr,Bl,x,y,d_in); }
+inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_upper(Gr,Bl,x,y,d_in); }
+inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_mean(Gr,Bl,x,y,d_in); }
+inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int32 size, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t) {cudaD_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
+inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                                   const double *src_data, MatrixDim src_dim, const Int32Pair *indices) {
+  cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
+}
+inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
+                               MatrixDim dim, const Int32Pair *indices,
+                               int indices_size, double *output) {
+  cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
+}

-} // namespace
+
+// Also include some template-friendly wrappers of cublas functions:
+inline void cuda_axpy(int n, float alpha, const float *x, int incx, float *y, int incy) {
+  cublasSaxpy(n, alpha, x, incx, y, incy);
+}
+inline void cuda_axpy(int n, double alpha, const double *x, int incx, double *y, int incy) {
+  cublasDaxpy(n, alpha, x, incx, y, incy);
+}
+inline void cuda_scal(int n, float alpha, float *x, int incx) {
+  cublasSscal(n, alpha, x, incx);
+}
+inline void cuda_scal(int n, double alpha, double *x, int incx) {
+  cublasDscal(n, alpha, x, incx);
+}
+
+
+} // namespace kaldi



 #endif // HAVE_CUDA

 #endif
-
-
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@ -0,0 +1,181 @@
+// cudamatrix/cuda-math-test.cc
+
+// Copyright 2013 Johns Hopkins University (Author: David Snyder)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix-lib.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-array.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+
+/*
+ * Unit tests
+ */
+      
+template<typename Real> 
+static void UnitTestCuMathRandomize() {
+  int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
+  CuMatrix<Real> src(M, N);
+  CuMatrix<Real> tgt(M, N);
+  CuArray<int32> copy_from_idx;
+
+  src.SetRandn(); 
+  int32 n_rows = src.NumRows();
+  int32 n_columns = src.NumCols();
+  std::vector<int32> copy_from_idx_vec;
+
+  for (int32 i = 0; i < n_rows; i++) {
+    copy_from_idx_vec.push_back(rand() % n_rows);
+  }
+  copy_from_idx.CopyFromVec(copy_from_idx_vec);
+  cu::Randomize(src, copy_from_idx, &tgt);
+  
+  for (int32 i = 0; i < n_rows; i++) {
+    for (int32 j = 0; j < n_columns; j++) {
+      Real src_val = src(copy_from_idx_vec.at(i), j);
+      Real tgt_val = tgt(i, j);
+      AssertEqual(src_val, tgt_val);
+    }
+  }
+}
+
+
+template<typename Real> 
+static void UnitTestCuMathCopy() {
+  int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
+  CuMatrix<Real> src(M, N);
+  CuMatrix<Real> tgt(M, N);
+  CuArray<int32> copy_from_idx;
+
+  src.SetRandn(); 
+  int32 n_rows = src.NumRows();
+  int32 n_columns = src.NumCols();
+  std::vector<int32> copy_from_idx_vec;
+
+  for (int32 i = 0; i < n_columns; i++) {
+    copy_from_idx_vec.push_back(rand() % n_columns);
+  }
+  copy_from_idx.CopyFromVec(copy_from_idx_vec);
+  cu::Copy(src, copy_from_idx, &tgt);
+  
+  for (int32 i = 0; i < n_rows; i++) {
+    for (int32 j = 0; j < n_columns; j++) {
+      Real src_val = src(i, copy_from_idx_vec.at(j));
+      Real tgt_val = tgt(i, j);
+      AssertEqual(src_val, tgt_val);
+    }
+  }
+}
+
+template<typename Real> 
+static void UnitTestCuMathSplice() {
+  int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
+  CuMatrix<Real> src(M, N);
+  CuArray<int32> frame_offsets;
+
+  src.SetRandn(); 
+  int32 n_rows = src.NumRows();
+  int32 n_columns = src.NumCols();
+  std::vector<int32> frame_offsets_vec;
+
+  // The number of columns of tgt is rows(src) 
+  // times n_frame_offsets, so we keep n_frame_offsets 
+  // reasonably small (2 <= n <= 6).
+  int32 n_frame_offsets = rand() % 7 + 2;
+  for (int32 i = 0; i < n_frame_offsets; i++) {
+    frame_offsets_vec.push_back(rand() % 2 * n_columns - n_columns);
+  }
+
+  CuMatrix<Real> tgt(M, N * n_frame_offsets);
+  frame_offsets.CopyFromVec(frame_offsets_vec);
+  cu::Splice(src, frame_offsets, &tgt);
+
+  for (int32 i = 0; i < n_rows; i++) {
+    for (int32 k = 0; k < n_frame_offsets; k++) {
+      for (int32 j = 0; j < n_columns; j++) {
+        Real src_val; 
+        if (i + frame_offsets_vec.at(k) >= n_rows) {
+          src_val = src(n_rows-1, j);
+        } else if (i + frame_offsets_vec.at(k) <= 0) {
+          src_val = src(0, j);
+        } else {
+          src_val = src(i + frame_offsets_vec.at(k), j); 
+        }
+        Real tgt_val = tgt(i, k * n_columns + j);
+        AssertEqual(src_val, tgt_val);
+      }
+    }
+  }
+}
+
+template<typename Real> void CudaMathUnitTest() {
+  #if HAVE_CUDA == 1  
+    if (CuDevice::Instantiate().DoublePrecisionSupported())
+  #endif
+  UnitTestCuMathRandomize<Real>();
+  UnitTestCuMathSplice<Real>();
+  UnitTestCuMathCopy<Real>();
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+#endif
+    srand(time(NULL));
+    kaldi::CudaMathUnitTest<float>();
+    
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+      kaldi::CudaMathUnitTest<double>();
+    } else {
+      KALDI_WARN << "Double precision not supported";
+    }
+#else
+    kaldi::CudaMathUnitTest<float>();
+#endif
+
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
+
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@ -36,15 +36,15 @@ namespace cu {
 template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
  KALDI_ASSERT(SameDim(*weight, *grad));
-#if HAVE_CUDA==1 
+#if HAVE_CUDA == 1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;

-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(weight->NumCols(), CUBLOCK), n_blocks(weight->NumRows(), CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));

    cuda_regularize_l1(dimGrid, dimBlock, weight->data_, grad->data_, l1, lr, weight->Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
    
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
@ -77,21 +77,21 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,

 template<typename Real>
 void Randomize(const CuMatrixBase<Real> &src,
-               const CuStlVector<int32> &copy_from_idx,
+               const CuArray<int32> &copy_from_idx,
               CuMatrixBase<Real> *tgt) {

  KALDI_ASSERT(src.NumCols() == tgt->NumCols());
  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
  KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());

-  #if HAVE_CUDA==1
+  #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
    
    /*
    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 
-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(copy_from_idx.Dim(), CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
    */

    /*
@ -108,7 +108,7 @@ void Randomize(const CuMatrixBase<Real> &src,
    MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();

    cuda_randomize(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_idx.Data(), dimtgt, dimsrc);
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
    
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
@ -116,7 +116,7 @@ void Randomize(const CuMatrixBase<Real> &src,
  {
    // randomize in CPU
    const MatrixBase<Real> &srcmat = src.Mat();
-    const std::vector<int32> &copy_from_idxvec = copy_from_idx.Vec();
+    const int32 *copy_from_idxvec = copy_from_idx.Data();
    MatrixBase<Real> &tgtmat = tgt->Mat();
    for(int32 i=0; i<copy_from_idx.Dim(); i++) {
      tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
@ -127,20 +127,20 @@ void Randomize(const CuMatrixBase<Real> &src,


 template<typename Real>
-void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<Real> *tgt) {
+void Splice(const CuMatrix<Real> &src, const CuArray<int32> &frame_offsets, CuMatrix<Real> *tgt) {

  KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
  KALDI_ASSERT(src.NumRows() == tgt->NumRows());

-  #if HAVE_CUDA==1
+  #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
    
-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
    
    cuda_splice(dimGrid, dimBlock, tgt->data_, src.data_, frame_offsets.Data(), tgt->Dim(), src.Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
    
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
@ -148,11 +148,12 @@ void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets,
  {
    // expand in CPU
    const MatrixBase<Real> &srcmat = src.Mat();
-    const std::vector<int32> &frame_offsetvec = frame_offsets.Vec();
+    const int32 *frame_offsetvec = frame_offsets.Data();
+    int32 dim = frame_offsets.Dim();
    MatrixBase<Real> &tgtmat = tgt->Mat();
    //
    for(int32 r=0; r < tgtmat.NumRows(); r++) {
-      for(int32 off=0; off < static_cast<int32>(frame_offsetvec.size()); off++) {
+      for(int32 off=0; off < dim; off++) {
        int32 r_off = r + frame_offsetvec[off];
        if(r_off < 0) r_off = 0;
        if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
@ -165,20 +166,20 @@ void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets,


 template<typename Real>
-void Copy(const CuMatrix<Real> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<Real> *tgt) { 
+void Copy(const CuMatrix<Real> &src, const CuArray<int32> &copy_from_indices, CuMatrix<Real> *tgt) { 

  KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
  KALDI_ASSERT(src.NumRows() == tgt->NumRows());

-  #if HAVE_CUDA==1
+  #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
    
-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
    
    cuda_copy(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_indices.Data(), tgt->Dim(), src.Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
    
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
@ -186,11 +187,12 @@ void Copy(const CuMatrix<Real> &src, const CuStlVector<int32> &copy_from_indices
  {
    // expand in CPU
    const MatrixBase<Real> &srcmat = src.Mat();
-    const std::vector<int32> &copy_from_indicesvec = copy_from_indices.Vec();
+    const int32 *copy_from_indicesvec = copy_from_indices.Data();
+    int32 dim = copy_from_indices.Dim();
    MatrixBase<Real> &tgtmat = tgt->Mat();
    //
-    for(int32 r=0; r < tgtmat.NumRows(); r++) {
-      for(int32 c=0; c < static_cast<int32>(copy_from_indicesvec.size()); c++) {
+    for(int32 r = 0; r < tgtmat.NumRows(); r++) {
+      for(int32 c = 0; c < dim; c++) {
        tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
      }
    }
@ -204,21 +206,21 @@ template
 void RegularizeL1(CuMatrixBase<double> *weight, CuMatrixBase<double> *grad, double l1, double lr);

 template
-void Splice(const CuMatrix<float> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<float> *tgt);
+void Splice(const CuMatrix<float> &src, const CuArray<int32> &frame_offsets, CuMatrix<float> *tgt);
 template
-void Splice(const CuMatrix<double> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<double> *tgt);
+void Splice(const CuMatrix<double> &src, const CuArray<int32> &frame_offsets, CuMatrix<double> *tgt);
 template
-void Copy(const CuMatrix<float> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<float> *tgt);
+void Copy(const CuMatrix<float> &src, const CuArray<int32> &copy_from_indices, CuMatrix<float> *tgt);
 template
-void Copy(const CuMatrix<double> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<double> *tgt);
+void Copy(const CuMatrix<double> &src, const CuArray<int32> &copy_from_indices, CuMatrix<double> *tgt);

 template
 void Randomize(const CuMatrixBase<float> &src,
-               const CuStlVector<int32> &copy_from_idx,
+               const CuArray<int32> &copy_from_idx,
               CuMatrixBase<float> *tgt);
 template
 void Randomize(const CuMatrixBase<double> &src,
-               const CuStlVector<int32> &copy_from_idx,
+               const CuArray<int32> &copy_from_idx,
               CuMatrixBase<double> *tgt);


--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@ -1,6 +1,7 @@
 // cudamatrix/cu-math.h

 // Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (Author: David Snyder) 

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -22,7 +23,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_MATH_H_
 #define KALDI_CUDAMATRIX_CU_MATH_H_
 #include "cudamatrix/cu-common.h"
-#include "cudamatrix/cu-stlvector.h"
+#include "cudamatrix/cu-array.h"
 #include "cudamatrix/cu-device.h"
 #include "util/timer.h"

@ -38,21 +39,38 @@ template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *gradient,
                  Real l1_penalty, Real learning_rate);

-/// ie. switch rows according to copy_from_idx
+/// Copies a permutation of src into tgt. The row permutation is specified in
+/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The 
+/// dimensions of copy_from_idx must be equivalent to the number of rows in
+/// tgt and src and all elements in the vector must be in [0, src.numRows()-1].  
 template<typename Real>
 void Randomize(const CuMatrixBase<Real> &src,
-               const CuStlVector<int32> &copy_from_idx,
+               const CuArray<int32> &copy_from_idx,
               CuMatrixBase<Real> *tgt);

-/// ie. concatenate the frames with offsets from frame_offsets
+/// Splice concatenates frames of src as specified in frame_offsets into tgt.
+/// The dimensions of tgt must be equivalent to the number of rows in src
+/// and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim().
+/// As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the
+/// general case where i in [0..src.NumRows()-1], 
+/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] 
+/// and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the
+/// number of rows in src or less than 0 than the right side of the equation 
+/// is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid
+/// an index out of bounds.
 template<typename Real>
 void Splice(const CuMatrix<Real> &src,
-            const CuStlVector<int32> &frame_offsets,
+            const CuArray<int32> &frame_offsets,
            CuMatrix<Real> *tgt);

+/// Copies elements from src into tgt as given by copy_from_indices.
+/// The matrices src and tgt must have the same dimensions and
+/// the dimension of copy_from_indices must equal the number of columns
+/// in the src matrix. As a result, tgt(i, j) == src(i, copy_from_indices[j]).
+/// Also see CuMatrix::CopyCols(), which is more general.
 template<typename Real>
 void Copy(const CuMatrix<Real> &src,
-          const CuStlVector<int32> &copy_from_indices,
+          const CuArray<int32> &copy_from_indices,
          CuMatrix<Real> *tgt);


--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@ -24,7 +24,7 @@

 namespace kaldi {

-template<class Real>
+template<typename Real>
 inline CuSubMatrix<Real>::CuSubMatrix(const CuMatrixBase<Real> &mat,
                                      const MatrixIndexT row_offset,
                                      const MatrixIndexT num_rows,
--- a/src/cudamatrix/cu-matrix-lib.h
+++ b/src/cudamatrix/cu-matrix-lib.h
@ -1,31 +1,32 @@
-// matrix/packed-matrix-inl.h
+// cudamatrix/cu-matrix-lib.h

-// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University;  Yanmin Qian;   Jan Silovsky;
-//                      Haihua Xu
+// Copyright 2013   Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-
+//
 //  http://www.apache.org/licenses/LICENSE-2.0
-
+//
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
-#ifndef KALDI_MATRIX_PACKED_MATRIX_INL_H_
-#define KALDI_MATRIX_PACKED_MATRIX_INL_H_
-
-namespace kaldi {



-}  // namespace kaldi
+#ifndef KALDI_CUDAMATRIX_CU_MATRIX_LIB_H_
+#define KALDI_CUDAMATRIX_CU_MATRIX_LIB_H_
+
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "cudamatrix/cu-tp-matrix.h"
+#include "cudamatrix/cu-block-matrix.h"
+#include "cudamatrix/cu-rand.h"

 #endif
-
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@ -0,0 +1,196 @@
+// cudamatrix/cu-matrix-speed-test.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-math.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+template<typename Real>
+std::string NameOf() {
+  return (sizeof(Real) == 8 ? "<double>" : "<float>");
+}
+    
+template<typename Real> void TestCuMatrixMatMat(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(dim, dim), N(dim, dim), O(dim, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    O.AddMatMat(1.0, M, kNoTrans, N, kNoTrans, 0.0);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::AddMatMat" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+template<typename Real> void TestCuMatrixSigmoid(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(dim, dim), N(dim, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    N.Sigmoid(M);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::Sigmoid" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(256, dim), N(256, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    N.ApplySoftMaxPerRow(M);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::Softmax" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
+  for (int32 n = 0; n < 2; n++) {
+    MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
+    BaseFloat time_in_secs = 0.08;
+  
+    CuMatrix<Real> M(dim, dim), N(dim, dim);
+    M.SetRandn();
+    N.SetRandn();
+    Timer tim;
+    int32 iter = 0;
+    for (;tim.Elapsed() < time_in_secs; iter++) {
+      TraceMatMat(M, N, trans);
+    }
+    BaseFloat fdim = dim;
+    BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+    KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf<Real>() 
+              << (trans == kTrans ? " [transposed]" : "") << ", for dim = "
+              << dim << ", speed was " << gflops << " gigaflops.";
+  }
+}
+
+template<typename Real> void TestCuMatrixCopyLowerToUpper(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(dim, dim);
+  M.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (; tim.Elapsed() < time_in_secs; iter++) {
+    M.CopyLowerToUpper();
+  }
+  CuMatrix<Real> M2(M, kTrans);
+  AssertEqual(M, M2);
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::CopyLowerToUpper" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void TestCuMatrixCopyUpperToLower(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(dim, dim);
+  M.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (; tim.Elapsed() < time_in_secs; iter++) {
+    M.CopyUpperToLower();
+  }
+  CuMatrix<Real> M2(M, kTrans);
+  AssertEqual(M, M2);
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::CopyUpperToLower" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void CudaMatrixSpeedTest() {
+  std::vector<int32> sizes;
+  sizes.push_back(16);
+  sizes.push_back(128);
+  sizes.push_back(256);
+  sizes.push_back(1024);
+  int32 ns = sizes.size();
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixMatMat<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixSigmoid<Real>(sizes[s]);
+
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixSoftmax<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixTraceMatMat<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixCopyLowerToUpper<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixCopyUpperToLower<Real>(sizes[s]);
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+    //Select the GPU
+#if HAVE_CUDA == 1
+    CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
+#endif
+
+    kaldi::CudaMatrixSpeedTest<float>();
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+    kaldi::CudaMatrixSpeedTest<double>();
+  } else {
+    KALDI_WARN << "Double precision not supported";
+  }
+#else
+  kaldi::CudaMatrixSpeedTest<double>();
+#endif
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  std::cout << "Tests succeeded.\n";
+}
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@ -1,7 +1,10 @@
 // cudamatrix/cu-matrix.h

 // Copyright 2009-2012  Karel Vesely
-//                      Johns Hopkins University (author: Daniel Povey)
+//                2013  Johns Hopkins University (author: Daniel Povey)
+//                2013  Hainan Xu
+//                2013  Xiaohui Zhang
+//                2013  Johns Hopkins University (author: Guoguo Chen)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -27,14 +30,18 @@

 #include "cudamatrix/cu-matrixdim.h"
 #include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-value.h"
 #include "matrix/matrix-common.h"
 #include "matrix/kaldi-matrix.h"
-#include "cudamatrix/cu-stlvector.h"
+#include "cudamatrix/cu-array.h"
 #include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-rand.h"

 namespace kaldi {

-
+template<typename Real>
+Real TraceMatMat(const CuMatrixBase<Real> &A, const CuMatrixBase<Real> &B,
+                 MatrixTransposeType trans = kNoTrans);
 /**
 * Matrix for CUDA computing.
 * Does the computation on the CUDA card when CUDA is compiled in and
@ -42,23 +49,77 @@ namespace kaldi {
 * otherwise, does it on the CPU.
 */

+/*
+template<typename Real>
+struct MatrixElement {
+  int row;
+  int column;
+  Real weight;
+};
+// */
+
 template<typename Real>
 class CuMatrixBase {
 public:
+  friend class CuMatrixBase<float>;
+  friend class CuMatrixBase<double>;
+  friend class CuVectorBase<float>;
+  friend class CuVectorBase<double>;
+  friend class VectorBase<Real>;
+  friend class CuSpMatrix<Real>;
+  friend class CuTpMatrix<float>;
+  friend class CuTpMatrix<double>;
  friend class CuVectorBase<Real>;
  friend class CuSubMatrix<Real>;
  friend class CuRand<Real>;
+  friend class CuSubVector<Real>;
+  friend class CuBlockMatrix<Real>;
  friend void cu::RegularizeL1<Real>(CuMatrixBase<Real> *weight,
                                     CuMatrixBase<Real> *grad, Real l1, Real lr);
  friend void cu::Splice<Real>(const CuMatrix<Real> &src,
-                               const CuStlVector<int32> &frame_offsets,
+                               const CuArray<int32> &frame_offsets,
                               CuMatrix<Real> *tgt);
  friend void cu::Copy<Real>(const CuMatrix<Real> &src,
-                             const CuStlVector<int32> &copy_from_indices,
+                             const CuArray<int32> &copy_from_indices,
                             CuMatrix<Real> *tgt);
  friend void cu::Randomize<Real>(const CuMatrixBase<Real> &src,
-                                  const CuStlVector<int32> &copy_from_idx,
+                                  const CuArray<int32> &copy_from_idx,
                                  CuMatrixBase<Real> *tgt);
+
+  /// Copies column r from column indices[r] of src.
+  /// As a special case, if indexes[i] == -1, sets column i to zero
+  /// indices.size() must equal this->NumCols(),
+  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+  /// and src.NumRows() must equal this.NumRows()
+  void CopyCols(const CuMatrixBase<Real> &src,
+                const std::vector<MatrixIndexT> &indices);
+
+  /// Version of CopyCols that takes CuArray argument.
+  void CopyCols(const CuMatrixBase<Real> &src,
+                const CuArray<MatrixIndexT> &indices);
+
+  
+  /// Copies row r from row indices[r] of src.
+  /// As a special case, if indexes[i] <== -1, sets row i to zero  
+  /// "reorder".size() must equal this->NumRows(), 
+  /// all elements of "reorder" must be in [0, src.NumRows()-1],
+  /// and src.NumCols() must equal this.NumCols()
+  void CopyRows(const CuMatrixBase<Real> &src,
+                const std::vector<MatrixIndexT> &indices);
+
+
+  /// For each row r of this and for each column c, sets (*this)(r, c) to the
+  /// sum \sum_j src(r, j), where j ranges from indices[c].first through
+  /// indices[c].second - 1.
+  void SumColumnRanges(const CuMatrixBase<Real> &src,
+                       const CuArray<Int32Pair> &indices);
+
+
+  friend Real TraceMatMat<Real>(const CuMatrixBase<Real> &A,
+                                const CuMatrixBase<Real> &B,
+                                MatrixTransposeType trans);
+
+  void AddToDiag(Real value);
  
  /// Dimensions
  MatrixIndexT NumRows() const { return num_rows_;  }
@ -72,26 +133,67 @@ class CuMatrixBase {
    return d; 
  }

+  Real FrobeniusNorm() const { return sqrt(TraceMatMat(*this, *this, kTrans)); }
+
+  bool IsUnit(Real tol = 0.001) const;  
+
+  /// True if ((*this)-other).FrobeniusNorm() <= tol * this->FrobeniusNorm()
+  bool ApproxEqual(const CuMatrixBase<Real> &other, float tol = 0.01) const;
+  
  /// Get size of matrix in bytes
  MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
-
-  /// Get size of matrix row in bytes
-  MatrixIndexT RowSizeInBytes() const { return num_cols_*sizeof(Real); }
  
-  /// Get size of matrix stride in bytes
-  MatrixIndexT StrideSizeInBytes() const { return stride_*sizeof(Real); }
+  // Copy functions.  These do not resize.
+  template<typename OtherReal>
+  void CopyFromMat(const MatrixBase<OtherReal> &src,
+                   MatrixTransposeType trans = kNoTrans);

+  void CopyFromMat(const MatrixBase<Real> &src,
+                   MatrixTransposeType trans = kNoTrans);
  
-  /// Copy functions (reallocates when needed, but note from Dan: eventually
-  /// I'll change it to just die if the sizes don't match, like the Matrix class.)
-  void CopyFromMat(const CuMatrixBase<Real> &src);
-  void CopyFromMat(const MatrixBase<Real> &src);
-  void CopyToMat(MatrixBase<Real> *dst) const;
+  void CopyFromSp(const CuSpMatrix<Real> &M);
+  
+  template<typename OtherReal>
+  void CopyFromTp(const CuTpMatrix<OtherReal> &M,
+                  MatrixTransposeType trans = kNoTrans);
+  
+  template<typename OtherReal>
+  void CopyFromMat(const CuMatrixBase<OtherReal> &M,
+                   MatrixTransposeType trans = kNoTrans); 
+
+  template<typename OtherReal>
+  void CopyToMat(MatrixBase<OtherReal> *dst,
+                 MatrixTransposeType trans = kNoTrans) const;
+  
+  void CopyRowsFromVec(const CuVectorBase<Real> &v);
+
+  void CopyRowsFromVec(const VectorBase<Real> &v);
+  
+  /// Copy vector into specific column of matrix.
+  void CopyColFromVec(const CuVectorBase<Real> &v, const MatrixIndexT col);

  /// Set each element to the sigmoid of the corresponding element of "src":
-  /// element by element, *this = 1 / (1 + exp(-src)).
+  /// element by element, x = 1 / (1 + exp(-x))
  void Sigmoid(const CuMatrixBase<Real> &src);

+  /// Apply the function y = log(1 + exp(x)), to each element.
+  /// Note: the derivative of this function is the sigmoid function.
+  /// This is like a soft ReLU.
+  void SoftHinge(const CuMatrixBase<Real> &src);
+
+  /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
+  /// where G = x.NumCols() / y.NumCols() must be an integer.
+  void GroupPnorm(const CuMatrixBase<Real> &src, Real pow);
+
+  /// Calculate derivatives for the GroupPnorm function above...
+  /// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
+  /// and "output" is the result of the computation (i.e. the "this" of that function
+  /// call), and *this has the same dimension as "input", then it sets each element
+  /// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
+  /// "output-elem" is whichever element of output depends on that input element.
+  void GroupPnormDeriv(const CuMatrixBase<Real> &input,
+                       const CuMatrixBase<Real> &output, Real power);
+  
  /// Compute the hyperbolic tangent (tanh) function; element by element,
  /// *this = tanh(src).
  void Tanh(const CuMatrixBase<Real> &src);
@ -105,7 +207,7 @@ class CuMatrixBase {
  /// tanh output.  Does, element-by-element, *this = diff * (1 - value^2).
  void DiffTanh(const CuMatrixBase<Real> &value,
                const CuMatrixBase<Real> &diff);
-
+  
  /// Differentiate the block [softmax+cross-entropy] :
  /// dE/da = posterior_mat - target_mat, 
  /// 'E' is error function, 'a' is activation on softmax input
@ -115,16 +217,30 @@ class CuMatrixBase {
  /// net_out_or_diff ... before invocation net output, after diff dE/da
  /// log_post_tgt ... per-frame statistics for cross-entropy computations :
  ///                  log(sum_row(posterior_mat .* target_mat))
-  void DiffXent(const CuStlVector<int32> &tgt,
+  void DiffXent(const CuArray<int32> &tgt,
                CuVector<Real> *log_post_tgt);  
+
+  /// This method may be only called for symmetric matrices (it accesses the
+  /// upper as well as lower triangle).  The result is put in the lower
+  /// triangle, and the upper triangle zeroed.
+  void Cholesky();
  
+  void SymInvertPosDef(); ///< Inversion for positive definite symmetric matrices.
+                          ///< Requires that the input is symmetric (we do not check this).
+                          ///< The output is symmetric.
+  
+  void ApplyPow(Real power);
+  void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0)
+  void ApplyFloor(Real floor_val);
+  void ApplyCeiling(Real ceiling_val);
+  void ApplyExp();
  /// Softmax nonlinearity
-  /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik)
+  /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row
  /// for each row, the max value is first subtracted for good numerical stability
-  void Softmax(const CuMatrixBase<Real> &src);
+  void ApplySoftMaxPerRow(const CuMatrixBase<Real> &src);

  /// Find the id of the maximal element for each row
-  void FindRowMaxId(CuStlVector<int32> *id) const;
+  void FindRowMaxId(CuArray<int32> *id) const;
  
  /*
  // Copy row interval from matrix
@ -139,27 +255,90 @@ class CuMatrixBase {
  void SetZero();
  void Set(Real value);
  void Add(Real value);
+  void SetZeroUpperDiag();
  void Scale(Real value);
  void ApplyLog();
-  /// Multiply two matrices elementhwise: C = A .* C
-  void MulElements(const CuMatrixBase<Real>& A);
+  
+  /// Multiply two matrices elementwise: C = A .* C
+  void MulElements(const CuMatrixBase<Real> &A);
+  /// Do, elementwise, *this = max(*this, A).
+  void Max(const CuMatrixBase<Real> &A);
  /// scale i'th column by scale[i]
  void MulColsVec(const CuVectorBase<Real> &scale); 
  /// scale i'th row by scale[i]
-  void MulRowsVec(const CuVectorBase<Real> &scale); 
+  void MulRowsVec(const CuVectorBase<Real> &scale);
+  /// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j].   
+  void MulRowsGroupMat(const CuMatrixBase<Real> &src);
  /// divide i'th row by scale[i]
  void DivRowsVec(const CuVectorBase<Real> &div);
  /// B = aplha * A + beta * B
-  void AddMat(Real alpha, const CuMatrixBase<Real>& A, Real beta=1.0);
+  void AddMat(Real alpha, const CuMatrixBase<Real> &A, Real beta=1.0);
  /// B = aplha * row + beta * B
-  void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta=1.0);
+  void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
  /// B = aplha * row + beta * B
-  void AddVecToRows(Real alpha, const CuVectorBase<Real> &row, Real beta=1.0);
+  void AddVecToRows(Real alpha, const CuVectorBase<Real> &row, Real beta = 1.0);
  /// C = alpha * A(^T)*B(^T) + beta * C
-  void AddMatMat(Real alpha, const CuMatrixBase<Real>& A, MatrixTransposeType transA,
-                 const CuMatrixBase<Real>& B, MatrixTransposeType transB, Real beta);
+  void AddMatMat(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
+                 const CuMatrixBase<Real> &B, MatrixTransposeType transB, Real beta);

+  /// *this = beta * *this + alpha * M M^T, for symmetric matrices.  It only
+  /// updates the lower triangle of *this.  It will leave the matrix asymmetric;
+  /// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
+  void SymAddMat2(const Real alpha, const CuMatrixBase<Real> &M,
+                  MatrixTransposeType transA, Real beta);

+  
+  /// This function is like AddMatMat but for where the second argument is of
+  /// type CuBlockMatrix (a block-diagonal matrix of blocks).
+  void AddMatBlock(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
+                   const CuBlockMatrix<Real> &B, MatrixTransposeType transB, Real beta);
+  
+  /// *this = beta * *this + alpha * diag(v) * M [or M^T].
+  /// The same as adding M but scaling each row M_i by v(i).
+  void AddDiagVecMat(const Real alpha, CuVectorBase<Real> &v,
+                     const CuMatrixBase<Real> &M, MatrixTransposeType transM, 
+                     Real beta = 1.0);  
+  
+  /// this <-- beta*this + alpha*A*B
+  void AddMatSp(const Real alpha,
+                const CuMatrixBase<Real> &A, MatrixTransposeType transA,
+                const CuSpMatrix<Real> &B,
+                const Real beta) {
+    CuMatrix<Real> M(B);
+    return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
+  }
+
+  /// this <-- beta*this + alpha*SpA*B
+  void AddSpMat(const Real alpha,
+                const CuSpMatrix<Real> &A,
+                const CuMatrixBase<Real> &B, MatrixTransposeType transB,
+                const Real beta) {
+    CuMatrix<Real> M(A);
+    return AddMatMat(alpha, M, kNoTrans, B, transB, beta);
+  }
+
+  /// this <-- beta*this + alpha*A*B.
+  void AddTpMat(const Real alpha,
+                const CuTpMatrix<Real> &A, MatrixTransposeType transA,
+                const CuMatrixBase<Real> &B, MatrixTransposeType transB,
+                const Real beta) {
+    CuMatrix<Real> M(A);
+    return AddMatMat(alpha, M, transA, B, transB, beta);
+  }
+
+  /// this <-- beta*this + alpha*A*B.
+  void AddMatTp(const Real alpha,
+                const CuMatrixBase<Real> &A, MatrixTransposeType transA,
+                const CuTpMatrix<Real> &B, MatrixTransposeType transB,
+                const Real beta) {
+    CuMatrix<Real> M(B);
+    return AddMatMat(alpha, A, transA, M, transB, beta);
+  }
+
+  void CopyFromBlock(const CuBlockMatrix<Real> &B,
+                     MatrixTransposeType trans = kNoTrans);
+  void CopyLowerToUpper();
+  void CopyUpperToLower();
  inline CuSubMatrix<Real> Range(const MatrixIndexT row_offset,
                                 const MatrixIndexT num_rows,
                                 const MatrixIndexT col_offset,
@ -177,11 +356,67 @@ class CuMatrixBase {
    return CuSubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols); 
  }

+  inline const CuSubVector<Real> Row(MatrixIndexT i) const {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_));
+    return CuSubVector<Real>(data_ + (i * stride_), NumCols());
+  }
+
+  inline CuSubVector<Real> Row(MatrixIndexT i) {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_));
+    return CuSubVector<Real>(data_ + (i * stride_), NumCols());
+  }
+
+  inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                          static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+                          static_cast<UnsignedMatrixIndexT>(c) <
+                          static_cast<UnsignedMatrixIndexT>(num_cols_));
+    return CuValue<Real>(data_ + r * stride_ + c);
+  }
  
+  inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                          static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+                          static_cast<UnsignedMatrixIndexT>(c) <
+                          static_cast<UnsignedMatrixIndexT>(num_cols_));
+    return CuValue<Real>(data_ + r * stride_ + c);  // will be casted to Real.
+  }
+
+  Real Sum() const;
+
+  /// Return the trace. If check_square = true, will crash if matrix is not square.
+  Real Trace(bool check_square = true) const;
+
+  void SetRandn();
+
+  void SetRandUniform();
+
+  void Write(std::ostream &os, bool binary) const;
+
+  // This function resizes the output to indices.size(), and for each element of
+  // "indices" it interprets it as a (row, column) index into *this, and puts
+  // (*this)(row, column) into the corresponding element of "output".
+  void Lookup(const std::vector<Int32Pair> &indices,
+              std::vector<Real> *output) const;
 protected:
+  // The following two functions should only be called if we did not compile with CUDA
+  // or could not get a CUDA card; in that case the contents are interpreted the
+  // same as a regular matrix.
+  inline const MatrixBase<Real> &Mat() const {
+    return *(reinterpret_cast<const MatrixBase<Real>* >(this));
+  }
+  inline MatrixBase<Real> &Mat() {
+    return *(reinterpret_cast<MatrixBase<Real>* >(this));
+  }
+  
  /// Get raw row pointer
  inline const Real* RowData(MatrixIndexT r) const { return data_ + r * stride_; }
  inline Real* RowData(MatrixIndexT r) { return data_ + r * stride_; }
+  inline const Real *Data() const { return data_; }
+  inline Real *Data() { return data_; }
+

  
  // The constructors are protected to prevent the user creating an instance of
@ -198,19 +433,9 @@ class CuMatrixBase {
                     MatrixIndexT stride):
  data_(data), num_cols_(num_cols), num_rows_(num_rows), stride_(stride) { }

-  // The following two functions should only be called if we did not compile with CUDA
-  // or could not get a CUDA card; in that case the contents are interpreted the
-  // same as a regular matrix.
-  inline const MatrixBase<Real> &Mat() const {
-    return *(reinterpret_cast<const MatrixBase<Real>* >(this));
-  }
-  inline MatrixBase<Real> &Mat() {
-    return *(reinterpret_cast<MatrixBase<Real>* >(this));
-  }
-  
  Real *data_;       ///< GPU data pointer (or regular matrix data pointer,
-                     ///< if either CUDA was not compiled in or we could not
-                     ///< acquire the device).
+  ///< if either CUDA was not compiled in or we could not
+  ///< acquire the device).
  // Note: it might seem a bit backwards that we have the number of columns
  // first here; it's necessary because we need the data to be laid out the same
  // as for MatrixBase so the Mat() function call will work.  We don't want to
@ -239,15 +464,34 @@ class CuMatrix: public CuMatrixBase<Real> {

  // Note: we had to remove the "explicit" keyword due
  // to problems with STL vectors of CuMatrixBase.
-  CuMatrix(const CuMatrix<Real> &other) {
-    this->Resize(other.NumRows(), other.NumCols(), kUndefined);
-    this->CopyFromMat(other);
+  CuMatrix(const CuMatrix<Real> &other,
+           MatrixTransposeType trans = kNoTrans);
+
+  explicit CuMatrix(const CuBlockMatrix<Real> &other,
+                    MatrixTransposeType trans = kNoTrans);
+  
+  explicit CuMatrix(const CuMatrixBase<Real> &other,
+                    MatrixTransposeType trans = kNoTrans);
+
+  template<typename OtherReal>
+  explicit CuMatrix(const MatrixBase<OtherReal> &other,
+                    MatrixTransposeType trans = kNoTrans);
+
+  /// Copy constructor taking SpMatrix... 
+  explicit CuMatrix(const CuSpMatrix<Real> &M) : CuMatrixBase<Real>() {
+    Resize(M.NumRows(), M.NumRows(), kUndefined);
+    this->CopyFromSp(M);
  }

-  explicit CuMatrix(const MatrixBase<Real> &other) {
-    this->Resize(other.NumRows(), other.NumCols(), kUndefined);
-    this->CopyFromMat(other);
-  }
+  /// Copy constructor taking TpMatrix...
+  template <typename OtherReal>
+  explicit CuMatrix(const CuTpMatrix<OtherReal> & M,
+                    MatrixTransposeType trans = kNoTrans);
+  
+  /// Copy constructor: as above, but from another type.
+  template<typename OtherReal>
+  explicit CuMatrix(const CuMatrixBase<OtherReal> &M,
+                    MatrixTransposeType trans = kNoTrans);
  
  CuMatrix<Real> &operator = (const CuMatrixBase<Real> &other) {
    this->Resize(other.NumRows(), other.NumCols(), kUndefined);
@ -265,21 +509,45 @@ class CuMatrix: public CuMatrixBase<Real> {
    this->Resize(other.NumRows(), other.NumCols(), kUndefined);
    this->CopyFromMat(other);
    return *this;
-  }   
+  }
+
+  void Transpose();

  /// Allocate the memory
  void Resize(MatrixIndexT rows, MatrixIndexT cols,
              MatrixResizeType resize_type = kSetZero);
-  
-  
+    
  void Swap(Matrix<Real> *mat);
+  void Swap(CuMatrix<Real> *mat);
+
+  template<typename OtherReal>
+  void Swap(CuMatrix<OtherReal> *mat);
  
  /// I/O functions
  void Read(std::istream &is, bool binary);
-  void Write(std::ostream &os, bool binary) const;

  /// Destructor
  ~CuMatrix() { Destroy(); }
+
+  inline const Matrix<Real> &Mat() const {
+    return *(reinterpret_cast<const Matrix<Real>* >(this));
+  }
+  inline Matrix<Real> &Mat() {
+    return *(reinterpret_cast<Matrix<Real>* >(this));
+  }
+
+  /// This function does: for each element { row, column, weight } indexed i in
+  /// the vector "elements", let x(i) = A(row(i), column(i)); then it does
+  /// (*this)(row(i), column(i)) += weight(i) / x(i), and
+  /// *tot_objf = \sum_i weight(i) * log(x(i)), and
+  /// *tot_weight = \sum_i weight(i)
+  /// Preconditions: A must be strictly positive, and no (row, column) pair
+  /// may be repeated within "elements"
+  void CompObjfAndDeriv(const std::vector<MatrixElement<Real> > &elements,
+                        const CuMatrix<Real> &A,
+                        Real *tot_objf,
+                        Real* tot_weight);
+
 private:
  void Destroy();
 };
@ -305,27 +573,55 @@ class CuSubMatrix: public CuMatrixBase<Real> {
  CuSubMatrix<Real> &operator = (const CuSubMatrix<Real> &other);
 };

-template<class Real>
+
+template<typename Real>
+bool ApproxEqual(const CuMatrixBase<Real> &A,
+                 const CuMatrixBase<Real> &B, Real tol = 0.01) {
+  return A.ApproxEqual(B, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(CuMatrixBase<Real> &A, CuMatrixBase<Real> &B,
+                        float tol = 0.01) {
+  KALDI_ASSERT(A.ApproxEqual(B, tol));
+}
+
+template<typename Real>
 bool SameDim(const CuMatrixBase<Real> &M, const CuMatrixBase<Real> &N) {
  return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
 }

-template<class Real>
+template<typename Real>
 bool SameDimAndStride(const CuMatrixBase<Real> &M, const CuMatrixBase<Real> &N) {
  return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols()
          && M.Stride() == N.Stride());
 }

-
 /// I/O
 template<typename Real>
 std::ostream &operator << (std::ostream &out, const CuMatrixBase<Real> &mat);


-  
-} // namespace
+template<typename Real>
+template<typename OtherReal>
+Matrix<Real>::Matrix(const CuMatrixBase<OtherReal> &M,
+                     MatrixTransposeType trans) {
+  if (trans == kNoTrans) Init(M.NumRows(), M.NumCols());
+  else Init(M.NumCols(), M.NumRows());
+  M.CopyToMat(this, trans);
+}
+
+template<typename Real>
+template<typename OtherReal>
+void MatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &cu,
+                                   MatrixTransposeType trans) {
+  cu.CopyToMat(this, trans);
+}


-#include "cu-matrix-inl.h"
+}  // namespace
+
+
+#include "cudamatrix/cu-matrix-inl.h"

 #endif
--- a/src/cudamatrix/cu-matrixdim.h
+++ b/src/cudamatrix/cu-matrixdim.h
@ -1,6 +1,7 @@
 // cudamatrix/cu-matrixdim.h

 // Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -28,12 +29,20 @@
 #ifdef _MSC_VER
  typedef unsigned __int32 uint32_cuda;
  typedef __int32          int32_cuda;
+  typedef __int32          MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
 #else
  #include <stdint.h>
  typedef uint32_t         uint32_cuda;
  typedef int32_t          int32_cuda;
+  typedef int32_t          MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
 #endif

+template<typename Real>
+struct MatrixElement {
+  int32_cuda row;
+  int32_cuda column;
+  Real weight;
+};

 extern "C" {
  /**
@ -45,8 +54,37 @@ extern "C" {
    int32_cuda cols;
    int32_cuda stride;
  } MatrixDim;
+
+// we define the following constants here because this file is included
+// both by the C++ code and also CUDA code.
+  
+
+// The size of a CUDA 1-d block, e.g. for vector operations..
+#define CU1DBLOCK 256
+
+// The size of edge of CUDA square block, e.g. for matrix operations.
+// Must be defined the same in cu-kernels-ansi.h
+#define CU2DBLOCK 16
+
+
+  /** This structure is used in cu-block-matrix.h to store information
+      about a block-diagonal matrix.  We declare it here so that it
+      will be accessible
+   */
+  typedef struct CuBlockMatrixData_ {
+    int32_cuda row_offset; // sum of #rows of previous M_i
+    int32_cuda col_offset; // sum of #cols of previous M_i
+    MatrixDim matrix_dim; // dimension of this M_i
+    void *matrix_data; // data for M_i.  This is a pointer to either float* or
+                       // double*.  Because C doesn't support templates and to
+                       // avoid extra coding to support the two cases, we
+                       // decided to make this a void* pointer.
+  } CuBlockMatrixData;
+
+  typedef struct Int32Pair {
+    int32_cuda first;
+    int32_cuda second;
+  } Int32Pair;
 }

 #endif
-
-
--- a/Показать больше
+++ b/Показать больше