syncing 'sandbox/dan2->trunk' of src/matrix,src/cudamatrix

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3194 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
2013-11-21 21:24:54 +00:00 · 2013-11-21 21:24:54 +00:00 · c3a5fa2187
--- a/egs/wsj/s5/steps/align_nnet.sh
+++ b/egs/wsj/s5/steps/align_nnet.sh
@ -18,7 +18,7 @@ align_to_lats=false # optionally produce alignment in lattice format
 lats_decode_opts="--acoustic-scale=0.1 --beam=20 --latbeam=10"
 lats_graph_scales="--transition-scale=1.0 --self-loop-scale=0.1"

-use_gpu_id=-1 # disable gpu
+use_gpu="no" # yes|no|optionaly
 # End configuration options.

 [ $# -gt 0 ] && echo "$0 $@"  # Print the command line for logging
@ -76,7 +76,7 @@ if [ -f $srcdir/delta_order ]; then
  feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
 fi
 # Finally add feature_transform and the MLP
-feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
+feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"


 echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"
--- a/egs/wsj/s5/steps/decode_nnet.sh
+++ b/egs/wsj/s5/steps/decode_nnet.sh
@ -25,7 +25,7 @@ scoring_opts="--min-lmwt 4 --max-lmwt 15"

 num_threads=1 # if >1, will use latgen-faster-parallel
 parallel_opts="-pe smp $((num_threads+1))" # use 2 CPUs (1 DNN-forward, 1 decoder)
-use_gpu_id=-1 # -1 disable gpu
+use_gpu="no" # yes|no|optionaly
 # End configuration section.

 echo "$0 $@"  # Print the command line for logging
@ -104,7 +104,7 @@ fi
 # Run the decoding in the queue
 if [ $stage -le 0 ]; then
  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
-    nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet "$feats" ark:- \| \
+    nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
    latgen-faster-mapped$thread_string --max-active=$max_active --max-mem=$max_mem --beam=$beam \
    --lattice-beam=$latbeam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
--- a/egs/wsj/s5/steps/pretrain_dbn.sh
+++ b/egs/wsj/s5/steps/pretrain_dbn.sh
@ -50,8 +50,6 @@ splice_step=1      # Stepsize of the splicing (1 is consecutive splice,
                   # value 2 would do [ -10 -8 -6 -4 -2 0 2 4 6 8 10 ] splicing)
 # misc.
 verbose=1 # enable per-cache reports
-# gpu config
-use_gpu_id= # manually select GPU id to run on, (-1 disables GPU) 
 # End configuration.

 echo "$0 $@"  # Print the command line for logging
@ -172,7 +170,7 @@ else
  feature_transform_old=$feature_transform
  feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
  echo "Renormalizing MLP input features into $feature_transform"
-  nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
+  nnet-forward --use-gpu=yes \
    $feature_transform_old "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
    ark:- 2>$dir/log/cmvn_glob_fwd.log |\
  compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
@ -186,7 +184,7 @@ fi


 ###### GET THE DIMENSIONS ######
-num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu-id=-1 $feature_transform ark:- ark:- |" - 2>/dev/null)
+num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null)
 num_hid=$hid_dim


@ -208,14 +206,14 @@ for depth in $(seq 1 $nn_depth); do
    rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate_low --l2-penalty=$rbm_l2penalty \
      --num-iters=$((2*$rbm_iter)) --drop-data=$rbm_drop_data --verbose=$verbose \
      --feature-transform=$feature_transform \
-      ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} $rbm_extra_opts \
+      $rbm_extra_opts \
      $RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
  else
    #This is Bernoulli-Bernoulli RBM
    #cmvn stats for init
    echo "Computing cmvn stats '$dir/$depth.cmvn' for RBM initialization"
    if [ ! -f $dir/$depth.cmvn ]; then 
-      nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
+      nnet-forward --use-gpu=yes \
       "nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
        "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
        ark:- 2>$dir/log/cmvn_fwd.$depth.log | \
@ -232,7 +230,7 @@ for depth in $(seq 1 $nn_depth); do
    rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate --l2-penalty=$rbm_l2penalty \
      --num-iters=$rbm_iter --drop-data=$rbm_drop_data --verbose=$verbose \
      --feature-transform="nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
-      ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} $rbm_extra_opts \
+      $rbm_extra_opts \
      $RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
  fi

--- a/egs/wsj/s5/steps/train_nnet.sh
+++ b/egs/wsj/s5/steps/train_nnet.sh
@ -46,7 +46,6 @@ train_opts=        # options, passed to the training script
 train_tool=        # optionally change the training tool

 # OTHER
-use_gpu_id= # manually select GPU id to run on, (-1 disables GPU)
 analyze_alignments=true # run the alignment analysis script
 seed=777    # seed value used for training data shuffling and initialization
 # End configuration.
@ -258,7 +257,7 @@ else
  feature_transform_old=$feature_transform
  feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
  echo "Renormalizing MLP input features into $feature_transform"
-  nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
+  nnet-forward --use-gpu=yes \
    $feature_transform_old "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" \
    ark:- 2>$dir/log/nnet-forward-cmvn.log |\
  compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
@ -315,7 +314,6 @@ steps/train_nnet_scheduler.sh \
  ${train_opts} \
  ${train_tool:+ --train-tool "$train_tool"} \
  ${config:+ --config $config} \
-  ${use_gpu_id:+ --use-gpu-id $use_gpu_id} \
  $mlp_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir || exit 1


--- a/egs/wsj/s5/steps/train_nnet_mmi.sh
+++ b/egs/wsj/s5/steps/train_nnet_mmi.sh
@ -21,7 +21,6 @@ learn_rate=0.00001
 halving_factor=1.0 #ie. disable halving
 drop_frames=true
 verbose=1
-use_gpu_id=

 seed=777    # seed value used for training data shuffling
 # End configuration section
@ -168,7 +167,6 @@ while [ $x -le $num_iters ]; do
       --learn-rate=$learn_rate \
       --drop-frames=$drop_frames \
       --verbose=$verbose \
-       ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
  fi
  cur_mdl=$dir/$x.nnet
--- a/egs/wsj/s5/steps/train_nnet_mpe.sh
+++ b/egs/wsj/s5/steps/train_nnet_mpe.sh
@ -21,7 +21,6 @@ halving_factor=1.0 #ie. disable halving
 do_smbr=true
 use_silphones=false #setting this to something will enable giving siphones to nnet-mpe
 verbose=1
-use_gpu_id=

 seed=777    # seed value used for training data shuffling
 # End configuration section
@ -151,7 +150,6 @@ while [ $x -le $num_iters ]; do
       --do-smbr=$do_smbr \
       --verbose=$verbose \
       $mpe_silphones_arg \
-       ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
  fi
  cur_mdl=$dir/$x.nnet
--- a/egs/wsj/s5/steps/train_nnet_scheduler.sh
+++ b/egs/wsj/s5/steps/train_nnet_scheduler.sh
@ -25,8 +25,6 @@ end_halving_inc=0.1
 halving_factor=0.5
 # misc.
 verbose=1
-# gpu
-use_gpu_id=
 # tool
 train_tool="nnet-train-xent-hardlab-frmshuff"
 
@ -73,7 +71,6 @@ mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}
 $train_tool --cross-validate=true \
 --bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
 ${feature_transform:+ --feature-transform=$feature_transform} \
- ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
 $mlp_best "$feats_cv" "$labels_cv" \
 2> $dir/log/prerun.log || exit 1;

@ -97,7 +94,6 @@ for iter in $(seq -w $max_iters); do
   --learn-rate=$learn_rate --momentum=$momentum --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
   --bunchsize=$bunch_size --cachesize=$cache_size --randomize=true --verbose=$verbose \
   ${feature_transform:+ --feature-transform=$feature_transform} \
-   ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
   ${seed:+ --seed=$seed} \
   $mlp_best "$feats_tr" "$labels_tr" $mlp_next \
   2> $dir/log/iter$iter.log || exit 1; 
@ -110,7 +106,6 @@ for iter in $(seq -w $max_iters); do
  $train_tool --cross-validate=true \
   --bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
   ${feature_transform:+ --feature-transform=$feature_transform} \
-   ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
   $mlp_next "$feats_cv" "$labels_cv" \
   2>>$dir/log/iter$iter.log || exit 1;
  
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@ -9,12 +9,16 @@ OPENFST_LDLIBS =
 include ../kaldi.mk

 LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)

-TESTFILES = cuda-matrix-test 
+TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \
+            cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test

-OBJFILES = cu-device.o cu-math.o cu-matrix.o
+
+OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
+           cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o
 ifeq ($(CUDA), true)
-  OBJFILES += cu-kernels.o cu-randkernels.o
+  OBJFILES += cu-kernels.o cu-randkernels.o cu-choleskykernels.o
 endif

 LIBNAME = kaldi-cudamatrix
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@ -0,0 +1,208 @@
+// cudamatrix/cu-array-inl.h
+
+// Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
+#define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
+
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-kernels.h"
+#endif
+
+#include "util/timer.h"
+
+namespace kaldi {
+
+
+template<typename T>
+void CuArray<T>::Resize(MatrixIndexT dim, MatrixResizeType resize_type) {
+  KALDI_ASSERT((resize_type == kSetZero || resize_type == kUndefined) && dim >= 0);
+  if (dim_ == dim) {
+    if (resize_type == kSetZero)
+      SetZero();
+    return;
+  }
+
+  Destroy();
+
+  if (dim == 0) return;
+  
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    CU_SAFE_CALL(cudaMalloc((void**)&data_, dim*sizeof(T)));
+  } else
+#endif
+  {
+    data_ = static_cast<T*>(malloc(dim * sizeof(T)));
+    // We allocate with malloc because we don't want constructors being called.
+    // We basically ignore memory alignment issues here-- we assume the malloc
+    // implementation is forgiving enough that it will automatically align on
+    // sensible boundaries.
+    if (data_ == 0)
+      KALDI_ERR << "Memory allocation failed when initializing CuVector "
+                << "with dimension " << dim << " object size in bytes: "
+                << sizeof(T);
+  }
+
+  dim_ = dim;
+  if (resize_type == kSetZero)
+    SetZero();
+}
+
+template<typename T>
+void CuArray<T>::Destroy() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    if (data_ != NULL) {
+      CU_SAFE_CALL(cudaFree(data_));
+    }
+  } else
+#endif
+  {
+    if (data_ != NULL)
+      free(data_);
+  }
+  dim_ = 0;
+  data_ = NULL;
+}
+
+
+template<typename T>
+void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
+  Resize(src.size(), kUndefined);
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice));
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    memcpy(data_, &src.front(), src.size()*sizeof(T));
+  }
+}
+
+
+
+template<typename T>
+void CuArray<T>::CopyToVec(std::vector<T> *dst) const {
+  if (static_cast<MatrixIndexT>(dst->size()) != dim_) {
+    dst->resize(dim_);
+  }
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(T), cudaMemcpyDeviceToHost));
+    CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim.Elapsed());
+  } else
+#endif
+  {
+    memcpy(&dst->front(), data_, dim_*sizeof(T));
+  }
+}
+
+
+template<typename T>
+void CuArray<T>::SetZero() {
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+    CU_SAFE_CALL(cudaMemset(data_, 0, dim_ * sizeof(T)));
+    CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim.Elapsed());
+  } else
+#endif
+  {
+    memset(static_cast<void*>(data_), 0, dim_ * sizeof(T));
+  }
+}
+
+
+
+/**
+ * Print the vector to stream
+ */
+template<typename T>
+std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
+  std::vector<T> tmp;
+  vec.CopyToVec(&tmp);
+  out << "[";
+  for(int32 i=0; i<tmp.size(); i++) {
+    out << " " << tmp[i];
+  }
+  out << " ]\n";
+  return out;
+}
+
+
+template<class T> 
+inline void CuArray<T>::Set(const T &value) {
+  // This is not implemented yet, we'll do so if it's needed.
+  KALDI_ERR << "CuArray<T>::Set not implemented yet for this type.";
+}
+
+template<> 
+inline void CuArray<int32>::Set(const int32 &value) {
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+
+    dim3 dimBlock(CU2DBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaI32_set_const(dimGrid, dimBlock, data_, value, d);
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    for (int32 i = 0; i < dim_; i++)
+      data_[i] = value;
+  }
+}
+
+template<typename T>
+void CuArray<T>::CopyFromArray(const CuArray<T> &src) {
+  this->Resize(src.Dim(), kUndefined);
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T),
+                            cudaMemcpyDeviceToDevice));
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    memcpy(this->data_, src.data_, dim_ * sizeof(T));
+  }
+}
+
+
+} // namespace kaldi
+
+#endif
--- a/src/cudamatrix/cu-array-test.cc
+++ b/src/cudamatrix/cu-array-test.cc
@ -0,0 +1,124 @@
+// cudamatrix/cu-array-test.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-array.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+
+
+
+template<class T>
+static void UnitTestCuArray() {
+  for (int32 i = 0; i < 30; i++) {
+    int32 size = rand() % 5;
+    size = size * size * size; // Have a good distribution of sizes, including >256.
+    int32 size2 = rand() % 4;
+    std::vector<T> vec(size);
+    std::vector<T> garbage_vec(size2); // We just use garbage_vec to make sure
+                                       // we sometimes resize from empty,
+                                       // sometimes not.
+    
+    int32 byte_size = size * sizeof(T);
+    std::vector<char> rand_c(byte_size);
+    for (size_t i = 0; i < byte_size; i++)
+      rand_c[i] = rand() % 256;
+    if (!vec.empty()) {
+      std::memcpy((void*)&(vec[0]), (void*)&(rand_c[0]),
+                  byte_size);
+    }
+
+    { // test constructor from vector and CopyToVec.
+      CuArray<T> cu_vec(vec);
+      std::vector<T> vec2;
+      cu_vec.CopyToVec(&vec2);
+      KALDI_ASSERT(vec2 == vec);
+    }
+
+    { // test assignment operator from CuArray.
+      CuArray<T> cu_vec(vec);
+      CuArray<T> cu_vec2(garbage_vec);
+      cu_vec2 = cu_vec;
+      std::vector<T> vec2;
+      cu_vec2.CopyToVec(&vec2);
+      KALDI_ASSERT(vec2 == vec);
+      KALDI_ASSERT(cu_vec2.Dim() == int32(vec2.size())); // test Dim()
+    }
+      
+    { // test resize with resize_type = kSetZero.
+      CuArray<T> cu_vec(vec);
+      cu_vec.Resize(size, kSetZero);
+      std::vector<T> vec2(vec);
+
+      if (!vec2.empty())
+        std::memset(&(vec2[0]), 0, vec2.size() * sizeof(T));
+      std::vector<T> vec3;
+      cu_vec.CopyToVec(&vec3);
+      KALDI_ASSERT(vec2 == vec3); // testing equality of zero arrays.
+    }
+
+    if (sizeof(T) == sizeof(int32) && size > 0) { // test Set for type int32, or same size.
+      CuArray<T> cu_vec(vec);
+      cu_vec.Set(vec[0]);
+      for (size_t i = 1; i < vec.size(); i++) vec[i] = vec[0];
+      std::vector<T> vec2;
+      cu_vec.CopyToVec(&vec2);
+      KALDI_ASSERT(vec2 == vec);
+    }
+  }
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no");
+    else
+      CuDevice::Instantiate().SelectGpuId("yes");
+#endif
+
+    //kaldi::UnitTestCuArray<float>();
+    kaldi::UnitTestCuArray<double>();
+    kaldi::UnitTestCuArray<int32>();
+    kaldi::UnitTestCuArray<std::pair<int32, int32> >();
+
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
--- a/src/cudamatrix/cu-array.h
+++ b/src/cudamatrix/cu-array.h
@ -0,0 +1,123 @@
+// cudamatrix/cu-array.h
+
+// Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_ARRAY_H_
+#define KALDI_CUDAMATRIX_CU_ARRAY_H_
+
+#include "matrix/kaldi-vector.h"
+
+namespace kaldi {
+
+
+/**
+ * std::vector equivalent for CUDA computing.  This class is mostly intended as
+ * a CUDA-based mirror of a std::vector object that lives on the CPU.  We don't
+ * call constructors, initializers, etc., on the GPU.
+ */
+template<typename T>
+class CuArray {
+  typedef CuArray<T> ThisType;
+ public:
+
+  /// Default Constructor
+  CuArray<T>() : dim_(0), data_(NULL) {  }
+
+  /// Constructor with memory initialisation.  resize_type may be kSetZero or
+  /// kUndefined.
+  explicit CuArray<T>(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero):
+    dim_(0), data_(NULL) { Resize(dim, resize_type); }
+
+  /// Constructor from CPU-based int vector
+  explicit CuArray<T>(const std::vector<T> &src):
+    dim_(0), data_(NULL) { CopyFromVec(src); }
+
+  explicit CuArray<T>(const CuArray<T> &src):
+   dim_(0), data_(NULL) { CopyFromArray(src); }
+
+  /// Destructor
+  ~CuArray() { Destroy(); }
+
+  /// Return the vector dimension
+  MatrixIndexT Dim() const { return dim_;  }
+
+  /// Get raw pointer
+  const T* Data() const { return data_; }
+
+  T* Data() { return data_; }
+ 
+  /// Allocate the memory.  resize_type may be kSetZero or kUndefined.
+  /// kCopyData not yet supported (can be implemented if needed).
+  void Resize(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero);
+  
+  /// Deallocate the memory and set dim_ and data_ to zero.  Does not call any
+  /// destructors of the objects stored.
+  void Destroy();
+  
+  /// This function resizes if needed.  Note: copying to GPU is done via memcpy,
+  /// and any constructors or assignment operators are not called.
+  void CopyFromVec(const std::vector<T> &src);
+
+  /// This function resizes if needed.
+  void CopyFromArray(const CuArray<T> &src);
+
+  /// This function resizes *dst if needed.  On resize of "dst", the STL vector
+  /// may call copy-constructors, initializers, and assignment operators for
+  /// existing objects (which will be overwritten), but the copy from GPU to CPU
+  /// is done via memcpy.  So be very careful calling this function if your
+  /// objects are more than plain structs.
+  void CopyToVec(std::vector<T> *dst) const;
+
+  /// Sets the memory for the object to zero, via memset.  You should verify
+  /// that this makes sense for type T.
+  void SetZero();
+  
+  /// Set to a constant value.  Note: any copying is done as if using memcpy, and
+  /// assignment operators or destructors are not called.  This is NOT IMPLEMENTED
+  /// YET except for T == int32 (the current implementation will just crash).
+  void Set(const T &value);
+
+  CuArray<T> &operator= (const CuArray<T> &in) {
+    this->CopyFromArray(in); return *this;
+  }
+
+  CuArray<T> &operator= (const std::vector<T> &in) {
+    this->CopyFromVec(in); return *this;
+  }
+  
+ private:
+  MatrixIndexT dim_;     ///< dimension of the vector
+  T *data_;  ///< GPU data pointer (if GPU not available,
+             ///< will point to CPU memory).
+};
+
+
+/// I/O
+template<typename T>
+std::ostream &operator << (std::ostream &out, const CuArray<T> &vec);
+ 
+} // namespace
+
+
+#include "cudamatrix/cu-array-inl.h"
+
+#endif
+
--- a/src/cudamatrix/cu-block-matrix-test.cc
+++ b/src/cudamatrix/cu-block-matrix-test.cc
@ -0,0 +1,239 @@
+// cudamatrix/cu-block-matrix-test.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+/*
+ * ASSERTS
+ */
+template<typename Real> 
+static void AssertEqual(const MatrixBase<Real> &A,
+                        const MatrixBase<Real> &B,
+                        float tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows()&&A.NumCols() == B.NumCols());
+  for (MatrixIndexT i = 0;i < A.NumRows();i++) {
+    for (MatrixIndexT j = 0;j < A.NumCols();j++) {
+      KALDI_ASSERT(std::abs(A(i, j)-B(i, j)) <= tol*std::max(1.0, (double) (std::abs(A(i, j))+std::abs(B(i, j)))));
+    }
+  }
+}
+
+
+template<typename Real> 
+static void AssertEqual(const CuMatrixBase<Real> &A,
+                        const CuMatrixBase<Real> &B,
+                        float tol = 0.001) {
+  Real Anorm = A.FrobeniusNorm(), Bnorm = B.FrobeniusNorm();
+  CuMatrix<Real> diff(A);
+  diff.AddMat(-1.0, B);
+  Real diff_norm = diff.FrobeniusNorm();
+  if (diff_norm > tol * 0.5 * (Anorm + Bnorm)) {
+    KALDI_LOG << "A = " << A;
+    KALDI_LOG << "B = " << B;
+    KALDI_ERR << "Matrices differ, " << diff_norm << " > " << tol << " * 0.5 *  ( "
+              << Anorm << " + " << Bnorm << " ). ";
+  }
+}
+
+
+template<typename Real> 
+static void AssertEqual(const CuBlockMatrix<Real> &A,
+                        const CuBlockMatrix<Real> &B,
+                        float tol = 0.001) {
+  CuMatrix<Real> Acopy(A), Bcopy(B);
+  AssertEqual(Acopy, Bcopy, tol);
+}
+
+
+
+
+
+template<class Real>
+static void UnitTestCuBlockMatrixIO() {
+  for (int32 i = 0; i < 10; i++) {
+    int32 num_blocks = rand() % 5;
+    std::vector<CuMatrix<Real> > data(num_blocks);
+    for (int32 b = 0; b < num_blocks; b++) {
+      int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
+      if (b % 2 == 0) std::swap(dimM, dimN);
+      data[b].Resize(dimM, dimN);
+      data[b].SetRandn();
+    }
+    CuBlockMatrix<Real> B(data);
+
+    std::ostringstream os;
+    bool binary = (i % 4 < 2);
+    B.Write(os, binary);
+
+    CuBlockMatrix<Real> B2;
+    std::istringstream is(os.str());
+    B2.Read(is, binary);
+
+    CuMatrix<Real> mat(B), mat2(B2);
+    AssertEqual(mat, mat2);
+    if (!data.empty())
+      KALDI_ASSERT(mat.Sum() != 0.0);
+  }
+}
+
+
+
+template<class Real>
+static void UnitTestCuBlockMatrixAddMatBlock() {
+  for (int32 i = 0; i < 20; i++) {
+    int32 num_blocks = rand() % 5;
+    std::vector<CuMatrix<Real> > data(num_blocks);
+    for (int32 b = 0; b < num_blocks; b++) {
+      int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
+      // early failures will have small dim for easier eyeballing.
+      if (b % 2 == 0) std::swap(dimM, dimN);
+      data[b].Resize(dimM, dimN);
+      data[b].SetRandn();
+    }
+    CuBlockMatrix<Real> B(data);
+    int32 B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
+    // will do X += A B
+
+    MatrixTransposeType transB = (i % 2 == 1 ? kTrans : kNoTrans),
+        transA = (i % 3 == 1 ? kTrans : kNoTrans);
+    if (transB == kTrans) std::swap(B_num_rows, B_num_cols);
+    
+    int32 X_num_rows = 100 + rand() % 255, X_num_cols = B_num_cols,
+        A_num_rows = X_num_rows, A_num_cols = B_num_rows;
+    if (data.size() == 0) { X_num_rows = 0; A_num_rows = 0; }
+    if (transA == kTrans) std::swap(A_num_rows, A_num_cols);
+
+    Real alpha = 2.0, beta = -1.0;
+    CuMatrix<Real> X(X_num_rows, X_num_cols);
+    X.SetRandn();
+    CuMatrix<Real> A(A_num_rows, A_num_cols);
+    A.SetRandn();
+
+    CuMatrix<Real> Xcopy(X), Bcopy(B), Xorig(X), Aorig(A);
+    Xcopy.AddMatMat(alpha, A, transA, Bcopy, transB, beta);
+    X.AddMatBlock(alpha, A, transA, B, transB, beta);
+
+    AssertEqual(X, Xcopy);
+  }
+}
+
+
+template<class Real>
+static void UnitTestCuBlockMatrixAddMatMat() {
+  for (int32 i = 0; i < 20; i++) {
+    int32 num_blocks = rand() % 5;
+    std::vector<CuMatrix<Real> > data(num_blocks);
+    for (int32 b = 0; b < num_blocks; b++) {
+      int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
+      if (i == 0) { dimM = 1; dimN = 1; }
+      // early failures will have small dim for easier eyeballing.
+      if (b % 2 == 0) std::swap(dimM, dimN);
+      data[b].Resize(dimM, dimN);
+      data[b].SetRandn();
+    }    
+    
+    CuBlockMatrix<Real> B(data);
+    int32 B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
+    // will do B += C D
+
+    int32 C_num_rows = B_num_rows, C_num_cols = 100 + rand() % 255;
+    if (C_num_rows == 0) C_num_cols = 0;
+    int32 D_num_rows = C_num_cols, D_num_cols = B_num_cols;
+
+    MatrixTransposeType transC = (i % 2 == 1 ? kTrans : kNoTrans),
+        transD = (i % 3 == 1 ? kTrans : kNoTrans);
+    if (transC == kTrans) std::swap(C_num_rows, C_num_cols);
+    if (transD == kTrans) std::swap(D_num_rows, D_num_cols);
+
+    CuMatrix<Real> C(C_num_rows, C_num_cols), D(D_num_rows, D_num_cols);
+    C.SetRandn();
+    D.SetRandn();
+    
+    CuMatrix<Real> Bmat(B);
+
+    Real alpha = 2.0, beta = -1.0;
+
+    CuBlockMatrix<Real> Bcopy(B);
+
+    B.AddMatMat(alpha, C, transC, D, transD, beta);
+    
+    Bmat.AddMatMat(alpha, C, transC, D, transD, beta);
+
+
+    // Now check that the block-structured part of Bmat is the
+    // same as B.
+    Bcopy.CopyFromMat(Bmat); // copy block-structured part from Bmat to Bcopy.
+
+    AssertEqual(B, Bcopy);
+    KALDI_ASSERT(Bmat.Sum() != 0 || B_num_rows == 0);
+  }
+}
+
+
+template<typename Real> void CuBlockMatrixUnitTest() {
+  UnitTestCuBlockMatrixIO<Real>();
+  UnitTestCuBlockMatrixAddMatBlock<Real>();
+  UnitTestCuBlockMatrixAddMatMat<Real>();
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+#endif
+
+    kaldi::CuBlockMatrixUnitTest<float>();
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+      kaldi::CuBlockMatrixUnitTest<double>();
+    } else {
+      KALDI_WARN << "Double precision not supported";
+    }
+#else
+    kaldi::CuBlockMatrixUnitTest<double>();
+#endif
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@ -0,0 +1,330 @@
+// cudamatrix/cu-block-matrix.cc
+
+// Copyright 2013      Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+#endif
+
+#include "util/timer.h"
+#include "cudamatrix/cu-block-matrix.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-device.h"
+
+namespace kaldi {
+
+template<class Real>
+CuBlockMatrix<Real>::CuBlockMatrix() {
+#if HAVE_CUDA == 1
+  cu_data_ = NULL;
+#endif
+}
+
+template<class Real>
+CuBlockMatrix<Real>::CuBlockMatrix(const std::vector<CuMatrix<Real> >&data) {
+#if HAVE_CUDA == 1
+  cu_data_ = NULL;
+#endif
+  block_data_.resize(data.size());
+  MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
+  for (size_t b = 0; b < data.size(); b++) {
+    MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
+    KALDI_ASSERT(num_rows > 0 && num_cols > 0);
+    BlockMatrixData block_data;
+    block_data.num_rows = num_rows;
+    block_data.num_cols = num_cols;
+    block_data.row_offset = row_offset;
+    block_data.col_offset = col_offset;
+    row_offset += num_rows;
+    col_offset += num_cols;
+    max_num_rows = std::max(max_num_rows, num_rows);
+    block_data_[b] = block_data;
+  }
+  num_rows_ = row_offset;
+  data_.Resize(max_num_rows, col_offset);
+  for (int32 b = 0; b < NumBlocks(); b++)
+    Block(b).CopyFromMat(data[b]);
+  SetCudaData();
+}
+
+
+template<class Real>
+const CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) const {
+  KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
+  const BlockMatrixData &block_data = block_data_[b];
+  return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
+                           block_data.col_offset, block_data.num_cols);
+}
+
+template<class Real>
+CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) {
+  KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
+  BlockMatrixData &block_data = block_data_[b];
+  return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
+                           block_data.col_offset, block_data.num_cols);
+}
+
+
+template<class Real>
+CuBlockMatrix<Real>::CuBlockMatrix(const CuBlockMatrix<Real> &other):
+    data_(other.data_), block_data_(other.block_data_), num_rows_(other.num_rows_) {
+#if HAVE_CUDA == 1
+  cu_data_ = NULL;
+#endif
+  SetCudaData();
+}
+
+template<class Real>
+CuBlockMatrix<Real> &CuBlockMatrix<Real>::operator =(const CuBlockMatrix<Real> &other) {
+  FreeCudaData();
+  data_ = other.data_;
+  block_data_ = other.block_data_;
+  num_rows_ = other.num_rows_;
+  SetCudaData();
+  return *this;
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::FreeCudaData() {
+#if HAVE_CUDA == 1
+  if (cu_data_ != NULL) {
+    if (CuDevice::Instantiate().Enabled()) {
+      CuDevice::Instantiate().Free(cu_data_);
+      cu_data_ = NULL;
+    } else {
+      KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but "
+                << "no GPU is enabled: likely code error.";
+    }
+  }
+#endif
+}
+
+
+template<class Real>
+void CuBlockMatrix<Real>::SetCudaData() {
+#if HAVE_CUDA == 1
+  KALDI_ASSERT(cu_data_ == NULL);
+  if (block_data_.size() == 0) return; // Nothing to do.
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
+    int32 row_offset = 0, col_offset = 0;
+    for (size_t b = 0; b < NumBlocks(); b++) {
+      CuSubMatrix<Real> this_mat = Block(b);
+      CuBlockMatrixData &this_cu_data = tmp_cu_data[b];
+      this_cu_data.row_offset = row_offset;
+      this_cu_data.col_offset = col_offset;
+      this_cu_data.matrix_dim = this_mat.Dim();
+      this_cu_data.matrix_data = static_cast<void*>(this_mat.Data());
+      row_offset += this_mat.NumRows();
+      col_offset += this_mat.NumCols();
+    }
+    size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
+    cu_data_ = static_cast<CuBlockMatrixData*>(
+        CuDevice::Instantiate().Malloc(size));
+    CU_SAFE_CALL(cudaMemcpy(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice));
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+  }
+#endif
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::Swap(CuBlockMatrix<Real> *other) {
+  data_.Swap(&other->data_);
+  block_data_.swap(other->block_data_);
+  std::swap(num_rows_, other->num_rows_);
+#if HAVE_CUDA == 1
+  std::swap(cu_data_, other->cu_data_);
+#endif
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<CuBlockMatrix>");
+  int32 num_blocks = NumBlocks();
+  WriteBasicType(os, binary, num_blocks);
+  for (int32 b = 0; b < num_blocks; b++)
+    this->Block(b).Write(os, binary);
+  WriteToken(os, binary, "</CuBlockMatrix>");  
+}
+
+
+template<class Real>
+void CuBlockMatrix<Real>::Read(std::istream &is, bool binary) {
+  Destroy();
+  int i = Peek(is, binary);
+  std::vector<CuMatrix<Real> > data;
+  if (i != static_cast<int>('<')) {
+    // back-compatibility code so we can read the older format of
+    // MixtureProbComponent.  This code should be deleted eventually.
+    int32 size;
+    ReadBasicType(is, binary, &size);
+    KALDI_ASSERT(size >= 0);
+    data.resize(size);
+    for (int32 i = 0; i < size; i++)
+      data[i].Read(is, binary);
+  } else {
+    ExpectToken(is, binary, "<CuBlockMatrix>");
+    int32 size;
+    ReadBasicType(is, binary, &size);
+    KALDI_ASSERT(size >= 0);
+    data.resize(size);
+    for (int32 i = 0; i < size; i++)
+      data[i].Read(is, binary);
+    ExpectToken(is, binary, "</CuBlockMatrix>");    
+  }
+
+  CuBlockMatrix<Real> block_mat(data); // initializer from std::vector<CuMatrix<Real> > does
+  // the main job of initialization.
+  this->Swap(&block_mat);
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::Destroy() {
+  data_.Resize(0, 0);
+  block_data_.clear();
+  num_rows_ = 0;  
+  FreeCudaData();
+}
+
+// Does *this = alpha A B + beta * *this, discarding elements outside
+// the block structure of the *this matrix. 
+template<class Real>
+void CuBlockMatrix<Real>::AddMatMat(
+    BaseFloat alpha,
+    const CuMatrix<Real> &A, MatrixTransposeType transA,
+    const CuMatrix<Real> &B, MatrixTransposeType transB,
+    BaseFloat beta) {
+  MatrixIndexT A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
+      A_row_stride = A.Stride(), A_col_stride = 1,
+      B_num_rows = B.NumRows(), B_num_cols = B.NumCols(),
+      B_row_stride = B.Stride(), B_col_stride = 1;
+  if (transA == kTrans) {
+    std::swap(A_num_rows, A_num_cols);
+    std::swap(A_row_stride, A_col_stride);
+  }
+  if (transB == kTrans) {
+    std::swap(B_num_rows, B_num_cols);
+    std::swap(B_row_stride, B_col_stride);
+  }
+  KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
+               && A_num_cols == B_num_rows);
+  if (NumBlocks() == 0) return; // empty matrix.
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+
+    // (x,y,z) dimensions are (block-id, row-of-block, col-of-block)
+    // First some logic to choose block dims...
+    // we assume (which we can, safely) that CU1DBLOCK is <= the max threads per block.
+    int32 x_blocksize = std::min(CU1DBLOCK, NumBlocks()); // x dim corresponds to block-idx.
+    int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
+    int32 y_blocksize = max_block_rows;
+    while (y_blocksize * x_blocksize > CU1DBLOCK || y_blocksize > CU2DBLOCK)
+      y_blocksize--;
+    int32 z_blocksize = max_block_cols;
+    while (z_blocksize * x_blocksize * y_blocksize > CU1DBLOCK || z_blocksize > CU2DBLOCK)
+      z_blocksize--;
+    
+    dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
+    dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
+                 n_blocks(max_block_rows, y_blocksize),
+                 n_blocks(max_block_cols, z_blocksize));
+    cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
+                           A.Data(), A_num_cols, A_row_stride, A_col_stride,
+                           B.Data(), B_row_stride, B_col_stride, alpha, beta);
+    CU_SAFE_CALL(cudaGetLastError());    
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+  } else
+#endif
+  {
+    int32 row_offset = 0, col_offset = 0;    
+    for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
+      CuSubMatrix<Real> this_block = Block(b);
+      MatrixIndexT this_num_rows = this_block.NumRows(),
+          this_num_cols = this_block.NumCols();
+      CuSubMatrix<Real> A_part = (transA == kNoTrans ?
+                                  A.Range(row_offset, this_num_rows,
+                                          0, A.NumCols()) :
+                                  A.Range(0, A.NumRows(),
+                                          row_offset, this_num_rows)),
+          B_part = (transB == kNoTrans ?
+                    B.Range(0, B.NumRows(),
+                            col_offset, this_num_cols) :
+                    B.Range(col_offset, this_num_cols,
+                            0, B.NumCols()));
+      this_block.AddMatMat(alpha, A_part, transA, B_part, transB, beta);
+      row_offset += this_num_rows;
+      col_offset += this_num_cols;
+    }
+    KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
+  }
+}
+
+template<class Real>
+MatrixIndexT CuBlockMatrix<Real>::MaxBlockCols() const {
+  MatrixIndexT max_cols = 0;
+  for (size_t i = 0; i < block_data_.size(); i++)
+    max_cols = std::max(max_cols, block_data_[i].num_cols);
+  return max_cols;
+}
+
+template<class Real>
+MatrixIndexT CuBlockMatrix<Real>::MaxBlockRows() const {
+  return data_.NumRows();
+}
+
+template<class Real>
+void CuBlockMatrix<Real>::CopyFromMat(const CuMatrix<Real> &M) {
+  KALDI_ASSERT(NumRows() == M.NumRows() && NumCols() == M.NumCols());
+  MatrixIndexT row_offset = 0, col_offset = 0;
+  for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
+    CuSubMatrix<Real> this_block = Block(b);
+    MatrixIndexT this_num_rows = this_block.NumRows(),
+        this_num_cols = this_block.NumCols();
+    const CuSubMatrix<Real> src(M, row_offset, this_num_rows,
+                                col_offset, this_num_cols);
+    this_block.CopyFromMat(src);
+    row_offset += this_num_rows;
+    col_offset += this_num_cols;
+  }
+  KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
+}
+
+/**
+ * Print the matrix to stream
+ */
+template<typename Real>
+std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
+  bool binary = false;
+  mat.Write(out, binary);
+  return out;
+}
+// instantiate the template
+template
+std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
+template 
+std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);
+
+// Instantiate the class for float and double.
+template class CuBlockMatrix<float>;
+template class CuBlockMatrix<double>;
+
+} // namespace kaldi
--- a/src/cudamatrix/cu-block-matrix.h
+++ b/src/cudamatrix/cu-block-matrix.h
@ -0,0 +1,150 @@
+// cudamatrix/cu-block-matrix.h
+
+// Copyright 2013      Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
+#define KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
+
+#include <sstream>
+
+#include <vector>
+#include "cudamatrix/cu-common.h"
+
+
+namespace kaldi {
+
+
+/**
+   The class CuBlockMatrix holds a vector of objects of type CuMatrix,
+   say, M_1, M_2, .. M_N
+   and it represents the matrix diag(M_1, M_2, ... M_N).  Note:
+   the individual matrices do not have to be square.  The reason the
+   class is needed is mostly so that we can efficiently multiply by this
+   block-diagonal structure in a parallel way.
+
+   If we have a GPU available, CuBlockMatrix will store a copy of the
+   individual CuMatrix quantities M_1 .. M_N on the GPU, but their
+   'primary' home remains on the CPU.. what we mean by this is that
+   while the data remains on the GPU, the "primary" version of the
+   Matrix object that holds the pointers will remain on the CPU.
+   We just copy it over to the GPU whenever it is changed.
+ */
+
+template<typename Real>
+class CuBlockMatrix {
+ public:
+  friend class CuMatrixBase<Real>;
+  
+  CuBlockMatrix();
+
+  CuBlockMatrix(const std::vector<CuMatrix<Real> > &data);
+
+  ~CuBlockMatrix() { Destroy(); }
+  
+  /// Copy constructor
+  CuBlockMatrix(const CuBlockMatrix &other); 
+
+  /// Assignment operator
+  CuBlockMatrix &operator= (const CuBlockMatrix &other); 
+
+  void Write(std::ostream &os, bool binary) const;
+  
+  void Read(std::istream &is, bool binary);
+
+  MatrixIndexT NumRows() const { return num_rows_; }
+
+  MatrixIndexT NumCols() const { return data_.num_cols_; }
+
+  MatrixIndexT NumBlocks() const { return block_data_.size(); }
+  
+  // Returns max num-columns of any block
+  MatrixIndexT MaxBlockCols() const ;
+
+  // Returns max num-rows of any block
+  MatrixIndexT MaxBlockRows() const;
+    
+  const CuSubMatrix<Real> Block(MatrixIndexT b) const;
+
+  CuSubMatrix<Real> Block(MatrixIndexT b); // return CuMatrixBase to disallow resizes.
+
+
+  /// Does *this = alpha A B + beta * *this, discarding elements of the product outside
+  /// the block structure of the *this matrix.  The transA and transB parameters
+  /// can be used to substitute A^T for A and B^T for B, respectively.
+  void AddMatMat(BaseFloat alpha,
+                 const CuMatrix<Real> &A, MatrixTransposeType transA,
+                 const CuMatrix<Real> &B, MatrixTransposeType transB,
+                 BaseFloat beta);
+
+
+  /// Copies elements within the block structure from matrix M, discarding others.
+  /// Note: this has not been implemented in a very efficient way, it's used only
+  /// for testing.
+  void CopyFromMat(const CuMatrix<Real> &M);
+
+  /// Normalizes the columns of *this so that each one sums to one.
+  /// On error (e.g. inf's), will set the column to a constant value that
+  /// sums to one.
+  void NormalizeColumns();
+
+  void Swap(CuBlockMatrix *other);
+  
+ protected:
+  CuMatrix<Real> data_; // This is a single matrix into which
+  // we pack all the blocks (possibly with spaces left over)
+
+  struct BlockMatrixData{
+    MatrixIndexT num_rows;
+    MatrixIndexT num_cols;
+    MatrixIndexT row_offset;
+    MatrixIndexT col_offset;
+  };
+  
+
+#if HAVE_CUDA == 1
+  const CuBlockMatrixData* CuData() const { return cu_data_; }
+#endif
+ private:
+  
+  /// If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL
+  void FreeCudaData();
+  /// If using GPU, allocate and set cu_data_ on the GPU to reflect "data_".
+  void SetCudaData();
+
+
+  /// Frees and deinitializes everything.
+  void Destroy();
+
+  std::vector<BlockMatrixData> block_data_;
+  
+  MatrixIndexT num_rows_; // sum of num_rows of elements of block_data_.
+#if HAVE_CUDA == 1
+  CuBlockMatrixData *cu_data_; // We store the pointers and some additional info
+                               // on the GPU card in a form more suited to
+                               // use by CUDA kernels.
+#endif
+}; // class CuBlockMatrix
+
+template<typename Real>
+std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat);
+
+
+} // namespace Kaldi
+#endif
--- a/src/cudamatrix/cu-choleskykernels-ansi.h
+++ b/src/cudamatrix/cu-choleskykernels-ansi.h
@ -0,0 +1,53 @@
+// cudamatrix/cu-choleskykernel-ansi.h
+
+// Copyright 2010-2013Dr. Stephan Kramer
+//  Institut für Numerische und Angewandte Mathematik
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_ANSI_H_
+#define KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_ANSI_H_
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "cudamatrix/cu-matrixdim.h"
+
+#if HAVE_CUDA == 1
+
+extern "C" {
+
+/*********************************************************
+ * float CUDA kernel calls
+ */
+void cudaF_factorize_diagonal_block(float* A, int block_offset, MatrixDim d);
+void cudaF_strip_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d);
+void cudaF_diag_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d);
+void cudaF_lo_update(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d);
+
+
+/*********************************************************
+ * double CUDA kernel calls
+ */
+void cudaD_factorize_diagonal_block(double* A, int block_offset, MatrixDim d);
+void cudaD_strip_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d);
+void cudaD_diag_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d);
+void cudaD_lo_update(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d);
+}
+
+#endif // HAVE_CUDA
+
+#endif
--- a/src/cudamatrix/cu-choleskykernels.cu
+++ b/src/cudamatrix/cu-choleskykernels.cu
@ -0,0 +1,359 @@
+// cudamatrix/cu-choleskykernel.cu
+
+// Copyright 2010-2013  Dr. Stephan Kramer
+//  Institut fur Numerische und Angewandte Mathematik
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.    
+// You may obtain a copy of the License at    
+//   
+//  http://www.apache.org/licenses/LICENSE-2.0    
+//   
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED    
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,    
+// MERCHANTABLITY OR NON-INFRINGEMENT.                             
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudamatrix/cu-choleskykernels-ansi.h"
+#include <stdio.h>
+
+
+#define TILE_SIZE 16
+
+/***********************************************************************
+ * CUDA kernels
+ * some functions are templated to have the float/double operations
+ */
+__device__ int lex_index_2D (int r, int c, int row_length) {
+  return c +  r*row_length;
+}
+
+
+__device__ int global_pos(int t_pos, int block_offset) {
+  return t_pos + TILE_SIZE*block_offset;
+}
+
+
+__device__ float inv_sqrt(float x) {
+  return rsqrtf(x);
+}
+
+
+__device__ double inv_sqrt(double x) {
+  return rsqrt(x);
+}
+
+
+template<typename T>
+__global__
+void __factorize_diagonal_block(T* A, int block_offset, MatrixDim d) {
+  int global_row_length = d.stride;
+
+  int col = threadIdx.x;
+  int row = threadIdx.y;
+
+  int global_row = global_pos(row,block_offset);
+  int global_col = global_pos(col,block_offset);
+
+  if ((global_row >= d.cols) || (global_col >= d.cols))
+    return;
+
+  int k_max = TILE_SIZE;
+  if (d.cols - global_pos(0,block_offset) < TILE_SIZE)
+    k_max = d.cols % TILE_SIZE;
+
+
+  int idx = lex_index_2D(global_row, global_col, global_row_length);
+  
+  __shared__ T L[TILE_SIZE][TILE_SIZE+1];
+
+  L[row][col] = 0;
+  L[row][col] = A[idx];
+  __syncthreads();
+
+  if ((row >= k_max) || (col >= k_max))
+    return;
+
+
+  T fac;
+
+  for (int k = 0; k < k_max; k++) {
+    __syncthreads();
+    fac = inv_sqrt(L[k][k]);
+    __syncthreads();
+
+    if ((row==k)&&(col>=k))
+      L[col][row] = (L[col][row])*fac;
+
+    __syncthreads();
+
+    if ((row>=col)&&(col>k))
+      L[row][col] = L[row][col] - L[col][k]*L[row][k];
+  }
+  __syncthreads();
+
+    
+  if (row >= col) {
+    A[idx] = L[row][col];
+    if (A[idx] > 100000)
+      A[idx] = 1;
+  }
+}
+
+
+template<typename T>
+__global__
+void __strip_update(T* A, int block_offset, MatrixDim d) {
+  int global_row_length = d.stride;
+
+  int boffy = block_offset;
+  int boffx = blockIdx.x + boffy + 1;
+  
+  int col = threadIdx.x;
+  int row = threadIdx.y;
+
+  __shared__ T topleft[TILE_SIZE][TILE_SIZE+1];
+  __shared__ T workingmat[TILE_SIZE][TILE_SIZE+1];
+
+  int global_row = global_pos(row,block_offset);
+  int global_col = global_pos(col,block_offset);
+
+  if ((global_row >= d.cols) || (global_col >= d.cols))
+    return;
+
+  int idx = lex_index_2D(global_row, global_col, global_row_length);
+
+  topleft[row][col] = 0;  
+  topleft[row][col] = A[idx];
+  //__syncthreads();
+  
+  global_row = global_pos(row,boffx);
+  
+  if (global_row >= d.cols)
+    return;
+
+  int idx_w = lex_index_2D(global_row, global_col, global_row_length);
+  //int row2 = row + block_offset * TILE_SIZE;
+  //int idx_w = row2 + col*global_row_length;
+  workingmat[col][row]=0;
+  workingmat[col][row]=A[idx_w];
+
+  __syncthreads();
+  
+  if (row==0) {
+    for (int k = 0; k < TILE_SIZE; k++) {
+      T sum=0.0;
+      for (int m = 0; m < k; m++) 
+        sum = sum + topleft[k][m]*workingmat[m][col];
+	
+      workingmat[k][col] = (workingmat[k][col] - sum) / topleft[k][k];
+    }
+  }
+
+  __syncthreads();
+
+  A[idx_w] = workingmat[col][row];
+  if (A[idx_w] > 100000)
+    A[idx_w] = 1;
+  //A[idx_w] = 1;
+}
+
+
+template<typename T>
+__global__
+void __diag_update(T* A, int block_offset, MatrixDim d) {
+  int global_row_length = d.stride;
+  int boffx = blockIdx.x + block_offset + 1;
+
+  int col = threadIdx.x;
+  int row = threadIdx.y;
+
+  int global_row = global_pos(row,boffx);
+  int global_col = global_pos(col,block_offset);
+
+  if ((global_row >= d.cols) || (global_col >= d.cols))
+    return;
+
+  int idx = lex_index_2D(global_row, global_col, global_row_length);
+
+  __shared__ T left[TILE_SIZE][TILE_SIZE+1];
+  
+  left[row][col] = 0;
+  left[row][col] = A[idx];
+  
+  __syncthreads();
+
+  T sum = 0.0;
+
+
+  if (row >= col) {
+    for (int kk = 0; kk < TILE_SIZE; kk++)
+      sum = sum + left[row][kk]*left[col][kk];
+    
+    //__syncthreads();
+  
+    global_col = global_pos(col, boffx);
+ 
+    if (global_col >= d.cols)
+      return;
+
+    idx = lex_index_2D(global_row, global_col, global_row_length);
+
+    A[idx] = A[idx] - sum;
+ 
+  }
+}
+
+
+template<typename T>
+__global__
+void __lo_update(T* A, int block_offset, int n_blocks, MatrixDim d) {
+  int global_row_length = d.stride;
+  int col = threadIdx.x;
+  int row = threadIdx.y;
+  
+  int boffy = blockIdx.y + block_offset + 1;
+  //int boffx = boffy + 1;
+  int boffx = boffy + 1;
+
+  __shared__ T left[TILE_SIZE][TILE_SIZE];
+
+  __shared__ T upt[TILE_SIZE][TILE_SIZE + 1];
+  
+  int global_row = global_pos(row,boffy);
+  int global_col_src = global_pos(col,block_offset);
+
+  if ((global_row >= d.cols) || (global_col_src >= d.cols))
+    return;
+
+  int idx = lex_index_2D(global_row, global_col_src, global_row_length);
+  
+  upt[row][col] = 0;
+  upt[row][col] = A[idx];
+  __syncthreads();
+
+  for (; boffx < n_blocks; boffx++) {
+    global_row = global_pos(row,boffx);
+
+    if (global_row >= d.cols) 
+      return;
+
+    idx = lex_index_2D(global_row, global_col_src, global_row_length);
+    
+    left[row][col] = 0;    
+    left[row][col] = A[idx];
+    
+    __syncthreads();
+
+    if (global_row >= d.cols)
+      return;
+
+    T matrixprod = 0.0;
+    
+    for (int kk = 0; kk < TILE_SIZE; kk++)
+      matrixprod += left[row][kk]*upt[col][kk];
+
+    __syncthreads();
+
+    int global_col = global_pos(col,boffy);
+    if (global_col >= d.cols)
+      return;
+        
+    idx = lex_index_2D(global_row, global_col, global_row_length);
+    A[idx] = A[idx] - matrixprod;
+  }
+}
+
+/***********************************************************************
+ * ANSI-C wrappers of CUDA kernels
+ */
+
+/*
+ * float
+ */
+
+void cudaF_factorize_diagonal_block(float* A, int block_offset, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  __factorize_diagonal_block<<<1,threads>>>(A,block_offset,d);
+  cudaThreadSynchronize();
+}
+
+void cudaF_strip_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  if (n_remaining_blocks >= 2) {
+    dim3 stripgrid(n_remaining_blocks-1);
+    __strip_update<<<stripgrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();    
+  } else {
+    int stripgrid = 1;
+    __strip_update<<<stripgrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();    
+  }
+}
+
+void cudaF_diag_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  if (n_remaining_blocks >= 2) {
+    dim3 diaggrid(n_remaining_blocks-1);
+    __diag_update<<<diaggrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();
+  } else {
+    int diaggrid = 1;
+    __diag_update<<<diaggrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();
+  }
+}
+
+void cudaF_lo_update(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) {
+  dim3 logrid;
+  logrid.x = 1;
+  logrid.y = n_remaining_blocks-2;
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  __lo_update<<<logrid,threads>>>(A,block_offset,n_blocks,d);
+  cudaThreadSynchronize();
+}
+/*
+ * double
+ */
+void cudaD_factorize_diagonal_block(double* A, int block_offset, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  __factorize_diagonal_block<<<1,threads>>>(A,block_offset,d);
+  cudaThreadSynchronize();
+}
+
+void cudaD_strip_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  if (n_remaining_blocks >= 2) {
+    dim3 stripgrid(n_remaining_blocks-1);
+    __strip_update<<<stripgrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();    
+  } else {
+    int stripgrid = 1;
+    __strip_update<<<stripgrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();    
+  }
+}
+
+void cudaD_diag_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  if (n_remaining_blocks >= 2) {
+    dim3 diaggrid(n_remaining_blocks-1);
+    __diag_update<<<diaggrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();
+  } else {
+    int diaggrid = 1;
+    __diag_update<<<diaggrid,threads>>>(A,block_offset,d);
+    cudaThreadSynchronize();
+  }
+}
+
+void cudaD_lo_update(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) {
+  dim3 logrid;
+  logrid.x = 1;
+  logrid.y = n_remaining_blocks-2;
+  dim3 threads(TILE_SIZE,TILE_SIZE);
+  __lo_update<<<logrid,threads>>>(A,block_offset,n_blocks,d);
+  cudaThreadSynchronize();
+}
--- a/src/cudamatrix/cu-choleskykernels.h
+++ b/src/cudamatrix/cu-choleskykernels.h
@ -0,0 +1,62 @@
+// cudamatrix/cu-choleskykernel.h
+
+// Copyright 2010-2013  Dr. Stephan Kramer
+//  Institut für Numerische und Angewandte Mathematik
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_H_
+#define KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_H_
+
+#if HAVE_CUDA == 1
+
+#include "base/kaldi-error.h"
+#include "cudamatrix/cu-choleskykernels-ansi.h"
+
+/*
+ * In this file are C++ templated wrappers
+ * of the ANSI-C CUDA kernels
+ */
+
+namespace kaldi {
+
+/*********************************************************
+* base templates
+*/
+template<typename Real> inline void cuda_factorize_diagonal_block(Real* A, int block_offset, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+template<typename Real> inline void cuda_strip_update(Real* A, int block_offset, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+template<typename Real> inline void cuda_diag_update(Real* A, int block_offset, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+template<typename Real> inline void cuda_lo_update(Real* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+/*********************************************************
+* float specialization
+*/
+template<> inline void cuda_factorize_diagonal_block<float>(float* A, int block_offset, MatrixDim d) { cudaF_factorize_diagonal_block(A,block_offset,d); }
+template<> inline void cuda_strip_update<float>(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaF_strip_update(A,block_offset,n_remaining_blocks,d); }
+template<> inline void cuda_diag_update<float>(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaF_diag_update(A,block_offset,n_remaining_blocks,d); }
+template<> inline void cuda_lo_update<float>(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { cudaF_lo_update(A,block_offset,n_blocks,n_remaining_blocks,d); }
+/*********************************************************
+* double specialization
+*/
+template<> inline void cuda_factorize_diagonal_block<double>(double* A, int block_offset, MatrixDim d) { cudaD_factorize_diagonal_block(A,block_offset,d); }
+template<> inline void cuda_strip_update<double>(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaD_strip_update(A,block_offset,n_remaining_blocks,d); }
+template<> inline void cuda_diag_update<double>(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaD_diag_update(A,block_offset,n_remaining_blocks,d); }
+template<> inline void cuda_lo_update<double>(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { cudaD_lo_update(A,block_offset,n_blocks,n_remaining_blocks,d); }
+
+} // namespace
+
+#endif // HAVE_CUDA
+
+#endif
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@ -0,0 +1,32 @@
+#ifndef KALDI_CUDAMATRIX_COMMON_H_
+#define KALDI_CUDAMATRIX_COMMON_H_
+
+// This file contains some #includes, forward declarations
+// and typedefs that are needed by all the main header
+// files in this directory.
+
+#include "base/kaldi-common.h"
+#include "matrix/kaldi-blas.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-common.h"
+
+namespace kaldi {
+
+#if HAVE_CUDA == 1
+cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans) {
+  cublasOperation_t cublas_trans;
+
+  if (kaldi_trans == kNoTrans)
+    cublas_trans = CUBLAS_OP_N;
+  else if (kaldi_trans == kTrans)
+    cublas_trans = CUBLAS_OP_T;
+  else
+    cublas_trans = CUBLAS_OP_C;
+  return cublas_trans;
+}
+#endif
+
+} // namespace
+
+
+#endif  // KALDI_CUDAMATRIX_COMMON_H_
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@ -22,20 +22,20 @@

 #ifndef KALDI_CUDAMATRIX_CU_COMMON_H_
 #define KALDI_CUDAMATRIX_CU_COMMON_H_
-
-
-#if HAVE_CUDA==1
-
+#include "cudamatrix/cu-matrixdim.h" // for CU1DBLOCK and CU2DBLOCK

 #include <iostream>
 #include <sstream>
+#include "base/kaldi-error.h"
+#include "matrix/matrix-common.h"

+#if HAVE_CUDA == 1
+#include <cublas.h>
 #include <cuda_runtime_api.h>

-#include "base/kaldi-error.h"


-#define cuSafeCall(fun) \
+#define CU_SAFE_CALL(fun) \
 { \
  int32 ret; \
  if ((ret = (fun)) != 0) { \
@ -47,19 +47,19 @@

 namespace kaldi {

-  /** The size of edge of CUDA square block **/
-  static const int32 CUBLOCK = 16;
+/** Number of blocks in which the task of size 'size' is splitted **/
+inline int32 n_blocks(int32 size, int32 block_size) { 
+  return size / block_size + ((size % block_size == 0)? 0 : 1); 
+}

-  /** Number of blocks in which the task of size 'size' is splitted **/
-  inline int32 n_blocks(int32 size, int32 block_size) { 
-    return size / block_size + ((size % block_size == 0)? 0 : 1); 
-  }
+cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans);
+  
 }

 #endif // HAVE_CUDA

 namespace kaldi {
-// Some forward declarations, frequently needed
+// Some forward declarations, needed for friend declarations.
 template<typename Real> class CuVectorBase;
 template<typename Real> class CuVector;
 template<typename Real> class CuSubVector;
@ -67,7 +67,13 @@ template<typename Real> class CuRand;
 template<typename Real> class CuMatrixBase;
 template<typename Real> class CuMatrix;
 template<typename Real> class CuSubMatrix;
-template<typename Real> class CuRand;
+template<typename Real> class CuPackedMatrix;
+template<typename Real> class CuSpMatrix;
+template<typename Real> class CuTpMatrix;
+
+template<typename Real> class CuBlockMatrix; // this has no non-CU counterpart.
+
+
 }


--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@ -1,6 +1,8 @@
 // cudamatrix/cu-device.cc

 // Copyright 2009-2012  Karel Vesely
+//                2013  Lucas Ondel
+//                2013  Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -19,140 +21,137 @@



-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1

 #include <cublas.h>
 #include <cuda.h>
+#include <cuda_runtime_api.h>

+#include <string>
 #include <vector>
+#include <algorithm>
 #include <dlfcn.h>
+#include <unistd.h> // for sleep

 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
 #include "base/kaldi-error.h"
-
+#include "util/common-utils.h"

 namespace kaldi {

-CuDevice::CuDevice()
- : active_gpu_id_(-3), verbose_(true) 
-{ }
-
-
-
-CuDevice::~CuDevice() {
-  if (Enabled()) {
-    cuSafeCall(cublasShutdown());
-  } else if (active_gpu_id_ == -2) {
-    KALDI_WARN << "CUDA was NOT used! No CUDA GPU detected!";
-  }
-}
-
-

 /** 
- * SelectGpuId(gpu_id) 
+ * SelectGpuId(use_gpu) 
 *
- * The argument 'gpu_id' meaning: 0..N selects a GPU, 
- * -1 disables CUDA, -2 performs GPU auto-detection.
+ * There are 3 'use_gpu' modes for GPU selection:
+ * "yes"      -- Select GPU automatically (or get one by exclusive mode) 
+ *               and die if this fails.
+ * "optional" -- Do as above, but if it fails, back off to CPU.
+ * "no"       -- Run on CPU.
 *
- * If there is no GPU in the system, and we have GPU auto-detection,
- * or GPU is manually disabled the computation will run on CPU. 
- * In other cases it is an error (manual selection).
+ * In case of Compute exclusive mode, the GPU is selected by OS.
 *
- * In case of Compute exclusive mode, the GPU is selected by OS, 
- * this has priority over manual/auto selection of GPU.
+ * Otherwise GPU selection is based on largest proportion of free memory.
+ * This can eventually lead to multiple processes computing on single GPU,
+ * which is slow. More practical is to use "compute exclusive mode".
 *
- * Since the autoselection of GPU is not perfect, it may still 
- * happen that two processes compute on single GPU, which is slow. 
- * The users are advised to use manual selection or exclusive mode.
- *
- * This method must be called at the very beginning of the program
- * (before the cudamatrix objects allocate memory for the data), 
- * or not at all (when we intentionally want to run on the CPU). 
+ * This method is to be called at the very beginning of the program
+ * (before first allocation in cudamatrix), or not at all (default to CPU).
 *
 */
-void CuDevice::SelectGpuId(int32 gpu_id) {
+void CuDevice::SelectGpuId(std::string use_gpu) {
+  // Possible modes  
+  if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional") {
+    KALDI_ERR << "Please choose : --use-gpu=yes|no|optional, passed '" << use_gpu << "'";
+  }
+ 
  // Make sure this function is not called twice!
-  if(Enabled()) {
+  if (Enabled()) {
    KALDI_ERR << "There is already an active GPU " << active_gpu_id_ 
              << ", cannot change it on the fly!";
  }
  // Allow the GPU to stay disabled
-  if(!Enabled() && gpu_id == -1) { 
-    KALDI_LOG << "Selected device: " << gpu_id 
-              << ", we don't even try to get a GPU. We run on CPU.";
-    active_gpu_id_ = -1;
+  if(!Enabled() && use_gpu == "no") { 
+    KALDI_LOG << "Manually selected to compute on CPU.";
    return;
  }
+
  // Check that we have a gpu available
  int32 n_gpu = 0;
  cudaGetDeviceCount(&n_gpu);
-  if(n_gpu == 0 && gpu_id == -2) {
-    // If we do automatic selection and no GPU is found, we run on a CPU
-    KALDI_WARN << "CUDA will NOT be used!!! No CUDA capable GPU detected...";
-    active_gpu_id_ = -2;
-    return;
-  }
-  // In other cases it is an error, no GPU is an error
  if(n_gpu == 0) {
-    KALDI_ERR << "No CUDA capable GPU detected, while explicitly asked for gpu-id '"
-              << gpu_id << "'.";
-  }
-
-
-  //Now we know that there is a GPU in the system, 
-  //and we don't want to have it disabled. 
-  //
-  //For the GPU selection there are 3 possibilities, 
-  //with priorities according to the order:
-  //
-  //1.) We have compute exclusive mode on (GPU is selected by OS)
-  //2.) User did not specify the GPU-id (default value -2), 
-  //    we will do automatic selection.
-  //3.) User specified the GPU to run on, so we select it.
-  if(IsComputeExclusive()) {
-    //we have the GPU context now...
-    ;
-  } else if(gpu_id == -2) {
-    SelectGpuIdAuto();
-  } else {
-    //try to select the desired GPU
-    int32 ret = cudaSetDevice(gpu_id);
-    //handle the possible errors (no recovery!!!)
-    switch(ret) {
-      case cudaSuccess : {
-        //create the GPU context
-        cudaError_t e;
-        e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
-        if(e != cudaSuccess) {
-          KALDI_ERR << "Failed to create CUDA context on a GPU.";
-        }
-        //this was okay, so we are done!
-        KALDI_LOG << "Selected device: " << gpu_id << " (manually)";
-        break;
-      }
-      case cudaErrorInvalidDevice : { 
-        int32 n_gpu = 0;
-        cudaGetDeviceCount(&n_gpu);
-        KALDI_ERR << "cudaSetDevice(" << gpu_id << "):"
-                  << " '" << gpu_id << "' is not a VALID CUDA device! "
-                  << " (system has " << n_gpu << " GPUs,"
-                  << " valid IDs 0.." << n_gpu-1 << ")";
-        break;
-      }
-      default :
-        KALDI_ERR << "cudaSetDevice(" << gpu_id << "): "
-                  << "returned " << ret << ", " 
-                  << cudaGetErrorString((cudaError_t)ret);
+    if (use_gpu == "yes") {
+      KALDI_ERR << "No CUDA GPU detected!";
+    }
+    if (use_gpu == "optional") {
+      KALDI_WARN << "Running on CPU!!! No CUDA GPU detected...";
+      return;
    }
  }

-
-  // Now the we should have active GPU, 
-  // so we can query its name and memory stats
-  // and notify user which GPU is finally used.
  //
+  // Create a CUDA context : in case of compute-exclusive mode OS selects gpu_id,
+  // or default gpu_id=0. In the case with no free GPUs a context cannot be created
+  // (compute-exclusive mode).
+  //
+  cudaError_t e;
+  e = cudaThreadSynchronize(); //<< CUDA context gets created here.
+  if (e != cudaSuccess) {
+    // So far no we don't have context, sleep a bit and retry.
+    int32 sec_sleep = 2;
+    KALDI_WARN << "Will try again to get a GPU after " << sec_sleep 
+               << " seconds.";
+    sleep(sec_sleep);
+    //
+    e = cudaThreadSynchronize(); //<< 2nd trial to get CUDA context.
+    if (e != cudaSuccess) {
+      if (use_gpu == "yes") {
+        KALDI_ERR << "Failed to create CUDA context, no more unused GPUs?";
+      }
+      if (use_gpu == "optional") {
+        KALDI_WARN << "Running on CPU!!! No more unused CUDA GPUs?";
+        return;
+      }
+    }
+  }
+
+  // Re-assure we have the context
+  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+
+  // Check if the machine use compute exclusive mode 
+  if (IsComputeExclusive()) {
+    FinalizeActiveGpu();
+    return;
+  } else {
+    // Or suggest to use compute exclusive mode
+    if(n_gpu > 1) { 
+      KALDI_WARN << "Hint: It is practical to set the GPUs into ``compute exclusive mode''."
+                 << " Selection of free GPUs would be done by OS automatically.";
+    }
+    // And select the GPU according to proportion of free memory
+    if(SelectGpuIdAuto()) {
+      FinalizeActiveGpu();
+      return;
+    } else { 
+      // Could not get GPU, after prevously having the CUDA context?
+      // Strange but not impossible...
+      if (use_gpu == "yes") {
+        KALDI_ERR << "Error acquiring GPU.";
+      }
+      if (use_gpu == "optional") {
+        KALDI_WARN << "Running on CPU!!! Error acquiring GPU.";
+        return;
+      }
+    }
+  }
+}
+
+
+void CuDevice::FinalizeActiveGpu() {
+  // The device at this point should have active GPU, so we can query its name
+  // and memory stats and notify user which GPU is finally used.
+
  // Get the device-id of active device:
  {
    int32 act_gpu_id;
@ -164,44 +163,38 @@ void CuDevice::SelectGpuId(int32 gpu_id) {
    // Remember the id of active GPU 
    active_gpu_id_ = act_gpu_id; //CuDevice::Enabled() is true from now on
    // Initialize the CUBLAS
-    cuSafeCall(cublasInit());
+    CU_SAFE_CALL(cublasInit());

    // Notify user which GPU is finally used
    char name[128];
    DeviceGetName(name,128,act_gpu_id);
-    KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: "
-              << name << "\t" << GetFreeMemory(NULL, NULL);
-  }

+    CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, act_gpu_id));
+    
+    KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t"
+              << GetFreeMemory(&free_memory_at_startup_, NULL) << " version "
+              << properties_.major << "." << properties_.minor;
+
+    if (verbose_) PrintMemoryUsage();
+  }
  return;
 }


+bool CuDevice::DoublePrecisionSupported() {
+  if (!Enabled()) return true;
+  return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
+  // Double precision is supported from version 1.3
+}
+

 bool CuDevice::IsComputeExclusive() {
-  // check that we have a gpu
-  int32 n_gpu = 0;
-  cudaGetDeviceCount(&n_gpu);
-  if(n_gpu == 0) {
-    KALDI_LOG << "No CUDA devices found";
-    return false;
-  }
-  
-  // Create a GPU context
-  // This will be kept if we detect compute exclusive mode
-  // or released in the other case.
-  //
-  // It does not harm if the function gets called twice,
-  // and the context is already created.
-  cudaError_t e;
-  e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
-  if(e != cudaSuccess) {
-    KALDI_ERR << "Failed to create CUDA context on a GPU. No more unused GPUs in compute exclusive mode?";
-  }
-  
+  // assume we already have an CUDA context created
+  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+
  // get the device-id and its device-properties
  int32 gpu_id = -1;
-  e = cudaGetDevice(&gpu_id);
+  cudaError_t e = cudaGetDevice(&gpu_id);
  if(e != cudaSuccess) {
    KALDI_ERR << "Failed to get current device";
  }
@ -216,12 +209,12 @@ bool CuDevice::IsComputeExclusive() {
      KALDI_LOG << "CUDA setup operating under Compute Exclusive Mode.";
      return true;
      break;
-    #if (CUDA_VERSION >= 4000)
+#if (CUDA_VERSION >= 4000)
    case cudaComputeModeExclusiveProcess :
      KALDI_LOG << "CUDA setup operating under Compute Exclusive Process Mode.";
      return true;
      break;
-    #endif
+#endif
    default :
      // The computation mode is not compute-exclusive,
      // in this case we release the GPU context...
@ -234,21 +227,20 @@ bool CuDevice::IsComputeExclusive() {
 }


-
-void CuDevice::SelectGpuIdAuto() {
-  // check that we have at least one gpu
+bool CuDevice::SelectGpuIdAuto() {
+  // Check that we have at least one gpu
  int32 n_gpu = 0;
  cudaGetDeviceCount(&n_gpu);
  if(n_gpu == 0) {
-    KALDI_ERR << "No CUDA devices found";
-    return;
+    KALDI_WARN << "No CUDA devices found";
+    return false;
  }
-
+  
  // The GPU is selected according to maximal free memory ratio
  std::vector<float> free_mem_ratio(n_gpu+1, 0.0);
-  //get ratios of memory use, if possible
+  // Get ratios of memory use, if possible
  KALDI_LOG << "Selecting from " << n_gpu << " GPUs";
-  for(int32 n=0; n<n_gpu; n++) {
+  for(int32 n = 0; n < n_gpu; n++) {
    int32 ret = cudaSetDevice(n);
    switch(ret) {
      case cudaSuccess : {
@ -292,23 +284,22 @@ void CuDevice::SelectGpuIdAuto() {
    if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
  }
  //the free_mem_ratio should be bigger than zero
-  if(!free_mem_ratio[max_id] > 0.0) {
-    KALDI_ERR << "No device could be selected (this should never happen)";
-  }
+  KALDI_ASSERT(free_mem_ratio[max_id] > 0.0);

  //finally select the GPU
  KALDI_LOG << "Selected device: " << max_id << " (automatically)";
-  cuSafeCall(cudaSetDevice(max_id));
+  CU_SAFE_CALL(cudaSetDevice(max_id));
  //create the context
  cudaError_t e;
  e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
  if(e != cudaSuccess) {
-    KALDI_ERR << "Failed to create CUDA context on a GPU.";
+    KALDI_WARN << "Failed to create CUDA context on a GPU.";
+    return false;
  }
+  return true;
 }


-
 void CuDevice::AccuProfile(const std::string &key, double time) { 
  if (profile_map_.find(key) == profile_map_.end()) {
    profile_map_[key] = 0.0;
@ -316,23 +307,35 @@ void CuDevice::AccuProfile(const std::string &key, double time) {
  profile_map_[key] += time;
 }

-
+void CuDevice::PrintMemoryUsage() const {
+  if (Enabled()) {
+    int64 free_memory_now;
+    GetFreeMemory(&free_memory_now, NULL);
+    KALDI_LOG << "Memory used: " << (free_memory_at_startup_ - free_memory_now) << " bytes.";
+  }
+}

 void CuDevice::PrintProfile() {
  if (verbose_ && Enabled()) { 
    std::ostringstream os;
    os << "-----\n[cudevice profile]\n";
    std::map<std::string, double>::iterator it;
-    for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
-      os << it->first << "\t" << it->second << "s\n";
-    }
+    std::vector<std::pair<double, std::string> > pairs;
+    for(it = profile_map_.begin(); it != profile_map_.end(); ++it)
+      pairs.push_back(std::make_pair(it->second, it->first));
+    std::sort(pairs.begin(), pairs.end());
+    size_t max_print = 15, start_pos = (pairs.size() <= max_print ?
+                                        0 : pairs.size() - max_print);
+    for (size_t i = start_pos; i < pairs.size(); i++) 
+      os << pairs[i].second << "\t" << pairs[i].first << "s\n";
    os << "-----";
    KALDI_LOG << os.str();
+    PrintMemoryUsage();
  }
 }


-std::string CuDevice::GetFreeMemory(int64* free, int64* total) {
+std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
 // WARNING! the CUDA API is inconsistent accross versions!
 #if (CUDA_VERSION >= 3020)
  //define the function signature type
@ -406,14 +409,354 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
 }


-////////////////////////////////////////////////
-// The instance of the static singleton 
-//
-CuDevice CuDevice::msDevice;
-//
-////////////////////////////////////////////////
+struct CuAllocatorOptions {
+  int32 count; // Number of times we free and delete a particular size before we
+               // start to cache it.
+  int32 cleanup_interval_bytes;
+  CuAllocatorOptions(): count(1), cleanup_interval_bytes(1000000) { }
+};


+/// We define class CuAllocator inside the .cc file, because we don't want to
+/// expose it in the header.  Its purpose is to hang on to memory that we have
+/// freed, so that we don't waste time in cudaMalloc and cudaMallocPitch().
+/// For some reason, they are sometimes very slow.
+class CuAllocator {
+ public:
+  CuAllocator(const CuAllocatorOptions &opts, CuDevice *device):
+      device_(device), opts_(opts),
+      cleanup_countdown_bytes_(opts.cleanup_interval_bytes) { }
+  
+  inline void *Malloc(size_t size);
+  
+  inline void *MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
+  
+  inline void Free(void *ptr);
+
+  ~CuAllocator();
+ private:
+  inline void *MallocInternal(size_t row_bytes, size_t num_rows, size_t *pitch);
+  
+  // struct MemInfoForSize stores information associated with a particular size
+  // of allocated memory.  The row_bytes and num_rows refer to the arguments of
+  // a cudaMallocPitch call; for regular, non-pitch allocations with cudaMalloc,
+  // we make "row_bytes" zero and the size in bytes is "num_rows"... there is a
+  // reason why we do it this way round (make num_rows contain the size in
+  // bytes); it relates to the ordering of the map, and the behavior when
+  // we didn't find the exact size and want to find larger match.
+
+  
+  struct MemInfoForSize {
+    size_t row_bytes; // or zero, if a regular CudaMalloc, not
+                      // CudaMallocPitch.
+    size_t num_rows; // or the number of rows, if it's a regular CudaMalloc
+                     // call, not CudaMallocPitch.
+    size_t pitch; // If CudaMallocPitch, the pitch returned by CudaMallocPitch;
+                  // this code assumes (and checks) that it's a deterministic
+                  // function of row_bytes and num_rows.
+    size_t countdown; // number that have been freed and not cached.
+    size_t currently_used; // number that are "in the wild".. kept for
+                           // diagnostics and error detection.
+    std::vector<void*> freed; // freed and cached...
+      
+    MemInfoForSize(size_t row_bytes,
+                   size_t num_rows,
+                   int32 count):
+        row_bytes(row_bytes),
+        num_rows(num_rows),
+        pitch(0),
+        countdown(count),
+        currently_used(0) { }
+  };
+
+
+  // FindMemInfo returns the MemInfoForSize object for this (row_bytes,
+  // num_rows) combination if it exists; otherwise...
+  // if there is a MemInfoForSize object with the same row_bytes and larger (but
+  // not more than twice larger) num_rows that has freed memory waiting, it
+  // returns that; otherwise, it returns a new MemInfoForSize object for the
+  // requested size).
+  
+  inline MemInfoForSize *FindMemInfo(size_t row_bytes,
+                                     size_t num_rows) {
+    if (row_bytes >= size_to_list_.size())
+      size_to_list_.resize(row_bytes + 1, NULL);
+    
+    // note: we set row_bytes to 0 for regular, linear allocation.
+    KALDI_ASSERT(num_rows != 0);
+
+    if (size_to_list_[row_bytes] == NULL)
+      size_to_list_[row_bytes] = new std::map<size_t, MemInfoForSize*>;
+
+
+    std::map<size_t, MemInfoForSize*> &size_to_list = *(size_to_list_[row_bytes]);
+
+    typedef std::map<size_t, MemInfoForSize* >::iterator IterType;
+
+    // get an iterator to the requested object or the next-larger one.
+    // Here, upper_bound(num_rows - 1) returns an object strictly greater
+    // than num_rows - 1, which could be num_rows itself.  We need to
+    // treat num_rows == 0 as a special case because of size_t being
+    // unsigned.
+    IterType iter = (num_rows == 0 ? size_to_list.begin() :
+                     size_to_list.upper_bound(num_rows - 1));
+    
+    if (iter != size_to_list.end() && iter->first == num_rows) {
+      // Found a MemInfoForSize object
+      // with the requested size -> return it.
+      KALDI_ASSERT(iter->second->row_bytes == row_bytes &&
+                   iter->second->num_rows == num_rows);
+      return iter->second;
+    } else if (iter != size_to_list.end() &&
+               iter->second->num_rows <= 2 * num_rows &&
+               !iter->second->freed.empty()) {
+      // Return the non-matching one with freed memory, which is larger than
+      // this one but not more than twice larger.
+      KALDI_ASSERT(iter->second->row_bytes == row_bytes &&
+                   iter->second->num_rows > num_rows); // confirm expectations.
+      return iter->second;
+    } else {
+      // There was no such object, and the next-larger object either did not
+      // exist, had more than twice the num-rows requested, or had no free
+      // memory -> create an object with the requested size.
+      return (size_to_list[num_rows] =  new MemInfoForSize(row_bytes, num_rows,
+                                                           opts_.count));
+    }
+  }
+                 
+  void PossiblyCleanup(size_t num_bytes);
+
+  // A periodic housekeeping task..
+  void Cleanup();
+
+  // Frees all memory in the "freed" vectors; memory that the
+  // user freed but we held on to.  If destroy == true, also
+  // clean up all memory held in the size_to_list_ object (i.e.
+  // allocated maps and MemInfoForSize objects).
+  void ReleaseAllCachedMemory(bool destroy = false);
+
+  CuDevice *device_; // device this is attached to...
+  CuAllocatorOptions opts_;
+
+
+  unordered_map<void*, MemInfoForSize*> addr_to_list_;
+
+  // size_to_list_ is indexed first by row_bytes (which is zero for linear
+  // mallocs) and then by num_rows (which for linear mallocs, is the actual size
+  // in bytes).
+  std::vector<std::map<size_t, MemInfoForSize*>* > size_to_list_;
+  
+  int32 cleanup_countdown_bytes_; // countdown in bytes, until the next time we check
+                                  // whether we should do cleanup
+};
+
+
+void* CuAllocator::Malloc(size_t size) {
+  KALDI_ASSERT(size > 0);
+  return MallocInternal(0, size, NULL);
+}
+
+void* CuAllocator::MallocPitch(size_t num_rows, size_t row_bytes,
+                               size_t *pitch) {
+  KALDI_ASSERT(num_rows > 0 && row_bytes > 0 && pitch != NULL);
+  return MallocInternal(num_rows, row_bytes, pitch);
+}
+
+void* CuAllocator::MallocInternal(size_t row_bytes,
+                                  size_t num_rows,
+                                  size_t *pitch_out) {
+  // we share the code for standard cudaMalloc and cudaMallocPitch
+  // because most of it is the same.  for cudaMalloc, we'll have
+  // row_bytes == 0, and num_rows is just the size to be allocated.
+  KALDI_ASSERT(num_rows != 0 && (row_bytes != 0) == (pitch_out != NULL));
+  
+  MemInfoForSize *info = FindMemInfo(row_bytes, num_rows);
+  if (!info->freed.empty()) { // We can satisfy the request with cached,
+                              // previously-allocated memory.
+    void *ans = info->freed.back();
+    info->freed.pop_back();
+    info->currently_used++;
+    addr_to_list_[ans] = info;
+    if (pitch_out) *pitch_out = info->pitch;
+    return ans;
+  } else {
+    PossiblyCleanup(row_bytes == 0 ? num_rows : row_bytes * num_rows);
+    void *ans;
+    if (row_bytes == 0) { // Simple malloc request, not "MallocPitch".
+      size_t size = num_rows;
+      int32 ret = cudaMalloc(&ans, size);
+      if (ret != 0) {
+        KALDI_WARN << "Allocation of memory block of " << size << " bytes "
+                   << "failed, releasing cached memory and retrying.";
+        ReleaseAllCachedMemory();
+        ret = cudaMalloc(&ans, size);
+        if (ret != 0)
+          KALDI_WARN << "Allocation failed for the second time.    Printing "
+                    << "device memory usage and exiting";
+          device_->PrintMemoryUsage();
+          KALDI_ERR << "Memory allocation failure";
+      }
+    } else {
+      size_t pitch;
+      int32 ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
+      if (ret != 0) { // allocation failed...
+        KALDI_WARN << "Allocation of " << num_rows << " rows, each of size "
+                   << row_bytes << " bytes failed,  releasing cached "
+                   << "memory and retrying.";
+        ReleaseAllCachedMemory();
+        ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
+        if (ret != 0) {
+          KALDI_WARN << "Allocation failed for the second time.    Printing "
+                    << "device memory usage and exiting";
+          device_->PrintMemoryUsage();
+          KALDI_ERR << "Memory allocation failure";
+        }
+      }
+      KALDI_ASSERT(pitch > 0);
+      if (info->pitch == 0) { // First allocation; have not set info->pitch yet.
+        info->pitch = pitch;
+      } else if (pitch != info->pitch) {
+        KALDI_ERR << "Pitch differs between multiple calls with the same "
+                  << "parameters: " << pitch << " vs. " << info->pitch;
+      }
+      *pitch_out = info->pitch;
+    }
+    addr_to_list_[ans] = info;
+    info->currently_used++;
+    return ans;
+  }
+}
+
+void CuAllocator::Free(void *addr) {
+  unordered_map<void*, MemInfoForSize*>::iterator iter
+      = addr_to_list_.find(addr);
+  if (iter == addr_to_list_.end()) {
+    KALDI_ERR << "Attempt to free address " << addr << " that was not allocated "
+              << "by CuDevice::Malloc() (or was previously freed);";
+  }
+  MemInfoForSize *info = iter->second;
+  addr_to_list_.erase(addr); // Erase this element in the addr_to_list_ map.
+  info->currently_used--;
+  if (info->countdown == 0) { // We have freed [i.e. actually freed with
+                              // CudaFree()] enough of these that we think
+                              // we're wasting too much time this way and
+                              // need to start caching them.
+    info->freed.push_back(addr);
+  } else { // Actually free the address, and decrease "countdown".
+    info->countdown--;
+    CU_SAFE_CALL(cudaFree(addr)); // This is how we free, even if allocated with
+                                  // cudaMallocPitch().
+  }
+}
+
+void CuAllocator::ReleaseAllCachedMemory(bool destroy) {
+  KALDI_VLOG(2) << "Releasing all cached memory.";
+  for (size_t i = 0; i < size_to_list_.size(); i++) {
+    if (size_to_list_[i] == NULL)
+      continue;
+    typedef std::map<size_t, MemInfoForSize*>::iterator  IterType;
+    for (IterType iter = size_to_list_[i]->begin();
+         iter != size_to_list_[i]->end(); ++iter) {
+      MemInfoForSize *info = iter->second;
+      if (destroy && !info->freed.empty()) {
+        // When called from the destructor at program end, if verbose level is
+        // high, say the sizes we had.
+        if (info->row_bytes == 0) {
+          KALDI_VLOG(3) << "Releasing " << info->freed.size() << " blocks of "
+                        << info->num_rows << " bytes.";
+        } else {
+          KALDI_VLOG(3) << "Releasing " << info->freed.size()
+                        << " 2-d blocks of " << info->num_rows << " rows of "
+                        << info->row_bytes << " bytes each.";
+        }
+      }
+      if (!destroy) {
+        // We only do this freeing part when we're *not* called from the
+        // destuctor (destroy = false).  This leads to a crash when called from
+        // the destructor, with cudaFree returning "unload of CUDA runtime
+        // failed".  Presumably this has to do with the destruction order of
+        // C++, which we can't really control.
+        while (!info->freed.empty()) {
+          CU_SAFE_CALL(cudaFree(info->freed.back()));
+          info->freed.pop_back();
+        }
+      }
+      if (destroy)
+        delete info;
+    }
+    if (destroy) {
+      delete size_to_list_[i];
+      size_to_list_[i] = NULL;
+    }
+  }
+}
+
+void CuAllocator::Cleanup() {
+  // TODO: implement this or remove it (and also PossiblyCleanup).
+  // Actually we may never implement this, as just calling
+  // ReleaseAllCachedMemory whenever an allocation fails is probably
+  // sufficient.
+}
+void CuAllocator::PossiblyCleanup(size_t num_bytes) {
+  if (static_cast<size_t>(cleanup_countdown_bytes_) <= num_bytes) {
+    Cleanup();
+    cleanup_countdown_bytes_ = opts_.cleanup_interval_bytes;
+  } else {
+    cleanup_countdown_bytes_ -= static_cast<int32>(num_bytes);
+  }
+}
+
+CuAllocator::~CuAllocator() {
+  // Check that nothing was allocated by the user and not freed.
+  std::set<MemInfoForSize*> unfreed_set;
+  typedef unordered_map<void*, MemInfoForSize *>::iterator IterType;
+  for (IterType iter = addr_to_list_.begin(); iter != addr_to_list_.end();
+       ++iter)
+    unfreed_set.insert(iter->second);
+  for (std::set<MemInfoForSize*>::iterator iter = unfreed_set.begin();
+       iter != unfreed_set.end(); ++iter) {
+    MemInfoForSize *info = *iter;
+    KALDI_ASSERT(info->currently_used > 0); // Or should not be in this set
+                                            // (code error or memory corruption)
+    if (info->num_rows == 0) {
+      KALDI_WARN << info->currently_used << " memory chunks of size "
+                 << info->row_bytes << " were allocated and not freed.";
+    } else {
+      KALDI_WARN << info->currently_used << " memory chunks of size "
+                 << info->row_bytes << " per row, and " << info->num_rows
+                 << " rows, were allocated and not freed.";
+    }
+  }
+  
+  bool destroy = true;
+  ReleaseAllCachedMemory(destroy);
+}
+
+void CuDevice::Free(void *ptr) { allocator_->Free(ptr); }
+
+void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
+  return allocator_->MallocPitch(row_bytes, num_rows, pitch);
+}
+
+void* CuDevice::Malloc(size_t size) {
+  return allocator_->Malloc(size);
+}
+
+CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true),
+                      allocator_(new CuAllocator(CuAllocatorOptions(), this))
+  { }
+
+
+CuDevice::~CuDevice() {
+  if (allocator_ != NULL)
+    delete allocator_;
+  if (Enabled())
+    CU_SAFE_CALL(cublasShutdown());
+}
+  
+// The instance of the static singleton 
+CuDevice CuDevice::global_device_;
+

 }

--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@ -22,75 +22,105 @@
 #ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_
 #define KALDI_CUDAMATRIX_CU_DEVICE_H_

-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1

 #include <map>
 #include <string>
 #include <iostream>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+

 namespace kaldi {

+class CuAllocator; // Forward declaration.
+
 /**
 * Singleton object which represents CUDA device
 * responsible for CUBLAS initilalisation, collects profiling info
 */
 class CuDevice {
- // Singleton interface...
- private:
-  CuDevice();
-  CuDevice(CuDevice&);
-  CuDevice &operator=(CuDevice&);
-
+ // Singleton object (there should only be one instantiated per program)
 public:
  ~CuDevice();
-  static CuDevice& Instantiate() { 
-    return msDevice; 
-  }
+  static inline CuDevice& Instantiate() { return global_device_; }

- private:
-  static CuDevice msDevice;
+  // We provide functions Malloc, MallocPitch and Free which replace cudaMalloc,
+  // cudaMallocPitch and cudaFree.  Their function is to cache the results of
+  // previous allocations to avoid the very large overhead that CUDA's
+  // allocation seems to give for some setups.
+  void* Malloc(size_t size);
+  
+  void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
+  
+  void Free(void *ptr);
+  
+  /// Select a GPU for computation, the 'use_gpu' modes are:
+  ///  "yes"      -- Select GPU automatically and die if this fails.
+  ///  "optional" -- Do as above, but if it fails, back off to CPU. 
+  ///  "no"       -- Run on CPU. 
+  ///  (more comments in cu-device.cc)
+  void SelectGpuId(std::string use_gpu);

-
- /**********************************/
- // Instance interface
- public:
- 
-  /// Check if the CUDA device is selected for use
-  bool Enabled() { 
+  /// Check if the CUDA GPU is selected for use
+  bool Enabled() const {
    return (active_gpu_id_ > -1); 
  }

-  /// Manually select GPU by id (more comments in cu-device.cc)
-  void SelectGpuId(int32 gpu_id);
  /// Get the active GPU id
  int32 ActiveGpuId() {
    return active_gpu_id_;
  }

-  void Verbose(bool verbose) { 
-    verbose_ = verbose; 
-  }
+  /// Returns true if either we have no GPU, or we have a GPU
+  /// and it supports double precision.
+  bool DoublePrecisionSupported();
+
+  void SetVerbose(bool verbose) {  verbose_ = verbose; }

  /// Sum the IO time
  void AccuProfile(const std::string &key, double time);
  void PrintProfile(); 
+
+  void PrintMemoryUsage() const;
  
  void ResetProfile() { 
    profile_map_.clear(); 
  }
  
  /// Get the actual GPU memory use stats
-  std::string GetFreeMemory(int64* free = NULL, int64* total = NULL);
+  std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const;
  /// Get the name of the GPU
  void DeviceGetName(char* name, int32 len, int32 dev); 
  
 private:
-  /// Check if the GPU run in compute exclusive mode
-  bool IsComputeExclusive();
-  /// Automatically select GPU
-  void SelectGpuIdAuto();
+  CuDevice();
+  CuDevice(CuDevice&); // Disallow.
+  CuDevice &operator=(CuDevice&);  // Disallow.
+
+  static CuDevice global_device_;
+  
+  /// Check if the GPU run in compute exclusive mode Returns true if it is
+  /// running in compute exclusive mode and we have a GPU.  Returns false
+  /// otherwise.  Sets error to true if there was some error, such as that we
+  /// were running in compute exclusive modes but no GPUs available; otherwise
+  /// sets it to false.
+  bool IsComputeExclusive();
+
+  /// Automatically select GPU and get CUDA context.  Returns true on success.
+  bool SelectGpuIdAuto();
+
+  /// Try to get CUDA context on manually selected GPU.  Return true on success.
+  bool SelectGpuIdManual(int32 gpu_id);
+
+  void FinalizeActiveGpu();
+  
+  /// Should only be called if Enabled() == true. 
+  int32 MajorDeviceVersion();
+
+  /// Should only be called if Enabled() == true. 
+  int32 MinorDeviceVersion();

- private:
  std::map<std::string, double> profile_map_;
  
  /// active_gpu_id_ values:
@ -99,14 +129,20 @@ class CuDevice {
  /// -1 SelectGpuId was called, but the GPU was manually disabled
  /// 0..N Normal GPU IDs
  int32 active_gpu_id_; 
-  ///
+  
+  int64 free_memory_at_startup_;
+  
+  cudaDeviceProp properties_;

  bool verbose_;

+  CuAllocator *allocator_;
+  
 }; // class CuDevice


-}// namespace
+
+}  // namespace

 #endif // HAVE_CUDA

--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@ -1,6 +1,10 @@
 // cudamatrix/cu-kernels-ansi.h

 // Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)
+//                2013  Hainan Xu	
+//                2013  Xiaohui Zhang
+//                2013	Johns Hopkins University (author: Guoguo Chen)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -25,8 +29,7 @@

 #include "cudamatrix/cu-matrixdim.h"

-#if HAVE_CUDA==1
-
+#if HAVE_CUDA == 1
 extern "C" {

 /*********************************************************
@ -43,13 +46,39 @@ void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, Matr
 /*
 * CuMatrix 
 */
+void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
+void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
+void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim,
+                            const float *vec, const float *mat2, int mat2_row_stride,
+                            int mat2_col_stride, float beta);
+void cudaF_copy_from_tp_trans(int Gr, int Bl, float* A, const float* B, MatrixDim dmat);
+void cudaFD_copy_from_tp_trans(int Gr, int Bl, float* A, const double* B, MatrixDim dmat);
+void cudaF_copy_from_tp(int Gr, int Bl, float* A, const float* B, MatrixDim dmat);
+void cudaFD_copy_from_tp(int Gr, int Bl, float* A, const double* B, MatrixDim dmat);
+void cudaF_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d);
+void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
+void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
+void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
+void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
+void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
+void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
 void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
+void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
+void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec, const float alpha, int dim);
+void cudaF_scale_diag(int Gr, int Bl, float* mat, float value, int dim);
 void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
 void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
-void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim d);
+void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride);
+void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride);
 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
 void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
+void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size);
+void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2,  MatrixDim d, int src_stride, int group_size, float power);
 void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d);
 void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d);
 void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d);
@ -58,29 +87,82 @@ void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, floa
 /*
 * CuVector
 */
+void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig, float changed);
+void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a, float param_1, float param_2, float param_3, int* flag, int dim);
+void cudaF_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim);
+void cudaF_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim);
+void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
+void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
+void cudaF_vec_min(const float* v, float* value, int dim);
+void cudaF_vec_max(const float* v, float* value, int dim);
+void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
+void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
+void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
+void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
+void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
+                            int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
+                            int N_col_stride, int threads_per_element, float beta);  
+void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim);
+void cudaF_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
+void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim);
+void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
+void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
+void cudaF_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size);
+void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim);
+void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim);
+void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
+void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
+void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
 void cudaF_add_row_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d);
 void cudaF_add_col_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d);
 void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
-
+// Note: B_trans is nonzero if B is transposed.
+void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const float *Adata,
+                            int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
+                            const CuBlockMatrixData *B_cu_data, int B_num_blocks,
+                            float alpha, float beta, int B_trans);
+void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
+                             const float *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
+                             const float *D_data, int D_row_stride, int D_col_stride,
+                             float alpha, float beta);
 /*
 * cu::
 */
 void cudaF_softmax(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d);
+void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride);
 void cudaF_softmax_part(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d);
-void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d);
-void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d);
-void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d);
+void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
+void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power);
+void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
+void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int src_stride);
+void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
 void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d);

 void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d);
 void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
 void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d);
+void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in);

 void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
 void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
+void cudaF_one(int Gr, int Bl, float* x, int dim);
 void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
+void cudaF_copy_from_sp(int Gr, int Bl, const float* x, float* y, int d_in, MatrixDim d_out);
+void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
+void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
+void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
+void cudaF_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<float>* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t);
+void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
+                      float *S, MatrixDim sdim);
+void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                             const float *src_data, MatrixDim src_dim,
+                             const Int32Pair *indices);  
+void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
+                         const Int32Pair *indices, int indices_size,
+                         float *output);

-
+  
 /*********************************************************
 * double CUDA kernel calls
 */
@ -88,13 +170,39 @@ void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *co
 /*
 * CuMatrix 
 */
+void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
+void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
+void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
+                            const double *vec, const double *mat2, int mat2_row_stride,
+                            int mat2_col_stride, double beta);  
+void cudaD_copy_from_tp_trans(int Gr, int Bl, double* A, const double* B, MatrixDim dmat);
+void cudaDF_copy_from_tp_trans(int Gr, int Bl, double* A, const float* B, MatrixDim dmat);
+void cudaD_copy_from_tp(int Gr, int Bl, double* A, const double* B, MatrixDim dmat);
+void cudaDF_copy_from_tp(int Gr, int Bl, double* A, const float* B, MatrixDim dmat);
+void cudaD_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d);
+void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
+void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);  
+void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
+void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
+void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
+void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
+void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
+void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
 void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
+void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
+void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec, const double alpha, int dim);
+void cudaD_scale_diag(int Gr, int Bl, double* mat, double value, int dim);
 void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
 void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
-void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim d);
+void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride);
+void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride);
 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
 void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
+void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size);
+void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2,  MatrixDim d, int src_stride, int group_size, double power);
 void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d);
 void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d);
 void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d);
@ -103,31 +211,101 @@ void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, do
 /*
 * CuVector
 */
+void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig, double changed);
+void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a, double param_1, double param_2, double param_3, int* flag, int dim);
+void cudaD_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim);
+void cudaD_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim);
+void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim);
+void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
+void cudaD_vec_min(const double* v, double* value, int dim);
+void cudaD_vec_max(const double* v, double* value, int dim);
+void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
+void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
+void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
+void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
+void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
+                            int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
+                            int N_col_stride, int threads_per_element, double beta);  
+void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim);
+void cudaD_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
+void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
+void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim);
+void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
+void cudaD_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size);
+void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim);
+void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim);
+void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
+void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
+void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
 void cudaD_add_row_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d);
 void cudaD_add_col_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d);
 void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
+// note: B_trans is nonzero if B is tranposed.
+void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const double *Adata,
+                            int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
+                            const CuBlockMatrixData *B_cu_data, int B_num_blocks,
+                            double alpha, double beta, int B_trans);
+void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
+                             const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
+                             const double *D_data, int D_row_stride, int D_col_stride,
+                             double alpha, double beta);  
+

 /*
 * cu::
 */
 void cudaD_softmax(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d);
+void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride);
 void cudaD_softmax_part(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d);
-void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d);
-void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d);
-void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d);
+void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
+void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power);
+void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
+void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int src_stride);
+void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
 void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d);

 void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d);
 void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
 void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d);
+void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in);

 void cudaD_randomize(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
 void cudaD_splice(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
+void cudaD_one(int Gr, int Bl, double* x, int dim);
 void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
+void cudaD_copy_from_sp(int Gr, int Bl, const double* x, double* y, int d_in, MatrixDim d_out);
+void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
+void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
+void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);


+// some mostly mixed-type kernels.
+void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
+
+void cudaD_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<double>* x, int s, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t);
+
+void cudaD_transpose_matrix(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* T, MatrixDim tdim,
+                      double *S, MatrixDim sdim);
+void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                             const double *src_data, MatrixDim src_dim,
+                             const Int32Pair *indices);
+void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
+                         const Int32Pair *indices, int indices_size,
+                         double *output);
+  
+  
+  
 } // extern "C" 

 #endif // HAVE_CUDA

+
 #endif
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@ -1,6 +1,11 @@
 // cudamatrix/cu-kernels.h

 // Copyright 2009-2012  Karel Vesely
+//                2013  Ehsan Variani
+//                2014  Johns Hopkins University (author: Daniel Povey)
+//                2013  Hainan Xu
+//                2013  Xiaohui Zhang	
+//                2013  Johns Hopkins University (author: Guoguo Chen)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -22,7 +27,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_KERNELS_H_
 #define KALDI_CUDAMATRIX_CU_KERNELS_H_

-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1

 #include "base/kaldi-error.h"
 #include "cudamatrix/cu-kernels-ansi.h"
@ -34,147 +39,366 @@

 namespace kaldi {

-
-
-/*********************************************************
- * base templates
- */
-
-/*
- * CuMatrix
- */
-template<typename Real> inline void cuda_set_const(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_scale(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_apply_log(dim3 Gr, dim3 Bl, Real *mat, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_mul_elements(dim3 Gr, dim3 Bl, Real *mat, const Real *A, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *scale, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *scale, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *vec_div, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add_mat(dim3 Gr, dim3 Bl, Real alpha, const Real *A, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, Real alpha, const Real *col, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, Real alpha, const Real *row, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
- 
-/*
- * CuVector
- */
-template<typename Real> inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_sum, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_sum, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_invert_elements(dim3 Gr, dim3 Bl, Real *data, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-
-template<typename Real> inline void cuda_sigmoid(dim3 Gr, dim3 Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, Real *eout, const Real *e, const Real *y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_tanh(dim3 Gr, dim3 Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, Real *eout, const Real *e, const Real *y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_softmax(size_t Gr, size_t Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const Real *X, const int32_cuda *vec_ids, Real* Y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-
-template<typename Real> inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, Real *wei, Real *grad, Real l1, Real lr, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, Real *mat_net_out, Real *vec_log_post, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
-
-template<typename Real> inline void cuda_randomize(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_splice(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
-template<typename Real> inline void cuda_copy(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
-
-
-
-/*********************************************************
- * float specializations
- */
-
 /*
 * CuMatrix 
 */
-template<> inline void cuda_set_const<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_set_const(Gr,Bl,mat,value,d); }
-template<> inline void cuda_add<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_add(Gr,Bl,mat,value,d); }
-template<> inline void cuda_scale<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_scale(Gr,Bl,mat,value,d); }
-template<> inline void cuda_apply_log<float>(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) { cudaF_apply_log(Gr,Bl,mat,d); }
-template<> inline void cuda_mul_elements<float>(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim d) { cudaF_mul_elements(Gr,Bl,mat,A,d); }
-template<> inline void cuda_mul_cols_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr,Bl,mat,scale,d); }
-template<> inline void cuda_mul_rows_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
-template<> inline void cuda_div_rows_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d) { cudaF_div_rows_vec(Gr,Bl,mat,vec_div,d); }
-template<> inline void cuda_add_mat<float>(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d) { cudaF_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
-template<> inline void cuda_add_vec_to_cols<float>(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
-template<> inline void cuda_add_vec_to_rows<float>(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
+
+inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_upp_low(Gr, Bl, A, dimA); }
+inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_low_upp(Gr, Bl, A, dimA); }
+inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim,
+                                  const float *vec, const float *mat2, int mat2_row_stride,
+                                  int mat2_col_stride, float beta) {
+  cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
+                         mat2_row_stride, mat2_col_stride, beta);
+}
+inline void cuda_copy_from_tp_trans(int Gr, int Bl, float* A, const float* B, MatrixDim dmat) { cudaF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp_trans(int Gr, int Bl, float* A, const double* B, MatrixDim dmat) { cudaFD_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp(int Gr, int Bl, float* A, const float* B, MatrixDim dmat) { cudaF_copy_from_tp(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp(int Gr, int Bl, float* A, const double* B, MatrixDim dmat) { cudaFD_copy_from_tp(Gr,Bl,A,B,dmat); }
+
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+}
+
+inline void cuda_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { cudaF_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
+inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr,Bl,mat,d); }
+inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); }
+inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) { cudaF_apply_heaviside(Gr,Bl,mat,dim); }
+inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim dim) { cudaF_apply_floor(Gr,Bl,mat,floor_val,dim); }
+inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim dim) { cudaF_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
+inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) { cudaF_trace(Gr,Bl,mat,value,dim); }
+inline void cuda_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d) { cudaF_set_diag(Gr,Bl,mat,value,d); }
+inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { cudaF_set_diag_packed(Gr,Bl,mat,value,dim); }
+inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { cudaF_add_diag_packed(Gr,Bl,mat,value,dim); }
+inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_set_const(Gr,Bl,mat,value,d); }
+inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_set_zero_above_diag(Gr,Bl,mat,d); }
+inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_add(Gr,Bl,mat,value,d); }
+inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec, const float alpha, int dim) { cudaF_add_vec2(Gr,Bl,mat,vec,alpha,dim); }
+inline void cuda_scale_diag(int Gr, int Bl, float* mat, float value, int dim) { cudaF_scale_diag(Gr,Bl,mat,value,dim); }
+inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_scale(Gr,Bl,mat,value,d); }
+inline void cuda_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) { cudaF_apply_log(Gr,Bl,mat,d); }
+inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride) {
+  cudaF_mul_elements(Gr,Bl,mat,A,dst_d,src_stride);
+}
+inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride) {
+  cudaF_max(Gr,Bl,mat,A,dst_d,src_stride);
+}
+inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr,Bl,mat,scale,d); }
+inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
+inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
+inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2,  MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
+inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d) { cudaF_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
+inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
+inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
+inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_transpose_matrix(Gr, Bl, mat, d); }
+inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
+                            float *S, MatrixDim sdim) {
+  cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
+}
+
 
 /*
 * CuVector
 */
-template<> inline void cuda_add_row_sum_mat<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
-template<> inline void cuda_add_col_sum_mat<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
-template<> inline void cuda_invert_elements<float>(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
+inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig, float changed) {cudaF_replace_value(Gr, Bl, v, dim, orig, changed); }
+inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d) { cudaF_div_rows_vec(Gr,Bl,mat,vec_div,d); }
+inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a, float param_1, float param_2, float param_3, int* flag, int dim) { cudaF_set_bias_params(Gr,Bl,v,a,param_1,param_2,param_3,flag,dim); }
+inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
+inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
+inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_mul_elements(Gr,Bl,v,a,dim); }
+inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { cudaF_vec_soft_max(Gr,Bl,v,dim); }
+inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(v,value,dim); }
+inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
+inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
+inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
+inline void cuda_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
+inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
+                                  int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
+                                  int N_col_stride, int threads_per_element, float beta) {
+  cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
+                         N_col_stride, threads_per_element, beta);
+}
+inline void cuda_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
+inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
+inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc) { cudaF_vec_sum(Gr,Bl,v,value,dim,inc); }
+inline void cuda_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size) { cudaF_pvec_sum(Gr, Bl, vec, pvec_sum, dim, size); }
+inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim) { cudaF_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
+inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim) { cudaF_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
+inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) { cudaF_vec_apply_exp(Gr,Bl,v,dim); }
+inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) { cudaF_vec_apply_log(Gr,Bl,v,flag,dim); }
+inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
+inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
+inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
+// B_trans nonzero if B transposed.
+inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const float *Adata,
+                                  int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
+                                  const CuBlockMatrixData *B_cu_data, int B_num_blocks,
+                                  float alpha, float beta, int B_trans) {
+  cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride,
+                         B_cu_data, B_num_blocks, alpha, beta, B_trans);
+}
+inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
+                                   const float *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
+                                   const float *D_data, int D_row_stride, int D_col_stride,
+                                   float alpha, float beta) {
+  cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
+                          C_row_stride, C_col_stride, D_data, D_row_stride,
+                          D_col_stride, alpha, beta);
+}
+
+

 /*
 * cu::
 */
-template<> inline void cuda_sigmoid<float>(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d) { cudaF_sigmoid(Gr,Bl,y,x,d); }
-template<> inline void cuda_diff_sigmoid<float>(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d); }
-template<> inline void cuda_tanh<float>(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d) { cudaF_tanh(Gr,Bl,y,x,d); }
-template<> inline void cuda_diff_tanh<float>(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d); }
-template<> inline void cuda_softmax<float>(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d) { cudaF_softmax(Gr,Bl,y,x,d); }
-template<> inline void cuda_softmax_part<float>(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d) { cudaF_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
+inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_soft_hinge(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power) { cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);}
+inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_sigmoid(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int src_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,src_stride); }
+inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d); }
+inline void cuda_softmax(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d) { cudaF_softmax(Gr,Bl,y,x,d); }
+/*
+Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK threads reduce a row at the same time.
+Gr: the number of rows
+*/
+inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d) { cudaF_softmax_part(Gr,Bl,X,vec_ids,Y,d); }

-template<> inline void cuda_regularize_l1<float>(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
-template<> inline void cuda_find_row_max_id<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
-template<> inline void cuda_diff_xent<float>(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
-
-template<> inline void cuda_randomize<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
-
-template<> inline void cuda_splice<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaF_splice(Gr,Bl,y,x,off,d_out,d_in); }
-template<> inline void cuda_copy<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
+inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
+inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in) {
+  cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
+}


-/*********************************************************
- * double specializations
- */
+inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
+
+inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaF_splice(Gr,Bl,y,x,off,d_out,d_in); }
+inline void cuda_one(int Gr,int Bl,float* x,int dim) { cudaF_one(Gr,Bl,x,dim); }
+inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_copy_from_sp(int Gr, int Bl, const float* x, float* y, int d_in, MatrixDim d_out) { cudaF_copy_from_sp(Gr,Bl,x,y,d_in,d_out); }
+inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_lower(Gr,Bl,x,y,d_in); }
+inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_upper(Gr,Bl,x,y,d_in); }
+inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_mean(Gr,Bl,x,y,d_in); }
+inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int32 size, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) {cudaF_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
+inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                                   const float *src_data, MatrixDim src_dim,
+                                   const Int32Pair *indices) {
+  cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
+}
+inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
+                               MatrixDim dim, const Int32Pair *indices,
+                               int indices_size, float *output) {
+  cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
+}
+
+
+// double versions

 /*
 * CuMatrix 
 */
-template<> inline void cuda_set_const<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_set_const(Gr,Bl,mat,value,d); }
-template<> inline void cuda_add<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_add(Gr,Bl,mat,value,d); }
-template<> inline void cuda_scale<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_scale(Gr,Bl,mat,value,d); }
-template<> inline void cuda_apply_log<double>(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_apply_log(Gr,Bl,mat,d); }
-template<> inline void cuda_mul_elements<double>(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim d) { cudaD_mul_elements(Gr,Bl,mat,A,d); }
-template<> inline void cuda_mul_cols_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_cols_vec(Gr,Bl,mat,scale,d); }
-template<> inline void cuda_mul_rows_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
-template<> inline void cuda_div_rows_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d) { cudaD_div_rows_vec(Gr,Bl,mat,vec_div,d); }
-template<> inline void cuda_add_mat<double>(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d) { cudaD_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
-template<> inline void cuda_add_vec_to_cols<double>(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
-template<> inline void cuda_add_vec_to_rows<double>(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
- 
+inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_upp_low(Gr, Bl, A, dimA); }
+inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_low_upp(Gr, Bl, A, dimA); }
+inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
+                                  const double *vec, const double *mat2, int mat2_row_stride,
+                                  int mat2_col_stride, double beta) {
+  cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
+                         mat2_row_stride, mat2_col_stride, beta);
+}
+inline void cuda_copy_from_tp_trans(int Gr, int Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp_trans(int Gr, int Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp(int Gr, int Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_from_tp(int Gr, int Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp(Gr,Bl,A,B,dmat); }
+inline void cuda_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d) { cudaD_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
+inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_apply_exp(Gr,Bl,mat,d); }
+inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim dim) { cudaD_apply_pow(Gr,Bl,mat,power,dim); }
+inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) { cudaD_apply_heaviside(Gr,Bl,mat,dim); }
+inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim dim) { cudaD_apply_floor(Gr,Bl,mat,floor_val,dim); }
+inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim dim) { cudaD_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
+inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
+  cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) { cudaD_trace(Gr,Bl,mat,value,dim); }
+inline void cuda_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d) { cudaD_set_diag(Gr,Bl,mat,value,d); }
+inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { cudaD_set_diag_packed(Gr,Bl,mat,value,dim); }
+inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { cudaD_add_diag_packed(Gr,Bl,mat,value,dim); }
+inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_set_const(Gr,Bl,mat,value,d); }
+inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_set_zero_above_diag(Gr,Bl,mat,d); }
+inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_add(Gr,Bl,mat,value,d); }
+inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec, const double alpha, int dim) { cudaD_add_vec2(Gr,Bl,mat,vec,alpha,dim); }
+inline void cuda_scale_diag(int Gr, int Bl, double* mat, double value, int dim) { cudaD_scale_diag(Gr,Bl,mat,value,dim); }
+inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_scale(Gr,Bl,mat,value,d); }
+inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_apply_log(Gr,Bl,mat,d); }
+inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) {
+  cudaD_mul_elements(Gr,Bl,mat,A,dst_d,src_stride);
+}
+inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) {
+  cudaD_max(Gr,Bl,mat,A,dst_d,src_stride);
+}
+inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_cols_vec(Gr,Bl,mat,scale,d); }
+inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
+inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size) { cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
+inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2,  MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
+inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d) { cudaD_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
+inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
+inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
+inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_transpose_matrix(Gr, Bl, mat, d); }
+inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* T, MatrixDim tdim,
+                            double *S, MatrixDim sdim) {
+  cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
+}
+
+
 /*
 * CuVector
 */
-template<> inline void cuda_add_row_sum_mat<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
-template<> inline void cuda_add_col_sum_mat<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
-template<> inline void cuda_invert_elements<double>(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
+inline void cuda_replace_value(int Gr, int Bl, double *v, int dim, double orig, double changed) {cudaD_replace_value(Gr, Bl, v, dim, orig, changed); }
+inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d) { cudaD_div_rows_vec(Gr,Bl,mat,vec_div,d); }
+inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a, double param_1, double param_2, double param_3, int* flag, int dim) { cudaD_set_bias_params(Gr,Bl,v,a,param_1,param_2,param_3,flag,dim); }
+inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
+inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
+inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim) { cudaD_vec_mul_elements(Gr,Bl,v,a,dim); }
+inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr,Bl,v,dim); }
+inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_min(v,value,dim); }
+inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
+inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
+inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
+inline void cuda_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
+inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
+                                  int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
+                                  int N_col_stride, int threads_per_element, double beta) {
+  cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
+                         N_col_stride, threads_per_element, beta);
+}
+inline void cuda_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
+inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
+inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
+inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc) { cudaD_vec_sum(Gr,Bl,v,value,dim,inc); }
+inline void cuda_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size) { cudaD_pvec_sum(Gr,Bl,vec,pvec_sum,dim,size); }
+inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim) { cudaD_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
+inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim) { cudaD_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
+inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) { cudaD_vec_apply_exp(Gr,Bl,v,dim); }
+inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim) { cudaD_vec_apply_log(Gr,Bl,v,flag,dim); }
+inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
+inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
+inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
+// B_trans nonzero if B transposed.
+inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const double *Adata,
+                                  int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
+                                  const CuBlockMatrixData *B_cu_data, int B_num_blocks,
+                                  double alpha, double beta, int B_trans) {
+  cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride,
+                         B_cu_data, B_num_blocks, alpha, beta, B_trans);
+}
+inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
+                                   const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
+                                   const double *D_data, int D_row_stride, int D_col_stride,
+                                   double alpha, double beta) {
+  cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
+                          C_row_stride, C_col_stride, D_data, D_row_stride,
+                          D_col_stride, alpha, beta);
+}

 /*
 * cu::
 */
-template<> inline void cuda_sigmoid<double>(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d) { cudaD_sigmoid(Gr,Bl,y,x,d); }
-template<> inline void cuda_diff_sigmoid<double>(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d); }
-template<> inline void cuda_tanh<double>(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d) { cudaD_tanh(Gr,Bl,y,x,d); }
-template<> inline void cuda_diff_tanh<double>(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d); }
-template<> inline void cuda_softmax<double>(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d) { cudaD_softmax(Gr,Bl,y,x,d); }
-template<> inline void cuda_softmax_part<double>(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d) { cudaD_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
+inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_soft_hinge(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power) { cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power); }
+inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_sigmoid(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int src_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,src_stride); }
+inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d); }
+inline void cuda_softmax(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d) { cudaD_softmax(Gr,Bl,y,x,d); }
+inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
+inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d) { cudaD_softmax_part(Gr,Bl,X,vec_ids,Y,d); }

-template<> inline void cuda_regularize_l1<double>(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
-template<> inline void cuda_find_row_max_id<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
-template<> inline void cuda_diff_xent<double>(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) { cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
+inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
+inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) {
+  cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d);
+}
+inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in) {
+  cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
+}

-template<> inline void cuda_randomize<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
-template<> inline void cuda_splice<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaD_splice(Gr,Bl,y,x,off,d_out,d_in); }
-template<> inline void cuda_copy<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaD_splice(Gr,Bl,y,x,off,d_out,d_in); }
+inline void cuda_one(int Gr,int Bl,double* x,int dim) { cudaD_one(Gr,Bl,x,dim); }
+inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
+inline void cuda_copy_from_sp(int Gr, int Bl, const double* x, double* y, int d_in, MatrixDim d_out) { cudaD_copy_from_sp(Gr,Bl,x,y,d_in,d_out); }
+inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_lower(Gr,Bl,x,y,d_in); }
+inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_upper(Gr,Bl,x,y,d_in); }
+inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_mean(Gr,Bl,x,y,d_in); }
+inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int32 size, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t) {cudaD_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
+inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                                   const double *src_data, MatrixDim src_dim, const Int32Pair *indices) {
+  cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
+}
+inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
+                               MatrixDim dim, const Int32Pair *indices,
+                               int indices_size, double *output) {
+  cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
+}

-} // namespace
+
+// Also include some template-friendly wrappers of cublas functions:
+inline void cuda_axpy(int n, float alpha, const float *x, int incx, float *y, int incy) {
+  cublasSaxpy(n, alpha, x, incx, y, incy);
+}
+inline void cuda_axpy(int n, double alpha, const double *x, int incx, double *y, int incy) {
+  cublasDaxpy(n, alpha, x, incx, y, incy);
+}
+inline void cuda_scal(int n, float alpha, float *x, int incx) {
+  cublasSscal(n, alpha, x, incx);
+}
+inline void cuda_scal(int n, double alpha, double *x, int incx) {
+  cublasDscal(n, alpha, x, incx);
+}
+
+
+} // namespace kaldi



 #endif // HAVE_CUDA

 #endif
-
-
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@ -0,0 +1,181 @@
+// cudamatrix/cuda-math-test.cc
+
+// Copyright 2013 Johns Hopkins University (Author: David Snyder)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix-lib.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-array.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+
+/*
+ * Unit tests
+ */
+      
+template<typename Real> 
+static void UnitTestCuMathRandomize() {
+  int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
+  CuMatrix<Real> src(M, N);
+  CuMatrix<Real> tgt(M, N);
+  CuArray<int32> copy_from_idx;
+
+  src.SetRandn(); 
+  int32 n_rows = src.NumRows();
+  int32 n_columns = src.NumCols();
+  std::vector<int32> copy_from_idx_vec;
+
+  for (int32 i = 0; i < n_rows; i++) {
+    copy_from_idx_vec.push_back(rand() % n_rows);
+  }
+  copy_from_idx.CopyFromVec(copy_from_idx_vec);
+  cu::Randomize(src, copy_from_idx, &tgt);
+  
+  for (int32 i = 0; i < n_rows; i++) {
+    for (int32 j = 0; j < n_columns; j++) {
+      Real src_val = src(copy_from_idx_vec.at(i), j);
+      Real tgt_val = tgt(i, j);
+      AssertEqual(src_val, tgt_val);
+    }
+  }
+}
+
+
+template<typename Real> 
+static void UnitTestCuMathCopy() {
+  int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
+  CuMatrix<Real> src(M, N);
+  CuMatrix<Real> tgt(M, N);
+  CuArray<int32> copy_from_idx;
+
+  src.SetRandn(); 
+  int32 n_rows = src.NumRows();
+  int32 n_columns = src.NumCols();
+  std::vector<int32> copy_from_idx_vec;
+
+  for (int32 i = 0; i < n_columns; i++) {
+    copy_from_idx_vec.push_back(rand() % n_columns);
+  }
+  copy_from_idx.CopyFromVec(copy_from_idx_vec);
+  cu::Copy(src, copy_from_idx, &tgt);
+  
+  for (int32 i = 0; i < n_rows; i++) {
+    for (int32 j = 0; j < n_columns; j++) {
+      Real src_val = src(i, copy_from_idx_vec.at(j));
+      Real tgt_val = tgt(i, j);
+      AssertEqual(src_val, tgt_val);
+    }
+  }
+}
+
+template<typename Real> 
+static void UnitTestCuMathSplice() {
+  int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
+  CuMatrix<Real> src(M, N);
+  CuArray<int32> frame_offsets;
+
+  src.SetRandn(); 
+  int32 n_rows = src.NumRows();
+  int32 n_columns = src.NumCols();
+  std::vector<int32> frame_offsets_vec;
+
+  // The number of columns of tgt is rows(src) 
+  // times n_frame_offsets, so we keep n_frame_offsets 
+  // reasonably small (2 <= n <= 6).
+  int32 n_frame_offsets = rand() % 7 + 2;
+  for (int32 i = 0; i < n_frame_offsets; i++) {
+    frame_offsets_vec.push_back(rand() % 2 * n_columns - n_columns);
+  }
+
+  CuMatrix<Real> tgt(M, N * n_frame_offsets);
+  frame_offsets.CopyFromVec(frame_offsets_vec);
+  cu::Splice(src, frame_offsets, &tgt);
+
+  for (int32 i = 0; i < n_rows; i++) {
+    for (int32 k = 0; k < n_frame_offsets; k++) {
+      for (int32 j = 0; j < n_columns; j++) {
+        Real src_val; 
+        if (i + frame_offsets_vec.at(k) >= n_rows) {
+          src_val = src(n_rows-1, j);
+        } else if (i + frame_offsets_vec.at(k) <= 0) {
+          src_val = src(0, j);
+        } else {
+          src_val = src(i + frame_offsets_vec.at(k), j); 
+        }
+        Real tgt_val = tgt(i, k * n_columns + j);
+        AssertEqual(src_val, tgt_val);
+      }
+    }
+  }
+}
+
+template<typename Real> void CudaMathUnitTest() {
+  #if HAVE_CUDA == 1  
+    if (CuDevice::Instantiate().DoublePrecisionSupported())
+  #endif
+  UnitTestCuMathRandomize<Real>();
+  UnitTestCuMathSplice<Real>();
+  UnitTestCuMathCopy<Real>();
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+#endif
+    srand(time(NULL));
+    kaldi::CudaMathUnitTest<float>();
+    
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+      kaldi::CudaMathUnitTest<double>();
+    } else {
+      KALDI_WARN << "Double precision not supported";
+    }
+#else
+    kaldi::CudaMathUnitTest<float>();
+#endif
+
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
+
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@ -36,15 +36,15 @@ namespace cu {
 template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
  KALDI_ASSERT(SameDim(*weight, *grad));
-#if HAVE_CUDA==1 
+#if HAVE_CUDA == 1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;

-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(weight->NumCols(), CUBLOCK), n_blocks(weight->NumRows(), CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));

    cuda_regularize_l1(dimGrid, dimBlock, weight->data_, grad->data_, l1, lr, weight->Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
    
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
@ -77,21 +77,21 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,

 template<typename Real>
 void Randomize(const CuMatrixBase<Real> &src,
-               const CuStlVector<int32> &copy_from_idx,
+               const CuArray<int32> &copy_from_idx,
               CuMatrixBase<Real> *tgt) {

  KALDI_ASSERT(src.NumCols() == tgt->NumCols());
  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
  KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());

-  #if HAVE_CUDA==1
+  #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
    
    /*
    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 
-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(copy_from_idx.Dim(), CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
    */

    /*
@ -108,7 +108,7 @@ void Randomize(const CuMatrixBase<Real> &src,
    MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();

    cuda_randomize(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_idx.Data(), dimtgt, dimsrc);
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
    
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
@ -116,7 +116,7 @@ void Randomize(const CuMatrixBase<Real> &src,
  {
    // randomize in CPU
    const MatrixBase<Real> &srcmat = src.Mat();
-    const std::vector<int32> &copy_from_idxvec = copy_from_idx.Vec();
+    const int32 *copy_from_idxvec = copy_from_idx.Data();
    MatrixBase<Real> &tgtmat = tgt->Mat();
    for(int32 i=0; i<copy_from_idx.Dim(); i++) {
      tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
@ -127,20 +127,20 @@ void Randomize(const CuMatrixBase<Real> &src,


 template<typename Real>
-void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<Real> *tgt) {
+void Splice(const CuMatrix<Real> &src, const CuArray<int32> &frame_offsets, CuMatrix<Real> *tgt) {

  KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
  KALDI_ASSERT(src.NumRows() == tgt->NumRows());

-  #if HAVE_CUDA==1
+  #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
    
-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
    
    cuda_splice(dimGrid, dimBlock, tgt->data_, src.data_, frame_offsets.Data(), tgt->Dim(), src.Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
    
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
@ -148,11 +148,12 @@ void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets,
  {
    // expand in CPU
    const MatrixBase<Real> &srcmat = src.Mat();
-    const std::vector<int32> &frame_offsetvec = frame_offsets.Vec();
+    const int32 *frame_offsetvec = frame_offsets.Data();
+    int32 dim = frame_offsets.Dim();
    MatrixBase<Real> &tgtmat = tgt->Mat();
    //
    for(int32 r=0; r < tgtmat.NumRows(); r++) {
-      for(int32 off=0; off < static_cast<int32>(frame_offsetvec.size()); off++) {
+      for(int32 off=0; off < dim; off++) {
        int32 r_off = r + frame_offsetvec[off];
        if(r_off < 0) r_off = 0;
        if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
@ -165,20 +166,20 @@ void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets,


 template<typename Real>
-void Copy(const CuMatrix<Real> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<Real> *tgt) { 
+void Copy(const CuMatrix<Real> &src, const CuArray<int32> &copy_from_indices, CuMatrix<Real> *tgt) { 

  KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
  KALDI_ASSERT(src.NumRows() == tgt->NumRows());

-  #if HAVE_CUDA==1
+  #if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    Timer tim;
    
-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
    
    cuda_copy(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_indices.Data(), tgt->Dim(), src.Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
    
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
@ -186,11 +187,12 @@ void Copy(const CuMatrix<Real> &src, const CuStlVector<int32> &copy_from_indices
  {
    // expand in CPU
    const MatrixBase<Real> &srcmat = src.Mat();
-    const std::vector<int32> &copy_from_indicesvec = copy_from_indices.Vec();
+    const int32 *copy_from_indicesvec = copy_from_indices.Data();
+    int32 dim = copy_from_indices.Dim();
    MatrixBase<Real> &tgtmat = tgt->Mat();
    //
-    for(int32 r=0; r < tgtmat.NumRows(); r++) {
-      for(int32 c=0; c < static_cast<int32>(copy_from_indicesvec.size()); c++) {
+    for(int32 r = 0; r < tgtmat.NumRows(); r++) {
+      for(int32 c = 0; c < dim; c++) {
        tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
      }
    }
@ -204,21 +206,21 @@ template
 void RegularizeL1(CuMatrixBase<double> *weight, CuMatrixBase<double> *grad, double l1, double lr);

 template
-void Splice(const CuMatrix<float> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<float> *tgt);
+void Splice(const CuMatrix<float> &src, const CuArray<int32> &frame_offsets, CuMatrix<float> *tgt);
 template
-void Splice(const CuMatrix<double> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<double> *tgt);
+void Splice(const CuMatrix<double> &src, const CuArray<int32> &frame_offsets, CuMatrix<double> *tgt);
 template
-void Copy(const CuMatrix<float> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<float> *tgt);
+void Copy(const CuMatrix<float> &src, const CuArray<int32> &copy_from_indices, CuMatrix<float> *tgt);
 template
-void Copy(const CuMatrix<double> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<double> *tgt);
+void Copy(const CuMatrix<double> &src, const CuArray<int32> &copy_from_indices, CuMatrix<double> *tgt);

 template
 void Randomize(const CuMatrixBase<float> &src,
-               const CuStlVector<int32> &copy_from_idx,
+               const CuArray<int32> &copy_from_idx,
               CuMatrixBase<float> *tgt);
 template
 void Randomize(const CuMatrixBase<double> &src,
-               const CuStlVector<int32> &copy_from_idx,
+               const CuArray<int32> &copy_from_idx,
               CuMatrixBase<double> *tgt);


--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@ -1,6 +1,7 @@
 // cudamatrix/cu-math.h

 // Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (Author: David Snyder) 

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -22,7 +23,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_MATH_H_
 #define KALDI_CUDAMATRIX_CU_MATH_H_
 #include "cudamatrix/cu-common.h"
-#include "cudamatrix/cu-stlvector.h"
+#include "cudamatrix/cu-array.h"
 #include "cudamatrix/cu-device.h"
 #include "util/timer.h"

@ -38,21 +39,38 @@ template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *gradient,
                  Real l1_penalty, Real learning_rate);

-/// ie. switch rows according to copy_from_idx
+/// Copies a permutation of src into tgt. The row permutation is specified in
+/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The 
+/// dimensions of copy_from_idx must be equivalent to the number of rows in
+/// tgt and src and all elements in the vector must be in [0, src.numRows()-1].  
 template<typename Real>
 void Randomize(const CuMatrixBase<Real> &src,
-               const CuStlVector<int32> &copy_from_idx,
+               const CuArray<int32> &copy_from_idx,
               CuMatrixBase<Real> *tgt);

-/// ie. concatenate the frames with offsets from frame_offsets
+/// Splice concatenates frames of src as specified in frame_offsets into tgt.
+/// The dimensions of tgt must be equivalent to the number of rows in src
+/// and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim().
+/// As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the
+/// general case where i in [0..src.NumRows()-1], 
+/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] 
+/// and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the
+/// number of rows in src or less than 0 than the right side of the equation 
+/// is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid
+/// an index out of bounds.
 template<typename Real>
 void Splice(const CuMatrix<Real> &src,
-            const CuStlVector<int32> &frame_offsets,
+            const CuArray<int32> &frame_offsets,
            CuMatrix<Real> *tgt);

+/// Copies elements from src into tgt as given by copy_from_indices.
+/// The matrices src and tgt must have the same dimensions and
+/// the dimension of copy_from_indices must equal the number of columns
+/// in the src matrix. As a result, tgt(i, j) == src(i, copy_from_indices[j]).
+/// Also see CuMatrix::CopyCols(), which is more general.
 template<typename Real>
 void Copy(const CuMatrix<Real> &src,
-          const CuStlVector<int32> &copy_from_indices,
+          const CuArray<int32> &copy_from_indices,
          CuMatrix<Real> *tgt);


--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@ -24,7 +24,7 @@

 namespace kaldi {

-template<class Real>
+template<typename Real>
 inline CuSubMatrix<Real>::CuSubMatrix(const CuMatrixBase<Real> &mat,
                                      const MatrixIndexT row_offset,
                                      const MatrixIndexT num_rows,
--- a/src/cudamatrix/cu-matrix-lib.h
+++ b/src/cudamatrix/cu-matrix-lib.h
@ -1,31 +1,32 @@
-// matrix/packed-matrix-inl.h
+// cudamatrix/cu-matrix-lib.h

-// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University;  Yanmin Qian;   Jan Silovsky;
-//                      Haihua Xu
+// Copyright 2013   Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-
+//
 //  http://www.apache.org/licenses/LICENSE-2.0
-
+//
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
-#ifndef KALDI_MATRIX_PACKED_MATRIX_INL_H_
-#define KALDI_MATRIX_PACKED_MATRIX_INL_H_
-
-namespace kaldi {



-}  // namespace kaldi
+#ifndef KALDI_CUDAMATRIX_CU_MATRIX_LIB_H_
+#define KALDI_CUDAMATRIX_CU_MATRIX_LIB_H_
+
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "cudamatrix/cu-tp-matrix.h"
+#include "cudamatrix/cu-block-matrix.h"
+#include "cudamatrix/cu-rand.h"

 #endif
-
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@ -0,0 +1,196 @@
+// cudamatrix/cu-matrix-speed-test.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-math.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+template<typename Real>
+std::string NameOf() {
+  return (sizeof(Real) == 8 ? "<double>" : "<float>");
+}
+    
+template<typename Real> void TestCuMatrixMatMat(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(dim, dim), N(dim, dim), O(dim, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    O.AddMatMat(1.0, M, kNoTrans, N, kNoTrans, 0.0);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::AddMatMat" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+template<typename Real> void TestCuMatrixSigmoid(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(dim, dim), N(dim, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    N.Sigmoid(M);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::Sigmoid" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(256, dim), N(256, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    N.ApplySoftMaxPerRow(M);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::Softmax" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
+  for (int32 n = 0; n < 2; n++) {
+    MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
+    BaseFloat time_in_secs = 0.08;
+  
+    CuMatrix<Real> M(dim, dim), N(dim, dim);
+    M.SetRandn();
+    N.SetRandn();
+    Timer tim;
+    int32 iter = 0;
+    for (;tim.Elapsed() < time_in_secs; iter++) {
+      TraceMatMat(M, N, trans);
+    }
+    BaseFloat fdim = dim;
+    BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+    KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf<Real>() 
+              << (trans == kTrans ? " [transposed]" : "") << ", for dim = "
+              << dim << ", speed was " << gflops << " gigaflops.";
+  }
+}
+
+template<typename Real> void TestCuMatrixCopyLowerToUpper(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(dim, dim);
+  M.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (; tim.Elapsed() < time_in_secs; iter++) {
+    M.CopyLowerToUpper();
+  }
+  CuMatrix<Real> M2(M, kTrans);
+  AssertEqual(M, M2);
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::CopyLowerToUpper" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void TestCuMatrixCopyUpperToLower(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuMatrix<Real> M(dim, dim);
+  M.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (; tim.Elapsed() < time_in_secs; iter++) {
+    M.CopyUpperToLower();
+  }
+  CuMatrix<Real> M2(M, kTrans);
+  AssertEqual(M, M2);
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::CopyUpperToLower" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void CudaMatrixSpeedTest() {
+  std::vector<int32> sizes;
+  sizes.push_back(16);
+  sizes.push_back(128);
+  sizes.push_back(256);
+  sizes.push_back(1024);
+  int32 ns = sizes.size();
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixMatMat<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixSigmoid<Real>(sizes[s]);
+
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixSoftmax<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixTraceMatMat<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixCopyLowerToUpper<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixCopyUpperToLower<Real>(sizes[s]);
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+    //Select the GPU
+#if HAVE_CUDA == 1
+    CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
+#endif
+
+    kaldi::CudaMatrixSpeedTest<float>();
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+    kaldi::CudaMatrixSpeedTest<double>();
+  } else {
+    KALDI_WARN << "Double precision not supported";
+  }
+#else
+  kaldi::CudaMatrixSpeedTest<double>();
+#endif
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  std::cout << "Tests succeeded.\n";
+}
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@ -1,7 +1,10 @@
 // cudamatrix/cu-matrix.h

 // Copyright 2009-2012  Karel Vesely
-//                      Johns Hopkins University (author: Daniel Povey)
+//                2013  Johns Hopkins University (author: Daniel Povey)
+//                2013  Hainan Xu
+//                2013  Xiaohui Zhang
+//                2013  Johns Hopkins University (author: Guoguo Chen)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -27,14 +30,18 @@

 #include "cudamatrix/cu-matrixdim.h"
 #include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-value.h"
 #include "matrix/matrix-common.h"
 #include "matrix/kaldi-matrix.h"
-#include "cudamatrix/cu-stlvector.h"
+#include "cudamatrix/cu-array.h"
 #include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-rand.h"

 namespace kaldi {

-
+template<typename Real>
+Real TraceMatMat(const CuMatrixBase<Real> &A, const CuMatrixBase<Real> &B,
+                 MatrixTransposeType trans = kNoTrans);
 /**
 * Matrix for CUDA computing.
 * Does the computation on the CUDA card when CUDA is compiled in and
@ -42,23 +49,77 @@ namespace kaldi {
 * otherwise, does it on the CPU.
 */

+/*
+template<typename Real>
+struct MatrixElement {
+  int row;
+  int column;
+  Real weight;
+};
+// */
+
 template<typename Real>
 class CuMatrixBase {
 public:
+  friend class CuMatrixBase<float>;
+  friend class CuMatrixBase<double>;
+  friend class CuVectorBase<float>;
+  friend class CuVectorBase<double>;
+  friend class VectorBase<Real>;
+  friend class CuSpMatrix<Real>;
+  friend class CuTpMatrix<float>;
+  friend class CuTpMatrix<double>;
  friend class CuVectorBase<Real>;
  friend class CuSubMatrix<Real>;
  friend class CuRand<Real>;
+  friend class CuSubVector<Real>;
+  friend class CuBlockMatrix<Real>;
  friend void cu::RegularizeL1<Real>(CuMatrixBase<Real> *weight,
                                     CuMatrixBase<Real> *grad, Real l1, Real lr);
  friend void cu::Splice<Real>(const CuMatrix<Real> &src,
-                               const CuStlVector<int32> &frame_offsets,
+                               const CuArray<int32> &frame_offsets,
                               CuMatrix<Real> *tgt);
  friend void cu::Copy<Real>(const CuMatrix<Real> &src,
-                             const CuStlVector<int32> &copy_from_indices,
+                             const CuArray<int32> &copy_from_indices,
                             CuMatrix<Real> *tgt);
  friend void cu::Randomize<Real>(const CuMatrixBase<Real> &src,
-                                  const CuStlVector<int32> &copy_from_idx,
+                                  const CuArray<int32> &copy_from_idx,
                                  CuMatrixBase<Real> *tgt);
+
+  /// Copies column r from column indices[r] of src.
+  /// As a special case, if indexes[i] == -1, sets column i to zero
+  /// indices.size() must equal this->NumCols(),
+  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+  /// and src.NumRows() must equal this.NumRows()
+  void CopyCols(const CuMatrixBase<Real> &src,
+                const std::vector<MatrixIndexT> &indices);
+
+  /// Version of CopyCols that takes CuArray argument.
+  void CopyCols(const CuMatrixBase<Real> &src,
+                const CuArray<MatrixIndexT> &indices);
+
+  
+  /// Copies row r from row indices[r] of src.
+  /// As a special case, if indexes[i] <== -1, sets row i to zero  
+  /// "reorder".size() must equal this->NumRows(), 
+  /// all elements of "reorder" must be in [0, src.NumRows()-1],
+  /// and src.NumCols() must equal this.NumCols()
+  void CopyRows(const CuMatrixBase<Real> &src,
+                const std::vector<MatrixIndexT> &indices);
+
+
+  /// For each row r of this and for each column c, sets (*this)(r, c) to the
+  /// sum \sum_j src(r, j), where j ranges from indices[c].first through
+  /// indices[c].second - 1.
+  void SumColumnRanges(const CuMatrixBase<Real> &src,
+                       const CuArray<Int32Pair> &indices);
+
+
+  friend Real TraceMatMat<Real>(const CuMatrixBase<Real> &A,
+                                const CuMatrixBase<Real> &B,
+                                MatrixTransposeType trans);
+
+  void AddToDiag(Real value);
  
  /// Dimensions
  MatrixIndexT NumRows() const { return num_rows_;  }
@ -72,26 +133,66 @@ class CuMatrixBase {
    return d; 
  }

+  Real FrobeniusNorm() const { return sqrt(TraceMatMat(*this, *this, kTrans)); }
+
+  bool IsUnit(Real tol = 0.001) const;  
+
+  bool ApproxEqual(const CuMatrixBase<Real> &other, float tol = 0.01) const;
+  
  /// Get size of matrix in bytes
  MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
-
-  /// Get size of matrix row in bytes
-  MatrixIndexT RowSizeInBytes() const { return num_cols_*sizeof(Real); }
  
-  /// Get size of matrix stride in bytes
-  MatrixIndexT StrideSizeInBytes() const { return stride_*sizeof(Real); }
+  // Copy functions.  These do not resize.
+  template<typename OtherReal>
+  void CopyFromMat(const MatrixBase<OtherReal> &src,
+                   MatrixTransposeType trans = kNoTrans);

+  void CopyFromMat(const MatrixBase<Real> &src,
+                   MatrixTransposeType trans = kNoTrans);
  
-  /// Copy functions (reallocates when needed, but note from Dan: eventually
-  /// I'll change it to just die if the sizes don't match, like the Matrix class.)
-  void CopyFromMat(const CuMatrixBase<Real> &src);
-  void CopyFromMat(const MatrixBase<Real> &src);
-  void CopyToMat(MatrixBase<Real> *dst) const;
+  void CopyFromSp(const CuSpMatrix<Real> &M);
+  
+  template<typename OtherReal>
+  void CopyFromTp(const CuTpMatrix<OtherReal> &M,
+                  MatrixTransposeType trans = kNoTrans);
+  
+  template<typename OtherReal>
+  void CopyFromMat(const CuMatrixBase<OtherReal> &M,
+                   MatrixTransposeType trans = kNoTrans); 
+
+  template<typename OtherReal>
+  void CopyToMat(MatrixBase<OtherReal> *dst,
+                 MatrixTransposeType trans = kNoTrans) const;
+  
+  void CopyRowsFromVec(const CuVectorBase<Real> &v);
+
+  void CopyRowsFromVec(const VectorBase<Real> &v);
+  
+  /// Copy vector into specific column of matrix.
+  void CopyColFromVec(const CuVectorBase<Real> &v, const MatrixIndexT col);

  /// Set each element to the sigmoid of the corresponding element of "src":
-  /// element by element, *this = 1 / (1 + exp(-src)).
+  /// element by element, x = 1 / (1 + exp(-x))
  void Sigmoid(const CuMatrixBase<Real> &src);

+  /// Apply the function y = log(1 + exp(x)), to each element.
+  /// Note: the derivative of this function is the sigmoid function.
+  /// This is like a soft ReLU.
+  void SoftHinge(const CuMatrixBase<Real> &src);
+
+  /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
+  /// where G = x.NumCols() / y.NumCols() must be an integer.
+  void GroupPnorm(const CuMatrixBase<Real> &src, Real pow);
+
+  /// Calculate derivatives for the GroupPnorm function above...
+  /// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
+  /// and "output" is the result of the computation (i.e. the "this" of that function
+  /// call), and *this has the same dimension as "input", then it sets each element
+  /// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
+  /// "output-elem" is whichever element of output depends on that input element.
+  void GroupPnormDeriv(const CuMatrixBase<Real> &input,
+                       const CuMatrixBase<Real> &output, Real power);
+  
  /// Compute the hyperbolic tangent (tanh) function; element by element,
  /// *this = tanh(src).
  void Tanh(const CuMatrixBase<Real> &src);
@ -105,7 +206,7 @@ class CuMatrixBase {
  /// tanh output.  Does, element-by-element, *this = diff * (1 - value^2).
  void DiffTanh(const CuMatrixBase<Real> &value,
                const CuMatrixBase<Real> &diff);
-
+  
  /// Differentiate the block [softmax+cross-entropy] :
  /// dE/da = posterior_mat - target_mat, 
  /// 'E' is error function, 'a' is activation on softmax input
@ -115,16 +216,30 @@ class CuMatrixBase {
  /// net_out_or_diff ... before invocation net output, after diff dE/da
  /// log_post_tgt ... per-frame statistics for cross-entropy computations :
  ///                  log(sum_row(posterior_mat .* target_mat))
-  void DiffXent(const CuStlVector<int32> &tgt,
+  void DiffXent(const CuArray<int32> &tgt,
                CuVector<Real> *log_post_tgt);  
+
+  /// This method may be only called for symmetric matrices (it accesses the
+  /// upper as well as lower triangle).  The result is put in the lower
+  /// triangle, and the upper triangle zeroed.
+  void Cholesky();
  
+  void SymInvertPosDef(); ///< Inversion for positive definite symmetric matrices.
+                          ///< Requires that the input is symmetric (we do not check this).
+                          ///< The output is symmetric.
+  
+  void ApplyPow(Real power);
+  void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0)
+  void ApplyFloor(Real floor_val);
+  void ApplyCeiling(Real ceiling_val);
+  void ApplyExp();
  /// Softmax nonlinearity
-  /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik)
+  /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row
  /// for each row, the max value is first subtracted for good numerical stability
-  void Softmax(const CuMatrixBase<Real> &src);
+  void ApplySoftMaxPerRow(const CuMatrixBase<Real> &src);

  /// Find the id of the maximal element for each row
-  void FindRowMaxId(CuStlVector<int32> *id) const;
+  void FindRowMaxId(CuArray<int32> *id) const;
  
  /*
  // Copy row interval from matrix
@ -139,27 +254,90 @@ class CuMatrixBase {
  void SetZero();
  void Set(Real value);
  void Add(Real value);
+  void SetZeroUpperDiag();
  void Scale(Real value);
  void ApplyLog();
-  /// Multiply two matrices elementhwise: C = A .* C
-  void MulElements(const CuMatrixBase<Real>& A);
+  
+  /// Multiply two matrices elementwise: C = A .* C
+  void MulElements(const CuMatrixBase<Real> &A);
+  /// Do, elementwise, *this = max(*this, A).
+  void Max(const CuMatrixBase<Real> &A);
  /// scale i'th column by scale[i]
  void MulColsVec(const CuVectorBase<Real> &scale); 
  /// scale i'th row by scale[i]
-  void MulRowsVec(const CuVectorBase<Real> &scale); 
+  void MulRowsVec(const CuVectorBase<Real> &scale);
+  /// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j].   
+  void MulRowsGroupMat(const CuMatrixBase<Real> &src);
  /// divide i'th row by scale[i]
  void DivRowsVec(const CuVectorBase<Real> &div);
  /// B = aplha * A + beta * B
-  void AddMat(Real alpha, const CuMatrixBase<Real>& A, Real beta=1.0);
+  void AddMat(Real alpha, const CuMatrixBase<Real> &A, Real beta=1.0);
  /// B = aplha * row + beta * B
-  void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta=1.0);
+  void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
  /// B = aplha * row + beta * B
-  void AddVecToRows(Real alpha, const CuVectorBase<Real> &row, Real beta=1.0);
+  void AddVecToRows(Real alpha, const CuVectorBase<Real> &row, Real beta = 1.0);
  /// C = alpha * A(^T)*B(^T) + beta * C
-  void AddMatMat(Real alpha, const CuMatrixBase<Real>& A, MatrixTransposeType transA,
-                 const CuMatrixBase<Real>& B, MatrixTransposeType transB, Real beta);
+  void AddMatMat(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
+                 const CuMatrixBase<Real> &B, MatrixTransposeType transB, Real beta);

+  /// *this = beta * *this + alpha * M M^T, for symmetric matrices.  It only
+  /// updates the lower triangle of *this.  It will leave the matrix asymmetric;
+  /// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
+  void SymAddMat2(const Real alpha, const CuMatrixBase<Real> &M,
+                  MatrixTransposeType transA, Real beta);

+  
+  /// This function is like AddMatMat but for where the second argument is of
+  /// type CuBlockMatrix (a block-diagonal matrix of blocks).
+  void AddMatBlock(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
+                   const CuBlockMatrix<Real> &B, MatrixTransposeType transB, Real beta);
+  
+  /// *this = beta * *this + alpha * diag(v) * M [or M^T].
+  /// The same as adding M but scaling each row M_i by v(i).
+  void AddDiagVecMat(const Real alpha, CuVectorBase<Real> &v,
+                     const CuMatrixBase<Real> &M, MatrixTransposeType transM, 
+                     Real beta = 1.0);  
+  
+  /// this <-- beta*this + alpha*A*B
+  void AddMatSp(const Real alpha,
+                const CuMatrixBase<Real> &A, MatrixTransposeType transA,
+                const CuSpMatrix<Real> &B,
+                const Real beta) {
+    CuMatrix<Real> M(B);
+    return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
+  }
+
+  /// this <-- beta*this + alpha*SpA*B
+  void AddSpMat(const Real alpha,
+                const CuSpMatrix<Real> &A,
+                const CuMatrixBase<Real> &B, MatrixTransposeType transB,
+                const Real beta) {
+    CuMatrix<Real> M(A);
+    return AddMatMat(alpha, M, kNoTrans, B, transB, beta);
+  }
+
+  /// this <-- beta*this + alpha*A*B.
+  void AddTpMat(const Real alpha,
+                const CuTpMatrix<Real> &A, MatrixTransposeType transA,
+                const CuMatrixBase<Real> &B, MatrixTransposeType transB,
+                const Real beta) {
+    CuMatrix<Real> M(A);
+    return AddMatMat(alpha, M, transA, B, transB, beta);
+  }
+
+  /// this <-- beta*this + alpha*A*B.
+  void AddMatTp(const Real alpha,
+                const CuMatrixBase<Real> &A, MatrixTransposeType transA,
+                const CuTpMatrix<Real> &B, MatrixTransposeType transB,
+                const Real beta) {
+    CuMatrix<Real> M(B);
+    return AddMatMat(alpha, A, transA, M, transB, beta);
+  }
+
+  void CopyFromBlock(const CuBlockMatrix<Real> &B,
+                     MatrixTransposeType trans = kNoTrans);
+  void CopyLowerToUpper();
+  void CopyUpperToLower();
  inline CuSubMatrix<Real> Range(const MatrixIndexT row_offset,
                                 const MatrixIndexT num_rows,
                                 const MatrixIndexT col_offset,
@ -177,11 +355,67 @@ class CuMatrixBase {
    return CuSubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols); 
  }

+  inline const CuSubVector<Real> Row(MatrixIndexT i) const {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_));
+    return CuSubVector<Real>(data_ + (i * stride_), NumCols());
+  }
+
+  inline CuSubVector<Real> Row(MatrixIndexT i) {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_));
+    return CuSubVector<Real>(data_ + (i * stride_), NumCols());
+  }
+
+  inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                          static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+                          static_cast<UnsignedMatrixIndexT>(c) <
+                          static_cast<UnsignedMatrixIndexT>(num_cols_));
+    return CuValue<Real>(data_ + r * stride_ + c);
+  }
  
+  inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                          static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+                          static_cast<UnsignedMatrixIndexT>(c) <
+                          static_cast<UnsignedMatrixIndexT>(num_cols_));
+    return CuValue<Real>(data_ + r * stride_ + c);  // will be casted to Real.
+  }
+
+  Real Sum() const;
+
+  /// Return the trace. If check_square = true, will crash if matrix is not square.
+  Real Trace(bool check_square = true) const;
+
+  void SetRandn();
+
+  void SetRandUniform();
+
+  void Write(std::ostream &os, bool binary) const;
+
+  // This function resizes the output to indices.size(), and for each element of
+  // "indices" it interprets it as a (row, column) index into *this, and puts
+  // (*this)(row, column) into the corresponding element of "output".
+  void Lookup(const std::vector<Int32Pair> &indices,
+              std::vector<Real> *output) const;
 protected:
+  // The following two functions should only be called if we did not compile with CUDA
+  // or could not get a CUDA card; in that case the contents are interpreted the
+  // same as a regular matrix.
+  inline const MatrixBase<Real> &Mat() const {
+    return *(reinterpret_cast<const MatrixBase<Real>* >(this));
+  }
+  inline MatrixBase<Real> &Mat() {
+    return *(reinterpret_cast<MatrixBase<Real>* >(this));
+  }
+  
  /// Get raw row pointer
  inline const Real* RowData(MatrixIndexT r) const { return data_ + r * stride_; }
  inline Real* RowData(MatrixIndexT r) { return data_ + r * stride_; }
+  inline const Real *Data() const { return data_; }
+  inline Real *Data() { return data_; }
+

  
  // The constructors are protected to prevent the user creating an instance of
@ -198,19 +432,9 @@ class CuMatrixBase {
                     MatrixIndexT stride):
  data_(data), num_cols_(num_cols), num_rows_(num_rows), stride_(stride) { }

-  // The following two functions should only be called if we did not compile with CUDA
-  // or could not get a CUDA card; in that case the contents are interpreted the
-  // same as a regular matrix.
-  inline const MatrixBase<Real> &Mat() const {
-    return *(reinterpret_cast<const MatrixBase<Real>* >(this));
-  }
-  inline MatrixBase<Real> &Mat() {
-    return *(reinterpret_cast<MatrixBase<Real>* >(this));
-  }
-  
  Real *data_;       ///< GPU data pointer (or regular matrix data pointer,
-                     ///< if either CUDA was not compiled in or we could not
-                     ///< acquire the device).
+  ///< if either CUDA was not compiled in or we could not
+  ///< acquire the device).
  // Note: it might seem a bit backwards that we have the number of columns
  // first here; it's necessary because we need the data to be laid out the same
  // as for MatrixBase so the Mat() function call will work.  We don't want to
@ -239,15 +463,34 @@ class CuMatrix: public CuMatrixBase<Real> {

  // Note: we had to remove the "explicit" keyword due
  // to problems with STL vectors of CuMatrixBase.
-  CuMatrix(const CuMatrix<Real> &other) {
-    this->Resize(other.NumRows(), other.NumCols(), kUndefined);
-    this->CopyFromMat(other);
+  CuMatrix(const CuMatrix<Real> &other,
+           MatrixTransposeType trans = kNoTrans);
+
+  explicit CuMatrix(const CuBlockMatrix<Real> &other,
+                    MatrixTransposeType trans = kNoTrans);
+  
+  explicit CuMatrix(const CuMatrixBase<Real> &other,
+                    MatrixTransposeType trans = kNoTrans);
+
+  template<typename OtherReal>
+  explicit CuMatrix(const MatrixBase<OtherReal> &other,
+                    MatrixTransposeType trans = kNoTrans);
+
+  /// Copy constructor taking SpMatrix... 
+  explicit CuMatrix(const CuSpMatrix<Real> &M) : CuMatrixBase<Real>() {
+    Resize(M.NumRows(), M.NumRows(), kUndefined);
+    this->CopyFromSp(M);
  }

-  explicit CuMatrix(const MatrixBase<Real> &other) {
-    this->Resize(other.NumRows(), other.NumCols(), kUndefined);
-    this->CopyFromMat(other);
-  }
+  /// Copy constructor taking TpMatrix...
+  template <typename OtherReal>
+  explicit CuMatrix(const CuTpMatrix<OtherReal> & M,
+                    MatrixTransposeType trans = kNoTrans);
+  
+  /// Copy constructor: as above, but from another type.
+  template<typename OtherReal>
+  explicit CuMatrix(const CuMatrixBase<OtherReal> &M,
+                    MatrixTransposeType trans = kNoTrans);
  
  CuMatrix<Real> &operator = (const CuMatrixBase<Real> &other) {
    this->Resize(other.NumRows(), other.NumCols(), kUndefined);
@ -265,21 +508,45 @@ class CuMatrix: public CuMatrixBase<Real> {
    this->Resize(other.NumRows(), other.NumCols(), kUndefined);
    this->CopyFromMat(other);
    return *this;
-  }   
+  }
+
+  void Transpose();

  /// Allocate the memory
  void Resize(MatrixIndexT rows, MatrixIndexT cols,
              MatrixResizeType resize_type = kSetZero);
-  
-  
+    
  void Swap(Matrix<Real> *mat);
+  void Swap(CuMatrix<Real> *mat);
+
+  template<typename OtherReal>
+  void Swap(CuMatrix<OtherReal> *mat);
  
  /// I/O functions
  void Read(std::istream &is, bool binary);
-  void Write(std::ostream &os, bool binary) const;

  /// Destructor
  ~CuMatrix() { Destroy(); }
+
+  inline const Matrix<Real> &Mat() const {
+    return *(reinterpret_cast<const Matrix<Real>* >(this));
+  }
+  inline Matrix<Real> &Mat() {
+    return *(reinterpret_cast<Matrix<Real>* >(this));
+  }
+
+  /// This function does: for each element { row, column, weight } indexed i in
+  /// the vector "elements", let x(i) = A(row(i), column(i)); then it does
+  /// (*this)(row(i), column(i)) += weight(i) / x(i), and
+  /// *tot_objf = \sum_i weight(i) * log(x(i)), and
+  /// *tot_weight = \sum_i weight(i)
+  /// Preconditions: A must be strictly positive, and no (row, column) pair
+  /// may be repeated within "elements"
+  void CompObjfAndDeriv(const std::vector<MatrixElement<Real> > &elements,
+                        const CuMatrix<Real> &A,
+                        Real *tot_objf,
+                        Real* tot_weight);
+
 private:
  void Destroy();
 };
@ -305,27 +572,55 @@ class CuSubMatrix: public CuMatrixBase<Real> {
  CuSubMatrix<Real> &operator = (const CuSubMatrix<Real> &other);
 };

-template<class Real>
+
+template<typename Real>
+bool ApproxEqual(const CuMatrixBase<Real> &A,
+                 const CuMatrixBase<Real> &B, Real tol = 0.01) {
+  return A.ApproxEqual(B, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(CuMatrixBase<Real> &A, CuMatrixBase<Real> &B,
+                        float tol = 0.01) {
+  KALDI_ASSERT(A.ApproxEqual(B, tol));
+}
+
+template<typename Real>
 bool SameDim(const CuMatrixBase<Real> &M, const CuMatrixBase<Real> &N) {
  return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
 }

-template<class Real>
+template<typename Real>
 bool SameDimAndStride(const CuMatrixBase<Real> &M, const CuMatrixBase<Real> &N) {
  return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols()
          && M.Stride() == N.Stride());
 }

-
 /// I/O
 template<typename Real>
 std::ostream &operator << (std::ostream &out, const CuMatrixBase<Real> &mat);


-  
-} // namespace
+template<typename Real>
+template<typename OtherReal>
+Matrix<Real>::Matrix(const CuMatrixBase<OtherReal> &M,
+                     MatrixTransposeType trans) {
+  if (trans == kNoTrans) Init(M.NumRows(), M.NumCols());
+  else Init(M.NumCols(), M.NumRows());
+  M.CopyToMat(this, trans);
+}
+
+template<typename Real>
+template<typename OtherReal>
+void MatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &cu,
+                                   MatrixTransposeType trans) {
+  cu.CopyToMat(this, trans);
+}


-#include "cu-matrix-inl.h"
+}  // namespace
+
+
+#include "cudamatrix/cu-matrix-inl.h"

 #endif
--- a/src/cudamatrix/cu-matrixdim.h
+++ b/src/cudamatrix/cu-matrixdim.h
@ -1,6 +1,7 @@
 // cudamatrix/cu-matrixdim.h

 // Copyright 2009-2012  Karel Vesely
+//                2013  Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -28,12 +29,20 @@
 #ifdef _MSC_VER
  typedef unsigned __int32 uint32_cuda;
  typedef __int32          int32_cuda;
+  typedef __int32          MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
 #else
  #include <stdint.h>
  typedef uint32_t         uint32_cuda;
  typedef int32_t          int32_cuda;
+  typedef int32_t          MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
 #endif

+template<typename Real>
+struct MatrixElement {
+  int32_cuda row;
+  int32_cuda column;
+  Real weight;
+};

 extern "C" {
  /**
@ -45,8 +54,37 @@ extern "C" {
    int32_cuda cols;
    int32_cuda stride;
  } MatrixDim;
+
+// we define the following constants here because this file is included
+// both by the C++ code and also CUDA code.
+  
+
+// The size of a CUDA 1-d block, e.g. for vector operations..
+#define CU1DBLOCK 256
+
+// The size of edge of CUDA square block, e.g. for matrix operations.
+// Must be defined the same in cu-kernels-ansi.h
+#define CU2DBLOCK 16
+
+
+  /** This structure is used in cu-block-matrix.h to store information
+      about a block-diagonal matrix.  We declare it here so that it
+      will be accessible
+   */
+  typedef struct CuBlockMatrixData_ {
+    int32_cuda row_offset; // sum of #rows of previous M_i
+    int32_cuda col_offset; // sum of #cols of previous M_i
+    MatrixDim matrix_dim; // dimension of this M_i
+    void *matrix_data; // data for M_i.  This is a pointer to either float* or
+                       // double*.  Because C doesn't support templates and to
+                       // avoid extra coding to support the two cases, we
+                       // decided to make this a void* pointer.
+  } CuBlockMatrixData;
+
+  typedef struct Int32Pair {
+    int32_cuda first;
+    int32_cuda second;
+  } Int32Pair;
 }

 #endif
-
-
--- a/src/cudamatrix/cu-packed-matrix-test.cc
+++ b/src/cudamatrix/cu-packed-matrix-test.cc
@ -0,0 +1,265 @@
+// cudamatrix/cu-sp-matrix-test.cc
+//
+// Copyright 2013  Ehsan Variani
+//                 Lucas Ondel
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// UnitTests for testing cu-sp-matrix.h methods.
+//
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-math.h"
+
+using namespace kaldi;
+
+namespace kaldi {
+
+/*
+ * INITIALIZERS
+ */
+
+/*
+ * ASSERTS
+ */
+template<typename Real>
+static void AssertEqual(const CuPackedMatrix<Real> &A,
+                        const CuPackedMatrix<Real> &B,
+                        float tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows());
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++)
+    for (MatrixIndexT j = 0; j <= i; j++)
+      KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
+                   < tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
+}
+
+template<typename Real>
+static void AssertEqual(const PackedMatrix<Real> &A,
+                        const PackedMatrix<Real> &B,
+                        float tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows());
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++)
+    for (MatrixIndexT j = 0; j <= i; j++)
+      KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
+                   < tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
+}
+
+template<typename Real>
+static void AssertDiagEqual(const PackedMatrix<Real> &A,
+                        const CuPackedMatrix<Real> &B,
+                        float value,
+                        float tol = 0.001) {
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
+    KALDI_ASSERT(std::abs((A(i, i)+value) - B(i, i))  
+                 < tol * std::max(1.0, (double) (std::abs(A(i, i)) + std::abs(B(i, i) + value))));
+  }
+}
+template<typename Real>
+static void AssertDiagEqual(const PackedMatrix<Real> &A,
+                        const PackedMatrix<Real> &B,
+                        float value,
+                        float tol = 0.001) {
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
+    KALDI_ASSERT(std::abs((A(i, i)+value) - B(i, i))  
+                 < tol * std::max(1.0, (double) (std::abs(A(i, i)) + std::abs(B(i, i) + value))));
+  }
+}
+
+template<typename Real>
+static void AssertEqual(const PackedMatrix<Real> &A,
+                        const CuPackedMatrix<Real> &B,
+                        float tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows());
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++)
+    for (MatrixIndexT j = 0; j <= i; j++)
+      KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
+                   < tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
+}
+
+template<typename Real>
+static bool ApproxEqual(const PackedMatrix<Real> &A,
+                        const PackedMatrix<Real> &B, Real tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows());
+  PackedMatrix<Real> diff(A);
+  diff.AddPacked(1.0, B);
+  Real a = std::max(A.Max(), -A.Min()), b = std::max(B.Max(), -B.Min()),
+      d = std::max(diff.Max(), -diff.Min());
+  return (d <= tol * std::max(a, b));
+}
+
+/*
+ * Unit Tests
+ */
+template<typename Real>
+static void UnitTestCuPackedMatrixConstructor() { 
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10 * i;
+
+    PackedMatrix<Real> A(dim);
+    A.SetRandn();
+    CuPackedMatrix<Real> B(A);
+    CuPackedMatrix<Real> C(B);
+    AssertEqual(B, C);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuPackedMatrixCopy() { 
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10 * i;
+    
+    PackedMatrix<Real> A(dim);
+    A.SetRandn();
+    CuPackedMatrix<Real> B(A);
+
+    CuPackedMatrix<Real> C(dim);
+    C.CopyFromPacked(A);
+    CuPackedMatrix<Real> D(dim);
+    D.CopyFromPacked(B);
+    AssertEqual(C, D);
+
+    PackedMatrix<Real> E(dim);
+    D.CopyToPacked(&E);
+    AssertEqual(A, E);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuPackedMatrixTrace() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 5 * i + rand() % 10;
+    
+    PackedMatrix<Real> A(dim);
+    A.SetRandn();
+    CuPackedMatrix<Real> B(A);
+    
+    AssertEqual(A.Trace(), B.Trace());
+  }
+}
+
+template<typename Real>
+static void UnitTestCuPackedMatrixScale() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 5 * i + rand() % 10;
+    
+    PackedMatrix<Real> A(dim);
+    A.SetRandn();
+    CuPackedMatrix<Real> B(A);
+
+    Real scale_factor = 23.5896223;
+    A.Scale(scale_factor); 
+    B.Scale(scale_factor);
+    AssertEqual(A, B);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuPackedMatrixScaleDiag() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 5 * i + rand() % 10;
+    
+    PackedMatrix<Real> A(dim);
+    A.SetRandn();
+    CuPackedMatrix<Real> B(A);
+
+    Real scale_factor = 23.5896223;
+    A.ScaleDiag(scale_factor); 
+    B.ScaleDiag(scale_factor);
+    AssertEqual(A, B);
+  }
+}
+
+
+
+template<typename Real>
+static void UnitTestCuPackedMatrixAddToDiag() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 5 * i + rand() % 10;
+    
+    PackedMatrix<Real> A(dim);
+    A.SetRandn();
+    CuPackedMatrix<Real> B(A);
+
+    Real value = rand() % 50;
+    B.AddToDiag(value); 
+    
+    AssertDiagEqual(A, B, value);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuPackedMatrixSetUnit() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 5 * i + rand() % 10;
+    
+    CuPackedMatrix<Real> A(dim);
+    A.SetUnit();
+    
+    for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
+      for (MatrixIndexT j = 0; j < A.NumRows(); j++) {
+        if (i != j) { 
+          KALDI_ASSERT(A(i, j) == 0);
+        } else {
+          KALDI_ASSERT(A(i, j) == 1.0);
+        }
+      }
+    } 
+  }
+}
+
+
+template<typename Real> void CudaPackedMatrixUnitTest() {
+  UnitTestCuPackedMatrixConstructor<Real>();
+  //UnitTestCuPackedMatrixCopy<Real>();
+  UnitTestCuPackedMatrixTrace<Real>();
+  UnitTestCuPackedMatrixScale<Real>();
+  UnitTestCuPackedMatrixAddToDiag<Real>();
+  UnitTestCuPackedMatrixSetUnit<Real>();
+}
+
+} // namespace kaldi
+
+
+int main() {
+  using namespace kaldi;
+#if HAVE_CUDA == 1
+  // Select the GPU
+  CuDevice::Instantiate().SelectGpuId("yes");
+#endif
+  kaldi::CudaPackedMatrixUnitTest<float>();
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+    kaldi::CudaPackedMatrixUnitTest<double>();
+  } else {
+    KALDI_WARN << "Double precision not supported";
+  }
+#else
+  kaldi::CudaPackedMatrixUnitTest<double>();
+#endif
+  
+  KALDI_LOG << "Tests succeeded";
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@ -0,0 +1,400 @@
+// cudamatrix/cu-packed-matrix.cc
+
+// Copyright 2009-2013  Johns Hopkins University (author: Daniel Povey)
+//                      Karel Vesely
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+#endif
+
+#include "util/timer.h"
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-kernels.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-packed-matrix.h"
+#include "cudamatrix/cublas-wrappers.h"
+
+namespace kaldi {
+
+template<typename Real>
+void CuPackedMatrix<Real>::Resize(MatrixIndexT rows,
+                                  MatrixResizeType resize_type) {
+  // This code does not currently support the other resize_type options.
+  KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
+
+  if (this->num_rows_ == rows) {
+    if (resize_type == kSetZero) this->SetZero();
+    return;
+  }
+
+  if (this->num_rows_ != 0)
+    this->Destroy();
+  if (rows == 0) return;  
+#if HAVE_CUDA == 1
+  CuDevice &device = CuDevice::Instantiate();
+  if (device.Enabled()) {
+    Timer tim;
+    this->num_rows_ = rows;
+    size_t nr = static_cast<size_t>(num_rows_),
+        num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
+    this->data_ = static_cast<Real*>(device.Malloc(num_bytes));
+
+    if (resize_type == kSetZero) this->SetZero();
+    device.AccuProfile("CuPackedMatrix::Resize", tim.Elapsed());    
+  } else
+#endif
+  { // Let the initializer of SpMatrix<Real> handle the allocation,
+    // and then just do Swap which will switch the pointers.
+    // This wastes a few instructions but is simple to code.
+    SpMatrix<Real> mat(rows, resize_type);
+    this->Swap(&mat);
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::SetRandn() {
+  if (num_rows_ != 0) {
+    MatrixIndexT size = num_rows_ * (num_rows_ + 1) / 2;
+    CuSubVector<Real> tmp(data_, size);
+    CuRand<Real> rand;
+    rand.RandGaussian(&tmp);
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::Destroy() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    if (this->data_ != NULL) {
+      CuDevice::Instantiate().Free(this->data_);
+    }
+  } else
+#endif
+  {
+    if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
+  }
+  this->data_ = NULL;
+  this->num_rows_ = 0;
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::Swap(PackedMatrix<Real> *mat) {
+#if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) {
+    if (this->num_rows_ == 0) {
+      if (mat->num_rows_ != 0) {
+        // *this is empty, but mat is nonempty.
+        Resize(mat->num_rows_, kUndefined);
+        CopyFromPacked(*mat);
+        mat->Resize(0);
+      }
+      // else both are empty.
+    } else { // *this is nonempty.
+      if (mat->num_rows_ != 0) {
+        // Both *this and *mat are nonempty.  Recurse to simpler cases.
+        // this could be done more efficiently in the case where
+        // the size does not change.
+        PackedMatrix<Real> temp;
+        this->Swap(&temp); // now temp is full, *this is empty.
+        mat->Swap(&temp); // now mat has data from *this, temp has
+        // data from mat.
+        this->Swap(mat); // copy data in mat to *this, which is now empty.
+      } else { // *this is full but *mat is empty.
+        mat->Resize(this->num_rows_, kUndefined);
+        this->CopyToPacked(mat);
+        this->Destroy();
+      }
+    }
+  } else
+#endif
+  {
+    std::swap(mat->data_, this->data_);
+    std::swap(mat->num_rows_, this->num_rows_);
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::CopyFromPacked(const CuPackedMatrix<Real> &src) {
+  KALDI_ASSERT(src.NumRows() == num_rows_);
+#if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) {
+    if (num_rows_ == 0) return; // Nothing to do.
+    Timer tim;
+    size_t nr = static_cast<size_t>(num_rows_),
+        num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
+
+    CU_SAFE_CALL(cudaMemcpy(data_, src.data_, num_bytes,
+                            cudaMemcpyDeviceToDevice));
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked1",tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().CopyFromPacked(src.Mat());
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::CopyFromPacked(const PackedMatrix<Real> &src) {
+  KALDI_ASSERT(src.NumRows() == num_rows_);
+#if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) {
+    if (num_rows_ == 0) return; // Nothing to do.
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.SizeInBytes(),
+                            cudaMemcpyHostToDevice));
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked2",tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().CopyFromPacked(src);
+    //memcpy(data_, src.Data(), SizeInBytes());
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::CopyToPacked(PackedMatrix<Real> *dst) const {
+  KALDI_ASSERT(dst->NumRows() == NumRows());
+  
+#if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) { 
+    if (num_rows_ == 0) return; // Nothing to do.
+    Timer tim;
+    size_t nr = static_cast<size_t>(num_rows_),
+      num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
+    
+    CU_SAFE_CALL(cudaMemcpy(dst->data_, data_, num_bytes,
+                            cudaMemcpyDeviceToHost));
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrixMatrix::CopyToPackedD2H",tim.Elapsed());
+  } else
+#endif
+  {
+    //memcpy(data_, dst->Data(), SizeInBytes());
+    dst->CopyFromPacked(Mat());
+  }
+}
+
+/*
+template<typename Real>
+void CuPackedMatrix<Real>::CopyRowsFromPacked(int32 r, const CuPackedMatrix<Real> &src, int32 src_ro, int32 dst_ro) {
+  KALDI_ASSERT(r+src_ro <= src.NumRows());
+  KALDI_ASSERT(r+dst_ro <= NumRows());
+  KALDI_ASSERT(NumCols() == src.NumCols());
+   
+  #if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+
+    MatrixIndexT dst_pitch = stride_*sizeof(Real);
+    MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
+    MatrixIndexT width = src.NumCols()*sizeof(Real);
+
+    const Real *p_src = src.Data() + src_ro*src.Stride();  
+    Real *p_dst = data_ + dst_ro*stride_;
+
+    CU_SAFE_CALL(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, r, cudaMemcpyDeviceToDevice));
+
+    CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Elapsed());
+  } else
+  #endif
+  {
+    memcpy(Data()+dst_ro*stride_, src.Data()+src_ro*src.Stride(), r*stride_*sizeof(Real));
+  }
+} */
+
+
+
+template<typename Real>
+void CuPackedMatrix<Real>::Read(std::istream &is, bool binary) {
+  PackedMatrix<Real> temp;
+  temp.Read(is, binary);
+  Destroy();
+  Swap(&temp);
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
+  PackedMatrix<Real> temp(this->num_rows_, kUndefined);
+  this->CopyToPacked(&temp);
+  temp.Write(os, binary); 
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::SetZero() {
+  #if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+    size_t nr = static_cast<size_t>(num_rows_),
+      num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
+
+    CU_SAFE_CALL(cudaMemset(reinterpret_cast<void*>(this->data_), 0, num_bytes));
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim.Elapsed());
+  } else
+  #endif
+  {
+    Mat().SetZero();
+  }
+}
+
+template<typename Real>
+Real CuPackedMatrix<Real>::Trace() const {
+  Real result = 0.0;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    if (num_rows_ == 0) return 0.0;
+    CuVector<Real> tmp(num_rows_, kUndefined);
+    tmp.CopyDiagFromPacked(*this);
+    return tmp.Sum();
+  } else
+#endif
+  {
+    result = Mat().Trace();
+  }
+  return result;
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::SetDiag(Real alpha) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    if (num_rows_ == 0) return;
+    Timer tim;
+    int dimBlock(CU1DBLOCK);
+    int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
+    cuda_set_diag_packed(dimGrid,dimBlock,data_,alpha,num_rows_);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetDiag", tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().SetDiag(alpha);
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::Scale(Real alpha) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    size_t nr = static_cast<size_t>(num_rows_),
+        num_elements = ((nr * (nr+1)) / 2);
+    cublas_scal(num_elements, alpha, data_, 1);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::Scale", tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().Scale(alpha);
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::ScaleDiag(Real alpha) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    int dimBlock(CU1DBLOCK);
+    int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
+    CU_SAFE_CALL(cudaGetLastError()); // TEMP
+    cuda_scale_diag(dimGrid,dimBlock,data_,alpha,num_rows_);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::ScaleDiag", tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().ScaleDiag(alpha);
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::AddPacked(const Real alpha, const CuPackedMatrix<Real> &M) {
+  KALDI_ASSERT(num_rows_ == M.NumRows());
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    if (num_rows_ == 0) return;
+    Timer tim;
+    size_t nr = num_rows_,
+        sz = (nr * (nr + 1)) / 2;
+    cublas_axpy(sz, alpha, M.Data(), 1, data_, 1);
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddPacked", tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().AddPacked(alpha, M.Mat());
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::AddToDiag(Real r) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    if (num_rows_ == 0) return;
+    Timer tim;
+    int dimBlock(CU1DBLOCK);
+    int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
+    cuda_add_diag_packed(dimGrid,dimBlock,data_,r,num_rows_);
+    CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddToDiag", tim.Elapsed());
+  } else
+#endif
+  {
+    // TODO
+    Mat().AddToDiag(r);
+  }
+}
+
+template<typename Real>
+void CuPackedMatrix<Real>::SetUnit() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    this->SetZero();
+    this->SetDiag(1.0);
+  } else 
+#endif
+  { 
+    Mat().SetUnit(); 
+  }
+}
+
+/**
+ * Print the matrix to stream
+ */
+template<typename Real>
+std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat) {
+  PackedMatrix<Real> temp(mat.NumRows());
+  mat.CopyToPacked(&temp);
+  out << temp;
+  return out;
+}
+
+// instantiate the template
+template
+std::ostream &operator << (std::ostream &out, const CuPackedMatrix<float> &mat);
+template
+std::ostream &operator << (std::ostream &out, const CuPackedMatrix<double> &mat);
+
+
+// Instantiate class CuPackedMatrix for float and double.
+template class CuPackedMatrix<float>;
+template class CuPackedMatrix<double>;
+
+
+} // namespace kaldi
--- a/src/cudamatrix/cu-packed-matrix.h
+++ b/src/cudamatrix/cu-packed-matrix.h
@ -0,0 +1,176 @@
+// cudamatrix/cu-packed-matrix.h
+
+// Copyright 2009-2013  Johns Hopkins University (author: Daniel Povey)
+//                      Karel Vesely
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_PACKED_MATRIX_H_
+#define KALDI_CUDAMATRIX_CU_PACKED_MATRIX_H_
+
+#include <sstream>
+
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-value.h"
+#include "matrix/matrix-common.h"
+#include "matrix/kaldi-matrix.h"
+#include "matrix/packed-matrix.h"
+#include "matrix/sp-matrix.h"
+#include "cudamatrix/cu-array.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+
+
+/**
+ * Matrix for CUDA computing.  This is a base class for packed
+ * triangular and symmetric matrices. 
+ * Does the computation on the CUDA card when CUDA is compiled in and
+ * we have a suitable GPU (CuDevice::Instantiate().Enabled() == true);
+ * otherwise, does it on the CPU.
+ */
+
+
+/// @brief Packed CUDA matrix: base class for triangular and symmetric matrices on
+///        a GPU card.
+template<typename Real>
+class CuPackedMatrix {
+ public:
+  friend class CuMatrixBase<Real>;
+  friend class CuVectorBase<Real>;
+  friend class CuSubMatrix<Real>;
+  friend class CuRand<Real>;
+  
+  CuPackedMatrix() : data_(NULL), num_rows_(0) {}
+
+  explicit CuPackedMatrix(MatrixIndexT r,
+                          MatrixResizeType resize_type = kSetZero):
+      data_(NULL), num_rows_(0) {  Resize(r, resize_type);  }
+  
+  explicit CuPackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL), num_rows_(0) {
+    Resize(orig.num_rows_, kUndefined);
+    CopyFromPacked(orig);
+  }
+
+  explicit CuPackedMatrix(const CuPackedMatrix<Real> &orig) : data_(NULL), num_rows_(0) {
+    Resize(orig.NumRows(), kUndefined);
+    CopyFromPacked(orig);
+  }
+
+  void SetZero();  /// < Set to zero
+  void SetUnit();  /// < Set to unit matrix.
+  void SetRandn(); /// < Set to random values of a normal distribution
+  void SetDiag(Real alpha); /// < Set the diagonal value to alpha  
+  void AddToDiag(Real r); ///< Add this quantity to the diagonal of the matrix.
+
+  void Scale(Real alpha); 
+  void ScaleDiag(Real alpha);
+  Real Trace() const;
+
+  ~CuPackedMatrix() { Destroy(); }
+
+  /// Set packed matrix to a specified size (can be zero).
+  /// The value of the new data depends on resize_type:
+  ///   -if kSetZero, the new data will be zero
+  ///   -if kUndefined, the new data will be undefined
+  ///   -if kCopyData, the new data will be the same as the old data in any
+  ///      shared positions, and zero elsewhere.
+  /// This function takes time proportional to the number of data elements.
+  void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);
+  
+  // Copy functions (do not resize).
+  void CopyFromPacked(const CuPackedMatrix<Real> &src);
+  void CopyFromPacked(const PackedMatrix<Real> &src);
+  void CopyToPacked(PackedMatrix<Real> *dst) const;
+
+  void Read(std::istream &in, bool binary);
+  
+  void Write(std::ostream &out, bool binary) const;
+
+  void Destroy();
+  
+  /// Swaps the contents of *this and *other.  Shallow swap.
+  void Swap(CuPackedMatrix<Real> *other);
+
+  /// Swaps the contents of *this and *other.
+  void Swap(PackedMatrix<Real> *other);
+  Real* Data() { return data_; }  
+  const Real* Data() const { return data_; }
+  
+  inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    if (static_cast<UnsignedMatrixIndexT>(c) >
+        static_cast<UnsignedMatrixIndexT>(r))
+      std::swap(c, r);
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {    
+      Real value;
+      CU_SAFE_CALL(cudaMemcpy(&value, this->data_ + (r * (r+1)) / 2 + c,
+                              sizeof(Real), cudaMemcpyDeviceToHost));
+      return value;
+    } else
+#endif
+    return this->data_[(r * (r+1)) / 2 + c];
+  }
+
+  inline MatrixIndexT NumRows() const { return num_rows_; }
+  inline MatrixIndexT NumCols() const { return num_rows_; }
+
+  /// Returns size in bytes of the data held by the matrix.
+  size_t  SizeInBytes() const {
+    size_t nr = static_cast<size_t>(num_rows_),
+      num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
+    return num_bytes;
+  }
+
+
+ protected:
+  // The following two functions should only be called if we did not compile with CUDA
+  // or could not get a CUDA card; in that case the contents are interpreted the   
+  // same as a regular matrix.                                                                      
+  inline const PackedMatrix<Real> &Mat() const {
+    return *(reinterpret_cast<const PackedMatrix<Real>* >(this));
+  }
+  inline PackedMatrix<Real> &Mat() {
+    return *(reinterpret_cast<PackedMatrix<Real>* >(this));
+  }
+
+  
+  // Will only be called from this class or derived classes.
+
+  Real *data_;
+  MatrixIndexT num_rows_;
+
+  void AddPacked(const Real alpha, const CuPackedMatrix<Real> &M);
+  
+ private:
+  // Disallow assignment.
+  PackedMatrix<Real> & operator=(const PackedMatrix<Real> &other);
+}; // class CuPackedMatrix
+
+
+/// I/O
+template<typename Real>
+std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat);
+
+} // namespace
+
+
+#endif
--- a/src/cudamatrix/cu-rand-inl.h
+++ b/src/cudamatrix/cu-rand-inl.h
@ -1,6 +1,7 @@
-// cudamatrix/cu-rand-inl.h
+// cudamatrix/cu-rand.cc

 // Copyright 2012  Karel Vesely
+//           2013  Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -18,14 +19,8 @@
 // limitations under the License.


-
-#ifndef KALDI_CUDAMATRIX_CU_RAND_INL_H_
-#define KALDI_CUDAMATRIX_CU_RAND_INL_H_
-
 #include "base/kaldi-math.h"
-
-#include "cudamatrix/cu-common.h"
-#include "cudamatrix/cu-rand.h"
+#include "cudamatrix/cu-matrix-lib.h"
 #include "cudamatrix/cu-randkernels.h"


@ -34,112 +29,120 @@ namespace kaldi {

 template<typename Real> 
 void CuRand<Real>::SeedGpu(MatrixIndexT state_size) {
-  if(NULL != host_) delete[] host_; 
-  host_ = new uint32[state_size]; 
-  host_size_ = state_size;
-
-  SeedBuffer(&z1_, state_size);
-  SeedBuffer(&z2_, state_size);
-  SeedBuffer(&z3_, state_size);
-  SeedBuffer(&z4_, state_size);
+  KALDI_ASSERT(state_size >= 0);
  state_size_ = state_size;
-
-  delete[] host_;
-  host_ = NULL;
-  host_size_ = 0;
+  SeedBuffer(state_size, &z1_);
+  SeedBuffer(state_size, &z2_);
+  SeedBuffer(state_size, &z3_);
+  SeedBuffer(state_size, &z4_);
 }


-
 template<typename Real> 
-void CuRand<Real>::SeedBuffer(uint32* *tgt, MatrixIndexT state_size) {
-  // generate random state
-  for(MatrixIndexT i=0; i<host_size_; i++) {
-    host_[i] = RandInt(128, RAND_MAX);
-  }
-  #if HAVE_CUDA==1
-  // push it to the GPU
-  if (CuDevice::Instantiate().Enabled()) {
-    int32 state_size_in_bytes = state_size*sizeof(uint32);
-    // resize the GPU buffer
-    if (state_size_ != state_size) {
-      cudaFree(*tgt);
-      cudaMalloc((void**)tgt, state_size_in_bytes);
+void CuRand<Real>::SeedBuffer(MatrixIndexT state_size, uint32 **tgt) {
+#if HAVE_CUDA == 1
+  CuDevice &device = CuDevice::Instantiate();
+  if (device.Enabled()) {
+    if (*tgt != NULL) {
+      device.Free(*tgt);
+      *tgt = NULL;
    }
-    // copy the values
-    cudaMemcpy(*tgt, host_, state_size_in_bytes, cudaMemcpyHostToDevice);
-  } else
-  #endif
-  { // use back-off host buffer
-    if (state_size_ != state_size) {
-      delete[] (*tgt);
-      *tgt = new uint32[state_size];
-    }
-    int32 state_size_in_bytes = state_size*sizeof(uint32);
-    memcpy(*tgt, host_, state_size_in_bytes);
+    if (state_size == 0) return; // Nothing to do.
+    std::vector<uint32> temp_rand_data(state_size);
+    for(MatrixIndexT i = 0; i < state_size; i++)
+      temp_rand_data[i] = RandInt(128, RAND_MAX);
+    int32 state_size_in_bytes = state_size * sizeof(uint32);
+    *tgt = static_cast<uint32*>(device.Malloc(state_size_in_bytes));
+    CU_SAFE_CALL(cudaMemcpy(*tgt, &(temp_rand_data[0]),
+                            state_size_in_bytes, cudaMemcpyHostToDevice));
  }
+#endif
+}
+
+template<class Real>
+CuRand<Real>::~CuRand() {
+  SeedBuffer(0, &z1_);
+  SeedBuffer(0, &z2_);
+  SeedBuffer(0, &z3_);
+  SeedBuffer(0, &z4_);
 }



-template<typename Real> void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
-  #if HAVE_CUDA==1 
+template<typename Real> void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
+#if HAVE_CUDA == 1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;

-    int32 tgt_size = tgt->NumRows()*tgt->Stride();
+    int32 tgt_size = tgt->NumRows() * tgt->Stride();
    if (tgt_size != state_size_) SeedGpu(tgt_size);

-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(tgt->num_cols_, CUBLOCK), n_blocks(tgt->num_rows_, CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(tgt->num_cols_, CU2DBLOCK), n_blocks(tgt->num_rows_, CU2DBLOCK));

    cuda_rand(dimGrid, dimBlock, tgt->data_, z1_, z2_, z3_, z4_, tgt->Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
  
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
-  #endif
+#endif
  {
-    for(int32 r=0; r<tgt->NumRows(); r++) {
-      for(int32 c=0; c<tgt->num_cols_; c++) {
-        tgt->Mat()(r, c) = kaldi::RandUniform();
-      }
-    }
+    tgt->SetRandUniform();
  }
 }



-template<typename Real> void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
-  #if HAVE_CUDA==1 
+template<typename Real> void CuRand<Real>::RandGaussian(CuMatrixBase<Real> *tgt) {
+#if HAVE_CUDA == 1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;
-
-    int32 tgt_size = tgt->NumRows()*tgt->Stride();
-    if (tgt_size != state_size_) SeedGpu(tgt_size);
-
-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(tgt->num_cols_, CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
-
+    int32 tgt_size = tgt->NumRows() * tgt->Stride();
+    if (tgt_size == 0)
+      return;
+    if (tgt_size > state_size_) SeedGpu(tgt_size);
+    
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(tgt->num_cols_, CU2DBLOCK), n_blocks(tgt->num_rows_, CU2DBLOCK));
+    
    cuda_gauss_rand(dimGrid, dimBlock, tgt->data_, z1_, z2_, z3_, z4_, tgt->Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
  
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
-  #endif
+#endif
  {
-    for(int32 r=0; r<tgt->NumRows(); r++) {
-      for(int32 c=0; c<tgt->num_cols_; c++) {
-        tgt->Mat()(r, c) = RandGauss();
-      }
-    }
+    tgt->SetRandn();
  }
 }


+template<typename Real> void CuRand<Real>::RandGaussian(CuVectorBase<Real> *tgt) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+
+    int32 tgt_size = tgt->Dim();
+    if (tgt_size != state_size_) SeedGpu(tgt_size);
+
+    int dimBlock(CU1DBLOCK);
+    int dimGrid(n_blocks(tgt->Dim(), CU1DBLOCK));
+    
+    cuda_vec_gauss_rand(dimGrid, dimBlock, tgt->Data(), z1_, z2_, z3_, z4_, tgt->Dim());
+
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+    
+  } else
+#endif
+  {
+    tgt->Vec().SetRandn();
+  }
+}
+

 template<typename Real> void CuRand<Real>::BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states) {
-  #if HAVE_CUDA==1 
+#if HAVE_CUDA == 1 
  if (CuDevice::Instantiate().Enabled()) { 
    Timer tim;

@ -156,15 +159,15 @@ template<typename Real> void CuRand<Real>::BinarizeProbs(const CuMatrix<Real> &p
    RandUniform(&tmp_);

    // use the uniform random numbers to compute discrete 0/1 states
-    dim3 dimBlock(CUBLOCK, CUBLOCK);
-    dim3 dimGrid(n_blocks(states->num_cols_, CUBLOCK), n_blocks(states->num_rows_, CUBLOCK));
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(states->num_cols_, CU2DBLOCK), n_blocks(states->num_rows_, CU2DBLOCK));

    cuda_binarize_probs(dimGrid, dimBlock, states->data_, probs.data_, tmp_.data_, states->Dim());
-    cuSafeCall(cudaGetLastError());
+    CU_SAFE_CALL(cudaGetLastError());
  
    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
  } else
-  #endif
+#endif
  {
    for(int32 r=0; r<states->num_rows_; r++) {
      for(int32 c=0; c<states->num_cols_; c++) {
@ -182,10 +185,12 @@ template<typename Real> void CuRand<Real>::AddGaussNoise(CuMatrix<Real> *tgt, Re
  tgt->AddMat(gscale, tmp_, 1.0);
 }

-
+// Instantiate the class for float and double.
+template class CuRand<float>;
+template class CuRand<double>;

 } // namespace

-#endif
+


--- a/src/cudamatrix/cu-rand.h
+++ b/src/cudamatrix/cu-rand.h
@ -24,7 +24,7 @@


 #include "cudamatrix/cu-matrix.h"
-
+#include "base/kaldi-math.h"

 namespace kaldi {

@ -33,25 +33,18 @@ template<typename Real>
 class CuRand {
 public:

-  CuRand()
-   : z1_(NULL), z2_(NULL), z3_(NULL), z4_(NULL), state_size_(0),
-     host_(NULL), host_size_(0)
-  { }
-
-  ~CuRand() {
-#if HAVE_CUDA == 1
-    cudaFree(z1_); cudaFree(z2_); cudaFree(z3_); cudaFree(z4_);
-#endif
-    delete[] host_;
-  }
+  CuRand(): z1_(NULL), z2_(NULL), z3_(NULL), z4_(NULL), state_size_(0) { }

+  ~CuRand();
+  
  /// on demand seeding of all the buffers
  void SeedGpu(MatrixIndexT state_size);

-  /// fill with uniform random numbers (0.0-1.0)
-  void RandUniform(CuMatrix<Real> *tgt);
+  /// fill with numbers drawn from uniform distribution on [0, 1]
+  void RandUniform(CuMatrixBase<Real> *tgt);
  /// fill with normal random numbers
-  void RandGaussian(CuMatrix<Real> *tgt);
+  void RandGaussian(CuMatrixBase<Real> *tgt);
+  void RandGaussian(CuVectorBase<Real> *tgt);

  /// align probabilities to discrete 0/1 states (use uniform samplig)
  void BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states);
@ -59,8 +52,9 @@ class CuRand {
  void AddGaussNoise(CuMatrix<Real> *tgt, Real gscale = 1.0);

 private:
-  /// seed one buffer
-  void SeedBuffer(uint32* *tgt, MatrixIndexT state_size);
+  /// seed one buffer on the GPU.  If state_size == 0, just frees any
+  /// existing buffers.
+  void SeedBuffer(MatrixIndexT state_size, uint32 **tgt);
   
 private:

@ -75,19 +69,13 @@ class CuRand {
  /// Inner state of the ``grid-like'' random number generator
  uint32 *z1_, *z2_, *z3_, *z4_; 
  int32 state_size_; ///< size of the buffers
-
-  uint32 *host_; ///< host bufer, used for initializing
-  int32 host_size_; ///< size of the host buffer
-
+  
  CuMatrix<Real> tmp_; ///< auxiliary matrix
 };


-
 } // namsepace

-#include "cudamatrix/cu-rand-inl.h"
-
 #endif


--- a/src/cudamatrix/cu-randkernels-ansi.h
+++ b/src/cudamatrix/cu-randkernels-ansi.h
@ -25,7 +25,7 @@
 #include "cudamatrix/cu-matrixdim.h"
 #include "cudamatrix/cu-kernels-ansi.h"

-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1

 extern "C" {

@ -34,6 +34,7 @@ extern "C" {
 */
 void cudaF_rand(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
 void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
+void cudaF_vec_gauss_rand(int Gr, int Bl, float *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim);
 void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float *states, const float *probs, float *rand, MatrixDim d);

 /*********************************************************
@ -41,6 +42,7 @@ void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float *states, const float *probs, f
 */
 void cudaD_rand(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
 void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
+void cudaD_vec_gauss_rand(int Gr, int Bl, double *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim);
 void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double *states, const double *probs, double *rand, MatrixDim d);

 }
--- a/src/cudamatrix/cu-randkernels.cu
+++ b/src/cudamatrix/cu-randkernels.cu
@ -1,6 +1,7 @@
 // cudamatrix/cu-randkernels.cu

 // Copyright 2012  Karel Vesely
+//           2013 Johns Hopkins University (author: Daniel Povey)

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -108,6 +109,20 @@ static void _gauss_rand(Real* mat, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda



+template<typename Real>
+__global__
+static void _vec_gauss_rand(Real* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (blockIdx.y > 0)
+     return;
+
+  if ( i < dim ) {
+    v[i] = BoxMuller<Real>(z1[i],z2[i],z3[i],z4[i]);
+  }
+}
+
+
+
 template<typename Real>
 __global__
 static void _binarize_probs(Real* states, const Real* probs, const Real* rand, MatrixDim d) {
@ -136,6 +151,10 @@ void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, uint32_cuda* z1, uint32_cuda
  _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); 
 }

+void cudaF_vec_gauss_rand(int Gr, int Bl, float* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
+  _vec_gauss_rand<<<Gr,Bl>>>(v,z1,z2,z3,z4,dim);
+}
+
 void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) { 
  _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); 
 }
@ -153,6 +172,10 @@ void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, uint32_cuda* z1, uint32_cud
  _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); 
 }

+void cudaD_vec_gauss_rand(int Gr, int Bl, double* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
+  _vec_gauss_rand<<<Gr,Bl>>>(v,z1,z2,z3,z4,dim);
+}
+
 void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) { 
  _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); 
 }
--- a/src/cudamatrix/cu-randkernels.h
+++ b/src/cudamatrix/cu-randkernels.h
@ -22,7 +22,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_RANDKERNELS_H_
 #define KALDI_CUDAMATRIX_CU_RANDKERNELS_H_

-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1

 #include "base/kaldi-error.h"
 #include "cudamatrix/cu-randkernels-ansi.h"
@ -38,6 +38,7 @@ namespace kaldi {
 */
 template<typename Real> inline void cuda_rand(dim3 Gr, dim3 Bl, Real *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
 template<typename Real> inline void cuda_gauss_rand(dim3 Gr, dim3 Bl, Real *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
+template<typename Real> inline void cuda_vec_gauss_rand(int Gr, int Bl, Real *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { KALDI_ERR << __func__ << " Not implemented!"; }
 template<typename Real> inline void cuda_binarize_probs(dim3 Gr, dim3 Bl, Real *states, const Real *probs, Real *rand, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }

 /*********************************************************
@ -45,6 +46,7 @@ template<typename Real> inline void cuda_binarize_probs(dim3 Gr, dim3 Bl, Real *
 */
 template<> inline void cuda_rand<float>(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaF_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
 template<> inline void cuda_gauss_rand<float>(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaF_gauss_rand(Gr,Bl,mat,z1,z2,z3,z4,d); } 
+template<> inline void cuda_vec_gauss_rand<float>(int Gr, int Bl, float *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { cudaF_vec_gauss_rand(Gr,Bl,v,z1,z2,z3,z4,dim); } 
 template<> inline void cuda_binarize_probs<float>(dim3 Gr, dim3 Bl, float *states, const float *probs, float *rand, MatrixDim d) { cudaF_binarize_probs(Gr,Bl,states,probs,rand,d); } 

 /*********************************************************
@ -52,6 +54,7 @@ template<> inline void cuda_binarize_probs<float>(dim3 Gr, dim3 Bl, float *state
 */
 template<> inline void cuda_rand<double>(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaD_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
 template<> inline void cuda_gauss_rand<double>(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaD_gauss_rand(Gr,Bl,mat,z1,z2,z3,z4,d); } 
+template<> inline void cuda_vec_gauss_rand<double>(int Gr, int Bl, double *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { cudaD_vec_gauss_rand(Gr,Bl,v,z1,z2,z3,z4,dim); } 
 template<> inline void cuda_binarize_probs<double>(dim3 Gr, dim3 Bl, double *states, const double *probs, double *rand, MatrixDim d) { cudaD_binarize_probs(Gr,Bl,states,probs,rand,d); } 

 } // namespace
--- a/src/cudamatrix/cu-sp-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-speed-test.cc
@ -0,0 +1,187 @@
+// cudamatrix/cu-matrix-speed-test.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-sp-matrix.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+template<typename Real>
+std::string NameOf() {
+  return (sizeof(Real) == 8 ? "<double>" : "<float>");
+}
+
+template<typename Real>
+static void UnitTestCuSpMatrixInvert(int32 dim) {
+  BaseFloat time_in_secs = 0.5;
+  int32 iter = 0;
+  Timer tim;
+  CuSpMatrix<Real> A(dim);
+  A.SetRandn();
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    KALDI_ASSERT(A.Trace() != 0.0); // true with probability 1...
+    CuSpMatrix<Real> B(A);
+
+    if (iter  > 0) {
+      B.Invert();
+    } else { // do some more testing...
+    
+      CuMatrix<Real> D(A);
+      A.AddMat2(1.0, D, kTrans, 1.0);
+      A.AddToDiag(0.1 * dim);
+
+      CuMatrix<Real> C(B);
+      B.AddMat2(1.0, C, kTrans, 1.0);
+      B.AddToDiag(0.1 * dim);
+    
+      A.Invert();
+      B.Invert();
+    
+      SpMatrix<Real> E(dim);
+      B.CopyToSp(&E);
+
+      SpMatrix<Real> A2(A);
+      AssertEqual(A2, E);
+    }
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuSpMatrix::Invert" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+
+template<typename Real>
+static void UnitTestCuSpMatrixCopyFromMat(int32 dim, SpCopyType copy_type) {
+  BaseFloat time_in_secs = 0.1;
+  int32 iter = 0;
+  Timer tim;
+  CuMatrix<Real> A(dim, dim);
+  CuSpMatrix<Real> S(dim);
+
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    S.CopyFromMat(A, copy_type);
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuSpMatrix::CopyFromMat" << NameOf<Real>()
+            << ", with copy-type "
+            <<(copy_type == kTakeLower ? "kTakeLower" :
+               (copy_type == kTakeUpper ? "kTakeUpper" :
+                "kTakeMeanAndCheck")) << " and dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+
+template<typename Real>
+static void UnitTestCuMatrixApproxInvert(int32 dim) {
+  BaseFloat time_in_secs = 0.5;
+  int32 iter = 0;
+
+  // Get random orthogonal matrix.
+  Matrix<Real> Q_cpu(dim, dim);
+
+  Q_cpu.SetRandn();
+  for (int32 r = 0; r < dim; r++) {
+    for (int32 s = 0; s < r; s++)
+      Q_cpu.Row(r).AddVec(-1.0 * VecVec(Q_cpu.Row(r), Q_cpu.Row(s)), Q_cpu.Row(s));
+    Q_cpu.Row(r).Scale(1.0 / Q_cpu.Row(r).Norm(2.0));
+  }
+  CuMatrix<Real> Q(Q_cpu);
+  
+  CuVector<Real> s(dim);
+  Real eig_range = 50.0; // factor of 50 on eigenvalues.. this affects the speed.
+  Real first_eig = 0.001 + RandUniform() * 5.0;
+  for (int32 r = 0; r < dim; r++)
+    s(r) = first_eig * exp(r * log(eig_range) / dim);
+
+  s.ApplyPow(0.5);
+  Q.MulColsVec(s);
+  CuSpMatrix<Real> A(dim);
+  A.AddMat2(1.0, Q, kNoTrans, 0.0);
+
+
+  Timer tim;
+  
+  for (;tim.Elapsed() < time_in_secs; iter++) {  
+    CuSpMatrix<Real> Atmp(A);
+    Atmp.InvertPosDefApprox(0.1);
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuSpMatrix::InvertPosDefApprox" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}  
+
+template<typename Real> void CuSpMatrixSpeedTest() {
+  std::vector<int32> sizes;
+  sizes.push_back(16);
+  sizes.push_back(128);
+  sizes.push_back(256);
+  sizes.push_back(1024);
+  int32 ns = sizes.size();
+
+  for (int32 s = 0; s < ns; s++) {
+    UnitTestCuSpMatrixInvert<Real>(sizes[s]);
+    UnitTestCuMatrixApproxInvert<Real>(sizes[s]);
+    UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeLower);
+    UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeUpper);
+    UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeMean);
+  }
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+    //Select the GPU
+#if HAVE_CUDA == 1
+    CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
+#endif
+
+    kaldi::CuSpMatrixSpeedTest<float>();
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+    kaldi::CuSpMatrixSpeedTest<double>();
+  } else {
+    KALDI_WARN << "Double precision not supported";
+  }
+#else
+  kaldi::CuSpMatrixSpeedTest<double>();
+#endif
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  std::cout << "Tests succeeded.\n";
+}
--- a/src/cudamatrix/cu-sp-matrix-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-test.cc
@ -0,0 +1,437 @@
+// cudamatrix/cu-sp-matrix-test.cc
+//
+// Copyright 2013  Ehsan Variani
+//                 Lucas Ondel
+//                 Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+//
+// UnitTests for testing cu-sp-matrix.h methods.
+//
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-math.h"
+
+using namespace kaldi;
+
+namespace kaldi {
+
+/*
+ * Unit Tests
+ */
+template<typename Real>
+static void UnitTestCuSpMatrixConstructor() { 
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10 * i;
+
+    Matrix<Real> A(dim, dim);
+    A.SetRandn();
+    SpMatrix<Real> B(A, kTakeLower);
+
+    CuMatrix<Real> C(A);
+    CuSpMatrix<Real> D(C, kTakeLower);
+
+    SpMatrix<Real> E(dim);
+    D.CopyToSp(&E);
+
+    SpMatrix<Real> F(D);
+    
+    AssertEqual(F, B);
+     //added by hxu, to test copy from SpMatrix to CuSpMatrix
+
+    AssertEqual(B, E);
+
+    KALDI_ASSERT(!B.IsUnit());
+    B.SetZero();
+    B.SetDiag(1.0);
+    KALDI_ASSERT(B.IsUnit());
+  }
+}
+
+template<typename Real>
+static void UnitTestCuSpMatrixApproxEqual() {
+
+  for (int32 i = 0; i < 10; i++) {
+    int32 dim = 1 + rand() % 10;
+    SpMatrix<Real> A(dim), B(dim);
+    A.SetRandn();
+    B.SetRandn();
+    BaseFloat threshold = 0.01;
+    for (int32 j = 0; j < 20; j++, threshold *= 1.3) {
+      bool b1 = A.ApproxEqual(B, threshold);
+      SpMatrix<Real> diff(A);
+      diff.AddSp(-1.0, B);
+      bool b2 = (diff.FrobeniusNorm() < threshold * std::max(A.FrobeniusNorm(),
+                                                             B.FrobeniusNorm()));
+      KALDI_ASSERT(b1 == b2);
+    }
+  }
+  
+}
+
+
+
+template<typename Real>
+static void UnitTestCuSpMatrixOperator() {
+  SpMatrix<Real> A(100);
+  A.SetRandn();
+
+  CuSpMatrix<Real> B(100);
+  B.CopyFromSp(A);
+
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
+    for (MatrixIndexT j = 0; j <= i; j++)
+      KALDI_ASSERT(std::abs(A(i, j) - B(i, j)) < 0.0001);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuSpMatrixAddToDiag() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10*i;
+    SpMatrix<Real> A(dim);
+    A.SetRandn();
+    CuSpMatrix<Real> B(A);
+    
+    Matrix<Real> D(A);
+    A.AddToDiag(i);
+
+    CuMatrix<Real> C(B);
+    B.AddToDiag(i);
+    
+    SpMatrix<Real> E(dim);
+    B.CopyToSp(&E);
+    
+    AssertEqual(A, E);    
+  }
+}
+
+
+template<typename Real>
+static void UnitTestCuSpMatrixCopyFromMat() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    SpCopyType copy_type = (i % 3 == 0 ? kTakeMean :
+                            (i % 3 == 1 ? kTakeLower : kTakeUpper));
+    MatrixIndexT dim = 10 * i + rand() % 5;
+    CuMatrix<Real> A(dim, dim);
+    A.SetRandn();
+    Matrix<Real> A2(A);
+    
+    CuSpMatrix<Real> B(A, copy_type);
+    SpMatrix<Real> B2(A2, copy_type);
+    SpMatrix<Real> B3(B);
+    if (!ApproxEqual(B2, B3) ) {
+      KALDI_ERR << "Matrices differ, A = " << A << ", B2 = " << B2 << ", B3(CUDA) = " << B3;
+    }
+    KALDI_ASSERT(B3.Trace() != 0);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuSpMatrixApproxInvert(int32 dim) {
+  // Get random orthogonal matrix.
+  CuMatrix<Real> Q(dim, dim);
+
+  Q.SetRandn();
+  for (int32 r = 0; r < dim; r++) {
+    for (int32 s = 0; s < r; s++)
+      Q.Row(r).AddVec(-1.0 * VecVec(Q.Row(r), Q.Row(s)), Q.Row(s));
+    Q.Row(r).Scale(1.0 / Q.Row(r).Norm(2.0));
+  }
+  
+  CuVector<Real> s(dim); // factor of 10 on eigenvalues, evenly spaced in log.
+  Real eig_range = 50.0;
+  Real first_eig = 0.001 + RandUniform() * 5.0;
+  for (int32 r = 0; r < dim; r++)
+    s(r) = first_eig * exp(r * log(eig_range) / dim);
+
+  s.ApplyPow(0.5);
+  Q.MulColsVec(s);
+  CuSpMatrix<Real> A(dim);
+  A.AddMat2(1.0, Q, kNoTrans, 0.0);
+  CuMatrix<Real> A_orig(A);
+
+  BaseFloat max_error = 0.1;
+  A.InvertPosDefApprox(max_error);
+
+
+  CuMatrix<Real> prod(dim, dim);
+  prod.AddSpMat(1.0, A, A_orig, kNoTrans, 0.0);
+  KALDI_ASSERT(prod.IsUnit(max_error));  
+}  
+
+template<typename Real>
+static void UnitTestCuSpMatrixInvert() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10*i + rand() % 5;
+    CuSpMatrix<Real> A(dim);
+    A.SetRandn();
+    KALDI_ASSERT(A.Trace() != 0.0); // true with probability 1...
+    SpMatrix<Real> B(A);
+    
+    CuMatrix<Real> D(A);
+    A.AddMat2(1.0, D, kTrans, 1.0);
+    A.AddToDiag(i);
+
+    Matrix<Real> C(B);
+    B.AddMat2(1.0, C, kTrans, 1.0);
+    B.AddToDiag(i);
+
+    CuSpMatrix<Real> Acopy(A);
+    A.Invert();
+    B.Invert();
+    
+    SpMatrix<Real> A2(A);
+    AssertEqual(A2, B);
+
+    CuMatrix<Real> I(dim, dim);
+    I.AddMatMat(1.0, CuMatrix<Real>(Acopy), kNoTrans, CuMatrix<Real>(A), kNoTrans, 0.0);
+    KALDI_ASSERT(I.IsUnit(0.01));
+  }
+}
+
+// TODO (variani) : fails for dim = 0 
+template<typename Real>
+static void UnitTestCuSpMatrixAddVec2() {
+  for (int32 i = 0; i < 50; i++) {
+    MatrixIndexT dim = 1 + rand() % 200;
+    SpMatrix<Real> A(dim);
+    A.SetRandn();
+    CuSpMatrix<Real> B(A);
+    
+    Vector<Real> C(dim);
+    C.SetRandn();
+    CuVector<Real> D(C);
+    Real alpha = RandGauss();
+
+    A.AddVec2(alpha, C);
+    B.AddVec2(alpha, D);
+
+    SpMatrix<Real> E(dim);
+    B.CopyToSp(&E);
+
+    AssertEqual(A, E);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuSpMatrixAddMat2() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim_row = 15 * i + rand() % 10;
+    MatrixIndexT dim_col = 7 *i + rand() % 10;
+    Matrix<Real> A(dim_row, dim_col);
+    A.SetRandn();
+    CuMatrix<Real> B(A);
+
+    SpMatrix<Real> C(dim_col);
+    C.SetRandn();
+    CuSpMatrix<Real> D(C);
+
+    const Real alpha = 2.0;
+    const Real beta = 3.0;
+
+    C.AddMat2(alpha, A, kTrans, beta);
+    D.AddMat2(alpha, B, kTrans, beta);
+
+    SpMatrix<Real> E(dim_col);
+    D.CopyToSp(&E);
+
+    AssertEqual(C, E);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuSpMatrixAddSp() {
+  for (MatrixIndexT i = 1; i < 50; i++) {
+    MatrixIndexT dim = 7 * i + rand() % 10;
+    
+    SpMatrix<Real> A(dim);
+    A.SetRandn();
+    CuSpMatrix<Real> B(A);
+
+    SpMatrix<Real> C(dim);
+    C.SetRandn();
+    const CuSpMatrix<Real> D(C);
+
+    const Real alpha = 2.0;
+    
+    A.AddSp(alpha, C);
+    B.AddSp(alpha, D);
+
+    SpMatrix<Real> E(dim);
+    B.CopyToSp(&E);
+
+    AssertEqual(A, E);
+  }
+}
+
+template<typename Real, typename OtherReal>
+static void UnitTestCuSpMatrixTraceSpSp() {
+  for (MatrixIndexT i = 1; i < 2; i++) {
+    MatrixIndexT dim = 100 + rand() % 255;
+    
+    SpMatrix<Real> A(dim);
+    A.SetRandn();
+    const CuSpMatrix<Real> B(A);
+    SpMatrix<OtherReal> C(dim);
+    C.SetRandn();
+    const CuSpMatrix<OtherReal> D(C);
+
+    Real t1 = TraceSpSp(A, C), t2 = TraceSpSp(B, D);
+    KALDI_ASSERT(ApproxEqual(t1, t2));
+  }
+}
+
+
+template<typename Real>
+void UnitTestCuSpMatrixSetUnit() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 100 * i + rand() % 255;
+    if (i % 5 == 0) dim = 0;
+    CuSpMatrix<Real> S1(dim), S2(dim), S4(dim);
+    S1.SetRandn();
+    S2.SetRandn();
+    S4.SetRandn();
+    SpMatrix<Real> S3(dim);
+    S3.SetUnit();
+    S1.SetUnit();
+    S2.SetZero();
+    S2.SetDiag(1.0);
+    S4.SetZero();
+    S4.AddToDiag(0.4);
+    S4.AddToDiag(0.6);
+    CuSpMatrix<Real> cu_S3(S3);
+    KALDI_LOG << "S1 norm is " << S1.FrobeniusNorm();
+    KALDI_LOG << "S2 norm is " << S2.FrobeniusNorm();
+    KALDI_LOG << "S3 norm is " << S3.FrobeniusNorm();
+    AssertEqual(S1, cu_S3);
+    AssertEqual(S2, cu_S3);
+    AssertEqual(S4, cu_S3);
+  }
+}
+   
+template<class Real>
+static void UnitTestCuSpMatrixIO() {
+  for (int32 i = 0; i < 10; i++) {
+    int32 dimM = rand() % 255;
+    if (i % 5 == 0) { dimM = 0; }
+    CuSpMatrix<Real> mat(dimM);
+    mat.SetRandn();
+    std::ostringstream os;
+    bool binary = (i % 4 < 2);
+    mat.Write(os, binary);
+
+    CuSpMatrix<Real> mat2;
+    std::istringstream is(os.str());
+    mat2.Read(is, binary);
+    AssertEqual(mat, mat2);
+  }
+}
+
+
+
+
+template<typename Real, typename OtherReal>
+static void UnitTestCuSpMatrixAddSp() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 100 * i + rand() % 255;
+    
+    SpMatrix<Real> A(dim);
+    A.SetRandn();
+    const CuSpMatrix<Real> B(A);
+    SpMatrix<OtherReal> C(dim);
+    C.SetRandn();
+    const CuSpMatrix<OtherReal> D(C);
+    
+    A.AddSp(1.0, C);
+    B.AddSp(1.0, D);
+    
+    AssertEqual(A, B);
+
+  }
+}
+
+template<typename Real> void CudaSpMatrixUnitTest() {
+  UnitTestCuSpMatrixIO<Real>();
+  UnitTestCuSpMatrixConstructor<Real>();
+  UnitTestCuSpMatrixOperator<Real>();
+  UnitTestCuSpMatrixApproxEqual<Real>();
+  UnitTestCuSpMatrixInvert<Real>();
+  UnitTestCuSpMatrixApproxInvert<Real>(300);
+  UnitTestCuSpMatrixApproxInvert<Real>(100);
+  UnitTestCuSpMatrixApproxInvert<Real>(10);
+  UnitTestCuSpMatrixCopyFromMat<Real>();
+  UnitTestCuSpMatrixAddVec2<Real>();
+  UnitTestCuSpMatrixAddMat2<Real>();
+  UnitTestCuSpMatrixAddSp<Real>();
+  UnitTestCuSpMatrixAddToDiag<Real>();
+  UnitTestCuSpMatrixSetUnit<Real>();
+}
+
+template<typename Real, typename OtherReal> void CudaSpMatrixUnitTest() {
+  UnitTestCuSpMatrixTraceSpSp<Real, OtherReal>();
+
+}
+
+} // namespace kaldi
+
+
+int main() {
+  using namespace kaldi;
+
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+#endif
+
+    kaldi::CudaSpMatrixUnitTest<float>();
+    kaldi::CudaSpMatrixUnitTest<float, float>();
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+      kaldi::CudaSpMatrixUnitTest<double>();
+      kaldi::CudaSpMatrixUnitTest<float, double>();
+      kaldi::CudaSpMatrixUnitTest<double, float>();
+      kaldi::CudaSpMatrixUnitTest<double, double>();
+    } else {
+      KALDI_WARN << "Double precision not supported";
+    }
+#else
+    kaldi::CudaSpMatrixUnitTest<float, double>();
+    kaldi::CudaSpMatrixUnitTest<double, float>();
+    kaldi::CudaSpMatrixUnitTest<double, double>();
+#endif
+
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@ -0,0 +1,361 @@
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+#endif
+
+#include "util/timer.h"
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-kernels.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cublas-wrappers.h"
+
+namespace kaldi {
+
+template<typename Real>
+void CuSpMatrix<Real>::CopyFromMat(const CuMatrixBase<Real> &M,
+                                   SpCopyType copy_type) {
+  KALDI_ASSERT(this->num_rows_ == M.NumRows() &&
+               this->num_rows_ == M.NumCols());
+  if (this->num_rows_ == 0)
+    return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    MatrixIndexT D = this->NumRows();
+    if (D == 0)
+      return;
+    switch (copy_type) {
+      case kTakeMeanAndCheck:
+        KALDI_ERR << "kTakeMeanAndCheck not supported!";
+      // The grid/block dimensions have been very roughly tuned for the
+      // individual cases.
+      case kTakeMean:
+        {
+          dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+          dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
+          cuda_take_mean(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
+          CU_SAFE_CALL(cudaGetLastError());
+        }
+        break;
+      case kTakeLower:
+        {
+          dim3 dimBlock(1, CU1DBLOCK);
+          dim3 dimGrid(D, n_blocks(D, CU1DBLOCK));
+          cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
+          CU_SAFE_CALL(cudaGetLastError());
+          cudaThreadSynchronize();
+        }
+        break;
+      case kTakeUpper:
+        {
+          dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+          dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
+          cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
+          CU_SAFE_CALL(cudaGetLastError());
+        }
+        break;
+      default:
+        KALDI_ASSERT("Invalid argument to CuSpMatrix::CopyFromMat");
+    }
+    CuDevice::Instantiate().AccuProfile("CuSpMatrix::CopyFromMat(from CuMatrixBase)", tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().CopyFromMat(M.Mat(), copy_type);
+  }
+}
+
+template<typename Real>
+void CuSpMatrix<Real>::Invert() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuMatrix<Real> mat(this->num_rows_, this->num_rows_);
+    mat.CopyFromSp(*this);
+    mat.SymInvertPosDef();
+    this->CopyFromMat(mat);
+  } else
+#endif
+  { // Use inversion of CPU-based SpMatrix.
+    Mat().Invert();
+  }
+}
+
+
+
+template<typename Real>
+void CuSpMatrix<Real>::InvertPosDefApprox(BaseFloat max_error) {
+  if (this->num_rows_ == 0) return;
+  MatrixIndexT dim = this->num_rows_;
+  CuMatrix<Real> temp(dim * 5, dim);
+  CuSubMatrix<Real> A(temp, 0, dim, 0, dim),
+      AA(temp, dim, dim, 0, dim),
+      AAA(temp, 2 * dim, dim, 0, dim),
+      AAAA(temp, 3 * dim, dim, 0, dim);
+  Real prescale = dim / this->Trace();
+  this->Scale(prescale); // We'll compute the inverse of the prescaled A, and then
+                         // put that factor back later.  This is useful since we
+                         // deal with high powers of A that could get large or small.
+  A.CopyFromSp(*this);
+  // use *this as a temporary SpMatrix; we've stored its contents in "A".
+  this->AddMat2(1.0, A, kNoTrans, 0.0);
+  AA.CopyFromSp(*this);
+  { // now create AAA and AAAA using a single multiplication.
+    CuSubMatrix<Real> A_and_AA(temp, 0, dim * 2, 0, dim),
+        AAA_and_AAAA(temp, dim * 2, dim * 2, 0, dim);
+    // Note: below, the transpose-ness of AA is arbitrary since it's symmetric;
+    // I guess that transposed may be faster.
+    AAA_and_AAAA.AddMatMat(1.0, A_and_AA, kNoTrans, AA, kTrans, 0.0);
+  }
+
+  // Note: below, trace_A equals dim because of the prescaling, we
+  // ensured that.
+  Vector<double> trace(8); // trace(i) is trace(A^(i+1))
+  trace(0) = dim;
+  {  
+    CuVector<Real> trace_vec(dim * 5);
+    CuSubVector<Real> trace_lower4(trace_vec, 0, dim * 4),
+        trace_lower3(trace_vec, 0, dim * 3),
+        trace1(trace_vec, 0, dim), trace2(trace_vec, dim, dim),
+        trace3(trace_vec, dim * 2, dim), trace4(trace_vec, dim * 3, dim),
+        ones(trace_vec, dim * 4, dim);
+    trace_lower4.AddDiagMat2(1.0, temp.Range(0, dim * 4, 0, dim),
+                             kNoTrans, 0.0);
+    ones.Set(1.0);
+    // TODO: can make these vecvec's faster as fake matrix multiplies.
+    trace(1) = VecVec(trace1, ones);
+    trace(3) = VecVec(trace2, ones);
+    trace(5) = VecVec(trace3, ones);
+    trace(7) = VecVec(trace4, ones);
+    // Now we want to get odd-numbered trace quantities, so multiply the
+    // rows of A through AAA with the rows of AA through AAA.
+    CuSubMatrix<Real> lower_three(temp, 0, dim * 3, 0, dim),
+        upper_three(temp, dim, dim * 3, 0, dim);
+    trace_lower3.AddDiagMatMat(1.0, lower_three, kNoTrans, upper_three, kTrans, 0.0);
+    trace(2) = VecVec(trace1, ones);
+    trace(4) = VecVec(trace2, ones);
+    trace(6) = VecVec(trace3, ones);
+  }
+  { // Check the trace values.
+    CuMatrix<Real> power(A);
+    for (int32 i = 0; i < 8; i++) {
+      double this_trace = power.Trace();
+      AssertEqual(this_trace, trace(i));
+      CuMatrix<Real> temp_power(power);
+      power.AddMatMat(1.0, temp_power, kNoTrans, A, kNoTrans, 0.0);
+    }
+  }
+  
+  // We'll use a and B to get the coefficients.  These operations are in very
+  // tiny dimensions -> faster and more convenient to use CPU.
+  SubVector<double> a(trace, 0, 4);
+  SpMatrix<double> B(4);
+  for (int32 r = 0; r < 4; r++)
+    for (int32 c = 0; c <= r; c++)
+      B(r, c) = trace(r + c + 1);
+
+  TpMatrix<double> C(4);
+  C.Cholesky(B);
+  C.Invert();
+  SpMatrix<double> Binv(4);
+  Binv.AddTp2(1.0, C, kTrans, 0.0);
+  Vector<double> v(4);
+  v.AddSpVec(1.0, Binv, a, 0.0);
+  
+  Real av = VecVec(a, v), vBv = VecSpVec(v, B, v),
+      error = (vBv + dim) - 2.0 * av;
+  
+
+  KALDI_ASSERT(error >= 0.0); // note: error is a squared Frobenius
+                                      // norm.
+
+  KALDI_VLOG(5) << "a is " << a << ", B is " << B;
+  KALDI_VLOG(5) << "Dim is " << dim << ", error norm is " << sqrt(error);
+    
+  if (error <= max_error) {
+    // It's sufficient to return with the approximation up to A^3.
+    A.Scale(v(1));
+    A.AddToDiag(v(0));
+    A.AddMat(v(2), AA);
+    A.AddMat(v(3), AAA);
+    this->CopyFromMat(A, kTakeLower);
+    this->Scale(prescale);
+    return;
+  } else {
+    // Let X be the approximate inverse of A: X = v(0) I + v(1) A + v(2) A^2 + v(3) A^3.
+    // Let AX be A times X: AX = v(0) A + v(1) A^2 + v(2) A^3 + v(3) A^4.
+    // We can construct both X and AX out of quantities we've already computed.
+
+    CuSubMatrix<Real> X(temp, dim * 4, dim, 0, dim),
+        AX(temp, dim * 3, dim, 0, dim);
+
+    AX.Scale(v(3));  // AX re-uses memory of AAAA: scale that.
+    AX.AddMat(v(2), AAA);
+    AX.AddMat(v(1), AA);
+    AX.AddMat(v(0), A);
+
+    X.AddMat(v(3), AAA); // X was zero before; space never used.
+    X.AddMat(v(2), AA);
+    X.AddMat(v(1), A);
+    X.AddToDiag(v(0));
+
+    int32 num_iters = 10;
+    for (int32 i = 0; i < num_iters; i++) {
+      CuSubMatrix<Real> AX_and_X(temp, dim * 3, dim * 2, 0, dim),
+          AAXX_and_AXX(temp, dim, dim * 2, 0, dim); // Note: in our variable-naming
+      // conventions we put the A's first; since all quantities commute it doesn't
+      // matter which order we put them in.  Note: the transpose of AX below is
+      // arbitrary (it's symmetric); I think it might be more efficient.`
+      AAXX_and_AXX.AddMatMat(1.0, AX_and_X, kNoTrans, AX, kTrans, 0.0);
+
+      // The iteration now is X' <--- X (2I - AX).  This is the iteration of
+      // Schulz/Hoteling/whatever.  To get the objf (and for the next iteration)
+      // we also want AX'.  Use X' <-- 2X - AXX, and AX' <-- 2AX - AAXX.
+      // They go in the same place as before.  For now on, forget about the dash
+      // on the X, we'll just call it X.
+      AX_and_X.Scale(2.0);
+      AX_and_X.AddMat(-1.0, AAXX_and_AXX);
+
+      // The squared error is  ||AX - I||^2 = tr((AX - I)(AX - I)) = tr(AX^T AX) + dim - 2 tr(AX)
+      Real a = TraceMatMat(AX, AX, kTrans), b = AX.Trace();
+      error = a + dim - 2 * b;
+      
+      KALDI_VLOG(5) << "Better-inverse error is "
+                    <<  sqrt(error);
+      if (error <= max_error) {
+        this->CopyFromMat(X, kTakeLower);
+        this->Scale(prescale);
+        return;
+      }
+    }
+    KALDI_ASSERT("Error: max iters reached."); // TODO
+  }
+}
+
+
+template<typename Real>
+void CuSpMatrix<Real>::AddVec2(const Real alpha, const CuVectorBase<Real> &v) {
+  KALDI_ASSERT(v.Dim() == this->NumRows());
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    size_t nr = this->num_rows_;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK));
+
+    cublas_spr('U', this->num_rows_, alpha, v.Data(),
+               1, this->Data());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().AddVec2(alpha, v.Vec());
+  }
+}
+
+template<typename Real>
+void CuSpMatrix<Real>::AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
+                               MatrixTransposeType transM, const Real beta) {
+  KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
+               || (transM == kTrans && this->NumRows() == M.NumCols()));
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    MatrixIndexT this_dim = this->NumRows(),
+        m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows());
+
+    if (this_dim == 0) return;
+    if (alpha == 0.0) {
+      if (beta != 1.0) this->Scale(beta);
+      return;
+    }
+
+    char trans = (transM == kTrans ? 'N' : 'T');
+
+    CuMatrix<Real> tmp_mat(*this);
+    cublas_syrk('U', trans, this_dim, m_other_dim, alpha, M.Data(),
+                M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride());
+    this->CopyFromMat(tmp_mat, kTakeLower);
+    
+    CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddMat2", tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().AddMat2(alpha, M.Mat(), transM, beta);
+  }
+}
+
+/**
+ * C++ templatd wrapper of ANSI-C CUBLAS function GEMM (matrix multiply)
+ */
+
+template<typename Real, typename OtherReal>
+Real TraceSpSp(const CuSpMatrix<Real> &A, const CuSpMatrix<OtherReal> &B) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows());
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    MatrixIndexT nr = A.NumRows(), size = nr * (nr+1) / 2;
+    CuVector<Real> Adiag(nr, kUndefined);
+    CuVector<OtherReal> Bdiag(nr, kUndefined);
+    Adiag.CopyDiagFromPacked(A);
+    Bdiag.CopyDiagFromPacked(B);
+    CuSubVector<Real> Aall(A.Data(), size);
+    CuSubVector<OtherReal> Ball(B.Data(), size);
+    // Below, we subtrace VecVec(Adiag, Bdiag) to remove double-counting
+    // on the diagonal.
+    return 2.0 * VecVec(Aall, Ball) - VecVec(Adiag, Bdiag);
+  } else
+#endif
+  {
+    return TraceSpSp(A.Mat(), B.Mat());
+  }
+}
+template
+float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<float> &B);
+template
+float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<double> &B);
+template
+double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<float> &B);
+template
+double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<double> &B);
+
+
+template<typename Real>
+bool CuSpMatrix<Real>::ApproxEqual(const CuSpMatrix<Real> &B, Real tol) const {
+  KALDI_ASSERT(this->NumRows() == B.NumRows());
+  CuSpMatrix<Real> diff(*this);
+  diff.AddSp(-1.0, B);
+  Real a = this->FrobeniusNorm(), b = B.FrobeniusNorm(),
+      d = diff.FrobeniusNorm();
+  return (d <= tol * std::max(a, b));
+}
+
+template<typename Real>
+bool CuSpMatrix<Real>::IsUnit(Real tol) const {
+  // want to return:
+  //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.:
+  //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows()
+  //    trace((*this - I)(*this - I)) <= tol * NumRows()
+  // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows()
+  // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows()
+
+  // Note: we could do this more efficiently still, by slightly changing the
+  // definition of IsUnit and getting rid of the extra stuff inside TraceSpSp
+  // that corrects for the diagonal being counted twice.
+  
+  return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <=
+          tol * this->NumRows());
+}
+
+
+template class CuSpMatrix<float>;
+template class CuSpMatrix<double>;
+
+
+
+} // namespace
--- a/src/cudamatrix/cu-sp-matrix.h
+++ b/src/cudamatrix/cu-sp-matrix.h
@ -0,0 +1,146 @@
+#ifndef KALDI_CUDAMATRIX_CU_SP_MATRIX_H_
+#define KALDI_CUDAMATRIX_CU_SP_MATRIX_H_
+
+#include <sstream>
+
+#include "cudamatrix/cu-common.h"
+#include "matrix/matrix-common.h"
+#include "matrix/sp-matrix.h"
+#include "cudamatrix/cu-array.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-packed-matrix.h"
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+
+/// TraceSpSp returns tr(A B)
+template<typename Real, typename OtherReal>
+Real TraceSpSp(const CuSpMatrix<Real> &A, const CuSpMatrix<OtherReal> &B);
+
+template<typename Real>
+class CuSpMatrix : public CuPackedMatrix<Real> {
+  friend class CuMatrixBase<Real>;
+  friend class CuVectorBase<Real>;
+  friend class CuTpMatrix<Real>;
+  friend class CuSubMatrix<Real>;
+  friend class CuRand<Real>;
+
+  template<class R, class S>
+  friend R TraceSpSp(const CuSpMatrix<R> &A, const CuSpMatrix<S> &B);
+ public:
+  
+  CuSpMatrix(): CuPackedMatrix<Real>() {}
+  
+  explicit CuSpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
+    : CuPackedMatrix<Real>(r, resize_type) {}
+
+  explicit CuSpMatrix(const SpMatrix<Real> &orig)
+    : CuPackedMatrix<Real>(orig) {}
+
+  explicit CuSpMatrix(const CuSpMatrix<Real> &orig)
+    : CuPackedMatrix<Real>(orig) {}
+
+  explicit CuSpMatrix(const CuMatrixBase<Real> &orig,
+                      SpCopyType copy_type = kTakeLower)
+      : CuPackedMatrix<Real>(orig.NumRows(), kUndefined) {
+    CopyFromMat(orig, copy_type);
+  }
+
+  ~CuSpMatrix() {}  
+
+  inline void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) {
+    CuPackedMatrix<Real>::Resize(nRows, resize_type);
+  }
+
+  Real FrobeniusNorm() const { return sqrt(TraceSpSp(*this, *this)); }
+
+  bool IsUnit(Real tol = 0.001) const;
+
+  bool ApproxEqual(const CuSpMatrix<Real> &other, Real tol = 0.001) const;
+  
+  void CopyFromSp(const CuSpMatrix<Real> &other) {
+    CuPackedMatrix<Real>::CopyFromPacked(other);
+  }
+  void CopyFromSp(const SpMatrix<Real> &other) {
+    CuPackedMatrix<Real>::CopyFromPacked(other);
+  }
+
+  void CopyFromMat(const CuMatrixBase<Real> &orig,
+                   SpCopyType copy_type = kTakeLower);
+  
+  void CopyToSp(SpMatrix<Real> *dst) const { //added const by hxu
+    CuPackedMatrix<Real>::CopyToPacked(dst);
+  }
+
+  inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
+    if (static_cast<UnsignedMatrixIndexT>(c) >
+        static_cast<UnsignedMatrixIndexT>(r))
+      std::swap(c, r);
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
+    return CuValue<Real>(this->data_ + (r * (r+1)) / 2 + c);
+  }
+  
+  inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    if (static_cast<UnsignedMatrixIndexT>(c) >
+        static_cast<UnsignedMatrixIndexT>(r))
+      std::swap(c, r);
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
+    return CuValue<Real>(this->data_ + (r * (r+1)) / 2 + c); // will be
+    // casted to Real.
+  }
+
+  /// Approximate inversion of positive definite matrices, using repeated
+  /// multiplication.  Limits the error by ensuring that
+  /// || I - A Ainv ||^2 <= max_error, using Frobenius norm (so guarantees
+  // that (I - A Ainv).IsUnit(max_error) == true).
+  void InvertPosDefApprox(BaseFloat max_error = 0.1);
+  
+  /// Note: the CuMatrix version of the Invert() function will only work for
+  /// positive definite matrices; it is based on Cholesky.
+  void Invert();
+
+  void AddVec2(const Real alpha, const CuVectorBase<Real> &v);
+
+  void AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
+               MatrixTransposeType transM, const Real beta);
+  
+  void AddSp(const Real alpha, const CuSpMatrix<Real> &Ma) {
+    this->AddPacked(alpha, Ma);
+  }
+
+ protected:
+  inline const SpMatrix<Real> &Mat() const {
+    return *(reinterpret_cast<const SpMatrix<Real>* >(this));
+  }
+  inline SpMatrix<Real> &Mat() {
+    return *(reinterpret_cast<SpMatrix<Real>* >(this));
+  }
+
+};
+
+template<typename Real>
+inline bool ApproxEqual(const CuSpMatrix<Real> &A,
+                 const CuSpMatrix<Real> &B, Real tol = 0.001) {
+  return A.ApproxEqual(B, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(const CuSpMatrix<Real> &A,
+                        const CuSpMatrix<Real> &B, Real tol = 0.001) {
+  KALDI_ASSERT(ApproxEqual(A, B, tol));
+}
+
+
+template<typename Real>
+SpMatrix<Real>::SpMatrix(const CuSpMatrix<Real> &cu) {
+   Resize(cu.NumRows());
+   cu.CopyToSp(this);
+}
+
+
+
+} // namespace
+
+#endif
--- a/src/cudamatrix/cu-stlvector-inl.h
+++ b/src/cudamatrix/cu-stlvector-inl.h
@ -1,213 +0,0 @@
-// cudamatrix/cu-stlvector-inl.h
-
-// Copyright 2009-2012  Karel Vesely
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-
-#ifndef KALDI_CUDAMATRIX_CU_STLVECTOR_INL_H_
-#define KALDI_CUDAMATRIX_CU_STLVECTOR_INL_H_
-
-#if HAVE_CUDA==1
-  #include <cuda_runtime_api.h>
-  #include "cudamatrix/cu-common.h"
-  #include "cudamatrix/cu-device.h"
-  #include "cudamatrix/cu-kernels.h"
-#endif
-
-#include "util/timer.h"
-
-namespace kaldi {
-
-
-template<typename IntType>
-const IntType* CuStlVector<IntType>::Data() const {
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    return data_; 
-  } else
-  #endif
-  {
-    return &vec_.front();
-  }
-}
-
-
-
-template<typename IntType>
-IntType* CuStlVector<IntType>::Data() { 
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    return data_; 
-  } else
-  #endif
-  {
-    return &vec_.front();
-  }
-}
-
-
-
-template<typename IntType>
-CuStlVector<IntType>& CuStlVector<IntType>::Resize(MatrixIndexT dim) {
-  if (dim_ == dim) {
-    // SetZero();
-    return *this;
-  }
-
-  Destroy();
-
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    cuSafeCall(cudaMalloc((void**)&data_, dim*sizeof(IntType)));
-  } else
-  #endif
-  {
-    vec_.resize(dim);
-  }
-
-  dim_ = dim;
-  SetZero();
-
-  return *this;
-}
-
-
-
-template<typename IntType>
-void CuStlVector<IntType>::Destroy() {
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    if (NULL != data_) {
-      cuSafeCall(cudaFree(data_));
-      data_ = NULL;
-    }
-  } else
-  #endif
-  {
-    vec_.resize(0);
-  }
-
-  dim_ = 0;
-}
-
-
-
-template<typename IntType>
-CuStlVector<IntType>& CuStlVector<IntType>::CopyFromVec(const std::vector<IntType> &src) {
-  Resize(src.size());
-
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-
-    cuSafeCall(cudaMemcpy(data_, &src.front(), src.size()*sizeof(IntType), cudaMemcpyHostToDevice));
-
-    CuDevice::Instantiate().AccuProfile("CuStlVector::CopyFromVecH2D",tim.Elapsed());
-  } else
-  #endif
-  {
-    memcpy(&vec_.front(), &src.front(), src.size()*sizeof(IntType));
-  }
-  return *this;
-}
-
-
-
-template<typename IntType>
-void CuStlVector<IntType>::CopyToVec(std::vector<IntType> *dst) const {
-  if (static_cast<MatrixIndexT>(dst->size()) != dim_) {
-    dst->resize(dim_);
-  }
-
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-    cuSafeCall(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(IntType), cudaMemcpyDeviceToHost));
-    CuDevice::Instantiate().AccuProfile("CuStlVector::CopyToVecD2H",tim.Elapsed());
-  } else
-  #endif
-  {
-    memcpy(&dst->front(), &vec_.front(), dim_*sizeof(IntType));
-  }
-}
-
-
-
-template<typename IntType>
-void CuStlVector<IntType>::SetZero() {
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-    cuSafeCall(cudaMemset(data_, 0, dim_*sizeof(IntType)));
-    CuDevice::Instantiate().AccuProfile("CuStlVector::SetZero",tim.Elapsed());
-  } else
-  #endif
-  {
-    vec_.assign(dim_, 0);
-  }
-}
-
-
-
-/**
- * Print the vector to stream
- */
-template<typename IntType>
-std::ostream &operator << (std::ostream &out, const CuStlVector<IntType> &vec) {
-  std::vector<IntType> tmp;
-  vec.CopyToVec(&tmp);
-  out << "[";
-  for(int32 i=0; i<tmp.size(); i++) {
-    out << " " << tmp[i];
-  }
-  out << " ]\n";
-  return out;
-}
-
-
-
-/*
- * Methods wrapping the ANSI-C CUDA kernels
- */
-template<> 
-inline void CuStlVector<int32>::Set(int32 value) {
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-
-    dim3 dimBlock(CUBLOCK);
-    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
-    ::MatrixDim d = { 1, Dim(), Dim() };
-
-    cudaI32_set_const(dimGrid, dimBlock, data_, value, d);
-    cuSafeCall(cudaGetLastError());
-
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-  #endif
-  {
-    vec_.assign(vec_.size(), value);
-  }
-}
-
-
-} // namespace kaldi
-
-#endif
-
-
--- a/src/cudamatrix/cu-stlvector.h
+++ b/src/cudamatrix/cu-stlvector.h
@ -1,109 +0,0 @@
-// cudamatrix/cu-stlvector.h
-
-// Copyright 2009-2012  Karel Vesely
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-
-#ifndef KALDI_CUDAMATRIX_CU_STLVECTOR_H_
-#define KALDI_CUDAMATRIX_CU_STLVECTOR_H_
-
-#include "matrix/kaldi-vector.h"
-
-namespace kaldi {
-
-template<typename IntType> class CuMatrix;
-
-/**
- * std::vector equivalent for CUDA computing
- */
-template<typename IntType>
-class CuStlVector {
-  typedef CuStlVector<IntType> ThisType;
- public:
-
-  /// Default Constructor
-  CuStlVector<IntType>()
-   : dim_(0), data_(NULL) { 
-  }
-  /// Constructor with memory initialisation
-  CuStlVector<IntType>(MatrixIndexT dim)
-   : dim_(0), data_(NULL) { 
-    Resize(dim); 
-  }
-
-  /// Destructor
-  ~CuStlVector() {
-    Destroy(); 
-  }
-
-  /// Dimensions
-  MatrixIndexT Dim() const { 
-    return dim_; 
-  }
-
-  /// Get raw pointer
-  const IntType* Data() const;
-  IntType* Data();
- 
-  /// Allocate the memory
-  ThisType& Resize(MatrixIndexT dim);
-
-  /// Deallocate the memory
-  void Destroy();
-
-  /// Copy functions (reallocates when needed)
-  ThisType&        CopyFromVec(const std::vector<IntType> &src);
-  void             CopyToVec(std::vector<IntType> *dst) const;
-  
-  /// Math operations
-  void SetZero();
-  void Set(IntType value);
-
-  /// Accessor to non-GPU vector
-  const std::vector<IntType>& Vec() const {
-    return vec_;
-  }
-  std::vector<IntType>& Vec() {
-    return vec_;
-  }
-
- private:
-  MatrixIndexT dim_;     ///< dimension of the vector
-  IntType *data_;  ///< GPU data pointer
-  std::vector<IntType> vec_; ///< non-GPU vector as back-up
-};
-
-
-/*
- * Signatures of general/specialized methods
- */
-template<typename Real> void CuStlVector<Real>::Set(Real value) { KALDI_ERR << __func__ << " Not implemented"; }
-template<> inline void CuStlVector<int32>::Set(int32 value);
-
-
-/// I/O
-template<typename IntType>
-std::ostream &operator << (std::ostream &out, const CuStlVector<IntType> &vec);
- 
-} // namespace
-
-
-#include "cu-stlvector-inl.h"
-
-#endif
-
--- a/src/cudamatrix/cu-test.cc
+++ b/src/cudamatrix/cu-test.cc
@ -0,0 +1,582 @@
+// cudamatrix/cuda-test.cc
+//
+//
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+#include <ctime>
+
+#include "base/kaldi-common.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "cudamatrix/cu-tp-matrix.h"
+#include "cudamatrix/cu-packed-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include <numeric>
+#include <time.h>
+
+namespace kaldi {
+
+/*
+ * INITIALIZERS
+ */ 
+template<typename Real>
+static void InitRand(SpMatrix<Real> *M) {
+  do {
+    for (MatrixIndexT i = 0; i < M->NumRows(); i++) {
+      for (MatrixIndexT j = 0; j <= i; j++ ) {
+	(*M)(i,j) = RandGauss();
+      }
+    }
+  } while (M->NumRows() != 0 && M->Cond() > 100);
+}
+
+template<typename Real>
+static void InitRand(VectorBase<Real> *v) {
+  for (MatrixIndexT i = 0; i < v->Dim(); i++) {
+    (*v)(i) = RandGauss();
+  }
+}
+
+template<typename Real>
+static void UnitTestSetZeroUpperDiag() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10 * i;
+    Matrix<Real> A(dim,dim);
+    A.SetRandn();
+    CuMatrix<Real> B(A);
+
+    B.SetZeroUpperDiag();
+
+    Real sum = 0.0;
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      for (MatrixIndexT j = i + 1; j < dim; j++)
+        sum += A(i,j);
+    }
+        
+    KALDI_LOG << "the upper diaganoal sum for A is : " << sum;
+    B.CopyToMat(&A);
+    sum = 0.0;
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      for (MatrixIndexT j = i + 1; j < dim; j++)
+        sum += A(i,j);
+    }
+    KALDI_LOG << "the upper diaganoal sum for B is : " << sum;
+  }
+}
+
+
+template<typename Real> static void UnitTestCholesky() {
+  for (MatrixIndexT iter = 0; iter < 3; iter++) {
+    MatrixIndexT dim = 300 + rand() %  200;
+    // set dimension
+    // computing the matrix for cholesky input
+    // CuMatrix is cuda matrix class while Matrix is cpu matrix class
+    CuMatrix<Real> A(dim,dim);
+    Matrix<Real> B(dim,dim);
+    Vector<Real> C(dim);
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      B(i,i) = 1;
+      C(i) = i + 1;
+    }
+    B.AddVecVec(1.0, C, C);
+    // copy the matrix to cudamatrix object
+    A.CopyFromMat(B);
+    A.CopyToMat(&B);
+    //KALDI_LOG << B << '\n';
+    // doing cholesky
+    A.Cholesky();
+
+    Matrix<Real> D(dim,dim);
+    A.CopyToMat(&D);
+    
+    //KALDI_LOG << "D is: " << D << '\n';
+    Matrix<Real> E(dim,dim);
+    E.AddMatMat(1.0, D, kNoTrans, D, kTrans, 0.0);
+    // check if the D'D is equal to B or not!
+    AssertEqual(B, E);
+  }
+}
+
+template<typename Real> static void UnitTestTrace() {
+  for (MatrixIndexT iter = 1; iter < 18; iter++) {
+    MatrixIndexT dim = iter;
+    KALDI_LOG << "dim is : " << iter;
+    SpMatrix<Real> A(dim);
+    A.SetRandn();
+    CuSpMatrix<Real> B(A);
+    KALDI_LOG << "cpu trace is : " << A.Trace();
+    KALDI_LOG << "gpu trace is : " << B.Trace();
+  }
+  /*
+  Vector<Real> tim(100);
+  Vector<Real> d(100);
+  for (MatrixIndexT iter = 0; iter < 100; iter++) {
+    MatrixIndexT dim = 10000 + rand() % 400;
+    Matrix<Real> A(dim,dim);
+    A.SetRandn();
+    CuMatrix<Real> B(A);
+    CuSpMatrix<Real> C(B,kTakeLower);
+    clock_t t1 = clock();
+    tim(iter) = C.Trace();
+    clock_t t2 = clock();
+    //tim(iter) = t2 - t1;
+    d(iter) = dim;
+    KALDI_LOG << tim(iter) << iter << '\n';
+    KALDI_LOG << d(iter) << iter << '\n';
+  }
+  KALDI_LOG << "tim is " << tim << '\n';
+  KALDI_LOG << "dim is " << d << '\n';
+  */
+}
+
+template<typename Real> static void UnitInvert() {
+  //MatrixIndexT dim = 15 + rand() %  40;;
+  MatrixIndexT dim = 8;
+  CuMatrix<Real> A(dim,dim);
+  Matrix<Real> B(dim,dim);
+  Vector<Real> C(dim);
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    B(i,i) = 1;
+    C(i) = i + 1;
+  }
+  B.AddVecVec(1.0,C,C);
+  CuMatrix<Real> tmp(dim,dim);
+  A.CopyFromMat(B);
+  //A.Cholesky();
+  A.CopyToMat(&B);
+  KALDI_LOG << "B is : " << '\n';
+  KALDI_LOG << B << '\n';
+  A.SymInvertPosDef();
+  Matrix<Real> D(dim,dim);
+  A.CopyToMat(&D);
+  KALDI_LOG << "D is : " << '\n';
+  KALDI_LOG << D << '\n';
+  Matrix<Real> X(dim,dim);
+  X.AddMatMat(1,B,kNoTrans,D,kNoTrans,0);
+  KALDI_LOG << X << '\n';
+  //for (MatrixIndexT i = 0; i < dim; i++) {
+  //  for (MatrixIndexT j = i+1; j < dim; j++)
+  //    D(i,j) = 0;
+  //}
+  //Matrix<Real> E(dim,dim);
+  //E.AddMatMat(1,D,kNoTrans,D,kTrans,0);
+  //AssertEqual(B,E);
+}
+
+template<typename Real> static void UnitTestInvert() {
+  for (MatrixIndexT iter = 0; iter < 3; iter++) {
+    MatrixIndexT dim = 500 + rand() % 400;
+    
+    KALDI_LOG << "dim is : " << '\n';
+    KALDI_LOG << dim << '\n';
+    CuMatrix<Real> A(dim,dim);
+    Matrix<Real> B(dim,dim);
+    Vector<Real> C(dim);
+    for (MatrixIndexT i = 0; i < dim; i++) {
+      B(i,i) = 1;
+      C(i) = (i/(1.0*dim)) + 1;
+    }
+    Matrix<Real> Identity(B);
+    B.AddVecVec(1.0, C, C);
+    // Now we have a positive-definite B (inversion would
+    // fail if it were not positive definite). 
+
+    A.CopyFromMat(B);
+    
+    A.SymInvertPosDef();
+    Matrix<Real> D(dim,dim);
+    A.CopyToMat(&D);
+    
+    Matrix<Real> X(dim,dim);
+    X.AddMatMat(1.0, B, kNoTrans, D, kNoTrans, 0.0);
+    // KALDI_LOG << "X is (should be identity): " << X << '\n';
+    AssertEqual(Identity, X, (sizeof(Real) == 4 ? 0.1 : 0.001));
+  }
+}
+
+template<typename Real> static void UnitTestConstructor() {
+  MatrixIndexT dim = 8;
+  CuMatrix<Real> A(dim,dim);
+  Matrix<Real> B(dim,dim);
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    for (MatrixIndexT j = 0; j <=i; j++)
+      B(i,j) = i+j;
+    for (MatrixIndexT j = i+1; j < dim; j++)
+      B(i,j) = i+j+4;
+  }
+  KALDI_LOG << "A is : " << '\n';
+  KALDI_LOG << B << '\n';
+  A.CopyFromMat(B);
+  //CuSpMatrix<Real> C(dim);
+  //C.CopyFromMat(A,kTakeLower);
+  CuSpMatrix<Real> C(A, kTakeLower);
+  SpMatrix<Real> D(dim);
+  C.CopyToSp(&D);
+  KALDI_LOG << "C is : " << '\n';
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    for (MatrixIndexT j = 0; j <= i; j++)
+      std::cout << D(i,j) << " ";
+    std::cout << '\n';
+  }  
+}
+
+template<typename Real> static void UnitTestCopySp() {
+  // Checking that the various versions of copying                                 
+  // matrix to SpMatrix work the same in the symmetric case.                         
+  for (MatrixIndexT iter = 0;iter < 5;iter++) {
+    int32 dim = 5 + rand() %  10;
+    SpMatrix<Real> A(dim), B(dim);
+    A.SetRandn();
+    Matrix<Real> C(A);
+    //CuMatrix<Real> D(C);
+    
+    {
+      CuMatrix<Real> D2(dim,dim);
+      D2.CopyFromMat(C);
+      KALDI_LOG << "D2 is " << D2;
+      CuSpMatrix<Real> E(D2.NumRows(), kUndefined);
+      KALDI_LOG << "D2 is " << D2;
+      E.CopyFromMat(D2, kTakeLower);
+      KALDI_LOG << "D2 is " << D2;
+    }
+    
+    CuMatrix<Real> D(dim,dim);
+    D.CopyFromMat(C);
+    KALDI_LOG << "D stride is : " << D.Stride() <<'\n';
+    
+    CuSpMatrix<Real> E(D,kTakeLower);
+    ///CuSpMatrix<Real> E(dim);
+    //E.CopyFromMat(D,kTakeLower);
+    /*
+    KALDI_LOG << D.NumRows() << '\n';
+    //E.CopyFromMat(D, kTakeMean);
+    //E(D, kTakeMean);
+    //KALDI_LOG << E.NumRows() << '\n';
+
+    E.CopyToMat(&B);
+    AssertEqual(A, B);
+    B.SetZero();
+    //E.CopyFromMat(D, kTakeLower);
+    CuSpMatrix<Real> F(D,kTakeLower);
+    //F(D, kTakeLower);
+    F.CopyToMat(&B);
+    AssertEqual(A, B);
+    B.SetZero();
+    //E.CopyFromMat(D, kTakeUpper);
+    //E(D, kTakeUpper);
+    CuSpMatrix<Real> G(D, kTakeUpper);
+    G.CopyToMat(&B);
+    AssertEqual(A, B);
+    */  
+  }
+  
+}
+
+template<typename Real> static void UnitTestCopyFromMat() {
+  MatrixIndexT dim = 8;
+  CuMatrix<Real> A(dim,dim);
+  Matrix<Real> B(dim,dim);
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    for (MatrixIndexT j = 0; j <=i; j++)
+      B(i,j) = i+j;
+    for (MatrixIndexT j = i+1; j < dim; j++)
+      B(i,j) = i+j+4;
+  }
+  KALDI_LOG << "A is : " << '\n';
+  KALDI_LOG << B << '\n';
+  A.CopyFromMat(B);
+  CuSpMatrix<Real> C(dim);
+  C.CopyFromMat(A,kTakeLower);
+  SpMatrix<Real> D(dim);
+  C.CopyToSp(&D);
+  KALDI_LOG << "C is : " << '\n';
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    for (MatrixIndexT j = 0; j <= i; j++)
+      std::cout << D(i,j) << " ";
+    std::cout << '\n';
+  }
+  C.CopyFromMat(A,kTakeUpper);
+  C.CopyToSp(&D);
+  KALDI_LOG << "C is : " << '\n';
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    for (MatrixIndexT j = 0; j <= i; j++)
+      std::cout << D(i,j) << " ";
+    std::cout << '\n';
+  }
+  
+  C.CopyFromMat(A,kTakeMean);
+  C.CopyToSp(&D);
+  KALDI_LOG << "C is : " << '\n';
+  for (MatrixIndexT i = 0; i < dim; i++) {
+    for (MatrixIndexT j = 0; j <= i; j++)
+      std::cout << D(i,j) << " ";
+    std::cout << '\n';
+  }
+  
+  //KALDI_LOG << D << '\n';
+}
+
+template<typename Real> static void UnitTestMatrix() {
+  //operator()
+  for (MatrixIndexT iter = 0; iter < 2; iter++) {
+    int32 dim1 = 6 + rand() % 10;
+    int32 dim2 = 8 + rand() % 10;
+    Matrix<Real> A(dim1,dim2);
+    A.SetRandn();
+    CuMatrix<Real> B(A);
+    KALDI_ASSERT(A(3, 4) == B(3, 4));
+    B(3, 4) = 2.0;
+    A(3, 4) = B(3, 4);
+    KALDI_ASSERT(A(3, 4) == B(3, 4));
+
+    SpMatrix<Real> As(dim1);
+    CuSpMatrix<Real> Bs(As);
+    KALDI_ASSERT(As(3, 4) == Bs(3, 4));
+    Bs(3, 4) = 2.0;
+    if (rand() % 2 == 0)
+      As(3, 4) = Bs(3, 4);
+    else
+      As(3, 4) = (const_cast<const CuSpMatrix<Real>&>(Bs))(3, 4);
+    
+    KALDI_ASSERT(As(3, 4) == Bs(3, 4));
+
+    Vector<Real> v(dim1);
+    CuVector<Real> w(v);
+    KALDI_ASSERT(w(2) == v(2));
+    w(2) = 3.0;
+    v(2) = w(2);
+    KALDI_ASSERT(w(2) == v(2));
+  }
+
+  //SetRandn
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim1 = 15 + rand() % 10;
+    int32 dim2 = dim1;//10 + rand() % 14;
+    //KALDI_LOG << "dimension is " << dim1
+    //          << " " << dim2 << '\n';
+    CuMatrix<Real> A(dim1,dim2);
+    A.SetRandn();
+    Matrix<Real> A1(dim1,dim2);
+    A.CopyToMat(&A1);
+    //KALDI_LOG << "gpu sum is: " << A.Sum() << '\n';
+    //KALDI_LOG << "cpu sum is: " << A1.Sum() << '\n';
+  }
+}
+
+template<typename Real> static void UnitTestMulTp() {
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim = 1 + rand() % 30;
+    Vector<Real> v(dim);
+    v.SetRandn();
+    TpMatrix<Real> M(dim);
+    M.SetRandn();
+    CuVector<Real> cv(v);
+    CuTpMatrix<Real> cM(M);
+    
+    Vector<Real> v2(dim);
+    cv.CopyToVec(&v2);
+    AssertEqual(v, v2);
+    v.MulTp(M, iter % 2 == 0 ? kTrans:kNoTrans);
+    cv.MulTp(cM, iter % 2 == 0 ? kTrans:kNoTrans);
+    cv.CopyToVec(&v2);
+    // KALDI_LOG << "v is " << v << ", v2 is " << v2;
+    AssertEqual(v, v2);
+  }
+}
+
+template<typename Real> static void UnitTestVector() {
+  // Scale
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim = 24 + rand() % 10;
+    Vector<Real> A(dim);
+    A.SetRandn();
+    CuVector<Real> B(A);
+    Vector<Real> C(dim);
+    Real r = 1.43;
+    B.Scale(r);
+    B.CopyToVec(&C);
+    A.Scale(r);
+    //KALDI_LOG << A;
+    //KALDI_LOG << (A.Scale(r));
+    //KALDI_LOG << C;
+    AssertEqual(A, C);
+  }
+  
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim = 15 + rand() % 10;
+    CuVector<Real> A(dim);
+    CuVector<Real> B(dim);
+    Vector<Real> A1(dim);
+    Vector<Real> B1(dim);
+    A.SetRandn();
+    B.SetRandn();
+    A.CopyToVec(&A1);
+    B.CopyToVec(&B1);
+    A.MulElements(B);
+    A1.MulElements(B1);
+    Vector<Real> A2(dim);
+    A.CopyToVec(&A2);
+    AssertEqual(A1,A2);
+  }
+  /*
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim = 72;
+    CuVector<Real> A(dim);
+    Vector<Real> A1(dim);
+    CuMatrix<Real> B(9,8);
+    Matrix<Real> B1(9,8);
+    B.SetRandn();
+    B.CopyToMat(&B1);
+    A.CopyRowsFromMat(B);
+    A1.CopyRowsFromMat(B1);
+    Vector<Real> A2(dim);
+    A.CopyToVec(&A2);
+    AssertEqual(A1,A2);
+  }
+
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim = 15 + rand() % 10;
+    CuVector<Real> A(dim);
+    A.SetRandn();
+    Vector<Real> A1(dim);
+    A.CopyToVec(&A1);
+    KALDI_LOG << "cpu min is : " << A1.Min() << '\n';
+    KALDI_LOG << "gpu min is : " << A.Min() << '\n';    
+  }
+
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim = 15 + rand() % 10;
+    CuVector<Real> A(dim);
+    A.SetRandn();
+    Vector<Real> A1(dim);
+    A.CopyToVec(&A1);
+    CuVector<Real> B(dim);
+    B.SetRandn();
+    Vector<Real> B1(dim);
+    B.CopyToVec(&B1);
+    CuVector<Real> C(dim);
+    C.SetRandn();
+    Vector<Real> C1(dim);
+    C.CopyToVec(&C1);
+    Real alpha = 2;
+    Real beta = 3;
+    A.AddVecVec(alpha, B, C, beta);
+    A1.AddVecVec(alpha,B1,C1,beta);
+    Vector<Real> D(dim);
+    A.CopyToVec(&D);
+    AssertEqual(D,A1);
+  }
+  
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim1 = 15 + rand() % 10;
+    int32 dim2 = 10 + rand() % 10;
+    Matrix<Real> A(dim1,dim2);
+    for (MatrixIndexT i = 0; i < dim1; i++) {
+      for (MatrixIndexT j = 0; j < dim2; j++)
+        A(i,j) = i + 2 * j + 1;
+    }
+    KALDI_LOG << A;
+    CuMatrix<Real> B(dim1,dim2);
+    B.CopyFromMat(A);
+    CuVector<Real> C(dim1);
+    C.SetZero();
+    Real alpha = 1;
+    Real beta = 1;
+    C.AddDiagMat2(alpha, B, kNoTrans, beta);
+    Vector<Real> D(dim1);
+    C.CopyToVec(&D);
+    KALDI_LOG << D << '\n';
+    Vector<Real> E(dim1);
+    E.AddDiagMat2(alpha, A, kNoTrans, beta);
+    KALDI_LOG << E;
+    AssertEqual(D,E);
+  }
+
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim1 = 15 + rand() % 10;
+    int32 dim2 = 10 + rand() % 10;
+    Matrix<Real> A(dim1,dim2);
+    for (MatrixIndexT i = 0; i < dim1; i++) {
+      for (MatrixIndexT j = 0; j < dim2; j++)
+        A(i,j) = i + 2 * j + 1;
+    }
+    KALDI_LOG << A;
+    CuMatrix<Real> B(dim1,dim2);
+    B.CopyFromMat(A);
+    CuSubVector<Real> C(B,1);
+    Vector<Real> D(dim2);
+    C.CopyToVec(&D);
+    KALDI_LOG << D;
+  }
+
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim = 15 + rand() % 10;
+    CuVector<Real> A(dim);
+    A.SetRandn();
+    Vector<Real> A1(dim);
+    A.CopyToVec(&A1);
+    CuVector<Real> B(dim);
+    B.SetRandn();
+    Vector<Real> B1(dim);
+    B.CopyToVec(&B1);
+    Real dot = VecVec(A,B);
+    KALDI_LOG << "dot product in gpu: " << dot << '\n';
+    dot = VecVec(A1,B1);
+    KALDI_LOG << "dot product in cpu: " << dot << '\n';    
+  }
+
+  for (MatrixIndexT iter = 0; iter < 10; iter++) {
+    int32 dim = 15 + rand() % 10;
+    CuVector<Real> A(dim);
+    Vector<Real> A1(dim);
+    for (MatrixIndexT i = 0; i < dim; i++)
+      A1(i) = i;
+    A.CopyFromVec(A1);
+    KALDI_LOG << A(dim-2) << '\n';
+    KALDI_LOG << A1(dim-2) << '\n';
+  }
+  */
+}
+
+template<typename Real>
+static void CuMatrixUnitTest() {
+  UnitTestTrace<Real>();
+  UnitTestCholesky<Real>();
+  UnitTestInvert<Real>();
+  UnitInvert<Real>();
+  UnitTestCopyFromMat<Real>();
+  UnitTestCopySp<Real>();
+  UnitTestConstructor<Real>();
+  UnitTestVector<Real>();
+  UnitTestMulTp<Real>();
+  UnitTestMatrix<Real>();
+  UnitTestSetZeroUpperDiag<Real>();
+}
+} //namespace
+
+int main() {
+  using namespace kaldi;
+#if HAVE_CUDA == 1
+  kaldi::CuDevice::Instantiate().SelectGpuId("yes");
+#endif
+  
+  kaldi::CuMatrixUnitTest<float>();
+
+#if HAVE_CUDA == 1
+  if (!kaldi::CuDevice::Instantiate().DoublePrecisionSupported()) {
+    KALDI_WARN << "Double precision not supported, not testing that code";
+  } else
+#endif
+  {
+    kaldi::CuMatrixUnitTest<double>();
+  }
+
+#if HAVE_CUDA == 1
+  kaldi::CuDevice::Instantiate().PrintProfile();
+#endif
+
+  
+  KALDI_LOG << "Tests succeeded.\n";
+  return 0;
+}
--- a/src/cudamatrix/cu-tp-matrix-test.cc
+++ b/src/cudamatrix/cu-tp-matrix-test.cc
@ -0,0 +1,218 @@
+// cudamatrix/cu-sp-matrix-test.cc
+//
+// Copyright 2013  Ehsan Variani
+//                 Lucas Ondel
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+// UnitTests for testing cu-sp-matrix.h methods.
+//
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-tp-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-sp-matrix.h"
+
+using namespace kaldi;
+
+namespace kaldi {
+
+
+template<typename Real>
+static void AssertEqual(const CuPackedMatrix<Real> &A,
+                        const CuPackedMatrix<Real> &B,
+                        float tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows());
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++)
+    for (MatrixIndexT j = 0; j <= i; j++)
+      KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
+                   < tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
+}
+
+template<typename Real>
+static void AssertEqual(const PackedMatrix<Real> &A,
+                        const PackedMatrix<Real> &B,
+                        float tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows());
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++)
+    for (MatrixIndexT j = 0; j <= i; j++)
+      KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
+                   < tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
+}
+
+template<typename Real>
+static void AssertEqual(const PackedMatrix<Real> &A,
+                        const CuPackedMatrix<Real> &B,
+                        float tol = 0.001) {
+  KALDI_ASSERT(A.NumRows() == B.NumRows());
+  for (MatrixIndexT i = 0; i < A.NumRows(); i++)
+    for (MatrixIndexT j = 0; j <= i; j++)
+      KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
+                   < tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
+}
+
+
+
+/*
+ * Unit Tests
+ */
+template<typename Real>
+static void UnitTestCuTpMatrixInvert() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 5 * i + rand() % 10;
+    
+    TpMatrix<Real> A(dim);
+    A.SetRandn();
+    CuTpMatrix<Real> B(A);
+    
+    AssertEqual<Real>(A, B, 0.005);
+    A.Invert();
+    B.Invert();
+    AssertEqual<Real>(A, B, 0.005);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuTpMatrixCopyFromTp() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 5 * i + rand() % 10;
+    
+    TpMatrix<Real> A(dim);
+    A.SetRandn();
+    CuTpMatrix<Real> B(dim);
+    B.CopyFromTp(A);
+    CuTpMatrix<Real> C(dim);
+    C.CopyFromTp(B);
+    
+    AssertEqual<Real>(A, B);
+    AssertEqual<Real>(B, C);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuTpMatrixCopyFromMat() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixTransposeType trans = (i % 2 == 0 ? kNoTrans : kTrans);
+
+    MatrixIndexT dim = 10*i + rand() % 5;
+    CuMatrix<Real> A(dim, dim);
+    A.SetRandn();
+    Matrix<Real> A2(A);
+    
+    CuTpMatrix<Real> B(dim);
+    B.CopyFromMat(A, trans);
+    TpMatrix<Real> B2(dim);
+    B2.CopyFromMat(A2, trans);
+    TpMatrix<Real> B3(B);
+    AssertEqual(B2, B3);
+    KALDI_ASSERT(B3.Trace() != 0);
+  }
+}
+
+
+
+template<typename Real>
+static void UnitTestCuTpMatrixCholesky() {
+  for (MatrixIndexT i = 1; i < 10; i++) {
+    MatrixIndexT dim = 1 + rand() % 10;
+    if (i > 4) {
+      dim += 32 * (rand() % 5);
+    }
+
+    Matrix<Real> M(dim, dim + 2);
+    M.SetRandn();
+    SpMatrix<Real> A(dim);
+    A.AddMat2(1.0, M, kNoTrans, 0.0); // sets A to random almost-surely +ve
+                                      // definite matrix.
+    CuSpMatrix<Real> B(A);
+
+    TpMatrix<Real> C(dim);
+    C.SetRandn();
+    CuTpMatrix<Real> D(C);
+    C.Cholesky(A);
+    D.Cholesky(B);
+
+    AssertEqual<Real>(C, D);
+  }
+}
+
+template<class Real>
+static void UnitTestCuTpMatrixIO() {
+  for (int32 i = 0; i < 3; i++) {
+    int32 dimM = rand() % 255 + 10;
+    if (i % 5 == 0) { dimM = 0; }
+    CuTpMatrix<Real> mat(dimM);
+    mat.SetRandn();
+    std::ostringstream os;
+    bool binary = (i % 4 < 2);
+    mat.Write(os, binary);
+
+    CuTpMatrix<Real> mat2;
+    std::istringstream is(os.str());
+    mat2.Read(is, binary);
+    AssertEqual(mat, mat2);
+  }
+}
+
+template<typename Real> void CudaTpMatrixUnitTest() {
+  UnitTestCuTpMatrixIO<Real>();
+  UnitTestCuTpMatrixInvert<Real>();
+  UnitTestCuTpMatrixCopyFromTp<Real>();
+  UnitTestCuTpMatrixCholesky<Real>();
+  UnitTestCuTpMatrixCopyFromMat<Real>();
+}
+
+} // namespace kaldi
+
+
+int main() {
+  using namespace kaldi;
+
+
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+#endif
+    kaldi::CudaTpMatrixUnitTest<float>();
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+      kaldi::CudaTpMatrixUnitTest<double>();
+    } else {
+      KALDI_WARN << "Double precision not supported";
+    }
+#else
+    kaldi::CudaTpMatrixUnitTest<double>();
+#endif
+  
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@ -0,0 +1,112 @@
+#if HAVE_CUDA==1
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+#endif
+
+#include "util/timer.h"
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-kernels.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-tp-matrix.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "cudamatrix/cublas-wrappers.h"
+
+namespace kaldi {
+
+template<typename Real>
+CuTpMatrix<Real>::CuTpMatrix(const CuMatrixBase<Real> &orig, MatrixTransposeType trans):
+    CuPackedMatrix<Real>(orig.NumRows(), kUndefined) {
+  KALDI_ASSERT(orig.NumRows() == orig.NumCols());
+  this->CopyFromMat(orig, trans);
+}
+
+
+template<typename Real>
+void CuTpMatrix<Real>::Cholesky(const CuSpMatrix<Real> &orig) {
+#if HAVE_CUDA==1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuMatrix<Real> tmp(orig);
+    tmp.Cholesky();
+    this->CopyFromMat(tmp, kNoTrans);
+  } else
+#endif
+  {
+    this->Mat().Cholesky(orig.Mat());
+  }
+}
+
+
+template<typename Real>
+void CuTpMatrix<Real>::Invert() {
+#if HAVE_CUDA==1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    int dimBlock(CU2DBLOCK);
+    int dimGrid(n_blocks(this->NumRows(), CU2DBLOCK));
+    CuMatrix<Real> tmp(this->NumRows(), this->NumRows());
+    int dim = this->NumRows();
+    Real alpha = 1.0;
+    cuda_set_diag(dimGrid, dimBlock, tmp.Data(), alpha, tmp.Dim());
+    //Matrix<Real> A(dim,dim);
+    //tmp.CopyToMat(&A);
+    CuMatrix<Real> tmp2(dim, dim);
+    tmp2.CopyFromTp(*this);
+    cublas_trsm(dim, dim, alpha, tmp2.Data(), tmp2.Dim().stride, 
+      tmp.Data(), tmp.Dim().stride);
+    this->CopyFromMat(tmp, kNoTrans);
+  } else
+#endif
+  {
+    Mat().Invert();
+  }
+}
+
+template<typename Real>
+void CuTpMatrix<Real>::CopyFromMat(const CuMatrixBase<Real> &M,
+                                   MatrixTransposeType Trans) {
+#if HAVE_CUDA==1
+  if (CuDevice::Instantiate().Enabled()) {
+    MatrixIndexT num_rows = this->num_rows_;
+    KALDI_ASSERT(num_rows == M.NumRows() && this->num_rows_ == M.NumCols());
+    if (num_rows == 0)
+      return;
+    Timer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(num_rows, CU2DBLOCK), n_blocks(num_rows, CU2DBLOCK));
+    if (Trans == kNoTrans) {
+      cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
+      cudaThreadSynchronize();
+    } else {
+      cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
+      cudaThreadSynchronize();
+    }      
+  } else
+#endif
+  {
+    Mat().CopyFromMat(M.Mat(), Trans);
+  }
+}
+
+template<class Real>
+TpMatrix<Real>::TpMatrix(const CuTpMatrix<Real> &cu) {
+  this->Resize(cu.NumRows());
+  this->CopyFromMat(cu);
+}
+template TpMatrix<float>::TpMatrix(const CuTpMatrix<float> &cu);
+template TpMatrix<double>::TpMatrix(const CuTpMatrix<double> &cu);
+
+template<class Real>
+void TpMatrix<Real>::CopyFromMat(const CuTpMatrix<Real> &other) {
+  other.CopyToPacked(this);
+}
+// instantiate the template above.
+template void TpMatrix<float>::CopyFromMat(const CuTpMatrix<float> &other);
+template void TpMatrix<double>::CopyFromMat(const CuTpMatrix<double> &other);
+
+template class CuTpMatrix<float>;
+template class CuTpMatrix<double>;
+
+} // namespace
--- a/src/cudamatrix/cu-tp-matrix.h
+++ b/src/cudamatrix/cu-tp-matrix.h
@ -0,0 +1,83 @@
+// cudamatrix/cu-tp-matrix.h
+// Copyright 2013  Ehsan Variani
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef KALDI_CUDAMATRIX_CU_TP_MATRIX_H_
+#define KALDI_CUDAMATRIX_CU_TP_MATRIX_H_
+
+#include <sstream>
+
+#include "cudamatrix/cu-common.h"
+#include "matrix/matrix-common.h"
+#include "matrix/tp-matrix.h"
+#include "cudamatrix/cu-array.h"
+#include "cudamatrix/cu-math.h"
+#include "cudamatrix/cu-packed-matrix.h"
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+
+template<typename Real> class CuTpMatrix;
+
+template<typename Real>
+class CuTpMatrix : public CuPackedMatrix<Real> {
+  friend class CuMatrixBase<float>;
+  friend class CuMatrixBase<double>;
+  friend class CuVectorBase<Real>;
+  friend class CuSubMatrix<Real>;
+  friend class CuRand<Real>;
+  friend class CuTpMatrix<float>;
+  friend class CuTpMatrix<double>;
+ public:
+  CuTpMatrix() : CuPackedMatrix<Real>() {}
+  explicit CuTpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
+      : CuPackedMatrix<Real>(r, resize_type) {}
+  explicit CuTpMatrix<Real>(const TpMatrix<Real> &orig)
+      : CuPackedMatrix<Real>(orig) {}
+  explicit CuTpMatrix<Real>(const CuTpMatrix<Real> &orig)
+      : CuPackedMatrix<Real>(orig) {}
+  explicit CuTpMatrix<Real>(const CuMatrixBase<Real> &orig,
+                            MatrixTransposeType trans = kNoTrans);
+
+  
+  ~CuTpMatrix() {}
+
+  void CopyFromMat(const CuMatrixBase<Real> &M,
+                   MatrixTransposeType Trans = kNoTrans);
+
+  void CopyFromTp(const CuTpMatrix<Real> &other) {
+    CuPackedMatrix<Real>::CopyFromPacked(other);
+  }
+  void CopyFromTp(const TpMatrix<Real> &other) {
+    CuPackedMatrix<Real>::CopyFromPacked(other);
+  }  
+  void Cholesky(const CuSpMatrix<Real>& Orig);
+  void Invert();
+
+ protected:
+  inline const TpMatrix<Real> &Mat() const {
+    return *(reinterpret_cast<const TpMatrix<Real>* >(this));
+  }
+  inline TpMatrix<Real> &Mat() {
+    return *(reinterpret_cast<TpMatrix<Real>* >(this));
+  }
+  
+};
+
+} // namespace
+
+#endif
--- a/src/cudamatrix/cu-value.h
+++ b/src/cudamatrix/cu-value.h
@ -0,0 +1,88 @@
+// cudamatrix/cu-value.h
+
+// Copyright      2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_VALUE_H_
+#define KALDI_CUDAMATRIX_CU_VALUE_H_
+
+#include <cudamatrix/cu-device.h>
+
+namespace kaldi {
+
+/// The following class is used to simulate non-const
+/// references to Real, e.g. as returned by the non-const operator ().
+/// This class is also used as a convenient way of
+/// reading a single Real value from the device.
+template<typename Real>
+class CuValue {
+ public:
+  CuValue(Real *data): data_(data) { }
+  CuValue(const CuValue &other): data_(other.data_) { }
+
+  inline CuValue operator = (const CuValue<Real> &other) {
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+      CU_SAFE_CALL(cudaMemcpy(data_, other.data_, sizeof(Real), cudaMemcpyDeviceToDevice));
+      return *this;
+    } else
+#endif
+    {
+      *data_ = *other.data_;
+      return *this;
+    }
+  }
+  
+  inline Real operator = (Real r) { // assignment from Real
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+      CU_SAFE_CALL(cudaMemcpy(data_, &r, sizeof(Real), cudaMemcpyHostToDevice));
+      return r;
+    } else
+#endif
+    {
+      *data_ = r;
+      return r;
+    }
+  }
+
+  inline Real operator += (Real r) { return (*this = r + Real(*this)); }
+    
+
+  inline operator Real () const { // assignment to Real
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Real value;
+    CU_SAFE_CALL(cudaMemcpy(&value, data_,
+                            sizeof(Real), cudaMemcpyDeviceToHost));
+    return value;
+  } else
+#endif
+    return *data_;
+  }
+ private:
+  Real *data_;
+}; // class CuValue<Real>
+
+
+}  // namespace
+
+
+
+#endif  // KALDI_CUDAMATRIX_CU_VALUE_H_
--- a/src/cudamatrix/cu-vector-inl.h
+++ b/src/cudamatrix/cu-vector-inl.h
@ -1,462 +0,0 @@
-// cudamatrix/cu-vector-inl.h
-
-// Copyright 2009-2012  Karel Vesely
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-
-#ifndef KALDI_CUDAMATRIX_CU_VECTOR_INL_H_
-#define KALDI_CUDAMATRIX_CU_VECTOR_INL_H_
-
-#if HAVE_CUDA==1
-  #include <cuda_runtime_api.h>
-#endif
-
-#include "util/timer.h"
-#include "cudamatrix/cu-common.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-device.h"
-#include "cudamatrix/cu-kernels.h"
-
-namespace kaldi {
-
-
-template<typename Real>
-CuVector<Real>::CuVector(const CuVector<Real> &v) {
-  this->Resize(v.dim_);
-  this->CopyFromVec(v);
-}
-
-template<typename Real>
-CuVector<Real>::CuVector(const CuVectorBase<Real> &v) {
-  this->Resize(v.dim_);
-  this->CopyFromVec(v);
-}
-
-template<typename Real>
-CuVector<Real>::CuVector(const VectorBase<Real> &v) {
-  this->Resize(v.dim_);
-  this->CopyFromVec(v);
-}
-
-template<typename Real>
-void CuVector<Real>::Resize(MatrixIndexT dim, MatrixResizeType t) {
-  KALDI_ASSERT(t == kSetZero || t == kUndefined); // Others not implemented
-  // yet.
-  if (this->dim_ == dim) {
-    this->SetZero();
-    return;
-  }
-  if (this->dim_ != 0)
-    this->Destroy();
-  if (dim == 0) return;
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    cuSafeCall(cudaMalloc(reinterpret_cast<void**>(&this->data_), dim * sizeof(Real)));
-    this->dim_ = dim;
-    if (t == kSetZero) this->SetZero();
-  } else
-#endif
-  {
-    Vector<Real> vec(dim);
-    this->Swap(&vec); 
-  }
-}
-
-template<typename Real>
-void CuVector<Real>::Swap(Vector<Real> *vec) {
-#if HAVE_CUDA==1 
-  if (CuDevice::Instantiate().Enabled()) {
-    if (this->dim_ == 0) {
-      if (vec->dim_ != 0) {
-        // *this is empty, but vec is nonempty.
-        Resize(vec->dim_, kUndefined);
-        this->CopyFromVec(*vec);
-        vec->Resize(0);
-      }
-      // else both are empty.
-    } else { // *this is nonempty.
-      if (vec->dim_ != 0) {
-        // Both *this and *vec are nonempty.  Recurse to simpler cases.
-        // this could be done more efficiently in the case where
-        // the size does not change.
-        Vector<Real> temp;
-        this->Swap(&temp); // now temp is full, *this is empty.
-        vec->Swap(&temp); // now vec has data from *this, temp has
-        // data from vec.
-        Swap(vec); // copy data in vec to *this, which is now empty.
-      } else { // *this is full but *vec is empty.
-        vec->Resize(this->dim_, kUndefined);
-        this->CopyToVec(vec);
-        this->Destroy();
-      }
-    }
-  } else
-#endif
-  {
-    std::swap(vec->data_, this->data_);
-    std::swap(vec->dim_, this->dim_);
-  }
-}
-
-template<typename Real>
-void CuVector<Real>::Destroy() {
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    if (this->data_ != NULL) {
-      cuSafeCall(cudaFree(this->data_));
-    }
-  } else
-#endif
-  {
-    if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
-  }
-  this->data_ = NULL;
-  this->dim_ = 0;
-}
-
-
-
-template<typename Real>
-void CuVectorBase<Real>::CopyFromVec(const CuVectorBase<Real> &src) {
-  KALDI_ASSERT(src.Dim() == dim_);
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-    cuSafeCall(cudaMemcpy(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice));
-    CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecD2D",tim.Elapsed());
-  } else
-  #endif
-  {
-    memcpy(static_cast<void*>(data_), static_cast<void*>(src.data_),
-           dim_ * sizeof(Real));
-  }
-}
-
-
-
-template<typename Real>
-void CuVectorBase<Real>::CopyFromVec(const VectorBase<Real> &src) {
-  KALDI_ASSERT(src.Dim() == dim_);
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-
-    cuSafeCall(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice));
-
-    CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D",tim.Elapsed());
-  } else
-  #endif
-  {
-    memcpy(static_cast<void*>(data_), static_cast<const void*>(src.Data()),
-           dim_ * sizeof(Real));
-  }
-}
-
-
-
-template<typename Real>
-void CuVectorBase<Real>::CopyToVec(VectorBase<Real> *dst) const {
-  KALDI_ASSERT(dst->Dim() == dim_);
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-    cuSafeCall(cudaMemcpy(dst->Data(), this->data_,
-                          dim_*sizeof(Real), cudaMemcpyDeviceToHost));
-    CuDevice::Instantiate().AccuProfile("CuVector::CopyToVecD2H",tim.Elapsed());
-  } else
-  #endif
-  {
-    dst->CopyFromVec(Vec());
-  }
-}
-
-
-
-template<typename Real>
-void CuVector<Real>::Read(std::istream &is, bool binary) {
-  Vector<BaseFloat> temp;
-  temp.Read(is, binary);
-  Destroy();
-  Swap(&temp);
-}
-
-
-
-template<typename Real>
-void CuVector<Real>::Write(std::ostream &os, bool binary) const {
-  Vector<BaseFloat> temp(this->dim_);
-  this->CopyToVec(&temp);
-  temp.Write(os, binary); 
-}
-
-
-
-template<typename Real>
-void CuVectorBase<Real>::SetZero() {
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    KALDI_ASSERT(dim_>0);
-    KALDI_ASSERT(data_!=NULL);
-    Timer tim;
-    cuSafeCall(cudaMemset(data_, 0, dim_*sizeof(Real)));
-    CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Elapsed());
-  } else
-#endif
-  {
-    Vec().SetZero();
-  }
-}
-
-
-
-/**
- * Print the vector to stream
- */
-template<typename Real>
-std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec) {
-  Vector<Real> temp;
-  vec.CopyToVec(&temp);
-  out << temp;
-  return out;
-}
-
-
-
-
-/*
- * Methods wrapping the ANSI-C CUDA kernels
- */
-template<typename Real>
-void CuVectorBase<Real>::Set(Real value) {
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-    
-    dim3 dimBlock(CUBLOCK);
-    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
-    ::MatrixDim d = { 1, Dim(), Dim() };
-    
-    cuda_set_const(dimGrid, dimBlock, data_, value, d);
-    cuSafeCall(cudaGetLastError());
-    
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    Vec().Set(value);
-  }
-}
-
-
-
-template<typename Real>
-void CuVectorBase<Real>::Add(Real value) {
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-
-    dim3 dimBlock(CUBLOCK);
-    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
-    ::MatrixDim d = { 1, Dim(), Dim() };
-
-    cuda_add(dimGrid, dimBlock, data_, value, d);
-    cuSafeCall(cudaGetLastError());
-
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-  #endif
-  {
-    Vec().Add(value);
-  }
-}
-
-
-
-template<typename Real>
-void CuVectorBase<Real>::Scale(Real value) {
-  #if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-
-    dim3 dimBlock(CUBLOCK);
-    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
-    ::MatrixDim d = { 1, Dim(), Dim() };
-
-    cuda_scale(dimGrid, dimBlock, data_, value, d);
-    cuSafeCall(cudaGetLastError());
-
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-  #endif
-  {
-    Vec().Scale(value);
-  }
-}
-
-
-template<class Real>
-void CuVectorBase<Real>::AddVec(Real alpha, const CuVectorBase<Real> &vec,
-                                Real beta) {
-  KALDI_ASSERT(vec.Dim() == Dim());
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-
-    dim3 dimBlock(CUBLOCK);
-    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
-    ::MatrixDim d = { 1, Dim(), Dim() };
-
-    cuda_add_mat(dimGrid, dimBlock, alpha, vec.data_, beta, data_, d);
-    cuSafeCall(cudaGetLastError());
-    
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-  #endif
-  {
-    if (beta != 1.0) Vec().Scale(beta);
-    Vec().AddVec(alpha, vec.Vec());
-  }
-}
-
-
-
-template<typename Real>
-void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
-                                      Real beta) {
-  KALDI_ASSERT(mat.NumCols() == Dim());
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-   
-    CuVector<Real> temp(Dim()); // create a buffer
-    temp.SetZero();
-    
-    MatrixDim d = mat.Dim(); // only stride will be used!
-  
-    // process per 256 row blocks 
-    for(int32 block=0; (block+1)*256 <= mat.NumRows(); block++) {
-      // 1st dim ... rows, 2nd dim ... cols
-      dim3 dimBlock(256, 1); 
-      dim3 dimGrid(1, mat.NumCols());
-      int32 offset = block*256*d.stride;
-
-      cuda_add_row_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
-    }
-    
-    // process the remainder
-    int32 div = mat.NumRows() / 256;
-    int32 mod = mat.NumRows() % 256;
-    if (mod != 0) {
-      // 1st dim ... rows, 2nd dim ... cols
-      dim3 dimBlock(mod, 1);
-      dim3 dimGrid(1, mat.NumCols());
-      int32 offset = div*256*d.stride;
-      
-      cuda_add_row_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
-    }
-    // now we have the sum!
-    
-    // add buffer rmp to this vector using alpha and beta
-    this->AddVec(alpha,temp,beta);
-
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    Vec().AddRowSumMat(alpha, mat.Mat(), beta);
-  }
-}
-
-
-
-template<typename Real>
-void CuVectorBase<Real>::AddColSumMat(Real alpha,
-                                      const CuMatrixBase<Real> &mat,
-                                      Real beta) {
-  KALDI_ASSERT(mat.NumRows() == Dim());
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-
-    CuVector<Real> temp(Dim()); // create a buffer
-    
-    MatrixDim d = mat.Dim(); // only stride will be used!
-  
-    // process per 256 column blocks 
-    for(int32 block=0; (block+1)*256 <= mat.NumCols(); block++) {
-      // 1st dim ... cols, 2nd dim ... rows
-      dim3 dimBlock(256, 1);
-      dim3 dimGrid(1, mat.NumRows());
-      int32 offset = block*256;
-
-      cuda_add_col_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
-    }
-    
-    // process the remainder
-    int32 div = mat.NumCols() / 256;
-    int32 mod = mat.NumCols() % 256;
-    if (mod != 0) {
-      // 1st dim ... cols, 2nd dim ... rows
-      dim3 dimBlock(mod, 1);
-      dim3 dimGrid(1, mat.NumRows());
-      int32 offset=div*256;
-      
-      cuda_add_col_sum_mat(dimGrid, dimBlock, mat.data_ +offset, temp.data_, d);
-    }
-    // now we have the sum!
-    
-    // add buffer rmp to this vector using alpha and beta
-    this->AddVec(alpha, temp, beta);
-    
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-  #endif
-  {
-    Vec().AddColSumMat(alpha, mat.Mat(), beta);
-  }
-}
-
-
- 
-template<typename Real> 
-void CuVectorBase<Real>::InvertElements() {
-#if HAVE_CUDA==1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
-    
-    dim3 dimBlock(CUBLOCK*8, 1);
-    dim3 dimGrid(n_blocks(dim_, CUBLOCK*8));
-    MatrixDim d = {1, dim_, dim_};
-
-    cuda_invert_elements(dimGrid, dimBlock, data_, d);
-    cuSafeCall(cudaGetLastError());
-    
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    Vec().InvertElements();
-  }
-}
-
- 
-} // namespace kaldi
-
-#endif
-
-
--- a/src/cudamatrix/cu-vector-speed-test.cc
+++ b/src/cudamatrix/cu-vector-speed-test.cc
@ -0,0 +1,169 @@
+// cudamatrix/cu-vector-speed-test.cc
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-math.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+template<typename Real>
+std::string NameOf() {
+  return (sizeof(Real) == 8 ? "<double>" : "<float>");
+}
+
+template<typename Real> void TestCuVectorSoftmax(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuVector<Real> M(dim);
+  M.SetRandn();
+
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    M.ApplySoftMax();
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::Softmax" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void TestCuVectorSum(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuVector<Real> M(dim);
+  M.SetRandn();
+
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    M.Sum();
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::Sum" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuVector<Real> M(dim);
+  M.SetRandn();
+
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    CuVector<Real> ones(dim);
+    ones.Set(1.0);
+    VecVec(M, ones);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::VecVecOne" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+
+
+template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
+  BaseFloat time_in_secs = 0.05;
+  CuVector<Real> v(dim);
+  v.SetRandn();
+  CuMatrix<Real> N(dim, dim), O(dim, dim);
+  N.SetRandn(); O.SetRandn();
+
+  Timer tim;
+  int32 iter = 0;
+  
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    v.AddDiagMatMat(1.0, N, kNoTrans, O, kNoTrans, 1.0);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+
+template<typename Real> void CudaVectorSpeedTest() {
+  std::vector<int32> sizes;
+  sizes.push_back(16);
+  sizes.push_back(128);
+  sizes.push_back(256);
+  sizes.push_back(1024);
+  int32 ns = sizes.size();
+  for (int32 s = 0; s < ns; s++) {
+	  TestCuVectorSoftmax<Real>(sizes[s]);
+  }
+
+
+  for (int32 s = 0; s < ns; s++) {
+          TestCuVectorSum<Real>(sizes[s]);
+  }
+
+  for (int32 s = 0; s < ns; s++) {
+          TestCuVectorVecVecOne<Real>(sizes[s]);
+  }
+
+  for (int32 s = 0; s < ns; s++) {
+    TestCuVectorAddDiagMatMat<Real>(sizes[s]);
+  }
+  
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+    //Select the GPU
+#if HAVE_CUDA == 1
+    CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
+#endif
+
+    kaldi::CudaVectorSpeedTest<float>();
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+    kaldi::CudaVectorSpeedTest<double>();
+  } else {
+    KALDI_WARN << "Double precision not supported";
+  }
+#else
+  kaldi::CudaVectorSpeedTest<double>();
+#endif
+  std::cout << "Tests succeeded.\n";
+}
+
--- a/src/cudamatrix/cu-vector-test.cc
+++ b/src/cudamatrix/cu-vector-test.cc
@ -0,0 +1,751 @@
+// cudamatrix/cuda-vector-test.cc
+
+// Copyright 2013 Lucas Ondel
+//           2013 Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-tp-matrix.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "cudamatrix/cu-math.h"
+
+
+namespace kaldi {
+
+/*
+ * INITIALIZERS
+ */
+
+
+/*
+ * Unit tests
+ */
+
+template<class Real>
+static void UnitTestCuVectorIO() {
+  for (int32 i = 0; i < 10; i++) {
+    int32 dimM = rand() % 255;
+    if (i % 5 == 0) { dimM = 0; }
+    CuVector<Real> vec(dimM);
+    vec.SetRandn();
+    std::ostringstream os;
+    bool binary = (i % 4 < 2);
+    vec.Write(os, binary);
+
+    CuVector<Real> vec2;
+    std::istringstream is(os.str());
+    vec2.Read(is, binary);
+    AssertEqual(vec, vec2);
+  }
+}
+
+
+template<typename Real, typename OtherReal> 
+static void UnitTestCuVectorCopyFromVec() {
+  for (int32 i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10 * i;
+    Vector<Real> A(dim);
+    A.SetRandn();
+    CuVector<OtherReal> B(A);
+    Vector<Real> C(B);
+    CuVector<Real> D(dim);
+    D.CopyFromVec(C);
+    Vector<OtherReal> E(dim);
+    E.CopyFromVec(D);
+    CuVector<Real> F(E);
+    CuVector<Real> A2(A);
+    AssertEqual(F, A2);
+  }
+}
+
+template<typename Real> 
+static void UnitTestCuSubVector() {
+  for (int32 iter = 0 ; iter < 10; iter++) {
+    int32 M1 = 1 + rand () % 10, M2 = 1 + rand() % 1, M3 = 1 + rand() % 10, M = M1 + M2 + M3,
+        m = rand() % M2;
+    CuVector<Real> vec(M);
+    vec.SetRandn();
+    CuSubVector<Real> subvec1(vec, M1, M2),
+        subvec2 = vec.Range(M1, M2);
+    Real f1 = vec(M1 + m), f2 = subvec1(m), f3 = subvec2(m);
+    KALDI_ASSERT(f1 == f2);
+    KALDI_ASSERT(f2 == f3);
+  }
+}
+
+
+
+template<typename Real> 
+static void UnitTestCuVectorMulTp() {
+  for (int32 i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10 * i;
+    Vector<Real> A(dim);
+    A.SetRandn();
+    TpMatrix<Real> B(dim);
+    B.SetRandn();
+    
+    CuVector<Real> C(A);
+    CuTpMatrix<Real> D(B);
+
+    A.MulTp(B, kNoTrans);
+    C.MulTp(D, kNoTrans);
+
+    CuVector<Real> E(A);
+    AssertEqual(C, E);
+  }
+}
+
+template<typename Real>
+static void UnitTestCuVectorAddTp() {
+  for (int32 i = 1; i < 10; i++) {
+    MatrixIndexT dim = 10 * i;
+    Vector<Real> A(dim);
+    A.SetRandn();
+    TpMatrix<Real> B(dim);
+    B.SetRandn();
+    Vector<Real> C(dim);
+    C.SetRandn();
+    
+    CuVector<Real> D(A);
+    CuTpMatrix<Real> E(B);
+    CuVector<Real> F(C); 
+
+    A.AddTpVec(1.0, B, kNoTrans, C, 1.0);
+    D.AddTpVec(1.0, E, kNoTrans, F, 1.0);
+
+    CuVector<Real> G(A);
+    AssertEqual(D, G);
+  }
+}
+
+template<typename Real> void CuVectorUnitTestVecVec() {
+  int32 M = 10 % rand() % 100;
+  CuVector<Real> vec1(M), vec2(M);
+  vec1.SetRandn();
+  vec2.SetRandn();
+  Real prod = 0.0;
+  for (int32 i = 0; i < M; i++)
+    prod += vec1(i) * vec2(i);
+  AssertEqual(prod, VecVec(vec1, vec2));
+}
+
+template<typename Real> void CuVectorUnitTestAddVec() {
+  int32 M = 10 % rand() % 100;
+  CuVector<Real> vec1(M);
+  CuVector<Real> vec2(M);
+  vec1.SetRandn();
+  vec2.SetRandn();
+  CuVector<Real> vec1_orig(vec1);
+  BaseFloat alpha = 0.43243;
+  vec1.AddVec(alpha, vec2);
+  
+  for (int32 i = 0; i < M; i++)
+    AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
+}
+
+template<typename Real> void CuVectorUnitTestAddVecCross() {
+  for (int32 i = 0; i < 4; i++) {
+    int32 M = 10 % rand() % 100;
+    CuVector<float> vec1(M);
+    CuVector<Real> vec2(M);
+    vec1.SetRandn();
+    vec2.SetRandn();
+
+    if (i == 0) {
+      CuVector<Real> vec1_orig(vec1);
+      Real alpha = 0.43243;
+      vec1.AddVec(alpha, vec2);
+  
+      for (int32 i = 0; i < M; i++)
+        AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
+    } else {
+      CuVector<Real> vec2_orig(vec2);
+      Real alpha = 0.43243;
+      vec2.AddVec(alpha, vec1);
+      for (int32 i = 0; i < M; i++)
+        AssertEqual(vec2_orig(i) + alpha * vec1(i), vec2(i));
+    }
+  }
+}
+
+template<typename Real> void CuVectorUnitTestAddVecExtra() {
+  int32 M = 10 % rand() % 100;
+  CuVector<Real> vec1(M), vec2(M);
+  vec1.SetRandn();
+  vec2.SetRandn();
+  CuVector<Real> vec1_orig(vec1);
+  BaseFloat alpha = 0.43243, beta = 1.4321;
+  vec1.AddVec(alpha, vec2, beta);
+  
+  for (int32 i = 0; i < M; i++)
+    AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i));
+}
+
+
+template<typename Real> void CuVectorUnitTestAddRowSumMat() {
+  int32 M = 10 + rand() % 280, N = 10 + rand() % 20;
+  BaseFloat alpha = 10.0143432, beta = 43.4321;
+  CuMatrix<Real> mat(N, M);
+  mat.SetRandn();
+  CuVector<Real> vec(M);
+  mat.SetRandn();
+  Matrix<Real> mat2(mat);
+  Vector<Real> vec2(M);
+  vec.AddRowSumMat(alpha, mat, beta);
+  vec2.AddRowSumMat(alpha, mat2, beta);
+  Vector<Real> vec3(vec);
+  AssertEqual(vec2, vec3);
+}
+
+template<typename Real> void CuVectorUnitTestAddColSumMat() {
+  int32 M = 10 + rand() % 280, N = 10 + rand() % 20;
+  BaseFloat alpha = 10.0143432, beta = 43.4321;
+  CuMatrix<Real> mat(M, N);
+  mat.SetRandn();
+  CuVector<Real> vec(M);
+  mat.SetRandn();
+  Matrix<Real> mat2(mat);
+  Vector<Real> vec2(M);
+  vec.AddColSumMat(alpha, mat, beta);
+  vec2.AddColSumMat(alpha, mat2, beta);
+  Vector<Real> vec3(vec);
+  AssertEqual(vec2, vec3);
+}
+
+
+template<typename Real> void CuVectorUnitTestApproxEqual() {
+  int32 M = 10 + rand() % 100;
+  CuVector<Real> vec1(M), vec2(M);
+  vec1.SetRandn();
+  vec2.SetRandn();
+  Real tol = 0.5;
+  for (int32 i = 0; i < 10; i++) {
+    Real sumsq = 0.0, sumsq_orig = 0.0;
+    for (int32 j = 0; j < M; j++) {
+      sumsq += (vec1(j) - vec2(j)) * (vec1(j) - vec2(j));
+      sumsq_orig += vec1(j) * vec1(j);
+    }
+    Real rms = sqrt(sumsq), rms_orig = sqrt(sumsq_orig);
+    KALDI_ASSERT(vec1.ApproxEqual(vec2, tol) == (rms <= tol * rms_orig));
+    tol *= 2.0;
+  }
+}
+
+template<typename Real> static void UnitTestCuVectorReplaceValue() {
+  for (int32 i = 0; i < 5; i++) {
+    int32 dim = 100 + rand() % 200;
+    Real orig = 0.1 * (rand() % 100), changed = 0.1 * (rand() % 50);
+    Vector<Real> vec(dim);
+    vec.SetRandn();
+    vec(dim / 2) = orig;
+    CuVector<Real> vec1(vec);
+    vec.ReplaceValue(orig, changed);
+    vec1.ReplaceValue(orig, changed);
+    Vector<Real> vec2(vec1);
+    AssertEqual(vec, vec2);
+  }
+}
+
+template<typename Real> void CuVectorUnitTestInvertElements() {
+  // Also tests MulElements();
+  int32 M = 256 + rand() % 100;
+  CuVector<Real> vec1(M);
+  vec1.SetRandn();
+  CuVector<Real> vec2(vec1);
+  vec2.InvertElements();
+  CuVector<Real> vec3(vec1);
+  vec3.MulElements(vec2);
+  // vec3 should be all ones.
+  Real prod = VecVec(vec3, vec3);
+  AssertEqual(prod, static_cast<Real>(M));
+}
+
+template<typename Real> void CuVectorUnitTestSum() {
+  for (int32 i =1; i < 10; i++) {
+    MatrixIndexT dim = 2048 * i + 100 % rand();
+    CuVector<Real> A(dim), ones(dim);
+    A.SetRandn();
+    ones.Set(1.0);
+    
+    AssertEqual(VecVec(A, ones), A.Sum());
+  }
+}
+
+template<typename Real> void CuVectorUnitTestScale() {
+  for (int32 i = 0; i < 4; i++) {
+    int32 dim = 100 + 400 % rand();
+    CuVector<Real> cu_vec(dim);
+    cu_vec.SetRandn();
+    Vector<Real> vec(cu_vec);
+    BaseFloat scale = 0.333;
+    cu_vec.Scale(scale);
+    vec.Scale(scale);
+    Vector<Real> vec2(cu_vec);
+    KALDI_ASSERT(ApproxEqual(vec, vec2));
+  }
+}
+
+template<typename Real> void CuVectorUnitTestCopyFromMat() {
+  int32 M = 100 + rand() % 255, N = 100 + rand() % 255;
+  CuMatrix<Real> cu_matrix(M, N);
+  cu_matrix.SetRandn();
+  for(int32 i = 0; i < N; i++) {
+    CuVector<Real> vector(M);
+    vector.CopyColFromMat(cu_matrix, i);
+    for(int32 j = 0; j < M; j++) {
+      KALDI_ASSERT(vector(j)==cu_matrix(j, i));
+    }
+  }
+  Matrix<Real> matrix(cu_matrix), matrix2(M, N);
+  CuMatrix<Real> matrix3(M, N);
+  
+  CuVector<Real> vector(M * N), vector2(M * N);
+  vector.CopyRowsFromMat(cu_matrix);
+  vector2.CopyRowsFromMat(matrix);
+  matrix2.CopyRowsFromVec(vector2);
+  matrix3.CopyRowsFromVec(Vector<Real>(vector2));
+  Vector<Real> vector3(M * N);
+  vector3.CopyRowsFromMat(cu_matrix);
+                                         
+  
+  for(int32 j = 0; j < M*N; j++) {
+    if (rand() % 500 == 0) { // random small subset (it was slow)
+      KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N));
+      KALDI_ASSERT(vector2(j) == cu_matrix(j/N, j%N));
+      KALDI_ASSERT(vector2(j) == matrix2(j/N, j%N));
+      KALDI_ASSERT(vector3(j) == matrix2(j/N, j%N));
+      KALDI_ASSERT(vector3(j) == matrix3(j/N, j%N));
+    }
+  }
+}
+
+template<typename Real> void CuVectorUnitTestCopyDiagFromPacked() {
+  for (int32 i = 0; i < 5; i++) {
+    int32 N = 100 + rand() % 255;
+    CuSpMatrix<Real> S(N);
+    S.SetRandn();
+    CuVector<Real> V(N, kUndefined);
+    V.CopyDiagFromPacked(S);
+    SpMatrix<Real> cpu_S(S);
+    Vector<Real> cpu_V(N);
+    cpu_V.CopyDiagFromPacked(cpu_S);
+    Vector<Real> cpu_V2(V);
+    KALDI_ASSERT(cpu_V.ApproxEqual(cpu_V2));
+  }
+}
+
+template<typename Real> void CuVectorUnitTestCopyCross() {
+  for (int32 i = 0; i < 10; i++) {
+    int32 M = 100 + rand() % 255;
+    if (rand() % 3 == 0) M = 0;
+    CuVector<Real> v1(M);
+    v1.SetRandn();
+    CuVector<float> v2(M);
+    v2.CopyFromVec(v1);
+    CuVector<Real> v3(M);
+    v3.CopyFromVec(v2);
+    AssertEqual(v1, v3);
+  }
+}
+
+template<typename Real> void CuVectorUnitTestCopyCross2() {
+  for (int32 i = 0; i < 10; i++) {
+    int32 M = 100 + rand() % 255;
+    if (rand() % 3 == 0) M = 0;
+    CuVector<Real> v1(M);
+    v1.SetRandn();
+    Vector<float> v2(M);
+    v2.CopyFromVec(v1);
+    CuVector<Real> v3(M);
+    v3.CopyFromVec(v2);
+    AssertEqual(v1, v3);
+  }
+}
+
+template<typename Real> void CuVectorUnitTestCopyDiagFromMat() {
+  for (int32 i = 0; i < 5; i++) {
+    int32 M = 100 + rand() % 255, N = M + rand() % 2;
+    Matrix<Real> matrix(M, N);
+    if (i % 2 == 0) matrix.Transpose();
+    matrix.SetRandn();
+    Vector<Real> vector(M, kUndefined);
+    vector.CopyDiagFromMat(matrix);
+
+    CuMatrix<Real> cuda_matrix(matrix);
+    CuVector<Real> cuda_vector(M, kUndefined);
+    cuda_vector.CopyDiagFromMat(cuda_matrix);
+    Vector<Real> vector2(cuda_vector);
+    AssertEqual(vector, vector2);
+    AssertEqual(vector.Sum(), cuda_matrix.Trace(false));
+    AssertEqual(cuda_vector.Sum(), matrix.Trace(false));
+  }
+}
+
+
+template<typename Real> void CuVectorUnitTestNorm() {
+  int32 dim = 2;
+  CuVector<Real> cu_vector(dim);
+  cu_vector(0) = 1.0;
+  cu_vector(1) = -2.0;
+  KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0));
+  KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0)));
+}
+               
+
+template<typename Real> void CuVectorUnitTestMin() {
+  for (int32 p = 0; p < 5; p++) {
+    int32 dim = 100 + rand() % 500;
+    CuVector<Real> cu_vector(dim);
+    cu_vector.SetRandn();
+    Vector<Real> vector(cu_vector);
+    Real min1 = cu_vector.Min(), min2 = vector.Min();
+    KALDI_ASSERT(min1 == min2);
+  }
+}
+
+
+template<typename Real> void CuVectorUnitTestMax() {
+  for (int32 p = 0; p < 5; p++) {
+    int32 dim = 100 + rand() % 500;
+    CuVector<Real> cu_vector(dim);
+    cu_vector.SetRandn();
+    Vector<Real> vector(cu_vector);
+    Real max1 = cu_vector.Max(), max2 = vector.Max();
+    KALDI_ASSERT(max1 == max2);
+  }
+}
+
+
+template<typename Real> void CuVectorUnitTestApplySoftMax() {
+  for (int32 i = 0; i < 10; i++) {
+    int32 dim = 100 + rand() % 300;
+    //int32 dim = 1024;
+    CuVector<Real> cu_vector(dim);
+    cu_vector.SetRandn();
+    Vector<Real> vector(cu_vector);
+
+    cu_vector.ApplySoftMax();
+    vector.ApplySoftMax();
+    CuVector<Real> cu_vector2(vector);
+    //std::cout<<cu_vector <<"\n"<<cu_vector2<<std::endl;
+    AssertEqual(cu_vector, cu_vector2);
+  }
+}
+
+template<typename Real> void CuVectorUnitTestApplyExp() {
+  int32 dim = 100;
+  CuVector<Real> vector(dim);
+  vector.SetRandn();
+  CuVector<Real> vector2(vector);
+
+  vector.ApplyExp();
+  for(int32 j = 0; j < dim; j++) {
+    //std::cout<<"diff is "<<exp(vector2(j))-vector(j)<<std::endl;;
+    KALDI_ASSERT(abs(exp(vector2(j))-vector(j)) < 0.000001 )
+  }
+
+}
+
+template<typename Real> void CuVectorUnitTestApplyLog() {
+  int32 dim = 100;
+  CuVector<Real> vector(dim);
+  vector.SetRandn();
+  for(int32 j = 0; j < dim; j++) {
+    if(vector(j) <= 0.0)
+      vector(j) = 1.0 - vector(j);
+  }
+
+  CuVector<Real> vector2(vector);
+
+  vector.ApplyLog();
+  for(int32 j = 0; j < dim; j++) {
+    //std::cout<<"diff is "<<exp(vector2(j))-vector(j)<<std::endl;;
+    KALDI_ASSERT(abs(log(vector2(j))-vector(j)) < 0.000001 )
+  }
+}
+
+template<typename Real> void CuVectorUnitTestApplyFloor() {
+  for (int32 l = 0; l < 10; l++) {
+    int32 dim = 100 + rand() % 700;
+    CuVector<Real> cu_vector(dim);
+    cu_vector.SetRandn();
+
+    Vector<Real> vector(cu_vector);
+    BaseFloat floor = 0.33 * (-5 + rand() % 10);
+    int32 i = cu_vector.ApplyFloor(floor);
+    int32 j = vector.ApplyFloor(floor);
+  
+    CuVector<Real> cu2(vector);
+
+    AssertEqual(cu2, cu_vector);
+    if (i != j) {
+      KALDI_WARN << "ApplyFloor return code broken...";
+    }
+    KALDI_ASSERT(i==j);
+  }
+}
+
+template<typename Real> void CuVectorUnitTestApplyPow() {
+  for (int32 l = 0; l < 10; l++) {
+    int32 dim = 100 + rand() % 700;
+
+    CuVector<Real> cu_vector(dim);
+    cu_vector.SetRandn();
+
+    Vector<Real> vector(cu_vector);
+
+    BaseFloat pow = -2 + (rand() % 5);
+    cu_vector.ApplyPow(pow);
+    vector.ApplyPow(pow);
+  
+    CuVector<Real> cu2(vector);
+
+    AssertEqual(cu2, cu_vector);
+  }
+}
+
+template<typename Real> void CuVectorUnitTestAddVecVec() {
+  int32 dim = 100;
+  CuVector<Real> cu_vector(dim);
+  cu_vector.SetRandn();
+  Vector<Real> vector(cu_vector);
+
+  Real beta = rand();
+  Real alpha = rand();
+  Vector<Real> v(dim), r(dim);
+  v.SetRandn(); r.SetRandn();
+  CuVector<Real> cuV(v), cuR(r);
+
+
+  cu_vector.AddVecVec(alpha, cuR, cuV, beta);
+  vector.AddVecVec(alpha, r, v, beta);
+
+  CuVector<Real> cu2(vector);
+  std::cout<<cu2(0)<<' '<<cu_vector(0)<<std::endl;
+  AssertEqual(cu2, cu_vector);
+}
+
+template<typename Real> void CuVectorUnitTestAddDiagMat2() {
+  for (int p = 0; p < 4; p++) {
+    int32 M = 230 + rand() % 100, N = 230 + rand() % 100;
+    BaseFloat alpha = 0.2 + rand() % 3, beta = 0.3 + rand() % 2;
+    CuVector<Real> cu_vector(M);
+    cu_vector.SetRandn();
+
+    CuMatrix<Real> cu_mat_orig(M, N);
+    cu_mat_orig.SetRandn();
+    MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
+    CuMatrix<Real> cu_mat(cu_mat_orig, trans);
+    
+    Vector<Real> vector(cu_vector);
+    Matrix<Real> mat(cu_mat);
+
+    vector.AddDiagMat2(alpha, mat, trans, beta);
+    cu_vector.AddDiagMat2(alpha, cu_mat, trans, beta);
+
+    Vector<Real> vector2(cu_vector);
+    AssertEqual(vector, vector2);
+  }
+}
+
+template<typename Real>
+static void CuVectorUnitTestAddDiagMatMat() {
+  for (MatrixIndexT iter = 0; iter < 4; iter++) {
+    BaseFloat alpha = 0.432 + rand() % 5, beta = 0.043 + rand() % 2;
+	MatrixIndexT dimM = 10 + rand() % 300,
+                 dimN = 5 + rand() % 300;
+    CuVector<Real> v(dimM);
+    CuMatrix<Real> M_orig(dimM, dimN), N_orig(dimN, dimM);
+    M_orig.SetRandn();
+    N_orig.SetRandn();
+    MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans);
+    MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans);
+    CuMatrix<Real> M(M_orig, transM), N(N_orig, transN);
+    
+    v.SetRandn();
+    CuVector<Real> w(v);
+
+    w.AddDiagMatMat(alpha, M, transM, N, transN, beta);
+    
+    {
+      CuVector<Real> w2(v);
+      CuMatrix<Real> MN(dimM, dimM);
+      MN.AddMatMat(1.0, M, transM, N, transN, 0.0);
+      CuVector<Real> d(dimM);
+      d.CopyDiagFromMat(MN);
+      w2.Scale(beta);
+      w2.AddVec(alpha, d);
+      AssertEqual(w, w2);
+    }
+  }
+}
+
+
+
+template<typename Real> void CuVectorUnitTestAddMatVec() {
+  for (int32 i = 0; i < 10; i++) {
+    int32 M = 10 + rand() % 500, N = 10 + rand() % 400;
+
+    bool transpose = (i % 2 == 0);
+
+    CuVector<Real> src_cu(M);
+    src_cu.SetRandn();
+    Vector<Real> src(src_cu);
+
+    CuVector<Real> dst_cu(N);
+    dst_cu.SetRandn();
+    Vector<Real> dst(dst_cu);
+
+    CuMatrix<Real> mat_cu(transpose ? M : N, transpose ? N : M);
+    mat_cu.SetRandn();
+    Matrix<Real> mat(mat_cu);
+
+    BaseFloat alpha = 0.5 * (rand() % 10), beta = 0.5 * (rand() % 10);
+    dst_cu.AddMatVec(alpha, mat_cu, transpose ? kTrans : kNoTrans,
+                     src_cu, beta);
+    dst.AddMatVec(alpha, mat, transpose ? kTrans : kNoTrans,
+                  src, beta);
+    Vector<Real> dst2(dst_cu);
+    AssertEqual(dst, dst2);
+  }
+}
+
+
+template<typename Real> void CuVectorUnitTestAddSpVec() {
+  for (int32 i = 0; i < 5; i++) {
+    int32 M = 100 + rand() % 256;
+
+    CuVector<Real> src_cu(M);
+    src_cu.SetRandn();
+    Vector<Real> src(src_cu);
+
+    CuVector<Real> dst_cu(M);
+    dst_cu.SetRandn();
+    Vector<Real> dst(dst_cu);
+
+    CuSpMatrix<Real> mat_cu(M);
+    mat_cu.SetRandn();
+    SpMatrix<Real> mat(mat_cu);
+    
+    BaseFloat alpha = 0.5 * (rand() % 5), beta = 0.5 * (rand() % 5);
+    dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta);
+    dst.AddSpVec(alpha, mat, src, beta);
+    Vector<Real> dst2(dst_cu);
+    AssertEqual(dst, dst2);
+  }
+}
+
+
+
+template<typename Real> void CuVectorUnitTest() {
+  UnitTestCuVectorCopyFromVec<Real, float>();
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().DoublePrecisionSupported())
+#endif
+  UnitTestCuVectorCopyFromVec<Real, double>();
+  UnitTestCuVectorIO<Real>();
+  CuVectorUnitTestVecVec<Real>();
+  CuVectorUnitTestAddVec<Real>();
+  CuVectorUnitTestAddVecCross<Real>();
+  CuVectorUnitTestAddVecExtra<Real>();
+  CuVectorUnitTestApproxEqual<Real>();
+  CuVectorUnitTestScale<Real>();
+  CuVectorUnitTestSum<Real>();
+  CuVectorUnitTestInvertElements<Real>();
+  CuVectorUnitTestAddRowSumMat<Real>();
+  CuVectorUnitTestAddColSumMat<Real>();
+  UnitTestCuVectorReplaceValue<Real>();
+  UnitTestCuVectorAddTp<Real>();
+  UnitTestCuVectorMulTp<Real>();
+  UnitTestCuSubVector<Real>();
+  CuVectorUnitTestCopyFromMat<Real>();
+  CuVectorUnitTestMin<Real>();
+  CuVectorUnitTestMax<Real>();
+  CuVectorUnitTestApplySoftMax<Real>();
+  CuVectorUnitTestCopyDiagFromPacked<Real>();
+  CuVectorUnitTestCopyDiagFromMat<Real>();
+  CuVectorUnitTestCopyCross<Real>();
+  CuVectorUnitTestCopyCross2<Real>();  
+  CuVectorUnitTestNorm<Real>();  
+  CuVectorUnitTestApplyExp<Real>();
+  CuVectorUnitTestApplyLog<Real>();
+  CuVectorUnitTestApplyFloor<Real>();
+  CuVectorUnitTestApplyPow<Real>();
+  CuVectorUnitTestAddMatVec<Real>();
+  CuVectorUnitTestAddSpVec<Real>();
+  CuVectorUnitTestAddVecVec<Real>();
+  CuVectorUnitTestAddDiagMat2<Real>();
+  CuVectorUnitTestAddDiagMatMat<Real>();
+}
+
+
+} // namespace kaldi
+
+
+int main(int argc, char *argv[]) {
+  //Select the GPU
+  using namespace kaldi;
+  const char *usage = "Usage: cu-vector-test [options]";
+
+  ParseOptions po(usage);
+  std::string use_gpu = "yes";    
+  po.Register("use-gpu", &use_gpu, "yes|no|optional");
+  po.Read(argc, argv);
+  
+  if (po.NumArgs() != 0) {
+    po.PrintUsage();
+    exit(1);
+  }
+
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+    else
+      CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+
+    kaldi::CuVectorUnitTest<float>();
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+      kaldi::CuVectorUnitTest<double>();
+    } else {
+      KALDI_WARN << "Double precision not supported";
+    }
+#else
+    kaldi::CuVectorUnitTest<double>();
+#endif
+
+    if (loop == 0)
+      KALDI_LOG << "Tests without GPU use succeeded.\n";
+    else
+      KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  return 0;
+}
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@ -2,6 +2,8 @@

 // Copyright 2009-2012  Karel Vesely
 //                      Johns Hopkins University (author: Daniel Povey)
+//                      Lucas Ondel
+//		  2013  Xiaohui Zhang	

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -25,49 +27,170 @@

 #include "matrix/kaldi-vector.h"
 #include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-value.h"
 #include "cudamatrix/cu-math.h"

 namespace kaldi {

 template<typename Real> class CuMatrixBase;

+template<typename Real>
+Real VecVec(const CuVectorBase<Real> &v1, const CuVectorBase<Real> &v2);
+
+template<typename Real, typename OtherReal>
+Real VecVec(const CuVectorBase<Real> &v1, const CuVectorBase<OtherReal> &v2);
+
 /**
 * Vector for CUDA computing
 */
 template<typename Real>
 class CuVectorBase {
 public:
+  friend class CuVectorBase<float>;
+  friend class CuVectorBase<double>;
  friend class CuMatrixBase<Real>;
+  friend class MatrixBase<Real>;
+  friend class CuPackedMatrix<Real>;
+  friend class CuSpMatrix<Real>;
+  friend class CuTpMatrix<Real>;
+
+  template <typename OtherReal>
+  friend OtherReal VecVec(const CuVectorBase<OtherReal> &v1,
+                          const CuVectorBase<OtherReal> &v2);
  friend void cu::Splice<Real>(const CuMatrix<Real> &src,
-                               const CuStlVector<int32> &frame_offsets,
+                               const CuArray<int32> &frame_offsets,
                               CuMatrix<Real> *tgt);
-  
+  friend class CuRand<Real>;
  
  /// Dimensions
  MatrixIndexT Dim() const { return dim_;  }   

+  /// Returns a pointer to the start of the vector's data.
+  inline Real* Data() { return data_; }
+  /// Returns a pointer to the start of the vector's data (const).
+  inline const Real* Data() const { return data_; }
+  
  /// Copy functions; these will crash if the dimension
  /// do not match.  The operator = in class CuVector will
  /// also change the sizes for you.
  void CopyFromVec(const CuVectorBase<Real> &src);
-  void CopyFromVec(const VectorBase<Real> &src);
-  void CopyToVec(VectorBase<Real> *dst) const;
+  
+  template<typename OtherReal>
+  void CopyFromVec(const CuVectorBase<OtherReal> &M);

+  template<typename OtherReal>
+  void CopyFromVec(const VectorBase<OtherReal> &src);
+
+
+  template<typename OtherReal>
+  void CopyToVec(VectorBase<OtherReal> *dst) const;
+  
+  void CopyRowsFromMat(const CuMatrixBase<Real> &M);
+
+  void CopyRowsFromMat(const MatrixBase<Real> &M);
+  
  /// Math operations
  void SetZero();
  void Set(Real value);
  void Add(Real value);
  void Scale(Real value);
+  
  void AddVec(Real alpha, const CuVectorBase<Real> &vec, Real beta = 1.0);

+  template<typename OtherReal>
+  void AddVec(Real alpha, const CuVectorBase<OtherReal> &vec, Real beta = 1.0);
+
  /// Sum the rows of the matrix, add to vector
  void AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0);
  /// Sum the columns of the matrix, add to vector
  void AddColSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0); 
+
+  /// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
+  /// Works even if rv == *this.
+  void AddTpVec(const Real alpha, const CuTpMatrix<Real>&M,
+                const MatrixTransposeType trans, const CuVectorBase<Real> &v,
+                const Real beta);  // **beta previously defaulted to 0.0**
+  
+  /// Multiplies this vector by lower-triangular marix:  *this <-- *this *M
+  void MulTp(const CuTpMatrix<Real> &M, const MatrixTransposeType trans);
+
+  bool ApproxEqual(const CuVectorBase<Real> &other, float tol = 0.01) const;
+  
  void InvertElements(); 

+  void ApplySoftMax();
+  void ApplyExp();
+  void ApplyLog();
+  MatrixIndexT ApplyFloor(Real floor_val);
+  void ApplyPow(Real power);
+  Real Sum() const;
+  void SetRandn();
+  
+  CuSubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
+    return CuSubVector<Real>(*this, o, l);
+  }
+
+  const CuSubVector<Real> Range(const MatrixIndexT o,
+                                const MatrixIndexT l) const {
+    return CuSubVector<Real>(*this, o, l);
+  }
+
+  void CopyColFromMat(const CuMatrixBase<Real> &mat, MatrixIndexT col);
+
+  template<typename OtherReal>
+  void CopyColFromMat(const CuMatrixBase<OtherReal> &mat, MatrixIndexT col);
+
+  void AddMatVec(const Real alpha, const CuMatrixBase<Real> &M,
+                 MatrixTransposeType trans, const CuVectorBase<Real> &v,
+                 const Real beta);
+  void AddVecVec(Real alpha, const CuVectorBase<Real> &v,
+                 const CuVectorBase<Real> &r, Real beta);
+
+  void AddSpVec(const Real alpha, const CuSpMatrix<Real> &S,
+                const CuVectorBase<Real> &v, const Real beta);
+
+  /// Add the diagonal of a matrix times itself:
+  /// *this = diag(M M^T) +  beta * *this (if trans == kNoTrans), or
+  /// *this = diag(M^T M) +  beta * *this (if trans == kTrans).
+  void AddDiagMat2(Real alpha, const CuMatrixBase<Real> &M,
+                   MatrixTransposeType trans, Real beta);
+
+  /// Add the diagonal of a matrix product: *this = diag(M N), assuming the
+  /// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
+  /// as you would expect.
+  void AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M, MatrixTransposeType transM,
+                     const CuMatrixBase<Real> &N, MatrixTransposeType transN,
+                     Real beta = 1.0);  
+
+  inline CuValue<Real> operator() (MatrixIndexT i) {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                          static_cast<UnsignedMatrixIndexT>(dim_));
+    return CuValue<Real>(data_ + i);
+  }
+
+  Real Norm(BaseFloat p); // Only works for p = 1 and p = 2.
+            
+  inline Real operator() (MatrixIndexT i) const {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                          static_cast<UnsignedMatrixIndexT>(dim_));
+    return CuValue<Real>(data_ + i); // will be casted to Real.
+  }
+
+  /// Extracts the diagonal of a packed matrix M; works for Sp or Tp.
+  void CopyDiagFromPacked(const CuPackedMatrix<Real> &M);
+
+  /// Extracts the diagonal of a matrix.
+  void CopyDiagFromMat(const CuMatrix<Real> &M);
+  
+  Real Max() const;  
+  Real Min() const;
+  
+  // Set each element to y = (x == orig ? changed : x).
+  void ReplaceValue(Real orig, Real changed);
+  
+  void MulElements(const CuVectorBase<Real> &v);
+ protected:

-protected:
  // The following two functions should only be called if we did not compile
  // with CUDA or could not get a CUDA card; in that case the contents are
  // interpreted the same as a regular vector.
@ -78,7 +201,7 @@ protected:
    return *(reinterpret_cast<VectorBase<Real>* >(this));
  }
  
-  /// Default constructor: make it private so the user cannot
+  /// Default constructor: make it protected so the user cannot
  /// instantiate this class.
  CuVectorBase<Real>(): data_(NULL), dim_(0) { }
  
@ -89,14 +212,38 @@ protected:
  KALDI_DISALLOW_COPY_AND_ASSIGN(CuVectorBase);
 };

-template<class Real>
+template<typename Real>
 class CuVector: public CuVectorBase<Real> {
+  friend class CuVectorBase<float>;
+  friend class CuVectorBase<double>;
+  friend class CuMatrixBase<Real>;
+  friend class CuPackedMatrix<Real>;
+  friend class CuSpMatrix<Real>;
+  friend class CuTpMatrix<Real>;
+  
 public:
  CuVector() { }
  CuVector(MatrixIndexT dim, MatrixResizeType t = kSetZero) { Resize(dim, t); }
-  CuVector(const CuVector<Real> &v);
+  
  CuVector(const CuVectorBase<Real> &v);
+
  CuVector(const VectorBase<Real> &v);  
+  explicit CuVector(const CuVector<Real> &v) : CuVectorBase<Real>() {
+    Resize(v.Dim(), kUndefined);
+    this->CopyFromVec(v);
+  }
+
+  template<typename OtherReal>
+  explicit CuVector(const CuVectorBase<OtherReal> &v) : CuVectorBase<Real>() {
+    Resize(v.Dim(), kUndefined);
+    this->CopyFromVec(v);
+  }
+
+  template<typename OtherReal>
+  explicit CuVector(const VectorBase<OtherReal> &v) : CuVectorBase<Real>() {
+    Resize(v.Dim(), kUndefined);
+    this->CopyFromVec(Vector<Real>(v));
+  }

  /// Allocate the memory
  void Resize(MatrixIndexT dim, MatrixResizeType t = kSetZero);
@ -104,12 +251,20 @@ class CuVector: public CuVectorBase<Real> {
  ~CuVector() { Destroy(); }

  CuVector<Real> &operator = (const CuVectorBase<Real> &other) {
-    Resize(other.Dim());
-    CopyFromVec(other);
+    Resize(other.Dim(), kUndefined);
+    this->CopyFromVec(other);
+    return *this;
+  }
+
+  CuVector<Real> &operator = (const CuVector<Real> &other) {
+    Resize(other.Dim(), kUndefined);
+    this->CopyFromVec(other);
+    return *this;
  }
  CuVector<Real> &operator = (const VectorBase<Real> &other) {
    Resize(other.Dim());
-    CopyFromVec(other);
+    this->CopyFromVec(other);
+    return *this;
  }
      

@ -118,27 +273,91 @@ class CuVector: public CuVectorBase<Real> {
  void Write(std::ostream &is, bool binary) const;

  void Swap(Vector<Real> *vec);
+
 private:
  void Destroy();
 };

 // We'll fill out the following class if it's needed.
-template<class Real>
+template<typename Real>
 class CuSubVector: public CuVectorBase<Real> {
- public:
- private:
+ public:  
+  CuSubVector(const CuVectorBase<Real> &t, const MatrixIndexT origin,
+              const MatrixIndexT length) : CuVectorBase<Real>() {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
+                 static_cast<UnsignedMatrixIndexT>(length) <=
+                 static_cast<UnsignedMatrixIndexT>(t.Dim()));
+    CuVectorBase<Real>::data_ = const_cast<Real*>(t.Data()+origin);
+    CuVectorBase<Real>::dim_ = length;
+  }
+  /// Copy constructor
+  /// this constructor needed for Range() to work in base class.
+  CuSubVector(const CuSubVector &other) : CuVectorBase<Real> () {
+    CuVectorBase<Real>::data_ = other.data_;
+    CuVectorBase<Real>::dim_ = other.dim_;
+  }
+
+  CuSubVector(const Real* data, MatrixIndexT length) : CuVectorBase<Real> () {
+    // Yes, we're evading C's restrictions on const here, and yes, it can be used
+    // to do wrong stuff; unfortunately the workaround would be very difficult.
+    CuVectorBase<Real>::data_ = const_cast<Real*>(data);
+    CuVectorBase<Real>::dim_ = length;
+  }
+    
+  /// This operation does not preserve const-ness, so be careful.
+  CuSubVector(const CuMatrixBase<Real> &matrix, MatrixIndexT row) {
+    CuVectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
+    CuVectorBase<Real>::dim_ = matrix.NumCols();
+  }
+  
+
 };

-
-
 /// I/O
 template<typename Real>
 std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec);
 
-  
+
+template<typename Real>
+bool ApproxEqual(const CuVectorBase<Real> &a,
+                 const CuVectorBase<Real> &b, Real tol = 0.01) {
+  return a.ApproxEqual(b, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(CuVectorBase<Real> &a, CuVectorBase<Real> &b,
+                        float tol = 0.01) {
+  KALDI_ASSERT(a.ApproxEqual(b, tol));
+}
+
+template<typename Real>
+template<typename OtherReal>
+void CuVectorBase<Real>::CopyFromVec(const CuVectorBase<OtherReal> &v) {
+  v.CopyToVec(&this);
+}
+
+template<typename Real>
+template<typename OtherReal>
+void VectorBase<Real>::CopyFromVec(const CuVectorBase<OtherReal> &cu) {
+  cu.CopyToVec(this);
+}
+
+// declare template specializations.
+template <>
+template <>    
+void CuVectorBase<double>::CopyFromVec<float>(const CuVectorBase<float> &src);
+
+template<>
+template <>
+void CuVectorBase<float>::CopyFromVec<double>(const CuVectorBase<double> &src);
+
+template<typename Real>
+template<typename OtherReal>
+Vector<Real>::Vector(const CuVectorBase<OtherReal> &cu) {
+  Init(cu.Dim());
+  cu.CopyToVec(this);
+}
+
 } // namespace

-
-#include "cu-vector-inl.h"
-
 #endif
--- a/src/cudamatrix/cublas-wrappers.h
+++ b/src/cudamatrix/cublas-wrappers.h
@ -0,0 +1,136 @@
+// cudamatrix/cublas-wrappers.h
+
+// Copyright 2013  Johns Hopkins University (author: Daniel Povey);
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_CUBLAS_WRAPPERS_H_
+#define KALDI_MATRIX_CUBLAS_WRAPPERS_H_ 1
+
+// Do not include this file directly.  It is to be included
+// by .cc files in this directory.
+
+namespace kaldi {
+#if HAVE_CUDA == 1
+
+inline void cublas_gemm(char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda,const float *B, int ldb, float beta, float *C, int ldc) {
+  cublasSgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+}
+inline void cublas_gemm(char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda,const double *B, int ldb, double beta, double *C, int ldc) {
+  cublasDgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+}
+inline void cublas_trsm(int m, int n, float alpha, const float* A, int lda, float* B, int ldb) {
+  cublasStrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
+}
+inline void cublas_trsm(int m, int n, double alpha, const double* A, int lda, double* B, int ldb) {
+  cublasDtrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
+}
+inline void cublas_syrk(char uplo, char trans, int n, int k,
+                        float alpha, const float *A, int lda,
+                        float beta, float *C, int ldc) {
+  cublasSsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
+}
+inline void cublas_syrk(char uplo, char trans, int n, int k,
+                        double alpha, const double *A, int lda,
+                        double beta, double *C, int ldc) {
+  cublasDsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
+}
+inline float cublas_dot(int n, const float *x, int incx, const float *y, int incy) {
+  return cublasSdot(n, x, incx, y, incy);
+}
+inline double cublas_dot(int n, const double *x, int incx, const double *y, int incy) {
+  return cublasDdot(n, x, incx, y, incy);
+}
+inline float cublas_asum(int n, const float* x, int incx) {
+  return cublasSasum(n, x, incx);
+}
+inline double cublas_asum(int n, const double* x, int incx) {
+  return cublasDasum(n, x, incx);
+}
+inline float cublas_nrm2(int n, const float* x, int incx) {
+  return cublasSnrm2(n, x, incx);
+}
+inline double cublas_nrm2(int n, const double* x, int incx) {
+  return cublasDnrm2(n, x, incx);
+}
+inline void cublas_copy(int n, const float* x, int incx,
+                        float* y, int incy) {
+  cublasScopy(n,x,incx,y,incy);
+}
+inline void cublas_copy(int n, const double* x, int incx,
+                          double* y, int incy) {
+  cublasDcopy(n,x,incx,y,incy);
+}
+inline void cublas_scal(int n, float alpha, float* mat, int incx) {
+  cublasSscal(n, alpha, mat, incx);
+}
+inline void cublas_scal(int n, double alpha, double* mat, int incx) {
+  cublasDscal(n, alpha, mat, incx);
+}
+
+inline void cublas_axpy(int n, float alpha, const float* x, int incx, float* y, int incy) {
+  cublasSaxpy(n, alpha, x, incx, y, incy);
+}
+inline void cublas_axpy(int n, double alpha, const double* x, int incx, double* y, int incy) {
+  cublasDaxpy(n, alpha, x, incx, y, incy);
+}
+inline void cublas_gemv(char trans, int m, int n, float alpha,
+                        const float* A, int lda, const float* x,
+                        int incx, float beta, float* y, int incy) {
+  cublasSgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
+}
+inline void cublas_gemv(char trans, int m, int n, double alpha,
+                        const double* A, int lda, const double* x,
+                        int incx, double beta, double* y, int incy) {
+  cublasDgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
+}
+
+inline void cublas_spmv(char uplo, int n, float alpha, const float *AP, const float *x,
+                        int incx, float beta, float *y, int incy) {
+  cublasSspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+inline void cublas_spmv(char uplo, int n, double alpha, const double *AP, const double *x,
+                        int incx, double beta, double *y, int incy) {
+  cublasDspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+// Use caution with these, the 'transpose' argument is the opposite of what it
+// should really be, due to CUDA storing things in column major order.  We also
+// had to switch 'l' to 'u'; we view our packed matrices as lower-triangular,
+// row-by-row, but CUDA views the same layout as upper-triangular,
+// column-by-column.
+inline void cublas_tpmv(char trans, int n,
+                        const float* Ap, float* x, int incx) {
+  return cublasStpmv('u', trans, 'n', n, Ap, x, incx);
+}
+inline void cublas_tpmv(char trans, int n, const double* Ap,
+                        double* x,int incx) {
+  return cublasDtpmv('u', trans, 'n', n, Ap, x, incx);
+}
+
+inline void cublas_spr(char uplo, int n, float alpha, const float *x,
+                      int incx, float *AP) {
+  cublasSspr(uplo, n, alpha, x, incx, AP);
+}
+inline void cublas_spr(char uplo, int n, double alpha, const double *x,
+                      int incx, double *AP) {
+  cublasDspr(uplo, n, alpha, x, incx, AP);
+}
+
+#endif
+}
+// namespace kaldi
+
+#endif
--- a/src/cudamatrix/cuda-matrix-test.cc
+++ b/src/cudamatrix/cuda-matrix-test.cc
@ -1,713 +0,0 @@
-// cudamatrix/cuda-matrix-test.cc
-
-// Copyright 2010  Karel Vesely
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include <iostream>
-#include <vector>
-#include <cstdlib>
-
-#include "base/kaldi-common.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-#include "cudamatrix/cu-math.h"
-
-using namespace kaldi;
-
-
-namespace kaldi {
-
-/*
- * INITIALIZERS
- */
-template<class Real> 
-static void InitRand(VectorBase<Real> *v) {
-  for (MatrixIndexT i = 0;i < v->Dim();i++)
-	(*v)(i) = RandGauss();
-}
-
-
-
-template<class Real> 
-static void InitRand(MatrixBase<Real> *M) {
-  do {
-    for (MatrixIndexT i = 0;i < M->NumRows();i++)
-      for (MatrixIndexT j = 0;j < M->NumCols();j++)
-        (*M)(i, j) = RandGauss();
-  } while (M->NumRows() != 0 && M->Cond() > 100);
-}
-
-
-
-template<class Real> 
-static void RandGaussMatrix(MatrixBase<Real>* mat) {
-  for(int32 r=0; r<mat->NumRows(); r++)
-    for(int32 c=0; c<mat->NumCols(); c++)
-      (*mat)(r,c) = RandGauss();
-}
-
-
-
-template<class Real> 
-static void RandZeroToOneMatrix(MatrixBase<Real>* mat) {
-  for(int32 r=0; r<mat->NumRows(); r++)
-    for(int32 c=0; c<mat->NumCols(); c++)
-      (*mat)(r,c) = RandUniform();
-}
-
-
-
-
-/*
- * ASSERTS
- */
-template<class Real> 
-static void AssertEqual(const MatrixBase<Real> &A,
-                        const MatrixBase<Real> &B,
-                        float tol = 0.001) {
-  KALDI_ASSERT(A.NumRows() == B.NumRows()&&A.NumCols() == B.NumCols());
-  for (MatrixIndexT i = 0;i < A.NumRows();i++) {
-    for (MatrixIndexT j = 0;j < A.NumCols();j++) {
-      KALDI_ASSERT(std::abs(A(i, j)-B(i, j)) < tol*std::max(1.0, (double) (std::abs(A(i, j))+std::abs(B(i, j)))));
-    }
-  }
-}
-
-
-
-template<class Real>
-static bool ApproxEqual(const MatrixBase<Real> &A,
-                        const MatrixBase<Real> &B, Real tol = 0.001) {
-  KALDI_ASSERT(A.NumRows() == B.NumRows());
-  MatrixBase<Real> diff(A);
-  diff.AddSp(1.0, B);
-  Real a = std::max(A.Max(), -A.Min()), b = std::max(B.Max(), -B.Min),
-      d = std::max(diff.Max(), -diff.Min());
-  return (d <= tol * std::max(a, b));
-}
-
-
-
-template<class Real> 
-static void AssertEqual(VectorBase<Real> &A, VectorBase<Real> &B, float tol = 0.001) {
-  KALDI_ASSERT(A.Dim() == B.Dim());
-  for (MatrixIndexT i=0; i < A.Dim(); i++)
-    KALDI_ASSERT(std::abs(A(i)-B(i)) < tol);
-}
-
-
-
-template<class Real> 
-static bool ApproxEqual(VectorBase<Real> &A, VectorBase<Real> &B, float tol = 0.001) {
-  KALDI_ASSERT(A.Dim() == B.Dim());
-  for (MatrixIndexT i=0; i < A.Dim(); i++)
-    if (std::abs(A(i)-B(i)) > tol) return false;
-  return true;
-}
-
-
-
-static void AssertEqual(std::vector<int32> &A, std::vector<int32> &B) {
-  KALDI_ASSERT(A.size() == B.size());
-  for (size_t i=0; i < A.size(); i++)
-    KALDI_ASSERT(A[i] == B[i]);
-}
-
-
-
-/*
- * Unit tests
- */
-
-/*
- * CuMatrix
- */
-template<class Real> 
-static void UnitTestCuMatrixApplyLog() {
-  Matrix<Real> H(100,100);
-  RandGaussMatrix(&H);
-  H.MulElements(H); //make numbers positive
-
-  CuMatrix<Real> D(100,100);
-  D.CopyFromMat(H);
-
-  D.ApplyLog();
-  H.ApplyLog();
-
-  Matrix<Real> H2(100,100);
-  D.CopyToMat(&H2);
-
-  AssertEqual(H,H2);
-}
-
-
-template<class Real> 
-static void UnitTestCuMatrixMulElements() {
-  Matrix<Real> Ha(100,100);
-  Matrix<Real> Hb(100,100);
-  RandGaussMatrix(&Ha);
-  RandGaussMatrix(&Hb);
-
-  CuMatrix<Real> Da(100,100);
-  CuMatrix<Real> Db(100,100);
-  Da.CopyFromMat(Ha);
-  Db.CopyFromMat(Hb);
-
-  Da.MulElements(Db);
-  Ha.MulElements(Hb);
-
-  Matrix<Real> Ha2(100,100);
-  Da.CopyToMat(&Ha2);
-
-  AssertEqual(Ha,Ha2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuMatrixMulColsVec() {
-  Matrix<Real> Hm(100,99);
-  Vector<Real> Hv(99);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(100,99);
-  CuVector<Real> Dv(99);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dm.MulColsVec(Dv);
-  Hm.MulColsVec(Hv);
-
-  Matrix<Real> Hm2(100,99);
-  Dm.CopyToMat(&Hm2);
-
-  AssertEqual(Hm,Hm2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuMatrixMulRowsVec() {
-  Matrix<Real> Hm(100,99);
-  Vector<Real> Hv(100);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(100,99);
-  CuVector<Real> Dv(100);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dm.MulRowsVec(Dv);
-  Hm.MulRowsVec(Hv);
-
-  Matrix<Real> Hm2(100,99);
-  Dm.CopyToMat(&Hm2);
-
-  AssertEqual(Hm,Hm2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuMatrixDivRowsVec() {
-  Matrix<Real> Hm(100,99);
-  Vector<Real> Hv(100);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(100,99);
-  CuVector<Real> Dv(100);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dm.DivRowsVec(Dv);
-  Hv.InvertElements();
-  Hm.MulRowsVec(Hv);
-
-  Matrix<Real> Hm2(100,99);
-  Dm.CopyToMat(&Hm2);
-
-  AssertEqual(Hm,Hm2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuMatrixAddMat() {
-  Matrix<Real> Ha(100,100);
-  Matrix<Real> Hb(100,100);
-  RandGaussMatrix(&Ha);
-  RandGaussMatrix(&Hb);
-
-  CuMatrix<Real> Da(100,100);
-  CuMatrix<Real> Db(100,100);
-  Da.CopyFromMat(Ha);
-  Db.CopyFromMat(Hb);
-
-  Da.AddMat(0.5,Db);
-  Ha.AddMat(0.5,Hb);
-
-  Matrix<Real> Ha2(100,100);
-  Da.CopyToMat(&Ha2);
-
-  AssertEqual(Ha,Ha2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuMatrixAddVecToCols() {
-  Matrix<Real> Hm(100,99);
-  Vector<Real> Hv(100);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(100,99);
-  CuVector<Real> Dv(100);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dm.AddVecToCols(0.5,Dv);
-  Hm.AddVecToCols(0.5,Hv);
-
-  Matrix<Real> Hm2(100,99);
-  Dm.CopyToMat(&Hm2);
-
-  AssertEqual(Hm,Hm2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuMatrixAddVecToRows() {
-  Matrix<Real> Hm(100,99);
-  Vector<Real> Hv(99);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(100,99);
-  CuVector<Real> Dv(99);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dm.AddVecToRows(0.5,Dv);
-  Hm.AddVecToRows(0.5,Hv);
-
-  Matrix<Real> Hm2(100,99);
-  Dm.CopyToMat(&Hm2);
-
-  AssertEqual(Hm,Hm2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuMatrixAddMatMat() {
-  Matrix<Real> Ha(200,100);
-  Matrix<Real> Hb(100,200);
-  Matrix<Real> Hc1(200,200);
-  Matrix<Real> Hc2(100,100);
-  RandGaussMatrix(&Ha);
-  RandGaussMatrix(&Hb);
-
-  CuMatrix<Real> Da(200,100);
-  CuMatrix<Real> Db(100,200);
-  Da.CopyFromMat(Ha);
-  Db.CopyFromMat(Hb);
-  CuMatrix<Real> Dc1(200,200);
-  CuMatrix<Real> Dc2(100,100);
-
-  Dc1.AddMatMat(0.5f,Da,kNoTrans,Db,kNoTrans,0.0f);
-  Dc2.AddMatMat(0.5f,Da,kTrans,Db,kTrans,0.0f);
-  Hc1.AddMatMat(0.5f,Ha,kNoTrans,Hb,kNoTrans,0.0f);
-  Hc2.AddMatMat(0.5f,Ha,kTrans,Hb,kTrans,0.0f);
-
-  Matrix<Real> Hc1a(200,200);
-  Matrix<Real> Hc2a(100,100);
-  Dc1.CopyToMat(&Hc1a);
-  Dc2.CopyToMat(&Hc2a);
-
-  AssertEqual(Hc1,Hc1a);
-  AssertEqual(Hc2,Hc2a);
-}
-
-
-/*
- * CuVector unit tests
- */
-template<class Real> 
-static void UnitTestCuVectorAddVec() {
-  Vector<Real> Hv(777);
-  Vector<Real> Hw(777);
-  InitRand(&Hv);
-  InitRand(&Hw);
-
-  CuVector<Real> Dv(777);
-  CuVector<Real> Dw(777);
-  Dv.CopyFromVec(Hv);
-  Dw.CopyFromVec(Hw);
-
-  Dv.AddVec(0.1,Dw,0.9);
-  Hv.Scale(0.9);
-  Hv.AddVec(0.1,Hw);
-
-  Vector<Real> Hv2(777);
-  Dv.CopyToVec(&Hv2);
-  
-  AssertEqual(Hv,Hv2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuVectorAddRowSumMat() {
-  const int32 X=4321, Y=19;
-  Real alpha=0.1, beta=0.7;
-
-  Matrix<Real> Hm(X,Y);
-  Vector<Real> Hv(Y);
-  Vector<Real> Hv_accu(Y);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(X,Y);
-  CuVector<Real> Dv(Y);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dv.AddRowSumMat(alpha,Dm,beta);
-  
-  Hv_accu.SetZero();
-  Hv_accu.AddRowSumMat(1.0, Hm);
-  Hv.Scale(beta);
-  Hv.AddVec(alpha,Hv_accu);
-
-  Vector<Real> Hv2(Y);
-  Dv.CopyToVec(&Hv2);
-
-  AssertEqual(Hv,Hv2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuVectorAddRowSumMatLarge() {
-  Matrix<Real> Hm(1000,990);
-  Vector<Real> Hv(990);
-  Vector<Real> Hv_accu(990);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(1000,990);
-  CuVector<Real> Dv(990);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dv.AddRowSumMat(0.5,Dm,0.7);
-  
-  Hv_accu.SetZero();
-  Hv_accu.AddRowSumMat(1.0, Hm);
-  Hv.Scale(0.7);
-  Hv.AddVec(0.5,Hv_accu);
-
-  Vector<Real> Hv2(990);
-  Dv.CopyToVec(&Hv2);
-
-  AssertEqual(Hv,Hv2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuVectorAddColSumMat() {
-  const int32 X=19, Y=4321;
-  Real alpha=0.5, beta=0.7;
-
-  Matrix<Real> Hm(X,Y);
-  Vector<Real> Hv(X);
-  Vector<Real> Hv_accu(X);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(X,Y);
-  CuVector<Real> Dv(X);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dv.AddColSumMat(alpha,Dm,beta);
-  
-  Hv_accu.SetZero();
-  Hv_accu.AddColSumMat(1.0, Hm);
-  Hv.Scale(beta);
-  Hv.AddVec(alpha, Hv_accu);
-
-  Vector<Real> Hv2(X);
-  Dv.CopyToVec(&Hv2);
-
-  AssertEqual(Hv,Hv2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuVectorAddColSumMatLarge() {
-  Matrix<Real> Hm(1000,990);
-  Vector<Real> Hv(1000);
-  Vector<Real> Hv_accu(1000);
-  RandGaussMatrix(&Hm);
-  InitRand(&Hv);
-
-  CuMatrix<Real> Dm(1000,990);
-  CuVector<Real> Dv(1000);
-  Dm.CopyFromMat(Hm);
-  Dv.CopyFromVec(Hv);
-
-  Dv.AddColSumMat(0.5, Dm, 0.7);
-  
-  Hv_accu.SetZero();
-  Hv_accu.AddColSumMat(1.0, Hm);
-  Hv.Scale(0.7);
-  Hv.AddVec(0.5,Hv_accu);
-
-  Vector<Real> Hv2(1000);
-  Dv.CopyToVec(&Hv2);
-
-  AssertEqual(Hv,Hv2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuVectorInvertElements() {
-  Vector<Real> Hv(777);
-  InitRand(&Hv);
-
-  CuVector<Real> Dv(777);
-  Dv.CopyFromVec(Hv);
-
-  Dv.InvertElements();
-  Hv.InvertElements();
-
-  Vector<Real> Hv2(777);
-  Dv.CopyToVec(&Hv2);
-  
-  AssertEqual(Hv,Hv2);
-}
-
-
-
-/*
- * cu:: unit tests
- */
-template<class Real> 
-static void UnitTestCuSigmoid() {
-  Matrix<Real> Hi(100,111);
-  Matrix<Real> Ho(100,111);
-  RandGaussMatrix(&Hi);
-
-  CuMatrix<Real> Di(100,111);
-  CuMatrix<Real> Do(100,111);
-  Di.CopyFromMat(Hi);
-
-  //gpu
-  Do.Sigmoid(Di);
-  //cpu
-  for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
-    for(MatrixIndexT c=0; c<Hi.NumCols(); c++) {
-      Ho(r, c) = 1.0/(1.0+exp(-Hi(r, c)));
-    }
-  }
-
-  Matrix<Real> Ho2(100,111);
-  Do.CopyToMat(&Ho2);
-
-  AssertEqual(Ho,Ho2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuDiffSigmoid() {
-  Matrix<Real> Hi(100,111);
-  Matrix<Real> Ho(100,111);
-  Matrix<Real> Hy(100,111);
-  RandGaussMatrix(&Hi);
-  RandZeroToOneMatrix(&Hy);
-
-  CuMatrix<Real> Di(100,111);
-  CuMatrix<Real> Do(100,111);
-  CuMatrix<Real> Dy(100,111);
-  Di.CopyFromMat(Hi);
-  Dy.CopyFromMat(Hy);
-
-  //gpu
-  Do.DiffSigmoid(Dy, Di);
-  //cpu
-  for(MatrixIndexT r=0; r<Ho.NumRows(); r++) {
-    for(MatrixIndexT c=0; c<Ho.NumCols(); c++) {
-      Ho(r, c) = Hy(r, c)*(1.0 - Hy(r, c)) * Hi(r, c);
-    }
-  }
-
-  Matrix<Real> Ho2(100,111);
-  Do.CopyToMat(&Ho2);
-
-  AssertEqual(Ho,Ho2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuSoftmax() {
-  Matrix<Real> Hi(100,111);
-  Matrix<Real> Ho(100,111);
-  RandGaussMatrix(&Hi);
-  
-  CuMatrix<Real> Di(100,111);
-  CuMatrix<Real> Do(100,111);
-  Di.CopyFromMat(Hi);
-
-  //gpu
-  Do.Softmax(Di);
-  //cpu
-  Ho.CopyFromMat(Hi);
-  for(MatrixIndexT r=0; r<Ho.NumRows(); r++) {
-    Ho.Row(r).ApplySoftMax();
-  }
-
-  Matrix<Real> Ho2(100,111);
-  Do.CopyToMat(&Ho2);
-
-  AssertEqual(Ho,Ho2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuFindRowMaxId() {
-  Matrix<Real> Hi(100,111);
-  RandGaussMatrix(&Hi);
-
-  CuMatrix<Real> Di(100,111);
-  Di.CopyFromMat(Hi);
-
-  std::vector<int32> Hmax(100);
-  CuStlVector<int32> Dmax(100);
-
-  //gpu
-  Di.FindRowMaxId(&Dmax);
-
-  //cpu
-  for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
-    Real max=-1e20; int32 idx=-1;
-    for(MatrixIndexT c=0; c<Hi.NumCols(); c++) {
-      if(Hi(r,c) > max) { idx=c; max=Hi(r,c); }
-    }
-    Hmax[r] = idx;
-  }
-
-  std::vector<int32> Hmax2(100);
-  Dmax.CopyToVec(&Hmax2);
-
-  AssertEqual(Hmax,Hmax2);
-}
-
-
-
-template<class Real> 
-static void UnitTestCuDiffXent() {
-  int32 X=100, Y=111;
-  //nnet output / diff
-  Matrix<Real> Hi(X,Y);
-  RandZeroToOneMatrix(&Hi);
-  CuMatrix<Real> Di(X,Y);
-  Di.CopyFromMat(Hi);
-  //target vector
-  std::vector<int32> Htgt(X);
-  for(int32 i=0; i<X; i++) {
-    Htgt[i] = rand()%Y;
-  }
-  CuStlVector<int32> Dtgt(X);
-  Dtgt.CopyFromVec(Htgt);
-  //logpost vector
-  Vector<Real> Hlogpost(X);
-  CuVector<Real> Dlogpost(X);
-  
-  //gpu
-  Di.DiffXent(Dtgt, &Dlogpost);
-  //cpu
-  for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
-    int32 col_tgt = Htgt[r];
-    Hlogpost(r) = log(Hi(r, col_tgt));
-    Hi(r, col_tgt) -= 1.0;
-  }
-
-  Matrix<Real> Hi2(X,Y);
-  Di.CopyToMat(&Hi2);
-  Vector<Real> Hlogpost2(X);
-  Dlogpost.CopyToVec(&Hlogpost2);
-
-  AssertEqual(Hi,Hi2);
-  AssertEqual(Hlogpost,Hlogpost2);
-}
-
-
-
-template<class Real> void CudaMatrixUnitTest() {
-  //test CuMatrix<Real> methods by cross-check with Matrix
-  UnitTestCuMatrixApplyLog<Real>();
-  UnitTestCuMatrixMulElements<Real>();
-  UnitTestCuMatrixMulColsVec<Real>();
-  UnitTestCuMatrixMulRowsVec<Real>();
-  UnitTestCuMatrixDivRowsVec<Real>();
-  UnitTestCuMatrixAddMat<Real>();
-  UnitTestCuMatrixAddVecToCols<Real>();
-  UnitTestCuMatrixAddVecToRows<Real>();
-  UnitTestCuMatrixAddMatMat<Real>();
-  //test CuVector<Real> methods
-  UnitTestCuVectorAddVec<Real>();
-  UnitTestCuVectorAddRowSumMat<Real>();
-  UnitTestCuVectorAddRowSumMatLarge<Real>();
-  UnitTestCuVectorAddColSumMat<Real>();
-  UnitTestCuVectorAddColSumMatLarge<Real>();
-  UnitTestCuVectorInvertElements<Real>();
-
-  UnitTestCuSigmoid<Real>();
-  UnitTestCuDiffSigmoid<Real>();
-  UnitTestCuFindRowMaxId<Real>();
-  UnitTestCuSoftmax<Real>();
-  UnitTestCuDiffXent<Real>();
-}
-
-
-} // namespace kaldi
-
-
-int main() {
-    //Select the GPU
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(-2); //-2 .. automatic selection
-#endif
-
-
-  kaldi::CudaMatrixUnitTest<float>();
-  kaldi::CudaMatrixUnitTest<double>();
-  std::cout << "Tests succeeded.\n";
-}
--- a/src/matrix/cblas-wrappers.h
+++ b/src/matrix/cblas-wrappers.h
@ -17,6 +17,9 @@
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
+#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
+#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
+

 #include <limits>
 #include "matrix/sp-matrix.h"
@ -235,6 +238,8 @@ inline void cblas_Xgemm(const double alpha,
              alpha, Adata, a_stride, Bdata, b_stride,
              beta, Mdata, stride); 
 }
+
+
 inline void cblas_Xsymm(const float alpha,
                        MatrixIndexT sz,
                        const float *Adata,MatrixIndexT a_stride,
@ -470,3 +475,5 @@ inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT st

 }
 // namespace kaldi
+
+#endif
--- a/src/matrix/compressed-matrix.cc
+++ b/src/matrix/compressed-matrix.cc
@ -23,7 +23,7 @@

 namespace kaldi {

-template<class Real>
+template<typename Real>
 void CompressedMatrix::CopyFromMat(
    const MatrixBase<Real> &mat) {
  if (data_ != NULL) {
@ -95,6 +95,20 @@ void CompressedMatrix::CopyFromMat(const MatrixBase<float> &mat);
 template
 void CompressedMatrix::CopyFromMat(const MatrixBase<double> &mat);

+
+template<typename Real>
+CompressedMatrix &CompressedMatrix::operator =(const MatrixBase<Real> &mat) {
+  this->CopyFromMat(mat);
+  return *this;
+}
+
+// Instantiate the template for float and double.
+template
+CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<float> &mat);
+
+template
+CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<double> &mat);
+
 inline uint16 CompressedMatrix::FloatToUint16(
    const GlobalHeader &global_header,
    float value) {
@ -114,7 +128,7 @@ inline float CompressedMatrix::Uint16ToFloat(
      + global_header.range * 1.52590218966964e-05 * value;
 }

-template<class Real>  // static
+template<typename Real>  // static
 void CompressedMatrix::ComputeColHeader(
    const GlobalHeader &global_header,
    const Real *data, MatrixIndexT stride,
@ -229,7 +243,7 @@ inline float CompressedMatrix::CharToFloat(
 }


-template<class Real>  // static
+template<typename Real>  // static
 void CompressedMatrix::CompressColumn(
    const GlobalHeader &global_header,
    const Real *data, MatrixIndexT stride,
@ -383,7 +397,7 @@ void CompressedMatrix::Read(std::istream &is, bool binary) {
    KALDI_ERR << "Failed to read data.";
 }

-template<class Real>
+template<typename Real>
 void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat) const {
  if (data_ == NULL) {
    KALDI_ASSERT(mat->NumRows() == 0);
--- a/src/matrix/compressed-matrix.h
+++ b/src/matrix/compressed-matrix.h
@ -46,20 +46,24 @@ class CompressedMatrix {

  ~CompressedMatrix() { Destroy(); }
  
-  template<class Real>
+  template<typename Real>
  CompressedMatrix(const MatrixBase<Real> &mat): data_(NULL) { CopyFromMat(mat); }

+
  /// This will resize *this and copy the contents of mat to *this.
-  template<class Real>
+  template<typename Real>
  void CopyFromMat(const MatrixBase<Real> &mat);
  
  CompressedMatrix(const CompressedMatrix &mat);
  
  CompressedMatrix &operator = (const CompressedMatrix &mat); // assignment operator.
+
+  template<typename Real>
+  CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
  
  // Note: mat must have the correct size, CopyToMat no longer attempts
  // to resize the matrix
-  template<class Real>
+  template<typename Real>
  void CopyToMat(MatrixBase<Real> *mat) const;

  void Write(std::ostream &os, bool binary) const;
@ -122,12 +126,12 @@ class CompressedMatrix {
    uint16 percentile_100;
  };

-  template<class Real>
+  template<typename Real>
  static void CompressColumn(const GlobalHeader &global_header,
                             const Real *data, MatrixIndexT stride,
                             int32 num_rows, PerColHeader *header,
                             unsigned char *byte_data);
-  template<class Real>
+  template<typename Real>
  static void ComputeColHeader(const GlobalHeader &global_header,
                               const Real *data, MatrixIndexT stride,
                               int32 num_rows, PerColHeader *header);
--- a/src/matrix/jama-eig.h
+++ b/src/matrix/jama-eig.h
@ -36,7 +36,7 @@ namespace kaldi {
 // This class is not to be used externally.  See the Eig function in the Matrix
 // class in kaldi-matrix.h.  This is the external interface.

-template<class Real> class EigenvalueDecomposition {
+template<typename Real> class EigenvalueDecomposition {
  // This class is based on the EigenvalueDecomposition class from the JAMA
  // library (version 1.0.2).
 public:
@ -110,7 +110,7 @@ template<class Real> class EigenvalueDecomposition {
 template class EigenvalueDecomposition<float>;  // force instantiation.
 template class EigenvalueDecomposition<double>;  // force instantiation.

-template<class Real> void  EigenvalueDecomposition<Real>::Tred2() {
+template<typename Real> void  EigenvalueDecomposition<Real>::Tred2() {
  //  This is derived from the Algol procedures tred2 by
  //  Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
  //  Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
@ -224,7 +224,7 @@ template<class Real> void  EigenvalueDecomposition<Real>::Tred2() {
   e_[0] = 0.0;
 }

-template<class Real> void EigenvalueDecomposition<Real>::Tql2() {
+template<typename Real> void EigenvalueDecomposition<Real>::Tql2() {
  //  This is derived from the Algol procedures tql2, by
  //  Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
  //  Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
@ -341,7 +341,7 @@ template<class Real> void EigenvalueDecomposition<Real>::Tql2() {
  }
 }

-template<class Real>
+template<typename Real>
 void EigenvalueDecomposition<Real>::Orthes() {

  //  This is derived from the Algol procedures orthes and ortran,
@ -433,7 +433,7 @@ void EigenvalueDecomposition<Real>::Orthes() {
  }
 }

-template<class Real> void  EigenvalueDecomposition<Real>::Hqr2() {
+template<typename Real> void  EigenvalueDecomposition<Real>::Hqr2() {
  //  This is derived from the Algol procedure hqr2,
  //  by Martin and Wilkinson, Handbook for Auto. Comp.,
  //  Vol.ii-Linear Algebra, and the corresponding
@ -872,7 +872,7 @@ template<class Real> void  EigenvalueDecomposition<Real>::Hqr2() {
  }
 }

-template<class Real>
+template<typename Real>
 EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A) {
  KALDI_ASSERT(A.NumCols() == A.NumRows() && A.NumCols() >= 1);
  n_ = A.NumRows();
@ -907,7 +907,7 @@ EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A
  }
 }

-template<class Real>
+template<typename Real>
 EigenvalueDecomposition<Real>::~EigenvalueDecomposition() {
  delete [] d_;
  delete [] e_;
--- a/src/matrix/jama-svd.h
+++ b/src/matrix/jama-svd.h
@ -61,7 +61,7 @@ namespace kaldi {
 */


-template<class Real>
+template<typename Real>
 bool MatrixBase<Real>::JamaSvd(VectorBase<Real> *s_in,
                               MatrixBase<Real> *U_in,
                               MatrixBase<Real> *V_in) {  //  Destructive!
--- a/src/matrix/kaldi-gpsr-test.cc
+++ b/src/matrix/kaldi-gpsr-test.cc
@ -27,12 +27,12 @@ namespace ut = kaldi::unittest;

 namespace kaldi {

-template<class Real> static void InitRand(VectorBase<Real> *v) {
+template<typename Real> static void InitRand(VectorBase<Real> *v) {
  for (MatrixIndexT i = 0;i < v->Dim();i++)
    (*v)(i) = RandGauss();
 }

-template<class Real> static void InitRand(MatrixBase<Real> *M) {
+template<typename Real> static void InitRand(MatrixBase<Real> *M) {
 start:
  for (MatrixIndexT i = 0;i < M->NumRows();i++)
    for (MatrixIndexT j = 0;j < M->NumCols();j++)
@ -44,7 +44,7 @@ template<class Real> static void InitRand(MatrixBase<Real> *M) {
    }
 }

-template<class Real> static void InitRand(SpMatrix<Real> *M) {
+template<typename Real> static void InitRand(SpMatrix<Real> *M) {
 start_sp:
  for (MatrixIndexT i = 0;i < M->NumRows();i++)
    for (MatrixIndexT j = 0;j<=i;j++)
@ -56,7 +56,7 @@ template<class Real> static void InitRand(SpMatrix<Real> *M) {
  }
 }

-template<class Real> static void UnitTestGpsr() {
+template<typename Real> static void UnitTestGpsr() {
  for (int32 i = 0; i < 5; i++) {
    MatrixIndexT dim1 = (rand() % 10) + 10;
    MatrixIndexT dim2 = (rand() % 10) + 10;
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@ -5,7 +5,6 @@
 //                       Yanmin Qian;  Petr Schwarz;  Jan Silovsky;
 //                       Haihua Xu

-
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -30,9 +29,14 @@
 namespace kaldi {

 template<typename Real>
-void MatrixBase<Real>::Invert(Real *LogDet, Real *DetSign,
+void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
                              bool inverse_needed) {
  KALDI_ASSERT(num_rows_ == num_cols_);
+  if (num_rows_ == 0) {
+    if (det_sign) *det_sign = 1;
+    if (log_det) *log_det = 0.0;
+    return;
+  }
 #ifndef HAVE_ATLAS
  KaldiBlasInt *pivot = new KaldiBlasInt[num_rows_];
  KaldiBlasInt M = num_rows_;
@ -60,26 +64,26 @@ void MatrixBase<Real>::Invert(Real *LogDet, Real *DetSign,
    if (inverse_needed) {
      KALDI_ERR << "Cannot invert: matrix is singular";
    } else {
-      if (LogDet) *LogDet = -std::numeric_limits<Real>::infinity();
-      if (DetSign) *DetSign = 0;
+      if (log_det) *log_det = -std::numeric_limits<Real>::infinity();
+      if (det_sign) *det_sign = 0;
      return;
    }
  }
-  if (DetSign != NULL) {
+  if (det_sign != NULL) {
    int sign = 1;
    for (MatrixIndexT i = 0; i < num_rows_; i++)
      if (pivot[i] != static_cast<int>(i) + pivot_offset) sign *= -1;
-    *DetSign = sign;
+    *det_sign = sign;
  }
-  if (LogDet != NULL || DetSign != NULL) {  // Compute log determinant.
-    if (LogDet != NULL) *LogDet = 0.0;
+  if (log_det != NULL || det_sign != NULL) {  // Compute log determinant.
+    if (log_det != NULL) *log_det = 0.0;
    Real prod = 1.0;
    for (MatrixIndexT i = 0; i < num_rows_; i++) {
      prod *= (*this)(i, i);
      if (i == num_rows_ - 1 || std::fabs(prod) < 1.0e-10 ||
          std::fabs(prod) > 1.0e+10) {
-        if (LogDet != NULL) *LogDet += log(fabs(prod));
-        if (DetSign != NULL) *DetSign *= (prod > 0 ? 1.0 : -1.0);
+        if (log_det != NULL) *log_det += log(fabs(prod));
+        if (det_sign != NULL) *det_sign *= (prod > 0 ? 1.0 : -1.0);
        prod = 1.0;
      }
    }
@ -108,8 +112,8 @@ void MatrixBase<float>::AddVecVec(const float alpha,
             1, data_, stride_);
 }

-template<class Real>
-template<class OtherReal>
+template<typename Real>
+template<typename OtherReal>
 void MatrixBase<Real>::AddVecVec(const Real alpha,
                                 const VectorBase<OtherReal> &a,
                                 const VectorBase<OtherReal> &b) {
@ -146,6 +150,7 @@ void MatrixBase<double>::AddVecVec(const double alpha,
                                   const VectorBase<double> &a,
                                   const VectorBase<double> &rb) {
  KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
+  if (num_rows_ == 0) return;
  cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
             1, data_, stride_);
 }
@ -162,11 +167,50 @@ void MatrixBase<Real>::AddMatMat(const Real alpha,
               || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
               || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
  KALDI_ASSERT(&A !=  this && &B != this);
+  if (num_rows_ == 0) return;
  cblas_Xgemm(alpha, transA, A.data_, A.num_rows_, A.num_cols_, A.stride_,
              transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);

 }

+template<typename Real>
+void MatrixBase<Real>::CopyLowerToUpper() {
+  KALDI_ASSERT(num_rows_ == num_cols_);
+  Real *data = data_;
+  MatrixIndexT num_rows = num_rows_, stride = stride_;
+  for (int32 i = 0; i < num_rows; i++)
+    for (int32 j = 0; j < i; j++)
+      data[j * stride + i ] = data[i * stride + j];
+}
+
+
+template<typename Real>
+void MatrixBase<Real>::CopyUpperToLower() {
+  KALDI_ASSERT(num_rows_ == num_cols_);
+  Real *data = data_;
+  MatrixIndexT num_rows = num_rows_, stride = stride_;
+  for (int32 i = 0; i < num_rows; i++)
+    for (int32 j = 0; j < i; j++)
+      data[i * stride + j] = data[j * stride + i];
+}
+
+template<typename Real>
+void MatrixBase<Real>::SymAddMat2(const Real alpha,
+                                  const MatrixBase<Real> &A,
+                                  MatrixTransposeType transA,
+                                  Real beta) {
+  KALDI_ASSERT(num_rows_ == num_cols_ &&
+               ((transA == kNoTrans && A.num_rows_ == num_rows_) ||
+                (transA == kTrans && A.num_cols_ == num_cols_)));
+  KALDI_ASSERT(A.data_ != data_);
+  if (num_rows_ == 0) return;
+  MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
+  
+  // This function call is hard-coded to update the lower triangle.
+  cblas_Xsyrk(transA, num_rows_, A_other_dim, alpha, A.Data(),
+              A.Stride(), beta, this->data_, this->stride_);
+}
+

 template<typename Real>
 void MatrixBase<Real>::AddMatSmat(const Real alpha,
@ -253,13 +297,14 @@ void MatrixBase<Real>::AddSpSp(const Real alpha, const SpMatrix<Real> &A_in,
  // CblasLower or CblasUpper would work below as symmetric matrix is copied
  // fully (to save work, we used the matrix constructor from SpMatrix).
  // CblasLeft means A is on the left: C <-- alpha A B + beta C
+  if (sz == 0) return;
  cblas_Xsymm(alpha, sz, A.data_, A.stride_, B.data_, B.stride_, beta, data_, stride_);
 }

 template<typename Real>
 void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
                               MatrixTransposeType transA) {
-  if (&A == this) {  // Make it work in this case.
+  if (&A == this) {
    if (transA == kNoTrans) {
      Scale(alpha + 1.0);
    } else {
@ -293,20 +338,22 @@ void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
    Real *adata = A.data_, *data = data_;
    if (transA == kNoTrans) {
      KALDI_ASSERT(A.num_rows_ == num_rows_ && A.num_cols_ == num_cols_);
+      if (num_rows_ == 0) return;
      for (MatrixIndexT row = 0; row < num_rows_; row++, adata += aStride,
               data += stride) {
        cblas_Xaxpy(num_cols_, alpha, adata, 1, data, 1);
      }
    } else {
      KALDI_ASSERT(A.num_cols_ == num_rows_ && A.num_rows_ == num_cols_);
+      if (num_rows_ == 0) return;      
      for (MatrixIndexT row = 0; row < num_rows_; row++, adata++, data += stride)
        cblas_Xaxpy(num_cols_, alpha, adata, aStride, data, 1);
    }
  }
 }

-template<class Real>
-template<class OtherReal>
+template<typename Real>
+template<typename OtherReal>
 void MatrixBase<Real>::AddSp(const Real alpha, const SpMatrix<OtherReal> &S) {
  KALDI_ASSERT(S.NumRows() == NumRows() && S.NumRows() == NumCols());
  Real *data = data_; const OtherReal *sdata = S.Data();
@ -331,6 +378,31 @@ template
 void MatrixBase<double>::AddSp(const double alpha, const SpMatrix<float> &S);


+template<typename Real>
+void MatrixBase<Real>::AddDiagVecMat(
+    const Real alpha, VectorBase<Real> &v,
+    const MatrixBase<Real> &M,
+    MatrixTransposeType transM, 
+    Real beta) {
+  if (beta != 1.0) this->Scale(beta);
+  
+  if (transM == kNoTrans) {
+    KALDI_ASSERT(SameDim(*this, M));
+  } else {
+    KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
+  }
+  KALDI_ASSERT(v.Dim() == this->NumRows());
+
+  MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1, stride = stride_,
+      num_rows = num_rows_, num_cols = num_cols_;
+  if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
+  Real *data = data_;
+  const Real *Mdata = M.Data(), *vdata = v.Data();
+  if (num_rows_ == 0) return;
+  for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++)
+    cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
+}
+
 #if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
 // ****************************************************************************
 // ****************************************************************************
@ -869,6 +941,7 @@ template<typename Real> void MatrixBase<Real>::Max(const MatrixBase<Real> &A) {

 template<typename Real> void MatrixBase<Real>::Scale(Real alpha) {
  if (alpha == 1.0) return;
+  if (num_rows_ == 0) return;
  if (num_cols_ == stride_) {
    cblas_Xscal(static_cast<size_t>(num_rows_) * static_cast<size_t>(num_cols_),
                alpha, data_,1);
@ -893,6 +966,58 @@ void MatrixBase<Real>::MulRowsVec(const VectorBase<Real> &scale) {
  }
 }

+template<typename Real> 
+void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(src.NumCols() > 0 && src.NumCols() <= this->NumCols());
+  KALDI_ASSERT(this->NumCols() % src.NumCols() == 0 || 
+  	this->NumCols() % (src.NumCols() - 1) < this->NumCols() / (src.NumCols() - 1));
+  int group_size = 0;
+  if (this->NumCols() % src.NumCols() == 0) {
+    group_size = this->NumCols() / src.NumCols();
+  } else {
+    group_size = this->NumCols() / src.NumCols() + 1; 
+  }
+  MatrixIndexT M = num_rows_, N = num_cols_;
+
+  for (MatrixIndexT i = 0; i < M; i++) 
+    for (MatrixIndexT j = 0; j < N; j++) 
+      (*this)(i, j) *= src(i, j / group_size);
+}
+
+template<typename Real> 
+void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &src1,
+                                       const MatrixBase<Real> &src2,
+                                       Real power) {
+  KALDI_ASSERT(src2.NumCols() > 0 && src2.NumCols() <= this->NumCols());
+  KALDI_ASSERT(this->NumCols() % src2.NumCols() == 0 || 
+  	this->NumCols() % (src2.NumCols() - 1) < this->NumCols() / (src2.NumCols() - 1));
+  int group_size = 0;
+  if (this->NumCols() % src2.NumCols() == 0) {
+    group_size = this->NumCols() / src2.NumCols();
+  } else {
+    group_size = this->NumCols() / src2.NumCols() + 1; 
+  }
+  MatrixIndexT M = this->NumRows(), N = this->NumCols(); 
+
+  if (power == 1.0) {   
+    for (MatrixIndexT i = 0; i < M; i++) 
+      for (MatrixIndexT j = 0; j < N; j++) 
+	  (*this)(i, j) = (src1(i, j) == 0 ? 0 : (src1(i, j) > 0 ? 1 : -1));
+  } else {
+    for (MatrixIndexT i = 0; i < M; i++) {
+      for (MatrixIndexT j = 0; j < N; j++) {
+        if (src2(i, j / group_size) == 0) {
+          (*this)(i, j) = 0;
+        } else {
+      	  (*this)(i, j) = pow(std::abs(src1(i, j)), power - 1) * 
+              (src2(i, j / group_size) > 0 ? pow(src2(i, j / group_size), 1 - power) : 1) * 
+              (src1(i, j) >= 0 ? 1 : -1) ;
+        }
+      }
+    }
+  }
+}
+
 template<typename Real>  // scales each column by scale[i].
 void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
  KALDI_ASSERT(scale.Dim() == num_cols_);
@ -932,8 +1057,19 @@ void MatrixBase<Real>::SetUnit() {
 template<typename Real>
 void MatrixBase<Real>::SetRandn() {
  for (MatrixIndexT row = 0; row < num_rows_; row++) {
-    for (MatrixIndexT col = 0; col < num_cols_; col++) {
-      (*this)(row, col) = static_cast<Real>(kaldi::RandGauss());
+    Real *row_data = this->RowData(row);
+    for (MatrixIndexT col = 0; col < num_cols_; col++, row_data++) {
+      *row_data = static_cast<Real>(kaldi::RandGauss());
+    }
+  }
+}
+
+template<typename Real>
+void MatrixBase<Real>::SetRandUniform() {
+  for (MatrixIndexT row = 0; row < num_rows_; row++) {
+    Real *row_data = this->RowData(row);
+    for (MatrixIndexT col = 0; col < num_cols_; col++, row_data++) {
+      *row_data = static_cast<Real>(kaldi::RandUniform());
    }
  }
 }
@ -1218,7 +1354,7 @@ SubMatrix<Real>::SubMatrix(Real *data,
 }


-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::Add(const Real alpha) {
  Real *data = data_;
  MatrixIndexT stride = stride_;
@ -1227,8 +1363,17 @@ void MatrixBase<Real>::Add(const Real alpha) {
      data[c + stride*r] += alpha;
 }

+template<typename Real>
+void MatrixBase<Real>::AddToDiag(const Real alpha) {
+  Real *data = data_;
+  MatrixIndexT this_stride = stride_ + 1,
+      num_to_add = std::min(num_rows_, num_cols_);  
+  for (MatrixIndexT r = 0; r < num_to_add; r++)
+    data[r * this_stride] += alpha;
+}

-template<class Real>
+
+template<typename Real>
 Real MatrixBase<Real>::Cond() const {
  KALDI_ASSERT(num_rows_ > 0&&num_cols_ > 0);
  Vector<Real> singular_values(std::min(num_rows_, num_cols_));
@ -1241,7 +1386,7 @@ Real MatrixBase<Real>::Cond() const {
  else return 1.0e+100;
 }

-template<class Real>
+template<typename Real>
 Real MatrixBase<Real>::Trace(bool check_square) const  {
  KALDI_ASSERT(!check_square || num_rows_ == num_cols_);
  Real ans = 0.0;
@ -1249,7 +1394,7 @@ Real MatrixBase<Real>::Trace(bool check_square) const  {
  return ans;
 }

-template<class Real>
+template<typename Real>
 Real MatrixBase<Real>::Max() const {
  KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
  Real ans= *data_;
@ -1260,7 +1405,7 @@ Real MatrixBase<Real>::Max() const {
  return ans;
 }

-template<class Real>
+template<typename Real>
 Real MatrixBase<Real>::Min() const {
  KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
  Real ans= *data_;
@ -1273,7 +1418,7 @@ Real MatrixBase<Real>::Min() const {



-template <class Real>
+template <typename Real>
 void MatrixBase<Real>::AddMatMatMat(Real alpha,
                                    const MatrixBase<Real> &A, MatrixTransposeType transA,
                                    const MatrixBase<Real> &B, MatrixTransposeType transB,
@ -1313,7 +1458,7 @@ void MatrixBase<Real>::AddMatMatMat(Real alpha,



-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) {
  // Svd, *this = U*diag(s)*Vt.
  // With (*this).num_rows_ == m, (*this).num_cols_ == n,
@ -1357,7 +1502,7 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
  if (prescale != 1.0) s->Scale(1.0/prescale);
 }

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) const {
  try {
    if (num_rows_ >= num_cols_) {
@ -1380,7 +1525,7 @@ void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<
  }
 }

-template<class Real>
+template<typename Real>
 bool MatrixBase<Real>::IsSymmetric(Real cutoff) const {
  MatrixIndexT R = num_rows_, C = num_cols_;
  if (R != C) return false;
@ -1396,7 +1541,7 @@ bool MatrixBase<Real>::IsSymmetric(Real cutoff) const {
  return true;
 }

-template<class Real>
+template<typename Real>
 bool MatrixBase<Real>::IsDiagonal(Real cutoff) const{
  MatrixIndexT R = num_rows_, C = num_cols_;
  Real bad_sum = 0.0, good_sum = 0.0;
@ -1422,7 +1567,7 @@ void MatrixBase<Real>::TestUninitialized() const {
 }
  

-template<class Real>
+template<typename Real>
 bool MatrixBase<Real>::IsUnit(Real cutoff) const {
  MatrixIndexT R = num_rows_, C = num_cols_;
  // if (R != C) return false;
@ -1433,7 +1578,7 @@ bool MatrixBase<Real>::IsUnit(Real cutoff) const {
  return (bad_max <= cutoff);
 }

-template<class Real>
+template<typename Real>
 bool MatrixBase<Real>::IsZero(Real cutoff)const {
  MatrixIndexT R = num_rows_, C = num_cols_;
  Real bad_max = 0.0;
@ -1443,16 +1588,9 @@ bool MatrixBase<Real>::IsZero(Real cutoff)const {
  return (bad_max <= cutoff);
 }

-template<class Real>
+template<typename Real>
 Real MatrixBase<Real>::FrobeniusNorm() const{
-  MatrixIndexT R = num_rows_, C = num_cols_;
-  Real sum = 0.0;
-  for (MatrixIndexT i = 0;i < R;i++)
-    for (MatrixIndexT j = 0;j < C;j++) {
-      Real tmp = (*this)(i, j);
-      sum +=  tmp*tmp;
-    }
-  return sqrt(sum);
+  return sqrt(TraceMatMat(*this, *this, kTrans));
 }

 template<typename Real>
@ -1477,7 +1615,7 @@ bool MatrixBase<Real>::Equal(const MatrixBase<Real> &other) const {
 }


-template<class Real>
+template<typename Real>
 Real MatrixBase<Real>::LargestAbsElem() const{
  MatrixIndexT R = num_rows_, C = num_cols_;
  Real largest = 0.0;
@ -1488,7 +1626,7 @@ Real MatrixBase<Real>::LargestAbsElem() const{
 }


-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::OrthogonalizeRows() {
  KALDI_ASSERT(NumRows() <= NumCols());
  MatrixIndexT num_rows = num_rows_;
@ -1529,7 +1667,7 @@ void MatrixBase<Real>::OrthogonalizeRows() {
 // Throws exception if this failed to within supplied precision (typically because *this was not
 // symmetric positive definite).

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *rU, Real check_thresh) // e.g. check_thresh = 0.001
 {
  const MatrixIndexT D = num_rows_;
@ -1571,7 +1709,7 @@ void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *
 }


-template<class Real>
+template<typename Real>
 Real MatrixBase<Real>::LogDet(Real *det_sign) const {
  Real log_det;
  Matrix<Real> tmp(*this);
@ -1579,15 +1717,15 @@ Real MatrixBase<Real>::LogDet(Real *det_sign) const {
  return log_det;
 }

-template<class Real>
-void MatrixBase<Real>::InvertDouble(Real *LogDet, Real *DetSign,
+template<typename Real>
+void MatrixBase<Real>::InvertDouble(Real *log_det, Real *det_sign,
                                    bool inverse_needed) {
-  double LogDet_tmp, DetSign_tmp;
+  double log_det_tmp, det_sign_tmp;
  Matrix<double> dmat(*this);
-  dmat.Invert(&LogDet_tmp, &DetSign_tmp, inverse_needed);
+  dmat.Invert(&log_det_tmp, &det_sign_tmp, inverse_needed);
  if (inverse_needed) (*this).CopyFromMat(dmat);
-  if (LogDet) *LogDet = LogDet_tmp;
-  if (DetSign) *DetSign = DetSign_tmp;
+  if (log_det) *log_det = log_det_tmp;
+  if (det_sign) *det_sign = det_sign_tmp;
 }

 template<class Real>
@ -1610,7 +1748,7 @@ void MatrixBase<Real>::InvertElements() {
  }
 }

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::Transpose() {
  KALDI_ASSERT(num_rows_ == num_cols_);
  MatrixIndexT M = num_rows_;
@ -1622,7 +1760,7 @@ void MatrixBase<Real>::Transpose() {
 }


-template<class Real>
+template<typename Real>
 void Matrix<Real>::Transpose() {
  if (this->num_rows_ != this->num_cols_) {
    Matrix<Real> tmp(*this, kTrans);
@ -1633,7 +1771,7 @@ void Matrix<Real>::Transpose() {
  }
 }

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::ApplyFloor(Real floor_val) {
  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
  for (MatrixIndexT i = 0; i < num_rows; i++) {
@ -1643,7 +1781,7 @@ void MatrixBase<Real>::ApplyFloor(Real floor_val) {
  }
 }

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
  for (MatrixIndexT i = 0; i < num_rows; i++) {
@ -1653,28 +1791,28 @@ void MatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
  }
 }

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::ApplyLog() {
  for (MatrixIndexT i = 0; i < num_rows_; i++) {
    Row(i).ApplyLog();
  }
 }

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::ApplyExp() {
  for (MatrixIndexT i = 0; i < num_rows_; i++) {
    Row(i).ApplyExp();
  }
 }

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::ApplyPow(Real power) {
  for (MatrixIndexT i = 0; i < num_rows_; i++) {
    Row(i).ApplyPow(power);
  }
 }

-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::ApplyHeaviside() {
  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
  for (MatrixIndexT i = 0; i < num_rows; i++) {
@ -1685,7 +1823,7 @@ void MatrixBase<Real>::ApplyHeaviside() {
 }


-template<class Real>
+template<typename Real>
 bool MatrixBase<Real>::Power(Real power) {
  KALDI_ASSERT(num_rows_ > 0 && num_rows_ == num_cols_);
  MatrixIndexT n = num_rows_;
@ -1708,7 +1846,7 @@ bool MatrixBase<Real>::Power(Real power) {
  return true;
 }

-template<class Real>
+template<typename Real>
 void Matrix<Real>::Swap(Matrix<Real> *other) {
  std::swap(this->data_, other->data_);
  std::swap(this->num_cols_, other->num_cols_);
@ -1733,7 +1871,7 @@ void Matrix<Real>::Swap(Matrix<Real> *other) {
 // By making the pointer arguments non-NULL or NULL, the user can choose to take
 // not to take the eigenvalues directly, and/or the matrix D which is block-diagonal
 // with 2x2 blocks.
-template<class Real>
+template<typename Real>
 void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
                           VectorBase<Real> *r,
                           VectorBase<Real> *i) const {
@ -1756,7 +1894,7 @@ void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
 // INT_32 mSampSize;
 // };

-template<class Real>
+template<typename Real>
 bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
 {
  // check instantiated with double or float.
@ -1856,7 +1994,7 @@ bool ReadHtk(std::istream &is, Matrix<float> *M, HtkHeader *header_ptr);
 template
 bool ReadHtk(std::istream &is, Matrix<double> *M, HtkHeader *header_ptr);

-template<class Real>
+template<typename Real>
 bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr) // header may be derived from a previous call to ReadHtk.  Must be in binary mode.
 {
  KALDI_ASSERT(M.NumRows() == static_cast<MatrixIndexT>(htk_hdr.mNSamples));
@ -1910,7 +2048,7 @@ template
 bool WriteHtk(std::ostream &os, const MatrixBase<double> &M, HtkHeader htk_hdr);


-template <class Real>
+template <typename Real>
 Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
                    const MatrixBase<Real> &B, MatrixTransposeType transB,
                    const MatrixBase<Real> &C, MatrixTransposeType transC) {
@ -1946,7 +2084,7 @@ double TraceMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA,
                      const MatrixBase<double> &C, MatrixTransposeType transC);


-template <class Real>
+template <typename Real>
 Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
                       const MatrixBase<Real> &B, MatrixTransposeType transB,
                       const MatrixBase<Real> &C, MatrixTransposeType transC,
@ -1989,7 +2127,7 @@ double TraceMatMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA
                         const MatrixBase<double> &C, MatrixTransposeType transC,
                         const MatrixBase<double> &D, MatrixTransposeType transD);

-template<class Real> void  SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+template<typename Real> void  SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
                                   MatrixBase<Real> *Vt, bool sort_on_absolute_value) {
  /// Makes sure the Svd is sorted (from greatest to least absolute value).
  MatrixIndexT num_singval = s->Dim();
@ -2031,7 +2169,7 @@ template
 void SortSvd(VectorBase<double> *s, MatrixBase<double> *U,
             MatrixBase<double> *Vt, bool);

-template<class Real>
+template<typename Real>
 void CreateEigenvalueMatrix(const VectorBase<Real> &re, const VectorBase<Real> &im,
                            MatrixBase<Real> *D) {
  MatrixIndexT n = re.Dim();
@ -2067,7 +2205,7 @@ void CreateEigenvalueMatrix(const VectorBase<double> &re, const VectorBase<doubl



-template<class Real>
+template<typename Real>
 bool AttemptComplexPower(Real *x_re, Real *x_im, Real power) {
  // Used in Matrix<Real>::Power().
  // Attempts to take the complex value x to the power "power",
@ -2100,7 +2238,7 @@ bool AttemptComplexPower(double *x_re, double *x_im, double power);



-template <class Real>
+template <typename Real>
 Real TraceMatMat(const MatrixBase<Real> &A,
                  const MatrixBase<Real> &B,
                  MatrixTransposeType trans) {  // tr(A B), equivalent to sum of each element of A times same element in B'
@ -2186,6 +2324,75 @@ void MatrixBase<Real>::Tanh(const MatrixBase<Real> &src) {
  }
 }

+template<typename Real>
+void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
+  int32 num_rows = num_rows_, num_cols = num_cols_;
+  for (MatrixIndexT r = 0; r < num_rows; r++) {
+    Real *row_data = this->RowData(r);
+    const Real *src_row_data = src.RowData(r);
+    for (MatrixIndexT c = 0; c < num_cols; c++) {
+      Real x = src_row_data[c], y;
+      if (x > 10.0) y = x; // avoid exponentiating large numbers; function
+      // approaches y=x.
+      else y = log1p(exp(x));
+      row_data[c] = y;
+    }
+  }
+}
+template<typename Real>
+void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) {
+  int group_size = src.NumCols() / this->NumCols();
+  KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size);
+  for (MatrixIndexT i = 0; i < src.NumRows(); i++)
+    for (MatrixIndexT j = 0; j < this->NumCols(); j++)
+      (*this)(i, j) = src.Row(i).Range(j * group_size,  group_size).Norm(power);
+}
+
+template<typename Real>
+void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
+                                const std::vector<MatrixIndexT> &indices) {
+  KALDI_ASSERT(NumRows() == src.NumRows());
+  KALDI_ASSERT(NumCols() == static_cast<MatrixIndexT>(indices.size()));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
+      this_stride = stride_, src_stride = src.stride_;
+  Real *this_data = this->data_;
+  const Real *src_data = src.data_;
+#ifdef KALDI_PARANOID
+  MatrixIndexT src_cols = src.NumCols();
+  for (std::vector<MatrixIndexT>::const_iterator iter = indices.begin();
+       iter != indices.end(); ++iter)
+    KALDI_ASSERT(*iter >= -1 && *iter < src_cols);
+#endif                
+  
+  // For the sake of memory locality we do this row by row, rather
+  // than doing it column-wise using cublas_Xcopy
+  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
+    const MatrixIndexT *index_ptr = &(indices[0]);
+    for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
+      if (*index_ptr < 0) this_data[c] = 0;
+      else this_data[c] = src_data[*index_ptr];
+    }
+  }
+}
+
+template<typename Real>
+void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
+                                const std::vector<MatrixIndexT> &indices) {
+  KALDI_ASSERT(NumCols() == src.NumCols());
+  KALDI_ASSERT(NumRows() == static_cast<MatrixIndexT>(indices.size()));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
+      this_stride = stride_;
+  Real *this_data = this->data_;
+  
+  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
+    MatrixIndexT index = indices[r];
+    if (index < 0) memset(this_data, 0, sizeof(Real) * num_cols_);
+    else cblas_Xcopy(num_cols, src.RowData(index), 1, this_data, 1);
+  }
+}
+
+
 template<typename Real>
 void MatrixBase<Real>::Sigmoid(const MatrixBase<Real> &src) {
  KALDI_ASSERT(SameDim(*this, src));
@ -2237,8 +2444,8 @@ void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
 }


-template<class Real>
-template<class OtherReal>
+template<typename Real>
+template<typename OtherReal>
 void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v) {
  const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
      stride = stride_;
@ -2262,8 +2469,8 @@ template void MatrixBase<double>::AddVecToRows(const double alpha,
                                               const VectorBase<double> &v);


-template<class Real>
-template<class OtherReal>
+template<typename Real>
+template<typename OtherReal>
 void MatrixBase<Real>::AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v) {
  const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
      stride = stride_;
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@ -41,7 +41,7 @@ Real TraceMatMat(const MatrixBase<Real> &A, const MatrixBase<Real> &B,
 /// Base class which provides matrix operations not involving resizing
 /// or allocation.   Classes Matrix and SubMatrix inherit from it and take care
 /// of allocation and resizing.
-template<class Real>
+template<typename Real>
 class MatrixBase {
 public:
  // so this child can access protected members of other instances.
@ -50,6 +50,9 @@ class MatrixBase {
  friend class CuMatrixBase<Real>;
  friend class CuMatrix<Real>;
  friend class CuSubMatrix<Real>;
+  friend class CuPackedMatrix<Real>;
+  
+  friend class PackedMatrix<Real>;

  /// Returns number of rows (or zero for emtpy matrix).
  inline MatrixIndexT  NumRows() const { return num_rows_; }
@ -121,13 +124,16 @@ class MatrixBase {
  void SetUnit();
  /// Sets to random values of a normal distribution
  void SetRandn();
+  /// Sets to numbers uniformly distributed on (0, 1)
+  void SetRandUniform();

  /*  Copying functions.  These do not resize the matrix! */

+
  /// Copy given matrix. (no resize is done).
  template<typename OtherReal>
  void CopyFromMat(const MatrixBase<OtherReal> & M,
-                   MatrixTransposeType Trans = kNoTrans);
+                   MatrixTransposeType trans = kNoTrans);

  /// Copy from compressed matrix.
  void CopyFromMat(const CompressedMatrix &M);
@ -139,12 +145,21 @@ class MatrixBase {
  /// Copy given tpmatrix. (no resize is done).
  template<typename OtherReal>
  void CopyFromTp(const TpMatrix<OtherReal> &M,
-                  MatrixTransposeType Trans = kNoTrans);
+                  MatrixTransposeType trans = kNoTrans);
  
+  /// Copy from CUDA matrix.  Implemented in ../cudamatrix/cu-matrix.h
+  template<typename OtherReal>  
+  void CopyFromMat(const CuMatrixBase<OtherReal> &M,
+                   MatrixTransposeType trans = kNoTrans);
+
  /// Inverse of vec() operator. Copies vector into matrix, row-by-row.
  /// Note that rv.Dim() must either equal NumRows()*NumCols() or
  /// NumCols()-- this has two modes of operation.
  void CopyRowsFromVec(const VectorBase<Real> &v);
+
+  /// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
+  void CopyRowsFromVec(const CuVectorBase<Real> &v);
+  
  template<typename OtherReal>
  void CopyRowsFromVec(const VectorBase<OtherReal> &v);

@ -225,6 +240,10 @@ class MatrixBase {
  /// each row by a scalar taken from that dimension of the vector.
  void MulRowsVec(const VectorBase<Real> &scale);

+  /// divide each row into src.NumCols() groups, 
+  /// and then scale i'th row's jth group of elements by src[i, j].   
+  void MulRowsGroupMat(const MatrixBase<Real> &src);
+    
  /// Returns logdet of matrix.
  Real LogDet(Real *det_sign = NULL) const;
  
@ -248,6 +267,22 @@ class MatrixBase {
  /// Matrix child class works also for non-square.
  void Transpose();

+  /// Copies column r from column indices[r] of src.
+  /// As a special case, if indexes[i] == -1, sets column i to zero
+  /// indices.size() must equal this->NumCols(),
+  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+  /// and src.NumRows() must equal this.NumRows()
+  void CopyCols(const MatrixBase<Real> &src,
+                const std::vector<MatrixIndexT> &indices);
+
+  /// Copies row r from row indices[r] of src.
+  /// As a special case, if indexes[i] == -1, sets row i to zero
+  /// "reorder".size() must equal this->NumRows(),
+  /// all elements of "reorder" must be in [-1, src.NumRows()-1],
+  /// and src.NumCols() must equal this.NumCols()
+  void CopyRows(const MatrixBase<Real> &src,
+                const std::vector<MatrixIndexT> &indices);
+  
  /// Applies floor to all matrix elements
  void ApplyFloor(Real floor_val);

@ -374,6 +409,24 @@ class MatrixBase {
  /// Set each element to the sigmoid of the corresponding element of "src".
  void Sigmoid(const MatrixBase<Real> &src);

+  /// Set each element to y = log(1 + exp(x))
+  void SoftHinge(const MatrixBase<Real> &src);
+  
+  /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
+  /// where G = x.NumCols() / y.NumCols() must be an integer.
+  void GroupPnorm(const MatrixBase<Real> &src, Real power);
+
+
+  /// Calculate derivatives for the GroupPnorm function above...
+  /// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
+  /// and "output" is the result of the computation (i.e. the "this" of that function
+  /// call), and *this has the same dimension as "input", then it sets each element
+  /// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
+  /// "output-elem" is whichever element of output depends on that input element.
+  void GroupPnormDeriv(const MatrixBase<Real> &input, const MatrixBase<Real> &output,
+                       Real power);
+
+
  /// Set each element to the tanh of the corresponding element of "src".
  void Tanh(const MatrixBase<Real> &src);

@ -406,25 +459,40 @@ class MatrixBase {
  /// Add a scalar to each element
  void Add(const Real alpha);

+  /// Add a scalar to each diagonal element.
+  void AddToDiag(const Real alpha);
+
  /// *this += alpha * a * b^T
-  template<class OtherReal>
+  template<typename OtherReal>
  void AddVecVec(const Real alpha, const VectorBase<OtherReal> &a,
                 const VectorBase<OtherReal> &b);

  /// [each row of *this] += alpha * v
-  template<class OtherReal>
+  template<typename OtherReal>
  void AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v);
  
  /// [each col of *this] += alpha * v
-  template<class OtherReal>
+  template<typename OtherReal>
  void AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v);      
  
  /// *this += alpha * M [or M^T]
  void AddMat(const Real alpha, const MatrixBase<Real> &M,
              MatrixTransposeType transA = kNoTrans);

+  /// *this = beta * *this + alpha * M M^T, for symmetric matrices.  It only
+  /// updates the lower triangle of *this.  It will leave the matrix asymmetric;
+  /// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
+  void SymAddMat2(const Real alpha, const MatrixBase<Real> &M,
+                  MatrixTransposeType transA, Real beta);
+
+  /// *this = beta * *this + alpha * diag(v) * M [or M^T].
+  /// The same as adding M but scaling each row M_i by v(i).
+  void AddDiagVecMat(const Real alpha, VectorBase<Real> &v,
+                     const MatrixBase<Real> &M, MatrixTransposeType transM, 
+                     Real beta = 1.0);
+  
  /// *this += alpha * S
-  template<class OtherReal>
+  template<typename OtherReal>
  void AddSp(const Real alpha, const SpMatrix<OtherReal> &S);

  void AddMatMat(const Real alpha,
@ -512,6 +580,12 @@ class MatrixBase {
               const SpMatrix<Real>& A, const SpMatrix<Real>& B,
               const Real beta);

+  /// Copy lower triangle to upper triangle (symmetrize)
+  void CopyLowerToUpper();
+
+  /// Copy upper triangle to lower triangle (symmetrize)
+  void CopyUpperToLower();
+  
  /// This function orthogonalizes the rows of a matrix using the Gram-Schmidt
  /// process.  It is only applicable if NumRows() <= NumCols().  It will use
  /// random number generation to fill in rows with something nonzero, in cases
@ -580,7 +654,7 @@ class MatrixBase {
 };

 /// A class for storing matrices.
-template<class Real>
+template<typename Real>
 class Matrix : public MatrixBase<Real> {
 public:

@ -589,12 +663,23 @@ class Matrix : public MatrixBase<Real> {

  /// Basic constructor.  Sets to zero by default.
  /// if set_zero == false, memory contents are undefined.
-  Matrix(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type = kSetZero):
+  Matrix(const MatrixIndexT r, const MatrixIndexT c,
+         MatrixResizeType resize_type = kSetZero):
      MatrixBase<Real>() { Resize(r, c, resize_type); }
+  
+  /// Copy constructor from CUDA matrix
+  /// This is defined in ../cudamatrix/cu-matrix.h
+  template<typename OtherReal>
+  explicit Matrix(const CuMatrixBase<OtherReal> &cu,
+                  MatrixTransposeType trans = kNoTrans);
+

  /// Swaps the contents of *this and *other.  Shallow swap.
  void Swap(Matrix<Real> *other);

+  /// Defined in ../cudamatrix/cu-matrix.cc
+  void Swap(CuMatrix<Real> *mat);
+
  /// Constructor from any MatrixBase. Can also copy with transpose.
  /// Allocates new memory.
  explicit Matrix(const MatrixBase<Real> & M,
@ -707,11 +792,11 @@ struct HtkHeader {
 };

 // Read HTK formatted features from file into matrix.
-template<class Real>
+template<typename Real>
 bool ReadHtk(std::istream &is, Matrix<Real> *M, HtkHeader *header_ptr);

 // Write (HTK format) features to file from matrix.
-template<class Real>
+template<typename Real>
 bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr);


@ -764,19 +849,32 @@ class SubMatrix : public MatrixBase<Real> {

 // Some declarations.  These are traces of products.

+
+template<typename Real>
+bool ApproxEqual(const MatrixBase<Real> &A,
+                 const MatrixBase<Real> &B, Real tol = 0.01) {
+  return A.ApproxEqual(B, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(MatrixBase<Real> &A, MatrixBase<Real> &B,
+                        float tol = 0.01) {
+  KALDI_ASSERT(A.ApproxEqual(B, tol));
+}
+
 /// Returns trace of matrix.
-template <class Real>
+template <typename Real>
 double TraceMat(const MatrixBase<Real> &A) { return A.Trace(); }


 /// Returns tr(A B C)
-template <class Real>
+template <typename Real>
 Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
                      const MatrixBase<Real> &B, MatrixTransposeType transB,
                      const MatrixBase<Real> &C, MatrixTransposeType transC);

 /// Returns tr(A B C D)
-template <class Real>
+template <typename Real>
 Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
                         const MatrixBase<Real> &B, MatrixTransposeType transB,
                         const MatrixBase<Real> &C, MatrixTransposeType transC,
@ -796,7 +894,7 @@ Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
 /// otherwise, moving the columns of U, if it exists, and the rows of Vt, if it
 /// exists around in the same way.  Note: the "absolute value" part won't matter
 /// if this is an actual SVD, since singular values are non-negative.
-template<class Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+template<typename Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
                                  MatrixBase<Real>* Vt = NULL,
                                  bool sort_on_absolute_value = true);

@ -806,7 +904,7 @@ template<class Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
 /// 2x2 block [lambda, mu; -mu, lambda].
 /// This function will throw if any complex eigenvalues are not in complex conjugate
 /// pairs (or the members of such pairs are not consecutively numbered).
-template<class Real>
+template<typename Real>
 void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real> &imag,
                            MatrixBase<Real> *D);

@ -814,7 +912,7 @@ void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real>
 /// declare it here mainly for the testing code to see.  It takes a complex value to
 /// a power using a method that will work for noninteger powers (but will fail if the
 /// complex value is real and negative).
-template<class Real>
+template<typename Real>
 bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);


@ -834,7 +932,7 @@ template<typename Real>
 std::istream & operator >> (std::istream & In, Matrix<Real> & M);


-template<class Real>
+template<typename Real>
 bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
  return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
 }
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@ -45,7 +45,7 @@ template
 double VecVec<>(const VectorBase<double> &a,
                const VectorBase<double> &b);

-template<class Real, class OtherReal>
+template<typename Real, typename OtherReal>
 Real VecVec(const VectorBase<Real> &ra,
            const VectorBase<OtherReal> &rb) {
  MatrixIndexT adim = ra.Dim();
@ -470,20 +470,25 @@ Real VectorBase<Real>::Norm(Real p) const {
    return sqrt(sum);
  } else {
    Real tmp;
+    bool ok = true;
    for (MatrixIndexT i = 0; i < dim_; i++) {
      tmp = pow(std::abs(data_[i]), p);
-      if (tmp == HUGE_VAL) {  // HUGE_VAL is what pow returns on error.
-        KALDI_ERR << "Could not raise element " << i << "to power " << p
-                  << ": returned value = " << tmp;
-      }
+      if (tmp == HUGE_VAL) // HUGE_VAL is what pow returns on error.
+        ok = false;
      sum += tmp;
    }
    tmp = pow(sum, static_cast<Real>(1.0/p));
-    if (tmp == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
-      KALDI_ERR << "Could not take the " << p << "-th root of " << sum
-                << "; returned value = " << tmp;
-    }
-    return tmp;
+    KALDI_ASSERT(tmp != HUGE_VAL); // should not happen here.
+    if (ok) {
+      return tmp;
+    } else {
+      Real maximum = this->Max(), minimum = this->Min(),
+          max_abs = std::max(maximum, -minimum);
+      KALDI_ASSERT(max_abs > 0); // Or should not have reached here.
+      Vector<Real> tmp(*this);
+      tmp.Scale(1.0 / max_abs);
+      return tmp.Norm(p) * max_abs;
+    }      
  }
 }

@ -612,9 +617,7 @@ void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixInd
 template<typename Real>
 void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
  KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] = M(i, i);
-  // could make this more efficient.
+  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
 }

 template<typename Real>
@ -774,12 +777,13 @@ MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {

 template<typename Real>
 Real VectorBase<Real>::ApplySoftMax() {
-  Real max = this->Max(), sum = 0.0;
+Real max = this->Max(), sum = 0.0;
  for (MatrixIndexT i = 0; i < dim_; i++) {
    sum += (data_[i] = exp(data_[i] - max));
  }
  this->Scale(1.0 / sum);
  return max + log(sum);
+
 }

 #ifdef HAVE_MKL
@ -868,7 +872,12 @@ void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
  }
 }

-
+template<typename Real>  // Set each element to y = (x == orig ? changed : x).
+void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
+  Real *data = data_;
+  for (MatrixIndexT i = 0; i < dim_; i++) 
+    if (data[i] == orig) data[i] = changed;
+}


 template<typename Real>
@ -1136,7 +1145,7 @@ void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
 }


-template<class Real>
+template<typename Real>
 void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
  KALDI_ASSERT(dim_ == v.dim_);
  for (MatrixIndexT i = 0; i < dim_; i++)
@ -1144,7 +1153,7 @@ void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
 }

 // this <-- beta*this + alpha*M*v.
-template<class Real>
+template<typename Real>
 void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
                                const MatrixTransposeType trans,
                                const VectorBase<Real> &v,
@ -1162,7 +1171,7 @@ void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
  }
 }

-template<class Real>
+template<typename Real>
 Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
               const VectorBase<Real> &v2) {
  KALDI_ASSERT(v1.Dim() == M.NumRows() && v2.Dim() == M.NumCols());
@ -1178,7 +1187,7 @@ template
 double VecMatVec(const VectorBase<double> &v1, const MatrixBase<double> &M,
                 const VectorBase<double> &v2);

-template<class Real>
+template<typename Real>
 void Vector<Real>::Swap(Vector<Real> *other) {
  std::swap(this->data_, other->data_);
  std::swap(this->dim_, other->dim_);
@ -1209,6 +1218,29 @@ void VectorBase<Real>::AddDiagMat2(
  }
 }

+template<typename Real>
+void VectorBase<Real>::AddDiagMatMat(
+    Real alpha,
+    const MatrixBase<Real> &M, MatrixTransposeType transM,
+    const MatrixBase<Real> &N, MatrixTransposeType transN,
+    Real beta) {
+  MatrixIndexT dim = this->dim_,
+      M_col_dim = (transM == kTrans ? M.NumRows() : M.NumCols()),
+      N_row_dim = (transN == kTrans ? N.NumCols() : N.NumRows());
+  KALDI_ASSERT(M_col_dim == N_row_dim); // this is the dimension we sum over
+  MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
+  if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
+  MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
+  if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
+
+  Real *data = this->data_;
+  const Real *Mdata = M.Data(), *Ndata = N.Data();
+  for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) {
+    *data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride);
+  }
+}
+
+
 template class Vector<float>;
 template class Vector<double>;
 template class VectorBase<float>;
@ -1216,5 +1248,3 @@ template class VectorBase<double>;

 }  // namespace kaldi

-
-
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@ -109,6 +109,11 @@ class VectorBase {
  template<typename OtherReal>
  void CopyFromVec(const VectorBase<OtherReal> &v);

+  /// Copy from CuVector.  This is defined in ../cudamatrix/cu-vector.h
+  template<typename OtherReal>
+  void CopyFromVec(const CuVectorBase<OtherReal> &v);
+
+  
  /// Apply natural log to all elements.  Throw if any element of
  /// the vector is negative (but doesn't complain about zero; the
  /// log will be -infinity
@ -157,7 +162,7 @@ class VectorBase {

  /// Add vector : *this = *this + alpha * rv (with casting between floats and
  /// doubles)
-  template<class OtherReal>
+  template<typename OtherReal>
  void AddVec(const Real alpha, const VectorBase<OtherReal> &v);

  /// Add vector : *this = *this + alpha * rv^2  [element-wise squaring].
@ -165,7 +170,7 @@ class VectorBase {

  /// Add vector : *this = *this + alpha * rv^2  [element-wise squaring],
  /// with casting between floats and doubles.
-  template<class OtherReal>
+  template<typename OtherReal>
  void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);

  /// Add matrix times vector : this <-- beta*this + alpha*M*v.
@ -192,6 +197,9 @@ class VectorBase {
                const MatrixTransposeType trans, const VectorBase<Real> &v,
                const Real beta);  // **beta previously defaulted to 0.0**

+  /// Set each element to y = (x == orig ? changed : x).
+  void ReplaceValue(Real orig, Real changed);
+
  /// Multipy element-by-element by another vector.
  void MulElements(const VectorBase<Real> &v);
  /// Multipy element-by-element by another vector of different type.
@ -228,6 +236,8 @@ class VectorBase {
  template<typename OtherReal>
  void CopyRowsFromMat(const MatrixBase<OtherReal> &M);

+  /// The following is implemented in ../cudamatrix/cu-matrix.cc
+  void CopyRowsFromMat(const CuMatrixBase<Real> &M);

  /// Performs a column stack of the matrix M
  void CopyColsFromMat(const MatrixBase<Real> &M);
@ -292,6 +302,13 @@ class VectorBase {
  void AddDiagMat2(Real alpha, const MatrixBase<Real> &M,
                   MatrixTransposeType trans = kNoTrans, Real beta = 1.0);

+  /// Add the diagonal of a matrix product: *this = diag(M N), assuming the
+  /// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
+  /// as you would expect.
+  void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
+                     const MatrixBase<Real> &N, MatrixTransposeType transN,
+                     Real beta = 1.0);  
+
  /// Returns log(sum(exp())) without exp overflow
  /// If prune > 0.0, ignores terms less than the max - prune.
  /// [Note: in future, if prune = 0.0, it will take the max.
@ -354,6 +371,11 @@ class Vector: public VectorBase<Real> {
                  MatrixResizeType resize_type = kSetZero)
      : VectorBase<Real>() {  Resize(s, resize_type);  }

+  /// Copy constructor from CUDA vector
+  /// This is defined in ../cudamatrix/cu-vector.h
+  template<typename OtherReal>
+  explicit Vector(const CuVectorBase<OtherReal> &cu);
+
  /// Copy constructor.  The need for this is controversial.
  Vector(const Vector<Real> &v) : VectorBase<Real>()  { //  (cannot be explicit)
    Resize(v.Dim(), kUndefined);
@ -432,7 +454,7 @@ class Vector: public VectorBase<Real> {

 /// Represents a non-allocating general vector which can be defined
 /// as a sub-vector of higher-level vector [or as the row of a matrix].
-template<class Real>
+template<typename Real>
 class SubVector : public VectorBase<Real> {
 public:
  /// Constructor from a Vector or SubVector.
@ -506,6 +528,20 @@ std::istream & operator >> (std::istream & in, Vector<Real> & v);
 /// \addtogroup matrix_funcs_scalar
 /// @{

+
+template<typename Real>
+bool ApproxEqual(const VectorBase<Real> &a,
+                 const VectorBase<Real> &b, Real tol = 0.01) {
+  return a.ApproxEqual(b, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
+                        float tol = 0.01) {
+  KALDI_ASSERT(a.ApproxEqual(b, tol));
+}
+
+
 /// Returns dot product between v1 and v2.
 template<typename Real>
 Real VecVec(const VectorBase<Real> &v1, const VectorBase<Real> &v2);
@ -516,7 +552,7 @@ Real VecVec(const VectorBase<Real> &v1, const VectorBase<OtherReal> &v2);

 /// Returns \f$ v_1^T M v_2  \f$ .
 /// Not as efficient as it could be where v1 == v2.
-template<class Real>
+template<typename Real>
 Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
               const VectorBase<Real> &v2);

--- a/src/matrix/matrix-common.h
+++ b/src/matrix/matrix-common.h
@ -38,6 +38,12 @@ typedef enum {
  kCopyData
 } MatrixResizeType;

+typedef enum {
+  kTakeLower,
+  kTakeUpper,
+  kTakeMean,
+  kTakeMeanAndCheck
+} SpCopyType;

 template<typename Real> class VectorBase;
 template<typename Real> class Vector;
@ -57,6 +63,9 @@ template<typename Real> class CuMatrix;
 template<typename Real> class CuVectorBase;
 template<typename Real> class CuSubVector;
 template<typename Real> class CuVector;
+template<typename Real> class CuPackedMatrix;
+template<typename Real> class CuSpMatrix;
+template<typename Real> class CuTpMatrix;

 class CompressedMatrix;

--- a/src/matrix/matrix-functions-inl.h
+++ b/src/matrix/matrix-functions-inl.h
@ -28,14 +28,14 @@
 namespace kaldi {

 //! ComplexMul implements, inline, the complex multiplication b *= a.
-template<class Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
+template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
                                            Real *b_re, Real *b_im) {
  Real tmp_re = (*b_re * a_re) - (*b_im * a_im);
  *b_im = *b_re * a_im + *b_im * a_re;
  *b_re = tmp_re;
 }

-template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
+template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
                                                   const Real &b_re, const Real &b_im,
                                                   Real *c_re, Real *c_im) {
  *c_re += b_re*a_re - b_im*a_im;
@ -43,7 +43,7 @@ template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real
 }


-template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
+template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
  *a_re = std::cos(x);
  *a_im = std::sin(x);
 }
--- a/src/matrix/matrix-functions.cc
+++ b/src/matrix/matrix-functions.cc
@ -26,7 +26,7 @@

 namespace kaldi {

-template<class Real> void ComplexFt (const VectorBase<Real> &in,
+template<typename Real> void ComplexFt (const VectorBase<Real> &in,
                                     VectorBase<Real> *out, bool forward) {
  int exp_sign = (forward ? -1 : 1);
  KALDI_ASSERT(out != NULL);
@ -93,7 +93,7 @@ void ComplexFt (const VectorBase<double> &in,
 //! of the recursion.


-template<class Real>
+template<typename Real>
 void ComplexFftRecursive (Real *data, int nffts, int N,
                          const int *factor_begin,
                          const int *factor_end, bool forward,
@ -331,7 +331,7 @@ void ComplexFftRecursive (Real *data, int nffts, int N,

 // This is the outer-layer calling code for ComplexFftRecursive.
 // It factorizes the dimension and then calls the FFT routine.
-template<class Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
+template<typename Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
  KALDI_ASSERT(v != NULL);

  if (v->Dim()<=1) return;
@ -347,7 +347,7 @@ template<class Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<R
 }

 //! Inefficient version of Fourier transform, for testing purposes.
-template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
+template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
  KALDI_ASSERT(v != NULL);
  MatrixIndexT N = v->Dim();
  KALDI_ASSERT(N%2 == 0);
@ -388,7 +388,7 @@ void ComplexFft(VectorBase<double> *v, bool forward, Vector<double> *tmp_in);


 // See the long comment below for the math behind this.
-template<class Real> void RealFft (VectorBase<Real> *v, bool forward) {
+template<typename Real> void RealFft (VectorBase<Real> *v, bool forward) {
  KALDI_ASSERT(v != NULL);
  MatrixIndexT N = v->Dim(), N2 = N/2;
  KALDI_ASSERT(N%2 == 0);
@ -589,7 +589,7 @@ so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k}))                             (z2)

 */

-template<class Real> void ComputeDctMatrix(Matrix<Real> *M) {
+template<typename Real> void ComputeDctMatrix(Matrix<Real> *M) {
  //KALDI_ASSERT(M->NumRows() == M->NumCols());
  MatrixIndexT K = M->NumRows();
  MatrixIndexT N = M->NumCols();
@ -612,7 +612,7 @@ template void ComputeDctMatrix(Matrix<float> *M);
 template void ComputeDctMatrix(Matrix<double> *M);


-template<class Real>
+template<typename Real>
 void MatrixExponential<Real>::Clear() {
  N_ = 0;
  P_.Resize(0, 0);
@ -620,7 +620,7 @@ void MatrixExponential<Real>::Clear() {
  powers_.clear();
 }

-template<class Real>
+template<typename Real>
 void MatrixExponential<Real>::Compute(const MatrixBase<Real> &M,
                                      MatrixBase<Real> *X) {
  // does *X = exp(M)
@ -650,7 +650,7 @@ void MatrixExponential<Real>::Compute(const MatrixBase<Real> &M,
    (*X)(i, i) += 1.0;
 };

-template<class Real>
+template<typename Real>
 void MatrixExponential<Real>::Compute(const SpMatrix<Real> &M,
                                      SpMatrix<Real> *X) {
  Matrix<Real> Mfull(M), Xfull(M.NumRows(), M.NumCols());
@ -659,7 +659,7 @@ void MatrixExponential<Real>::Compute(const SpMatrix<Real> &M,
 }


-template<class Real>
+template<typename Real>
 MatrixIndexT MatrixExponential<Real>::ComputeN(const MatrixBase<Real> &M) {
  // Computes the power of two we want to use.  Aim to get
  // AScaled.FrobeniusNorm() < 1/10.
@ -674,7 +674,7 @@ MatrixIndexT MatrixExponential<Real>::ComputeN(const MatrixBase<Real> &M) {
  return N;
 }

-template<class Real>
+template<typename Real>
 void MatrixExponential<Real>::ComputeTaylor(const MatrixBase<Real> &P, MatrixBase<Real> *B0) {
  KALDI_ASSERT(P.FrobeniusNorm() < 1.001);  // should actually be << 1
  // for this to work fast enough.
@ -710,7 +710,7 @@ void MatrixExponential<Real>::ComputeTaylor(const MatrixBase<Real> &P, MatrixBas
  }
 }

-template<class Real>
+template<typename Real>
 void MatrixExponential<Real>::Backprop(const MatrixBase<Real> &hX,
                                       MatrixBase<Real> *hM) const {
  MatrixIndexT dim = P_.NumRows();
@ -747,7 +747,7 @@ void MatrixExponential<Real>::Backprop(const MatrixBase<Real> &hX,
 }


-template<class Real>
+template<typename Real>
 void MatrixExponential<Real>::Backprop(const SpMatrix<Real> &hX,
                                       SpMatrix<Real> *hM) const {
  Matrix<Real> hXfull(hX), hMfull(hX.NumRows(), hX.NumCols());
@ -756,7 +756,7 @@ void MatrixExponential<Real>::Backprop(const SpMatrix<Real> &hX,
 }


-template<class Real>
+template<typename Real>
 void MatrixExponential<Real>::BackpropTaylor(const MatrixBase<Real> &hB0,
                                             MatrixBase<Real> *hP) const {
  // Backprop through the Taylor-series computation.
@ -819,7 +819,7 @@ template class MatrixExponential<float>;
 template class MatrixExponential<double>;


-template<class Real>
+template<typename Real>
 void ComputePca(const MatrixBase<Real> &X,
                MatrixBase<Real> *U,
                MatrixBase<Real> *A,
@ -861,7 +861,7 @@ void ComputePca(const MatrixBase<Real> &X,
      A->AddMatMat(1.0, X, kNoTrans, *U, kTrans, 0.0);
  } else {  // Do inner-product PCA.
    SpMatrix<Real> Nsp(N);  // Matrix of inner products.
-    Nsp.AddMat2(1.0, X, kNoTrans);  // M <-- X X^T
+    Nsp.AddMat2(1.0, X, kNoTrans, 0.0);  // M <-- X X^T

    Matrix<Real> Vtmp;
    Vector<Real> l;
@ -929,7 +929,7 @@ void ComputePca(const MatrixBase<double> &X,
 // Added by Dan, Feb. 13 2012. 
 // This function does: *plus += max(0, a b^T),
 // *minus += max(0, -(a b^T)).
-template<class Real>
+template<typename Real>
 void AddOuterProductPlusMinus(Real alpha,
                              const VectorBase<Real> &a,
                              const VectorBase<Real> &b,
--- a/src/matrix/matrix-functions.h
+++ b/src/matrix/matrix-functions.h
@ -59,12 +59,12 @@ namespace kaldi {
   in some contexts, the transform is made symmetric by multiplying
   by sqrt(N) in both passes.   The user can do this by themselves.
 */
-template<class Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
+template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);

 /// ComplexFt is the same as ComplexFft but it implements the Fourier
 /// transform in an inefficient way.  It is mainly included for testing purposes.
 /// See comment for ComplexFft to describe the input and outputs and what it does.
-template<class Real> void ComplexFt (const VectorBase<Real> &in,
+template<typename Real> void ComplexFt (const VectorBase<Real> &in,
                                     VectorBase<Real> *out, bool forward);

 /// RealFft is a fourier transform of real inputs.  Internally it uses
@ -76,12 +76,12 @@ template<class Real> void ComplexFt (const VectorBase<Real> &in,
 /// The interpretation of the complex-FFT data is as follows: the array
 /// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
 /// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
-template<class Real> void RealFft (VectorBase<Real> *v, bool forward);
+template<typename Real> void RealFft (VectorBase<Real> *v, bool forward);


 /// RealFt has the same input and output format as RealFft above, but it is
 /// an inefficient implementation included for testing purposes.
-template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
+template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);

 /// ComputeDctMatrix computes a matrix corresponding to the DCT, such that
 /// M * v equals the DCT of vector v.  M must be square at input.
@ -97,21 +97,21 @@ template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward)
 /// because it was this way from the start and changing it would affect the
 /// feature generation.

-template<class Real> void ComputeDctMatrix(Matrix<Real> *M);
+template<typename Real> void ComputeDctMatrix(Matrix<Real> *M);


 /// ComplexMul implements, inline, the complex multiplication b *= a.
-template<class Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
+template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
                                            Real *b_re, Real *b_im);

 /// ComplexMul implements, inline, the complex operation c += (a * b).
-template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
+template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
                                                   const Real &b_re, const Real &b_im,
                                                   Real *c_re, Real *c_im);


 /// ComplexImExp implements a <-- exp(i x), inline.
-template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
+template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);


 // This class allows you to compute the matrix exponential function
@ -122,7 +122,7 @@ template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
 // It also provides a function that allows you do back-propagate the
 // derivative of a scalar function through this calculation.
 // The
-template<class Real>
+template<typename Real>
 class MatrixExponential {
 public:
  MatrixExponential() { }
@ -194,7 +194,7 @@ class MatrixExponential {
         method.
 */

-template<class Real>
+template<typename Real>
 void ComputePca(const MatrixBase<Real> &X,
                MatrixBase<Real> *U,
                MatrixBase<Real> *A,
@ -205,14 +205,14 @@ void ComputePca(const MatrixBase<Real> &X,

 // This function does: *plus += max(0, a b^T),
 // *minus += max(0, -(a b^T)).
-template<class Real>
+template<typename Real>
 void AddOuterProductPlusMinus(Real alpha,
                              const VectorBase<Real> &a,
                              const VectorBase<Real> &b,
                              MatrixBase<Real> *plus, 
                              MatrixBase<Real> *minus);

-template<class Real1, class Real2>
+template<typename Real1, typename Real2>
 inline void AssertSameDim(const MatrixBase<Real1> &mat1, const MatrixBase<Real2> &mat2) {
  KALDI_ASSERT(mat1.NumRows() == mat2.NumRows()
               && mat1.NumCols() == mat2.NumCols());
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
--- a/src/matrix/optimization.cc
+++ b/src/matrix/optimization.cc
@ -28,7 +28,7 @@ namespace kaldi {

 // Below, N&W refers to Nocedal and Wright, "Numerical Optimization", 2nd Ed.

-template<class Real>
+template<typename Real>
 OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
                                   const LbfgsOptions &opts):
    opts_(opts), k_(0), computation_state_(kBeforeStep), H_was_set_(false) {
@ -48,7 +48,7 @@ OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
 }


-template<class Real>
+template<typename Real>
 Real OptimizeLbfgs<Real>::RecentStepLength() const {
  size_t n = step_lengths_.size();
  if (n == 0) return std::numeric_limits<Real>::infinity();
@ -63,7 +63,7 @@ Real OptimizeLbfgs<Real>::RecentStepLength() const {
  }
 }

-template<class Real>
+template<typename Real>
 void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
  if (k_ == 0) {
    if (H_.Dim() == 0) {
@ -107,7 +107,7 @@ void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
 // This represents the first 2 lines of Algorithm 7.5 (N&W), which
 // in fact is mostly a call to Algorithm 7.4.
 // Note: this is valid whether we are minimizing or maximizing.
-template<class Real>
+template<typename Real>
 void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
                                              const VectorBase<Real> &gradient) {
  KALDI_ASSERT(computation_state_ == kBeforeStep);
@ -166,7 +166,7 @@ void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
 }


-template<class Real>
+template<typename Real>
 bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
                                     const VectorBase<Real> &gradient) {
  // Save s_k = x_{k+1} - x_{k}, and y_k = \nabla f_{k+1} - \nabla f_k.
@ -200,7 +200,7 @@ bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
  return true; // We successfully accepted the step.
 }

-template<class Real>
+template<typename Real>
 void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
  step_lengths_.push_back(s);
  if (step_lengths_.size() > static_cast<size_t>(opts_.avg_step_length))
@ -208,7 +208,7 @@ void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
 }


-template<class Real>
+template<typename Real>
 void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
                                  Real f,
                                  const VectorBase<Real> &gradient) {
@ -231,7 +231,7 @@ void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
  ComputeNewDirection(f, gradient);
 }

-template<class Real>
+template<typename Real>
 void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
                                            const VectorBase<Real> &gradient) {
  KALDI_VLOG(3) << "In step size iteration, function value changed "
@ -376,7 +376,7 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
  }
 }

-template<class Real>
+template<typename Real>
 void OptimizeLbfgs<Real>::DoStep(Real function_value,
                                 const VectorBase<Real> &gradient) {
  if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
@ -389,7 +389,7 @@ void OptimizeLbfgs<Real>::DoStep(Real function_value,
    StepSizeIteration(function_value, gradient);
 }

-template<class Real>
+template<typename Real>
 void OptimizeLbfgs<Real>::DoStep(Real function_value,
                                 const VectorBase<Real> &gradient,
                                 const VectorBase<Real> &diag_approx_2nd_deriv) {
@ -408,7 +408,7 @@ void OptimizeLbfgs<Real>::DoStep(Real function_value,
  DoStep(function_value, gradient);
 }

-template<class Real>
+template<typename Real>
 const VectorBase<Real>&
 OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
  if (objf_value != NULL) *objf_value = best_f_;
--- a/src/matrix/optimization.h
+++ b/src/matrix/optimization.h
@ -83,7 +83,7 @@ struct LbfgsOptions {
      avg_step_length(4) { }
 };

-template<class Real>
+template<typename Real>
 class OptimizeLbfgs {
 public:
  /// Initializer takes the starting value of x.
--- a/src/matrix/packed-matrix.cc
+++ b/src/matrix/packed-matrix.cc
@ -36,12 +36,6 @@ void PackedMatrix<Real>::Scale(Real alpha) {
  cblas_Xscal(sz, alpha, data_, 1);
 }

-template<typename Real>
-void PackedMatrix<Real>::AddVec2(const Real alpha, const Vector<Real> &rv) {
-  KALDI_ASSERT(rv.Dim() == num_rows_);
-  cblas_Xspr(rv.Dim(), alpha, rv.Data(), 1, data_);
-}
-
 template<typename Real>
 void PackedMatrix<Real>::AddPacked(const Real alpha, const PackedMatrix<Real> &rMa) {
  KALDI_ASSERT(num_rows_ == rMa.NumRows());
@ -50,7 +44,7 @@ void PackedMatrix<Real>::AddPacked(const Real alpha, const PackedMatrix<Real> &r
  cblas_Xaxpy(sz, alpha, rMa.Data(), 1, data_, 1);
 }

-template<class Real>
+template<typename Real>
 void PackedMatrix<Real>::SetRandn() {
  Real *data = data_;
  size_t dim = num_rows_, size = ((dim*(dim+1))/2);
@ -89,6 +83,12 @@ void PackedMatrix<Real>::Swap(PackedMatrix<Real> *other) {
  std::swap(num_rows_, other->num_rows_);
 }

+template<typename Real>
+void PackedMatrix<Real>::Swap(Matrix<Real> *other) {
+  std::swap(data_, other->data_);
+  std::swap(num_rows_, other->num_rows_);
+}
+

 template<typename Real>
 void PackedMatrix<Real>::Resize(MatrixIndexT r, MatrixResizeType resize_type) {
@ -119,6 +119,15 @@ void PackedMatrix<Real>::Resize(MatrixIndexT r, MatrixResizeType resize_type) {



+template<typename Real>
+void PackedMatrix<Real>::AddToDiag(Real r) {
+  Real *ptr = data_;
+  for (MatrixIndexT i = 2; i <= num_rows_+1; i++) {
+    *ptr += r;
+    ptr += i;
+  }
+}
+
 template<typename Real>
 void PackedMatrix<Real>::ScaleDiag(Real alpha) {
  Real *ptr = data_;
@ -138,6 +147,7 @@ void PackedMatrix<Real>::SetDiag(Real alpha) {
 }


+
 template<typename Real>
 template<typename OtherReal>
 void PackedMatrix<Real>::CopyFromPacked(const PackedMatrix<OtherReal> &orig) {
@ -221,35 +231,45 @@ void PackedMatrix<Real>::Destroy() {
  num_rows_ = 0;
 }

+
 template<typename Real>
 void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
  if (!os.good()) {
    KALDI_ERR << "Failed to write vector to stream: stream not good";
  }
-  std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
-
-  WriteToken(os, binary, my_token);

  int32 size = this->NumRows();  // make the size 32-bit on disk.
  KALDI_ASSERT(this->NumRows() == (MatrixIndexT) size);
-  WriteBasicType(os, binary, size);
+  MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;

+  if(binary) {  
+    std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
+    WriteToken(os, binary, my_token);
+    WriteBasicType(os, binary, size);
  // We don't use the built-in Kaldi write routines for the floats, as they are
  // not efficient enough.
-  MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;
-  if (!binary) {
-    for (MatrixIndexT i = 0; i < num_elems; i++)
-      WriteBasicType(os, binary, data_[i]);
-    os << '\n';
-  } else {
    os.write((const char*) data_, sizeof(Real) * num_elems);
  }
+  else {
+    if(size == 0)
+      os<<"[ ]\n";
+    else {
+      os<<"[\n";
+      MatrixIndexT i = 0;
+      for (int32 j = 0; j < size; j++) {  
+        for (int32 k = 0; k < j + 1; k++) {
+          WriteBasicType(os, binary, data_[i++]);
+        }
+        os << ( (j==size-1)? "]\n" : "\n");
+      }
+      KALDI_ASSERT(i == num_elems);
+    }
+  }
  if (os.fail()) {
    KALDI_ERR << "Failed to write packed matrix to stream";
  }
 }

-
 // template<typename Real>
 //   void Save (std::ostream & os, const PackedMatrix<Real>& rM)
 //   {
@ -275,7 +295,7 @@ void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {


 template<typename Real>
-void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
+void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
  if (add) {
    PackedMatrix<Real> tmp;
    tmp.Read(is, binary, false);  // read without adding.
@ -295,6 +315,8 @@ void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
  MatrixIndexT pos_at_start = is.tellg();
  int peekval = Peek(is, binary);
  const char *my_token =  (sizeof(Real) == 4 ? "FP" : "DP");
+  const char *new_format_token = "[";
+  bool is_new_format = false;//added by hxu
  char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
  int32 size;
  MatrixIndexT num_elems;
@ -310,25 +332,93 @@ void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
  std::string token;
  ReadToken(is, binary, &token);
  if (token != my_token) {
-    specific_error << ": Expected token " << my_token << ", got " << token;
-    goto bad;
-  }
-  ReadBasicType(is, binary, &size);  // throws on error.
-  if ((MatrixIndexT)size != this->NumRows()) {
-    KALDI_ASSERT(size>=0);
-    this->Resize(size);
-  }
-  num_elems = ((size+1)*(MatrixIndexT)size)/2;
-  if (!binary) {
-    for (MatrixIndexT i = 0; i < num_elems; i++) {
-      ReadBasicType(is, false, data_+i);  // will throw on error.
+    if(token != new_format_token) {
+      specific_error << ": Expected token " << my_token << ", got " << token;
+      goto bad;
    }
-  } else {
-    if (num_elems)
-      is.read(reinterpret_cast<char*>(data_), sizeof(Real)*num_elems);
+    //new format it is
+    is_new_format = true; 
+  }
+  if(!is_new_format) {
+    ReadBasicType(is, binary, &size);  // throws on error.
+    if ((MatrixIndexT)size != this->NumRows()) {
+      KALDI_ASSERT(size>=0);
+      this->Resize(size);
+    }
+    num_elems = ((size+1)*(MatrixIndexT)size)/2;
+    if (!binary) {
+      for (MatrixIndexT i = 0; i < num_elems; i++) {
+        ReadBasicType(is, false, data_+i);  // will throw on error.
+      }
+    } else {
+      if (num_elems)
+        is.read(reinterpret_cast<char*>(data_), sizeof(Real)*num_elems);
+    }
+    if (is.fail()) goto bad;
+    return;
+  }
+  else {
+    std::vector<Real> data;
+    while(1) {
+      int32 num_lines = 0;
+      int i = is.peek();
+      if (i == -1) { specific_error << "Got EOF while reading matrix data"; goto bad; }
+      else if (static_cast<char>(i) == ']') {  // Finished reading matrix.
+        is.get();  // eat the "]".
+        i = is.peek();
+        if (static_cast<char>(i) == '\r') {
+          is.get();
+          is.get();  // get \r\n (must eat what we wrote)
+        }// I don't actually understand what it's doing here
+        else if (static_cast<char>(i) == '\n') { is.get(); } // get \n (must eat what we wrote)
+
+        if (is.fail()) {
+          KALDI_WARN << "After end of matrix data, read error.";
+          // we got the data we needed, so just warn for this error.
+        }
+        //now process the data:
+        num_lines = int32(sqrt(data.size()*2));
+        
+        KALDI_ASSERT(data.size() == num_lines*(num_lines+1)/2);
+
+        this->Resize(num_lines);
+
+        //std::cout<<data.size()<<' '<<num_lines<<'\n';
+
+        for(int32 i = 0; i < data.size(); i++) {
+          data_[i] = data[i];
+        }
+        return;
+        //std::cout<<"here!!!!!hxu!!!!!"<<std::endl;
+      }
+      else if ( (i >= '0' && i <= '9') || i == '-' ) {  // A number...
+        Real r; 
+        is >> r;
+        if (is.fail()) {
+          specific_error << "Stream failure/EOF while reading matrix data.";
+          goto bad;
+        } 
+        data.push_back(r);
+      }
+      else if (isspace(i)) {
+        is.get();  // eat the space and do nothing.
+      } else {  // NaN or inf or error.
+        std::string str;
+        is >> str;
+        if (!KALDI_STRCASECMP(str.c_str(), "inf") ||
+            !KALDI_STRCASECMP(str.c_str(), "infinity")) {
+          data.push_back(std::numeric_limits<Real>::infinity());
+          KALDI_WARN << "Reading infinite value into matrix.";
+        } else if (!KALDI_STRCASECMP(str.c_str(), "nan")) {
+          data.push_back(std::numeric_limits<Real>::quiet_NaN());
+          KALDI_WARN << "Reading NaN value into matrix.";
+        } else {
+          specific_error << "Expecting numeric matrix data, got " << str;
+          goto bad;
+        } 
+      }       
+    } 
  }
-  if (is.fail()) goto bad;
-  return;
 bad:
  KALDI_ERR << "Failed to read packed matrix from stream. " << specific_error
            << " File position at start is "
--- a/src/matrix/packed-matrix.h
+++ b/src/matrix/packed-matrix.h
@ -1,7 +1,8 @@
 // matrix/packed-matrix.h

-// Copyright 2009-2012  Ondrej Glembek;  Lukas Burget;  Microsoft Corporation;
-//                      Saarland University;  Yanmin Qian;  Johns Hopkins University (Author: Daniel Povey)
+// Copyright 2009-2013  Ondrej Glembek;  Lukas Burget;  Microsoft Corporation;
+//                      Saarland University;  Yanmin Qian;
+//                      Johns Hopkins University (Author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
@ -37,28 +38,29 @@ std::ostream & operator <<(std::ostream & out, const PackedMatrix<Real>& M);

 /// @brief Packed matrix: base class for triangular and symmetric matrices.
 template<typename Real> class PackedMatrix {
+  friend class CuPackedMatrix<Real>;
 public:
+  //friend class CuPackedMatrix<Real>;
+
  PackedMatrix() : data_(NULL), num_rows_(0) {}

  explicit PackedMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero):
      data_(NULL) {  Resize(r, resize_type);  }

  explicit PackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL) {
-    Resize(orig.num_rows_);
+    Resize(orig.num_rows_, kUndefined);
    CopyFromPacked(orig);
  }

-  template<class OtherReal>
+  template<typename OtherReal>
  explicit PackedMatrix(const PackedMatrix<OtherReal> &orig) : data_(NULL) {
-    Resize(orig.NumRows());
+    Resize(orig.NumRows(), kUndefined);
    CopyFromPacked(orig);
  }
  
-  void SetZero();
+  void SetZero();  /// < Set to zero
  void SetUnit();  /// < Set to unit matrix.
-
-  /// Sets to random values of a normal distribution
-  void SetRandn();
+  void SetRandn(); /// < Set to random values of a normal distribution

  Real Trace() const;

@ -82,17 +84,19 @@ template<typename Real> class PackedMatrix {
  /// This function takes time proportional to the number of data elements.
  void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);

+  void AddToDiag(const Real r); // Adds r to diaginal
+
  void ScaleDiag(const Real alpha);  // Scales diagonal by alpha.

  void SetDiag(const Real alpha);  // Sets diagonal to this value.

-  template<class OtherReal>
+  template<typename OtherReal>
  void CopyFromPacked(const PackedMatrix<OtherReal> &orig);
-
+  
  /// CopyFromVec just interprets the vector as having the same layout
  /// as the packed matrix.  Must have the same dimension, i.e.
  /// orig.Dim() == (NumRows()*(NumRows()+1)) / 2;
-  template<class OtherReal>
+  template<typename OtherReal>
  void CopyFromVec(const SubVector<OtherReal> &orig);
  
  Real* Data() { return data_; }
@ -104,6 +108,8 @@ template<typename Real> class PackedMatrix {
    return ((nr * (nr+1)) / 2) * sizeof(Real);
  }

+  //MatrixIndexT Stride() const { return stride_; }
+
  // This code is duplicated in child classes to avoid extra levels of calls.
  Real operator() (MatrixIndexT r, MatrixIndexT c) const {
    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
@ -134,10 +140,6 @@ template<typename Real> class PackedMatrix {
    return * (std::min_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) ));
  }

-
-  // *this <-- *this + alpha* rV * rV^T.
-  // The "2" in the name is because the argument is repeated.
-  void AddVec2(const Real alpha, const Vector<Real> &rv);
  void Scale(Real c);

  friend std::ostream & operator << <> (std::ostream & out,
@ -147,18 +149,20 @@ template<typename Real> class PackedMatrix {
  void Read(std::istream &in, bool binary, bool add = false);

  void Write(std::ostream &out, bool binary) const;
-  // binary = true is not yet supported.
-
+  
  void Destroy();

  /// Swaps the contents of *this and *other.  Shallow swap.
  void Swap(PackedMatrix<Real> *other);
+  void Swap(Matrix<Real> *other);
+

 protected:
  // Will only be called from this class or derived classes.
  void AddPacked(const Real alpha, const PackedMatrix<Real>& M);
  Real *data_;
  MatrixIndexT num_rows_;
+  //MatrixIndexT stride_;
 private:
  /// Init assumes the current contents of the class are is invalid (i.e. junk or
  /// has already been freed), and it sets the matrixd to newly allocated memory
@ -189,9 +193,5 @@ std::istream & operator >> (std::istream &is, PackedMatrix<Real> &M) {

 }  // namespace kaldi

-
-// Including the implementation
-#include "matrix/packed-matrix-inl.h"
-
 #endif

--- a/src/matrix/qr.cc
+++ b/src/matrix/qr.cc
@ -37,7 +37,7 @@ namespace kaldi {
   x is the input of dimensino dim, v is the output of dimension
   dim, and beta is a scalar. Note: we use zero-based
   not one-based indexing. */
-template<class Real>
+template<typename Real>
 void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
  KALDI_ASSERT(dim > 0);
  // To avoid overflow, we first compute the max of x_ (or
@ -84,7 +84,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
 // the vector that is "special".  This is convenient in
 // the Tridiagonalize routine that uses reversed indexes for
 // compatibility with the packed lower triangular format.
-template<class Real>
+template<typename Real>
 void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
  KALDI_ASSERT(dim > 0);
  // To avoid overflow, we first compute the max of x_ (or
@ -138,7 +138,7 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
   Caution: Q is transposed vs. Golub and Van Loan.
   If Q != NULL it outputs Q. 
 */
-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
  MatrixIndexT n = this->NumRows();
  KALDI_ASSERT(Q == NULL || (Q->NumRows() == n &&
@ -194,7 +194,7 @@ template
 void SpMatrix<double>::Tridiagonalize(MatrixBase<double> *Q);

 /// Create Givens rotations, as in Golub and Van Loan 3rd ed., page 216.
-template<class Real>
+template<typename Real>
 inline void Givens(Real a, Real b, Real *c, Real *s) {
  if (b == 0) {
    *c = 1;
@ -218,7 +218,7 @@ inline void Givens(Real a, Real b, Real *c, Real *s) {
 // with Wilkinson shift."  A couple of differences: this code is
 // in zero based arithmetic, and we represent Q transposed from
 // their Q for memory locality with row-major-indexed matrices.
-template <class Real>
+template <typename Real>
 void QrStep(MatrixIndexT n,
            Real *diag,
            Real *off_diag,
@ -294,7 +294,7 @@ void QrStep(MatrixIndexT n,
 // Internal code for the QR algorithm, where the diagonal
 // and off-diagonal of the symmetric matrix are represented as
 // vectors of length n and n-1.
-template <class Real>
+template <typename Real>
 void QrInternal(MatrixIndexT n,
                Real *diag,
                Real *off_diag,
@ -372,7 +372,7 @@ void QrInternal(MatrixIndexT n,
   This is the symmetric QR algorithm, from Golub and Van Loan 3rd ed., Algorithm
   8.3.3.  Q is transposed w.r.t. there, though.
 */
-template <class Real>
+template <typename Real>
 void SpMatrix<Real>::Qr(MatrixBase<Real> *Q) {
  KALDI_ASSERT(this->IsTridiagonal());
  // We envisage that Q would be square but we don't check for this,
@ -396,7 +396,7 @@ void SpMatrix<Real>::Qr(MatrixBase<Real> *Q) {
  }
 }

-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::Eig(VectorBase<Real> *s, MatrixBase<Real> *P) const {
  MatrixIndexT dim = this->NumRows();
  KALDI_ASSERT(s->Dim() == dim);
@ -417,7 +417,7 @@ void SpMatrix<Real>::Eig(VectorBase<Real> *s, MatrixBase<Real> *P) const {
 }


-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
                             MatrixIndexT lanczos_dim) const {
  const SpMatrix<Real> &S(*this); // call this "S" for easy notation.
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@ -193,7 +193,7 @@ void SpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
  }
 }

-template<class Real>
+template<typename Real>
 Real SpMatrix<Real>::Trace() const {
  const Real *data = this->data_;
  MatrixIndexT num_rows = this->num_rows_;
@ -204,8 +204,8 @@ Real SpMatrix<Real>::Trace() const {
 }

 // diagonal update, this <-- this + diag(v)
-template<class Real>
-template<class OtherReal>
+template<typename Real>
+template<typename OtherReal>
 void  SpMatrix<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v) {
  int32 num_rows = this->num_rows_;
  KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0);
@ -316,7 +316,7 @@ void SpMatrix<Real>::Invert(Real *logdet, Real *det_sign, bool need_inverse) {
 }
 #else
 // in the ATLAS case, these are not implemented using a library and we back off to something else.
-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::Invert(Real *logdet, Real *det_sign, bool need_inverse) {
  Matrix<Real> M(this->NumRows(), this->NumCols());
  M.CopyFromSp(*this);
@ -481,7 +481,7 @@ double TraceMatSpMatSp(const MatrixBase<double> &A, MatrixTransposeType transA,
                       MatrixTransposeType transC, const SpMatrix<double> &D);


-template<class Real>
+template<typename Real>
 bool SpMatrix<Real>::IsDiagonal(Real cutoff) const {
  MatrixIndexT R = this->NumRows();
  Real bad_sum = 0.0, good_sum = 0.0;
@ -496,7 +496,7 @@ bool SpMatrix<Real>::IsDiagonal(Real cutoff) const {
  return (!(bad_sum > good_sum * cutoff));
 }

-template<class Real>
+template<typename Real>
 bool SpMatrix<Real>::IsUnit(Real cutoff) const {
  MatrixIndexT R = this->NumRows();
  Real max = 0.0;  // max error
@ -507,7 +507,7 @@ bool SpMatrix<Real>::IsUnit(Real cutoff) const {
  return (max <= cutoff);
 }

-template<class Real>
+template<typename Real>
 bool SpMatrix<Real>::IsTridiagonal(Real cutoff) const {
  MatrixIndexT R = this->NumRows();
  Real max_abs_2diag = 0.0, max_abs_offdiag = 0.0;
@ -523,13 +523,13 @@ bool SpMatrix<Real>::IsTridiagonal(Real cutoff) const {
  return (max_abs_offdiag <= cutoff * max_abs_2diag);
 }

-template<class Real>
+template<typename Real>
 bool SpMatrix<Real>::IsZero(Real cutoff) const {
  if (this->num_rows_ == 0) return true;
  return (this->Max() <= cutoff && this->Min() >= -cutoff);
 }

-template<class Real>
+template<typename Real>
 Real SpMatrix<Real>::FrobeniusNorm() const {
  Real sum = 0.0;
  MatrixIndexT R = this->NumRows();
@ -541,14 +541,14 @@ Real SpMatrix<Real>::FrobeniusNorm() const {
  return sqrt(sum);
 }

-template<class Real>
+template<typename Real>
 bool SpMatrix<Real>::ApproxEqual(const SpMatrix<Real> &other, float tol) const {
  if (this->NumRows() != other.NumRows())
    KALDI_ERR << "SpMatrix::AproxEqual, size mismatch, "
              << this->NumRows() << " vs. " << other.NumRows();
  SpMatrix<Real> tmp(*this);
  tmp.AddSp(-1.0, other);
-  return (tmp.FrobeniusNorm() <= tol * this->FrobeniusNorm());
+  return (tmp.FrobeniusNorm() <= tol * std::max(this->FrobeniusNorm(), other.FrobeniusNorm()));
 }

 // function Floor: A = Floor(B, alpha * C) ... see tutorial document.
@ -600,7 +600,7 @@ int SpMatrix<Real>::ApplyFloor(const SpMatrix<Real> &C, Real alpha,
  return nfloored;
 }

-template<class Real>
+template<typename Real>
 Real SpMatrix<Real>::LogDet(Real *det_sign) const {
  Real log_det;
  SpMatrix<Real> tmp(*this);
@ -648,7 +648,7 @@ MatrixIndexT SpMatrix<Real>::LimitCond(Real maxCond, bool invert) {  // e.g. max
      s(i) = sqrt(std::max(s(i), floor));
  }
  P.MulColsVec(s);
-  (*this).AddMat2(1.0, P, kNoTrans);  // (*this) = P*P^T.  ... (*this) = P * floor(s) * P^T  ... if P was original P.
+  (*this).AddMat2(1.0, P, kNoTrans, 0.0);  // (*this) = P*P^T.  ... (*this) = P * floor(s) * P^T  ... if P was original P.
  return nfloored;
 }

@ -965,8 +965,8 @@ void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<double> &v)
 }


-template<class Real>
-template<class OtherReal>
+template<typename Real>
+template<typename OtherReal>
 void SpMatrix<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
  KALDI_ASSERT(v.Dim() == this->NumRows());
  Real *data = this->data_;
@ -984,7 +984,7 @@ template
 void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<float> &v);


-template<class Real>
+template<typename Real>
 Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
              const VectorBase<Real> &v2) {
  MatrixIndexT D = M.NumRows();
@ -1002,7 +1002,7 @@ double VecSpVec(const VectorBase<double> &v1, const SpMatrix<double> &M,
                const VectorBase<double> &v2);


-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::AddMat2Sp(
    const Real alpha, const MatrixBase<Real> &M,
    MatrixTransposeType transM, const SpMatrix<Real> &A, const Real beta) {
@ -1046,7 +1046,7 @@ void SpMatrix<Real>::AddMat2Sp(
  }
 }

-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::AddSmat2Sp(
    const Real alpha, const MatrixBase<Real> &M,
    MatrixTransposeType transM, const SpMatrix<Real> &A,
@ -1101,7 +1101,7 @@ void SpMatrix<Real>::AddSmat2Sp(
  }
 }

-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::AddMat2Vec(const Real alpha,
                                const MatrixBase<Real> &M,
                                MatrixTransposeType transM,
@ -1130,7 +1130,7 @@ void SpMatrix<Real>::AddMat2Vec(const Real alpha,
  }
 }

-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
                             MatrixTransposeType transM, const Real beta)  {
  KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
@ -1159,7 +1159,7 @@ void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
  this->CopyFromMat(temp_mat, kTakeLower);
 }

-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
                              MatrixTransposeType transM, const SpMatrix<Real> &A,
                              const Real beta) {
@ -1167,7 +1167,7 @@ void SpMatrix<Real>::AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
  AddMat2Sp(alpha, Tmat, transM, A, beta);
 }

-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::AddVecVec(const Real alpha, const VectorBase<Real> &v,
                               const VectorBase<Real> &w) {
  int32 dim = this->NumRows();
@ -1176,7 +1176,7 @@ void SpMatrix<Real>::AddVecVec(const Real alpha, const VectorBase<Real> &v,
 }


-template<class Real>
+template<typename Real>
 void SpMatrix<Real>::AddTp2(const Real alpha, const TpMatrix<Real> &T,
                            MatrixTransposeType transM, const Real beta) {
  Matrix<Real> Tmat(T);
@ -1191,7 +1191,7 @@ template class SpMatrix<float>;
 template class SpMatrix<double>;


-template<class Real>
+template<typename Real>
 Real TraceSpSpLower(const SpMatrix<Real> &A, const SpMatrix<Real> &B) {
  MatrixIndexT adim = A.NumRows();
  KALDI_ASSERT(adim == B.NumRows());
--- a/src/matrix/sp-matrix.h
+++ b/src/matrix/sp-matrix.h
@ -28,14 +28,6 @@

 namespace kaldi {

-/// \weakgroup matrix_funcs_misc
-typedef enum {
-  kTakeLower,
-  kTakeUpper,
-  kTakeMean,
-  kTakeMeanAndCheck
-} SpCopyType;
-

 /// \addtogroup matrix_group
 /// @{
@ -47,19 +39,25 @@ template<typename Real> class SpMatrix;
 */
 template<typename Real>
 class SpMatrix : public PackedMatrix<Real> {
+  friend class CuSpMatrix<Real>;
 public:
  // so it can use our assignment operator.
  friend class std::vector<Matrix<Real> >;

  SpMatrix(): PackedMatrix<Real>() {}

+  /// Copy constructor from CUDA version of SpMatrix
+  /// This is defined in ../cudamatrix/cu-sp-matrix.h
+  
+  explicit SpMatrix(const CuSpMatrix<Real> &cu);
+ 
  explicit SpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
      : PackedMatrix<Real>(r, resize_type) {}

  SpMatrix(const SpMatrix<Real> &orig)
      : PackedMatrix<Real>(orig) {}

-  template<class OtherReal>
+  template<typename OtherReal>
  explicit SpMatrix(const SpMatrix<OtherReal> &orig)
      : PackedMatrix<Real>(orig) {}

@ -77,8 +75,6 @@ class SpMatrix : public PackedMatrix<Real> {
  }
 #endif

-  ~SpMatrix() {}
-
  /// Shallow swap.
  void Swap(SpMatrix *other);

@ -90,7 +86,7 @@ class SpMatrix : public PackedMatrix<Real> {
    PackedMatrix<Real>::CopyFromPacked(other);
  }

-  template<class OtherReal>
+  template<typename OtherReal>
  void CopyFromSp(const SpMatrix<OtherReal> &other) {
    PackedMatrix<Real>::CopyFromPacked(other);
  }
@ -231,7 +227,7 @@ class SpMatrix : public PackedMatrix<Real> {
  Real LogDet(Real *det_sign = NULL) const;

  /// rank-one update, this <-- this + alpha v v'
-  template<class OtherReal>
+  template<typename OtherReal>
  void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);

  /// rank-two update, this <-- this + alpha (v w' + w v').
@ -243,7 +239,7 @@ class SpMatrix : public PackedMatrix<Real> {
                 const SpMatrix<Real> &S, const Real beta);
  
  /// diagonal update, this <-- this + diag(v)
-  template<class OtherReal>
+  template<typename OtherReal>
  void AddVec(const Real alpha, const VectorBase<OtherReal> &v);

  /// rank-N update:
@ -251,8 +247,9 @@ class SpMatrix : public PackedMatrix<Real> {
  /// (*this) = beta*(*this) + alpha * M * M^T,
  /// or  (if transM == kTrans)
  ///  (*this) = beta*(*this) + alpha * M^T * M
+  /// Note: beta used to default to 0.0.
  void AddMat2(const Real alpha, const MatrixBase<Real> &M,
-               MatrixTransposeType transM, const Real beta = 0.0);
+               MatrixTransposeType transM, const Real beta);

  /// Extension of rank-N update:
  /// this <-- beta*this  +  alpha * M * A * M^T.
@ -286,8 +283,7 @@ class SpMatrix : public PackedMatrix<Real> {
  /// can implement it more efficiently.
  void AddTp2(const Real alpha, const TpMatrix<Real> &T,
              MatrixTransposeType transM, const Real beta = 0.0);
-  
-  
+
  /// Extension of rank-N update:
  /// this <-- beta*this + alpha * M * diag(v) * M^T.
  /// if transM == kTrans, then
@ -381,6 +377,20 @@ float TraceSpSp(const SpMatrix<float> &A, const SpMatrix<float> &B);
 double TraceSpSp(const SpMatrix<double> &A, const SpMatrix<double> &B);


+template<typename Real>
+inline bool ApproxEqual(const SpMatrix<Real> &A,
+                        const SpMatrix<Real> &B, Real tol = 0.01) {
+  return  A.ApproxEqual(B, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(const SpMatrix<Real> &A,
+                        const SpMatrix<Real> &B, Real tol = 0.01) {
+  KALDI_ASSERT(ApproxEqual(A, B, tol));
+}
+
+
+
 /// Returns tr(A B).
 template<typename Real, typename OtherReal>
 Real TraceSpSp(const SpMatrix<Real> &A, const SpMatrix<OtherReal> &B);
@ -419,7 +429,7 @@ Real TraceMatSpMatSp(const MatrixBase<Real> &A, MatrixTransposeType transA,

 /// Returns \f$ v_1^T M v_2 \f$
 /// Not as efficient as it could be where v1 == v2.
-template<class Real>
+template<typename Real>
 Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
               const VectorBase<Real> &v2);

@ -461,7 +471,7 @@ struct SolverOptions {
 /// Assumes H positive semidefinite.
 /// Returns the objective-function change.

-template<class Real>
+template<typename Real>
 Real SolveQuadraticProblem(const SpMatrix<Real> &H,
                           const VectorBase<Real> &g,
                           const SolverOptions &opts,
@ -479,7 +489,7 @@ Real SolveQuadraticProblem(const SpMatrix<Real> &H,
 /// diagonal_precondition option is newly added, to handle problems
 /// where different dimensions have very different scaling (we recommend to use
 /// the option but it's set false for back compatibility).
-template<class Real>
+template<typename Real>
 Real SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
                                 const MatrixBase<Real> &Y,
                                 const SpMatrix<Real> &P,
@ -490,7 +500,7 @@ Real SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
 /// \f[   Q(M) =  tr(M^T G) -0.5 tr(P_1 M Q_1 M^T) -0.5 tr(P_2 M Q_2 M^T).   \f]
 /// Encountered in matrix update with a prior. We also apply a limit on the
 /// condition but it should be less frequently necessary, and can be set larger.
-template<class Real>
+template<typename Real>
 Real SolveDoubleQuadraticMatrixProblem(const MatrixBase<Real> &G,
                                       const SpMatrix<Real> &P1,
                                       const SpMatrix<Real> &P2,
--- a/src/matrix/srfft.cc
+++ b/src/matrix/srfft.cc
@ -31,7 +31,7 @@
 namespace kaldi {


-template<class Real>
+template<typename Real>
 SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
  if ( (N & (N-1)) != 0 || N <= 1)
    KALDI_ERR << "SplitRadixComplexFft called with invalid number of points "
@ -46,7 +46,7 @@ SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
  temp_buffer = NULL;
 }

-template<class Real>
+template<typename Real>
 void SplitRadixComplexFft<Real>::ComputeTables() {
  MatrixIndexT    imax, lg2, i, j;
  MatrixIndexT     m, m2, m4, m8, nel, n;
@ -97,7 +97,7 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
  }
 }

-template<class Real>
+template<typename Real>
 SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
  delete [] brseed;
  if (tab != NULL) {
@ -109,7 +109,7 @@ SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
    delete [] temp_buffer;
 }

-template<class Real>
+template<typename Real>
 void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const {
  if (!forward) {  // reverse real and imaginary parts for complex FFT.
    Real *tmp = xr;
@ -123,7 +123,7 @@ void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const
  }
 }

-template<class Real>
+template<typename Real>
 void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
  if (temp_buffer == NULL)
    temp_buffer = new Real[N_];
@ -150,7 +150,7 @@ void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
  x[1] = temp_buffer[0];  // special case of i = 0.
 }

-template<class Real>
+template<typename Real>
 void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) const {
  MatrixIndexT      i, j, lg2, n;
  MatrixIndexT      off, fj, gno, *brp;
@ -176,7 +176,7 @@ void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) c
 }


-template<class Real>
+template<typename Real>
 void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixIndexT logm) const {

  MatrixIndexT    m, m2, m4, m8, nel, n;
@ -321,7 +321,7 @@ void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixInde

 // This code is mostly the same as the RealFft function.  It would be
 // possible to replace it with more efficient code from Rico's book.
-template<class Real>
+template<typename Real>
 void SplitRadixRealFft<Real>::Compute(Real *data, bool forward) {
  MatrixIndexT N = N_, N2 = N/2;
  KALDI_ASSERT(N%2 == 0);
--- a/src/matrix/srfft.h
+++ b/src/matrix/srfft.h
@ -41,7 +41,7 @@ namespace kaldi {
 // Microsoft Corporation
 // This is a more efficient way of doing the complex FFT than ComplexFft
 // above, but it only works for powers of 2.
-template<class Real>
+template<typename Real>
 class SplitRadixComplexFft {
 public:
  typedef MatrixIndexT Integer;
@ -83,7 +83,7 @@ class SplitRadixComplexFft {
  // data.
 };

-template<class Real>
+template<typename Real>
 class SplitRadixRealFft: private SplitRadixComplexFft<Real> {
 public:
  SplitRadixRealFft(MatrixIndexT N):  // will fail unless N>=4 and N is a power of 2.
--- a/src/matrix/tp-matrix.cc
+++ b/src/matrix/tp-matrix.cc
@ -69,7 +69,7 @@ void TpMatrix<Real>::Invert() {
 }

 /*
-template<class Real>
+template<typename Real>
 void TpMatrix<Real>::Invert() {
  Matrix<Real> tmp(*this);
  tmp.Invert();
@ -127,7 +127,7 @@ void TpMatrix<Real>::Cholesky(const SpMatrix<Real> &orig) {
 }

 template<typename Real>
-void TpMatrix<Real>::CopyFromMat(MatrixBase<Real> &M,
+void TpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
                                 MatrixTransposeType Trans) {
  if (Trans == kNoTrans) {
    KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols());
--- a/src/matrix/tp-matrix.h
+++ b/src/matrix/tp-matrix.h
@ -2,6 +2,7 @@

 // Copyright 2009-2011  Ondrej Glembek;  Lukas Burget;  Microsoft Corporation;
 //                      Saarland University;  Yanmin Qian;   Haihua Xu
+//                2013  Johns Hopkins Universith (author: Daniel Povey)


 // See ../../COPYING for clarification regarding multiple authors
@ -33,15 +34,22 @@ template<typename Real> class TpMatrix;
 /// @brief Packed symetric matrix class
 template<typename Real>
 class TpMatrix : public PackedMatrix<Real> {
+  friend class CuTpMatrix<float>;
+  friend class CuTpMatrix<double>;
 public:
  TpMatrix() : PackedMatrix<Real>() {}
  explicit TpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
      : PackedMatrix<Real>(r, resize_type) {}
  TpMatrix(const TpMatrix<Real>& Orig) : PackedMatrix<Real>(Orig) {}
-  template<class OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& Orig)
-      : PackedMatrix<Real>(Orig) {}
-  ~TpMatrix() {}

+  /// Copy constructor from CUDA TpMatrix
+  /// This is defined in ../cudamatrix/cu-tp-matrix.cc
+  explicit TpMatrix(const CuTpMatrix<Real> &cu);
+  
+  
+  template<typename OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& Orig)
+      : PackedMatrix<Real>(Orig) {}
+  
  Real operator() (MatrixIndexT r, MatrixIndexT c) const {
    if (static_cast<UnsignedMatrixIndexT>(c) >
        static_cast<UnsignedMatrixIndexT>(r)) {
@ -85,15 +93,18 @@ class TpMatrix : public PackedMatrix<Real> {

  /// CopyFromMat copies the lower triangle of M into *this
  /// (or the upper triangle, if Trans == kTrans).
-  void CopyFromMat(MatrixBase<Real> &M,
+  void CopyFromMat(const MatrixBase<Real> &M,
                   MatrixTransposeType Trans = kNoTrans);

-  /// CopyFromTp copies andother triangular matrix into this one.
+  /// This is implemented in ../cudamatrix/cu-tp-matrix.cc
+  void CopyFromMat(const CuTpMatrix<Real> &other);
+  
+  /// CopyFromTp copies another triangular matrix into this one.
  void CopyFromTp(const TpMatrix<Real> &other) {
    PackedMatrix<Real>::CopyFromPacked(other);
  }

-  template<class OtherReal> void CopyFromTp(const TpMatrix<OtherReal> &other) {
+  template<typename OtherReal> void CopyFromTp(const TpMatrix<OtherReal> &other) {
    PackedMatrix<Real>::CopyFromPacked(other);
  }

--- a/src/nnet/Makefile
+++ b/src/nnet/Makefile
@ -5,6 +5,7 @@ all:
 include ../kaldi.mk

 LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)

 TESTFILES = nnet-test nnet-randomizer-test

--- a/src/nnet/nnet-activation.h
+++ b/src/nnet/nnet-activation.h
@ -41,7 +41,7 @@ class Softmax : public Component {

  void PropagateFnc(const CuMatrix<BaseFloat> &in, CuMatrix<BaseFloat> *out) {
    // y = e^x_j/sum_j(e^x_j)
-    out->Softmax(in);
+    out->ApplySoftMaxPerRow(in);
  }

  void BackpropagateFnc(const CuMatrix<BaseFloat> &in, const CuMatrix<BaseFloat> &out,
--- a/src/nnet/nnet-cache-conf.h
+++ b/src/nnet/nnet-cache-conf.h
@ -96,7 +96,7 @@ class CacheConf {
  Vector<BaseFloat> confidence_leftover_;

  std::vector<int32> randmask_;
-  CuStlVector<int32> randmask_device_;
+  CuArray<int32> randmask_device_;

 }; 
 
--- a/src/nnet/nnet-cache-tgtmat.h
+++ b/src/nnet/nnet-cache-tgtmat.h
@ -94,7 +94,7 @@ class CacheTgtMat {
  CuMatrix<BaseFloat> targets_leftover_;  ///< Desired vector cache

  std::vector<int32> randmask_;
-  CuStlVector<int32> randmask_device_;
+  CuArray<int32> randmask_device_;

 }; 
 
--- a/src/nnet/nnet-cache.h
+++ b/src/nnet/nnet-cache.h
@ -94,7 +94,7 @@ class Cache {
  std::vector<int32> targets_leftover_;  ///< Desired vector cache

  std::vector<int32> randmask_;
-  CuStlVector<int32> randmask_device_;
+  CuArray<int32> randmask_device_;

 }; 
 
--- a/src/nnet/nnet-loss-prior.h
+++ b/src/nnet/nnet-loss-prior.h
@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
-#include "cudamatrix/cu-stlvector.h"
+#include "cudamatrix/cu-array.h"

 namespace kaldi {
 namespace nnet1 {
@ -67,10 +67,10 @@ class XentPrior {
  double frames_scaled_nosil_;
  double correct_scaled_nosil_;

-  CuStlVector<int32> max_id_;
+  CuArray<int32> max_id_;
  std::vector<int32> max_id_host_;

-  CuStlVector<int32>  target_device_;
+  CuArray<int32>  target_device_;
  CuVector<BaseFloat> log_post_tgt_;
  Vector<BaseFloat>   log_post_tgt_host_;

--- a/src/nnet/nnet-loss.h
+++ b/src/nnet/nnet-loss.h
@ -24,7 +24,7 @@
 #include "util/kaldi-holder.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
-#include "cudamatrix/cu-stlvector.h"
+#include "cudamatrix/cu-array.h"

 namespace kaldi {
 namespace nnet1 {
@ -61,16 +61,17 @@ class Xent {
  std::vector<float> loss_vec_;

  // loss computation buffers
-  CuStlVector<int32>  target_device_;
+  CuArray<int32>  target_device_;
+
  CuVector<BaseFloat> log_post_tgt_;
  Vector<BaseFloat>   log_post_tgt_host_;
  CuMatrix<BaseFloat> tgt_mat_device_;
  CuMatrix<BaseFloat> xentropy_aux_;

  // frame classification buffers 
-  CuStlVector<int32> max_id_out_;
+  CuArray<int32> max_id_out_;
  std::vector<int32> max_id_out_host_;
-  CuStlVector<int32> max_id_tgt_;
+  CuArray<int32> max_id_tgt_;
  std::vector<int32> max_id_tgt_host_;

 };
--- a/src/nnet/nnet-randomizer.cc
+++ b/src/nnet/nnet-randomizer.cc
@ -76,7 +76,7 @@ void MatrixRandomizer::Randomize(const std::vector<int32>& mask) {
  // Use auxiliary buffer for unshuffled data
  CuMatrix<BaseFloat> data_aux(data_);
  // Put the mask to GPU 
-  CuStlVector<int32> mask_in_gpu(mask.size());
+  CuArray<int32> mask_in_gpu(mask.size());
  mask_in_gpu.CopyFromVec(mask);
  // randomize the data, mask is used to index rows in source matrix
  cu::Randomize(data_aux, mask_in_gpu, &data_);
--- a/src/nnet/nnet-various.h
+++ b/src/nnet/nnet-various.h
@ -155,7 +155,7 @@ class Splice : public Component {
  }

 protected:
-  CuStlVector<int32> frame_offsets_;
+  CuArray<int32> frame_offsets_;
 };


@ -218,7 +218,7 @@ class CopyComponent: public Component {
  }

 protected:
-  CuStlVector<int32> copy_from_indices_;
+  CuArray<int32> copy_from_indices_;
 };


--- a/src/nnetbin/Makefile
+++ b/src/nnetbin/Makefile
@ -4,6 +4,7 @@ EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk

 LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)

 BINFILES = nnet-train-frmshuff \
        nnet-train-xent-hardlab-perutt \
--- a/src/nnetbin/nnet-forward.cc
+++ b/src/nnetbin/nnet-forward.cc
@ -51,13 +51,8 @@ int main(int argc, char *argv[]) {
    bool apply_log = false;
    po.Register("apply-log", &apply_log, "Transform MLP output to logscale");

-#if HAVE_CUDA==1
-    int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="no";
+    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 

    po.Read(argc, argv);

@ -76,7 +71,7 @@ int main(int argc, char *argv[]) {

    //Select the GPU
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
--- a/src/nnetbin/nnet-train-frmshuff.cc
+++ b/src/nnetbin/nnet-train-frmshuff.cc
@ -60,13 +60,8 @@ int main(int argc, char *argv[]) {
    std::string frame_weights;
    po.Register("frame-weights", &frame_weights, "Per-frame weights to scale gradients (frame selection/weighting).");

-#if HAVE_CUDA==1
-    int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 
    
    po.Read(argc, argv);

@ -90,7 +85,7 @@ int main(int argc, char *argv[]) {

    //Select the GPU
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ b/src/nnetbin/nnet-train-mmi-sequential.cc
@ -128,16 +128,9 @@ int main(int argc, char *argv[]) {
    po.Register("drop-frames", &drop_frames, 
                "Drop frames, where is zero den-posterior under numerator path "
                "(ie. path not in lattice)");
-    

-#if HAVE_CUDA == 1
-    kaldi::int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
-                "(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 

    po.Read(argc, argv);

@ -162,7 +155,7 @@ int main(int argc, char *argv[]) {

    // Select the GPU
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
@ -257,7 +250,7 @@ int main(int argc, char *argv[]) {
      if (old_acoustic_scale != 1.0) {
        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale), &den_lat);
      }
-      // optionaly sort it topologically
+      // optional sort it topologically
      kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
      if (!(props & fst::kTopSorted)) {
        if (fst::TopSort(&den_lat) == false)
--- a/src/nnetbin/nnet-train-mpe-sequential.cc
+++ b/src/nnetbin/nnet-train-mpe-sequential.cc
@ -129,15 +129,9 @@ int main(int argc, char *argv[]) {
    po.Register("do-smbr", &do_smbr, "Use state-level accuracies instead of "
                "phone accuracies.");

-#if HAVE_CUDA == 1
-    kaldi::int32 use_gpu_id=-2;
-    po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
-                "(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
-#else
-    int32 use_gpu_id=0;
-    po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
-#endif
-
+    std::string use_gpu="yes";
+    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
+     
    po.Read(argc, argv);

    if (po.NumArgs() != 6) {
@ -164,7 +158,7 @@ int main(int argc, char *argv[]) {

    // Select the GPU
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu_id);
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

    Nnet nnet_transf;
@ -248,7 +242,7 @@ int main(int argc, char *argv[]) {
        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
                          &den_lat);
      }
-      // optionaly sort it topologically
+      // optional sort it topologically
      kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
      if (!(props & fst::kTopSorted)) {
        if (fst::TopSort(&den_lat) == false)
--- a/Показать больше
+++ b/Показать больше