зеркало из https://github.com/mozilla/kaldi.git
syncing 'sandbox/dan2->trunk' of src/matrix,src/cudamatrix
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3194 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Родитель
c5bba859e4
Коммит
c3a5fa2187
|
@ -18,7 +18,7 @@ align_to_lats=false # optionally produce alignment in lattice format
|
|||
lats_decode_opts="--acoustic-scale=0.1 --beam=20 --latbeam=10"
|
||||
lats_graph_scales="--transition-scale=1.0 --self-loop-scale=0.1"
|
||||
|
||||
use_gpu_id=-1 # disable gpu
|
||||
use_gpu="no" # yes|no|optionaly
|
||||
# End configuration options.
|
||||
|
||||
[ $# -gt 0 ] && echo "$0 $@" # Print the command line for logging
|
||||
|
@ -76,7 +76,7 @@ if [ -f $srcdir/delta_order ]; then
|
|||
feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
|
||||
fi
|
||||
# Finally add feature_transform and the MLP
|
||||
feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
|
||||
feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"
|
||||
|
||||
|
||||
echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"
|
||||
|
|
|
@ -25,7 +25,7 @@ scoring_opts="--min-lmwt 4 --max-lmwt 15"
|
|||
|
||||
num_threads=1 # if >1, will use latgen-faster-parallel
|
||||
parallel_opts="-pe smp $((num_threads+1))" # use 2 CPUs (1 DNN-forward, 1 decoder)
|
||||
use_gpu_id=-1 # -1 disable gpu
|
||||
use_gpu="no" # yes|no|optionaly
|
||||
# End configuration section.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
@ -104,7 +104,7 @@ fi
|
|||
# Run the decoding in the queue
|
||||
if [ $stage -le 0 ]; then
|
||||
$cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
|
||||
nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet "$feats" ark:- \| \
|
||||
nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
|
||||
latgen-faster-mapped$thread_string --max-active=$max_active --max-mem=$max_mem --beam=$beam \
|
||||
--lattice-beam=$latbeam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
|
||||
$model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
|
||||
|
|
|
@ -50,8 +50,6 @@ splice_step=1 # Stepsize of the splicing (1 is consecutive splice,
|
|||
# value 2 would do [ -10 -8 -6 -4 -2 0 2 4 6 8 10 ] splicing)
|
||||
# misc.
|
||||
verbose=1 # enable per-cache reports
|
||||
# gpu config
|
||||
use_gpu_id= # manually select GPU id to run on, (-1 disables GPU)
|
||||
# End configuration.
|
||||
|
||||
echo "$0 $@" # Print the command line for logging
|
||||
|
@ -172,7 +170,7 @@ else
|
|||
feature_transform_old=$feature_transform
|
||||
feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
|
||||
echo "Renormalizing MLP input features into $feature_transform"
|
||||
nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
|
||||
nnet-forward --use-gpu=yes \
|
||||
$feature_transform_old "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
|
||||
ark:- 2>$dir/log/cmvn_glob_fwd.log |\
|
||||
compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
|
||||
|
@ -186,7 +184,7 @@ fi
|
|||
|
||||
|
||||
###### GET THE DIMENSIONS ######
|
||||
num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu-id=-1 $feature_transform ark:- ark:- |" - 2>/dev/null)
|
||||
num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null)
|
||||
num_hid=$hid_dim
|
||||
|
||||
|
||||
|
@ -208,14 +206,14 @@ for depth in $(seq 1 $nn_depth); do
|
|||
rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate_low --l2-penalty=$rbm_l2penalty \
|
||||
--num-iters=$((2*$rbm_iter)) --drop-data=$rbm_drop_data --verbose=$verbose \
|
||||
--feature-transform=$feature_transform \
|
||||
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} $rbm_extra_opts \
|
||||
$rbm_extra_opts \
|
||||
$RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
|
||||
else
|
||||
#This is Bernoulli-Bernoulli RBM
|
||||
#cmvn stats for init
|
||||
echo "Computing cmvn stats '$dir/$depth.cmvn' for RBM initialization"
|
||||
if [ ! -f $dir/$depth.cmvn ]; then
|
||||
nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
|
||||
nnet-forward --use-gpu=yes \
|
||||
"nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
|
||||
"$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
|
||||
ark:- 2>$dir/log/cmvn_fwd.$depth.log | \
|
||||
|
@ -232,7 +230,7 @@ for depth in $(seq 1 $nn_depth); do
|
|||
rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate --l2-penalty=$rbm_l2penalty \
|
||||
--num-iters=$rbm_iter --drop-data=$rbm_drop_data --verbose=$verbose \
|
||||
--feature-transform="nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
|
||||
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} $rbm_extra_opts \
|
||||
$rbm_extra_opts \
|
||||
$RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
|
||||
fi
|
||||
|
||||
|
|
|
@ -46,7 +46,6 @@ train_opts= # options, passed to the training script
|
|||
train_tool= # optionally change the training tool
|
||||
|
||||
# OTHER
|
||||
use_gpu_id= # manually select GPU id to run on, (-1 disables GPU)
|
||||
analyze_alignments=true # run the alignment analysis script
|
||||
seed=777 # seed value used for training data shuffling and initialization
|
||||
# End configuration.
|
||||
|
@ -258,7 +257,7 @@ else
|
|||
feature_transform_old=$feature_transform
|
||||
feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
|
||||
echo "Renormalizing MLP input features into $feature_transform"
|
||||
nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
|
||||
nnet-forward --use-gpu=yes \
|
||||
$feature_transform_old "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" \
|
||||
ark:- 2>$dir/log/nnet-forward-cmvn.log |\
|
||||
compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
|
||||
|
@ -315,7 +314,6 @@ steps/train_nnet_scheduler.sh \
|
|||
${train_opts} \
|
||||
${train_tool:+ --train-tool "$train_tool"} \
|
||||
${config:+ --config $config} \
|
||||
${use_gpu_id:+ --use-gpu-id $use_gpu_id} \
|
||||
$mlp_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir || exit 1
|
||||
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@ learn_rate=0.00001
|
|||
halving_factor=1.0 #ie. disable halving
|
||||
drop_frames=true
|
||||
verbose=1
|
||||
use_gpu_id=
|
||||
|
||||
seed=777 # seed value used for training data shuffling
|
||||
# End configuration section
|
||||
|
@ -168,7 +167,6 @@ while [ $x -le $num_iters ]; do
|
|||
--learn-rate=$learn_rate \
|
||||
--drop-frames=$drop_frames \
|
||||
--verbose=$verbose \
|
||||
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
|
||||
$cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
|
||||
fi
|
||||
cur_mdl=$dir/$x.nnet
|
||||
|
|
|
@ -21,7 +21,6 @@ halving_factor=1.0 #ie. disable halving
|
|||
do_smbr=true
|
||||
use_silphones=false #setting this to something will enable giving siphones to nnet-mpe
|
||||
verbose=1
|
||||
use_gpu_id=
|
||||
|
||||
seed=777 # seed value used for training data shuffling
|
||||
# End configuration section
|
||||
|
@ -151,7 +150,6 @@ while [ $x -le $num_iters ]; do
|
|||
--do-smbr=$do_smbr \
|
||||
--verbose=$verbose \
|
||||
$mpe_silphones_arg \
|
||||
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
|
||||
$cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
|
||||
fi
|
||||
cur_mdl=$dir/$x.nnet
|
||||
|
|
|
@ -25,8 +25,6 @@ end_halving_inc=0.1
|
|||
halving_factor=0.5
|
||||
# misc.
|
||||
verbose=1
|
||||
# gpu
|
||||
use_gpu_id=
|
||||
# tool
|
||||
train_tool="nnet-train-xent-hardlab-frmshuff"
|
||||
|
||||
|
@ -73,7 +71,6 @@ mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}
|
|||
$train_tool --cross-validate=true \
|
||||
--bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
|
||||
${feature_transform:+ --feature-transform=$feature_transform} \
|
||||
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
|
||||
$mlp_best "$feats_cv" "$labels_cv" \
|
||||
2> $dir/log/prerun.log || exit 1;
|
||||
|
||||
|
@ -97,7 +94,6 @@ for iter in $(seq -w $max_iters); do
|
|||
--learn-rate=$learn_rate --momentum=$momentum --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
|
||||
--bunchsize=$bunch_size --cachesize=$cache_size --randomize=true --verbose=$verbose \
|
||||
${feature_transform:+ --feature-transform=$feature_transform} \
|
||||
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
|
||||
${seed:+ --seed=$seed} \
|
||||
$mlp_best "$feats_tr" "$labels_tr" $mlp_next \
|
||||
2> $dir/log/iter$iter.log || exit 1;
|
||||
|
@ -110,7 +106,6 @@ for iter in $(seq -w $max_iters); do
|
|||
$train_tool --cross-validate=true \
|
||||
--bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
|
||||
${feature_transform:+ --feature-transform=$feature_transform} \
|
||||
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
|
||||
$mlp_next "$feats_cv" "$labels_cv" \
|
||||
2>>$dir/log/iter$iter.log || exit 1;
|
||||
|
||||
|
|
|
@ -9,12 +9,16 @@ OPENFST_LDLIBS =
|
|||
include ../kaldi.mk
|
||||
|
||||
LDFLAGS += $(CUDA_LDFLAGS)
|
||||
LDLIBS += $(CUDA_LDLIBS)
|
||||
|
||||
TESTFILES = cuda-matrix-test
|
||||
TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \
|
||||
cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test
|
||||
|
||||
OBJFILES = cu-device.o cu-math.o cu-matrix.o
|
||||
|
||||
OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
|
||||
cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o
|
||||
ifeq ($(CUDA), true)
|
||||
OBJFILES += cu-kernels.o cu-randkernels.o
|
||||
OBJFILES += cu-kernels.o cu-randkernels.o cu-choleskykernels.o
|
||||
endif
|
||||
|
||||
LIBNAME = kaldi-cudamatrix
|
||||
|
|
|
@ -0,0 +1,208 @@
|
|||
// cudamatrix/cu-array-inl.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
|
||||
#define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-kernels.h"
|
||||
#endif
|
||||
|
||||
#include "util/timer.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
template<typename T>
|
||||
void CuArray<T>::Resize(MatrixIndexT dim, MatrixResizeType resize_type) {
|
||||
KALDI_ASSERT((resize_type == kSetZero || resize_type == kUndefined) && dim >= 0);
|
||||
if (dim_ == dim) {
|
||||
if (resize_type == kSetZero)
|
||||
SetZero();
|
||||
return;
|
||||
}
|
||||
|
||||
Destroy();
|
||||
|
||||
if (dim == 0) return;
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
CU_SAFE_CALL(cudaMalloc((void**)&data_, dim*sizeof(T)));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
data_ = static_cast<T*>(malloc(dim * sizeof(T)));
|
||||
// We allocate with malloc because we don't want constructors being called.
|
||||
// We basically ignore memory alignment issues here-- we assume the malloc
|
||||
// implementation is forgiving enough that it will automatically align on
|
||||
// sensible boundaries.
|
||||
if (data_ == 0)
|
||||
KALDI_ERR << "Memory allocation failed when initializing CuVector "
|
||||
<< "with dimension " << dim << " object size in bytes: "
|
||||
<< sizeof(T);
|
||||
}
|
||||
|
||||
dim_ = dim;
|
||||
if (resize_type == kSetZero)
|
||||
SetZero();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void CuArray<T>::Destroy() {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (data_ != NULL) {
|
||||
CU_SAFE_CALL(cudaFree(data_));
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (data_ != NULL)
|
||||
free(data_);
|
||||
}
|
||||
dim_ = 0;
|
||||
data_ = NULL;
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
|
||||
Resize(src.size(), kUndefined);
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
CU_SAFE_CALL(cudaMemcpy(data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice));
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memcpy(data_, &src.front(), src.size()*sizeof(T));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename T>
|
||||
void CuArray<T>::CopyToVec(std::vector<T> *dst) const {
|
||||
if (static_cast<MatrixIndexT>(dst->size()) != dim_) {
|
||||
dst->resize(dim_);
|
||||
}
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(T), cudaMemcpyDeviceToHost));
|
||||
CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memcpy(&dst->front(), data_, dim_*sizeof(T));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void CuArray<T>::SetZero() {
|
||||
if (dim_ == 0) return;
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
CU_SAFE_CALL(cudaMemset(data_, 0, dim_ * sizeof(T)));
|
||||
CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memset(static_cast<void*>(data_), 0, dim_ * sizeof(T));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Print the vector to stream
|
||||
*/
|
||||
template<typename T>
|
||||
std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
|
||||
std::vector<T> tmp;
|
||||
vec.CopyToVec(&tmp);
|
||||
out << "[";
|
||||
for(int32 i=0; i<tmp.size(); i++) {
|
||||
out << " " << tmp[i];
|
||||
}
|
||||
out << " ]\n";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
template<class T>
|
||||
inline void CuArray<T>::Set(const T &value) {
|
||||
// This is not implemented yet, we'll do so if it's needed.
|
||||
KALDI_ERR << "CuArray<T>::Set not implemented yet for this type.";
|
||||
}
|
||||
|
||||
template<>
|
||||
inline void CuArray<int32>::Set(const int32 &value) {
|
||||
if (dim_ == 0) return;
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK));
|
||||
::MatrixDim d = { 1, Dim(), Dim() };
|
||||
|
||||
cudaI32_set_const(dimGrid, dimBlock, data_, value, d);
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
for (int32 i = 0; i < dim_; i++)
|
||||
data_[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void CuArray<T>::CopyFromArray(const CuArray<T> &src) {
|
||||
this->Resize(src.Dim(), kUndefined);
|
||||
if (dim_ == 0) return;
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memcpy(this->data_, src.data_, dim_ * sizeof(T));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
#endif
|
|
@ -0,0 +1,124 @@
|
|||
// cudamatrix/cu-array-test.cc
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
|
||||
|
||||
template<class T>
|
||||
static void UnitTestCuArray() {
|
||||
for (int32 i = 0; i < 30; i++) {
|
||||
int32 size = rand() % 5;
|
||||
size = size * size * size; // Have a good distribution of sizes, including >256.
|
||||
int32 size2 = rand() % 4;
|
||||
std::vector<T> vec(size);
|
||||
std::vector<T> garbage_vec(size2); // We just use garbage_vec to make sure
|
||||
// we sometimes resize from empty,
|
||||
// sometimes not.
|
||||
|
||||
int32 byte_size = size * sizeof(T);
|
||||
std::vector<char> rand_c(byte_size);
|
||||
for (size_t i = 0; i < byte_size; i++)
|
||||
rand_c[i] = rand() % 256;
|
||||
if (!vec.empty()) {
|
||||
std::memcpy((void*)&(vec[0]), (void*)&(rand_c[0]),
|
||||
byte_size);
|
||||
}
|
||||
|
||||
{ // test constructor from vector and CopyToVec.
|
||||
CuArray<T> cu_vec(vec);
|
||||
std::vector<T> vec2;
|
||||
cu_vec.CopyToVec(&vec2);
|
||||
KALDI_ASSERT(vec2 == vec);
|
||||
}
|
||||
|
||||
{ // test assignment operator from CuArray.
|
||||
CuArray<T> cu_vec(vec);
|
||||
CuArray<T> cu_vec2(garbage_vec);
|
||||
cu_vec2 = cu_vec;
|
||||
std::vector<T> vec2;
|
||||
cu_vec2.CopyToVec(&vec2);
|
||||
KALDI_ASSERT(vec2 == vec);
|
||||
KALDI_ASSERT(cu_vec2.Dim() == int32(vec2.size())); // test Dim()
|
||||
}
|
||||
|
||||
{ // test resize with resize_type = kSetZero.
|
||||
CuArray<T> cu_vec(vec);
|
||||
cu_vec.Resize(size, kSetZero);
|
||||
std::vector<T> vec2(vec);
|
||||
|
||||
if (!vec2.empty())
|
||||
std::memset(&(vec2[0]), 0, vec2.size() * sizeof(T));
|
||||
std::vector<T> vec3;
|
||||
cu_vec.CopyToVec(&vec3);
|
||||
KALDI_ASSERT(vec2 == vec3); // testing equality of zero arrays.
|
||||
}
|
||||
|
||||
if (sizeof(T) == sizeof(int32) && size > 0) { // test Set for type int32, or same size.
|
||||
CuArray<T> cu_vec(vec);
|
||||
cu_vec.Set(vec[0]);
|
||||
for (size_t i = 1; i < vec.size(); i++) vec[i] = vec[0];
|
||||
std::vector<T> vec2;
|
||||
cu_vec.CopyToVec(&vec2);
|
||||
KALDI_ASSERT(vec2 == vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
for (int32 loop = 0; loop < 2; loop++) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (loop == 0)
|
||||
CuDevice::Instantiate().SelectGpuId("no");
|
||||
else
|
||||
CuDevice::Instantiate().SelectGpuId("yes");
|
||||
#endif
|
||||
|
||||
//kaldi::UnitTestCuArray<float>();
|
||||
kaldi::UnitTestCuArray<double>();
|
||||
kaldi::UnitTestCuArray<int32>();
|
||||
kaldi::UnitTestCuArray<std::pair<int32, int32> >();
|
||||
|
||||
if (loop == 0)
|
||||
KALDI_LOG << "Tests without GPU use succeeded.\n";
|
||||
else
|
||||
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
|
||||
}
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
// cudamatrix/cu-array.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_ARRAY_H_
|
||||
#define KALDI_CUDAMATRIX_CU_ARRAY_H_
|
||||
|
||||
#include "matrix/kaldi-vector.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
/**
|
||||
* std::vector equivalent for CUDA computing. This class is mostly intended as
|
||||
* a CUDA-based mirror of a std::vector object that lives on the CPU. We don't
|
||||
* call constructors, initializers, etc., on the GPU.
|
||||
*/
|
||||
template<typename T>
|
||||
class CuArray {
|
||||
typedef CuArray<T> ThisType;
|
||||
public:
|
||||
|
||||
/// Default Constructor
|
||||
CuArray<T>() : dim_(0), data_(NULL) { }
|
||||
|
||||
/// Constructor with memory initialisation. resize_type may be kSetZero or
|
||||
/// kUndefined.
|
||||
explicit CuArray<T>(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero):
|
||||
dim_(0), data_(NULL) { Resize(dim, resize_type); }
|
||||
|
||||
/// Constructor from CPU-based int vector
|
||||
explicit CuArray<T>(const std::vector<T> &src):
|
||||
dim_(0), data_(NULL) { CopyFromVec(src); }
|
||||
|
||||
explicit CuArray<T>(const CuArray<T> &src):
|
||||
dim_(0), data_(NULL) { CopyFromArray(src); }
|
||||
|
||||
/// Destructor
|
||||
~CuArray() { Destroy(); }
|
||||
|
||||
/// Return the vector dimension
|
||||
MatrixIndexT Dim() const { return dim_; }
|
||||
|
||||
/// Get raw pointer
|
||||
const T* Data() const { return data_; }
|
||||
|
||||
T* Data() { return data_; }
|
||||
|
||||
/// Allocate the memory. resize_type may be kSetZero or kUndefined.
|
||||
/// kCopyData not yet supported (can be implemented if needed).
|
||||
void Resize(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero);
|
||||
|
||||
/// Deallocate the memory and set dim_ and data_ to zero. Does not call any
|
||||
/// destructors of the objects stored.
|
||||
void Destroy();
|
||||
|
||||
/// This function resizes if needed. Note: copying to GPU is done via memcpy,
|
||||
/// and any constructors or assignment operators are not called.
|
||||
void CopyFromVec(const std::vector<T> &src);
|
||||
|
||||
/// This function resizes if needed.
|
||||
void CopyFromArray(const CuArray<T> &src);
|
||||
|
||||
/// This function resizes *dst if needed. On resize of "dst", the STL vector
|
||||
/// may call copy-constructors, initializers, and assignment operators for
|
||||
/// existing objects (which will be overwritten), but the copy from GPU to CPU
|
||||
/// is done via memcpy. So be very careful calling this function if your
|
||||
/// objects are more than plain structs.
|
||||
void CopyToVec(std::vector<T> *dst) const;
|
||||
|
||||
/// Sets the memory for the object to zero, via memset. You should verify
|
||||
/// that this makes sense for type T.
|
||||
void SetZero();
|
||||
|
||||
/// Set to a constant value. Note: any copying is done as if using memcpy, and
|
||||
/// assignment operators or destructors are not called. This is NOT IMPLEMENTED
|
||||
/// YET except for T == int32 (the current implementation will just crash).
|
||||
void Set(const T &value);
|
||||
|
||||
CuArray<T> &operator= (const CuArray<T> &in) {
|
||||
this->CopyFromArray(in); return *this;
|
||||
}
|
||||
|
||||
CuArray<T> &operator= (const std::vector<T> &in) {
|
||||
this->CopyFromVec(in); return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
MatrixIndexT dim_; ///< dimension of the vector
|
||||
T *data_; ///< GPU data pointer (if GPU not available,
|
||||
///< will point to CPU memory).
|
||||
};
|
||||
|
||||
|
||||
/// I/O
|
||||
template<typename T>
|
||||
std::ostream &operator << (std::ostream &out, const CuArray<T> &vec);
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
#include "cudamatrix/cu-array-inl.h"
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,239 @@
|
|||
// cudamatrix/cu-block-matrix-test.cc
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "cudamatrix/cu-matrix-lib.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/*
|
||||
* ASSERTS
|
||||
*/
|
||||
template<typename Real>
|
||||
static void AssertEqual(const MatrixBase<Real> &A,
|
||||
const MatrixBase<Real> &B,
|
||||
float tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows()&&A.NumCols() == B.NumCols());
|
||||
for (MatrixIndexT i = 0;i < A.NumRows();i++) {
|
||||
for (MatrixIndexT j = 0;j < A.NumCols();j++) {
|
||||
KALDI_ASSERT(std::abs(A(i, j)-B(i, j)) <= tol*std::max(1.0, (double) (std::abs(A(i, j))+std::abs(B(i, j)))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void AssertEqual(const CuMatrixBase<Real> &A,
|
||||
const CuMatrixBase<Real> &B,
|
||||
float tol = 0.001) {
|
||||
Real Anorm = A.FrobeniusNorm(), Bnorm = B.FrobeniusNorm();
|
||||
CuMatrix<Real> diff(A);
|
||||
diff.AddMat(-1.0, B);
|
||||
Real diff_norm = diff.FrobeniusNorm();
|
||||
if (diff_norm > tol * 0.5 * (Anorm + Bnorm)) {
|
||||
KALDI_LOG << "A = " << A;
|
||||
KALDI_LOG << "B = " << B;
|
||||
KALDI_ERR << "Matrices differ, " << diff_norm << " > " << tol << " * 0.5 * ( "
|
||||
<< Anorm << " + " << Bnorm << " ). ";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void AssertEqual(const CuBlockMatrix<Real> &A,
|
||||
const CuBlockMatrix<Real> &B,
|
||||
float tol = 0.001) {
|
||||
CuMatrix<Real> Acopy(A), Bcopy(B);
|
||||
AssertEqual(Acopy, Bcopy, tol);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuBlockMatrixIO() {
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
int32 num_blocks = rand() % 5;
|
||||
std::vector<CuMatrix<Real> > data(num_blocks);
|
||||
for (int32 b = 0; b < num_blocks; b++) {
|
||||
int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
|
||||
if (b % 2 == 0) std::swap(dimM, dimN);
|
||||
data[b].Resize(dimM, dimN);
|
||||
data[b].SetRandn();
|
||||
}
|
||||
CuBlockMatrix<Real> B(data);
|
||||
|
||||
std::ostringstream os;
|
||||
bool binary = (i % 4 < 2);
|
||||
B.Write(os, binary);
|
||||
|
||||
CuBlockMatrix<Real> B2;
|
||||
std::istringstream is(os.str());
|
||||
B2.Read(is, binary);
|
||||
|
||||
CuMatrix<Real> mat(B), mat2(B2);
|
||||
AssertEqual(mat, mat2);
|
||||
if (!data.empty())
|
||||
KALDI_ASSERT(mat.Sum() != 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuBlockMatrixAddMatBlock() {
|
||||
for (int32 i = 0; i < 20; i++) {
|
||||
int32 num_blocks = rand() % 5;
|
||||
std::vector<CuMatrix<Real> > data(num_blocks);
|
||||
for (int32 b = 0; b < num_blocks; b++) {
|
||||
int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
|
||||
// early failures will have small dim for easier eyeballing.
|
||||
if (b % 2 == 0) std::swap(dimM, dimN);
|
||||
data[b].Resize(dimM, dimN);
|
||||
data[b].SetRandn();
|
||||
}
|
||||
CuBlockMatrix<Real> B(data);
|
||||
int32 B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
|
||||
// will do X += A B
|
||||
|
||||
MatrixTransposeType transB = (i % 2 == 1 ? kTrans : kNoTrans),
|
||||
transA = (i % 3 == 1 ? kTrans : kNoTrans);
|
||||
if (transB == kTrans) std::swap(B_num_rows, B_num_cols);
|
||||
|
||||
int32 X_num_rows = 100 + rand() % 255, X_num_cols = B_num_cols,
|
||||
A_num_rows = X_num_rows, A_num_cols = B_num_rows;
|
||||
if (data.size() == 0) { X_num_rows = 0; A_num_rows = 0; }
|
||||
if (transA == kTrans) std::swap(A_num_rows, A_num_cols);
|
||||
|
||||
Real alpha = 2.0, beta = -1.0;
|
||||
CuMatrix<Real> X(X_num_rows, X_num_cols);
|
||||
X.SetRandn();
|
||||
CuMatrix<Real> A(A_num_rows, A_num_cols);
|
||||
A.SetRandn();
|
||||
|
||||
CuMatrix<Real> Xcopy(X), Bcopy(B), Xorig(X), Aorig(A);
|
||||
Xcopy.AddMatMat(alpha, A, transA, Bcopy, transB, beta);
|
||||
X.AddMatBlock(alpha, A, transA, B, transB, beta);
|
||||
|
||||
AssertEqual(X, Xcopy);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuBlockMatrixAddMatMat() {
|
||||
for (int32 i = 0; i < 20; i++) {
|
||||
int32 num_blocks = rand() % 5;
|
||||
std::vector<CuMatrix<Real> > data(num_blocks);
|
||||
for (int32 b = 0; b < num_blocks; b++) {
|
||||
int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
|
||||
if (i == 0) { dimM = 1; dimN = 1; }
|
||||
// early failures will have small dim for easier eyeballing.
|
||||
if (b % 2 == 0) std::swap(dimM, dimN);
|
||||
data[b].Resize(dimM, dimN);
|
||||
data[b].SetRandn();
|
||||
}
|
||||
|
||||
CuBlockMatrix<Real> B(data);
|
||||
int32 B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
|
||||
// will do B += C D
|
||||
|
||||
int32 C_num_rows = B_num_rows, C_num_cols = 100 + rand() % 255;
|
||||
if (C_num_rows == 0) C_num_cols = 0;
|
||||
int32 D_num_rows = C_num_cols, D_num_cols = B_num_cols;
|
||||
|
||||
MatrixTransposeType transC = (i % 2 == 1 ? kTrans : kNoTrans),
|
||||
transD = (i % 3 == 1 ? kTrans : kNoTrans);
|
||||
if (transC == kTrans) std::swap(C_num_rows, C_num_cols);
|
||||
if (transD == kTrans) std::swap(D_num_rows, D_num_cols);
|
||||
|
||||
CuMatrix<Real> C(C_num_rows, C_num_cols), D(D_num_rows, D_num_cols);
|
||||
C.SetRandn();
|
||||
D.SetRandn();
|
||||
|
||||
CuMatrix<Real> Bmat(B);
|
||||
|
||||
Real alpha = 2.0, beta = -1.0;
|
||||
|
||||
CuBlockMatrix<Real> Bcopy(B);
|
||||
|
||||
B.AddMatMat(alpha, C, transC, D, transD, beta);
|
||||
|
||||
Bmat.AddMatMat(alpha, C, transC, D, transD, beta);
|
||||
|
||||
|
||||
// Now check that the block-structured part of Bmat is the
|
||||
// same as B.
|
||||
Bcopy.CopyFromMat(Bmat); // copy block-structured part from Bmat to Bcopy.
|
||||
|
||||
AssertEqual(B, Bcopy);
|
||||
KALDI_ASSERT(Bmat.Sum() != 0 || B_num_rows == 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuBlockMatrixUnitTest() {
|
||||
UnitTestCuBlockMatrixIO<Real>();
|
||||
UnitTestCuBlockMatrixAddMatBlock<Real>();
|
||||
UnitTestCuBlockMatrixAddMatMat<Real>();
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
for (int32 loop = 0; loop < 2; loop++) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (loop == 0)
|
||||
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
|
||||
else
|
||||
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
|
||||
#endif
|
||||
|
||||
kaldi::CuBlockMatrixUnitTest<float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CuBlockMatrixUnitTest<double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CuBlockMatrixUnitTest<double>();
|
||||
#endif
|
||||
if (loop == 0)
|
||||
KALDI_LOG << "Tests without GPU use succeeded.\n";
|
||||
else
|
||||
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
|
||||
}
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,330 @@
|
|||
// cudamatrix/cu-block-matrix.cc
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cublas.h>
|
||||
#endif
|
||||
|
||||
#include "util/timer.h"
|
||||
#include "cudamatrix/cu-block-matrix.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<class Real>
|
||||
CuBlockMatrix<Real>::CuBlockMatrix() {
|
||||
#if HAVE_CUDA == 1
|
||||
cu_data_ = NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
CuBlockMatrix<Real>::CuBlockMatrix(const std::vector<CuMatrix<Real> >&data) {
|
||||
#if HAVE_CUDA == 1
|
||||
cu_data_ = NULL;
|
||||
#endif
|
||||
block_data_.resize(data.size());
|
||||
MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
|
||||
for (size_t b = 0; b < data.size(); b++) {
|
||||
MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
|
||||
KALDI_ASSERT(num_rows > 0 && num_cols > 0);
|
||||
BlockMatrixData block_data;
|
||||
block_data.num_rows = num_rows;
|
||||
block_data.num_cols = num_cols;
|
||||
block_data.row_offset = row_offset;
|
||||
block_data.col_offset = col_offset;
|
||||
row_offset += num_rows;
|
||||
col_offset += num_cols;
|
||||
max_num_rows = std::max(max_num_rows, num_rows);
|
||||
block_data_[b] = block_data;
|
||||
}
|
||||
num_rows_ = row_offset;
|
||||
data_.Resize(max_num_rows, col_offset);
|
||||
for (int32 b = 0; b < NumBlocks(); b++)
|
||||
Block(b).CopyFromMat(data[b]);
|
||||
SetCudaData();
|
||||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
const CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) const {
|
||||
KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
|
||||
const BlockMatrixData &block_data = block_data_[b];
|
||||
return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
|
||||
block_data.col_offset, block_data.num_cols);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) {
|
||||
KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
|
||||
BlockMatrixData &block_data = block_data_[b];
|
||||
return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
|
||||
block_data.col_offset, block_data.num_cols);
|
||||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
CuBlockMatrix<Real>::CuBlockMatrix(const CuBlockMatrix<Real> &other):
|
||||
data_(other.data_), block_data_(other.block_data_), num_rows_(other.num_rows_) {
|
||||
#if HAVE_CUDA == 1
|
||||
cu_data_ = NULL;
|
||||
#endif
|
||||
SetCudaData();
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
CuBlockMatrix<Real> &CuBlockMatrix<Real>::operator =(const CuBlockMatrix<Real> &other) {
|
||||
FreeCudaData();
|
||||
data_ = other.data_;
|
||||
block_data_ = other.block_data_;
|
||||
num_rows_ = other.num_rows_;
|
||||
SetCudaData();
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
void CuBlockMatrix<Real>::FreeCudaData() {
|
||||
#if HAVE_CUDA == 1
|
||||
if (cu_data_ != NULL) {
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
CuDevice::Instantiate().Free(cu_data_);
|
||||
cu_data_ = NULL;
|
||||
} else {
|
||||
KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but "
|
||||
<< "no GPU is enabled: likely code error.";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
void CuBlockMatrix<Real>::SetCudaData() {
|
||||
#if HAVE_CUDA == 1
|
||||
KALDI_ASSERT(cu_data_ == NULL);
|
||||
if (block_data_.size() == 0) return; // Nothing to do.
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
|
||||
int32 row_offset = 0, col_offset = 0;
|
||||
for (size_t b = 0; b < NumBlocks(); b++) {
|
||||
CuSubMatrix<Real> this_mat = Block(b);
|
||||
CuBlockMatrixData &this_cu_data = tmp_cu_data[b];
|
||||
this_cu_data.row_offset = row_offset;
|
||||
this_cu_data.col_offset = col_offset;
|
||||
this_cu_data.matrix_dim = this_mat.Dim();
|
||||
this_cu_data.matrix_data = static_cast<void*>(this_mat.Data());
|
||||
row_offset += this_mat.NumRows();
|
||||
col_offset += this_mat.NumCols();
|
||||
}
|
||||
size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
|
||||
cu_data_ = static_cast<CuBlockMatrixData*>(
|
||||
CuDevice::Instantiate().Malloc(size));
|
||||
CU_SAFE_CALL(cudaMemcpy(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice));
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
void CuBlockMatrix<Real>::Swap(CuBlockMatrix<Real> *other) {
|
||||
data_.Swap(&other->data_);
|
||||
block_data_.swap(other->block_data_);
|
||||
std::swap(num_rows_, other->num_rows_);
|
||||
#if HAVE_CUDA == 1
|
||||
std::swap(cu_data_, other->cu_data_);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
void CuBlockMatrix<Real>::Write(std::ostream &os, bool binary) const {
|
||||
WriteToken(os, binary, "<CuBlockMatrix>");
|
||||
int32 num_blocks = NumBlocks();
|
||||
WriteBasicType(os, binary, num_blocks);
|
||||
for (int32 b = 0; b < num_blocks; b++)
|
||||
this->Block(b).Write(os, binary);
|
||||
WriteToken(os, binary, "</CuBlockMatrix>");
|
||||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
void CuBlockMatrix<Real>::Read(std::istream &is, bool binary) {
|
||||
Destroy();
|
||||
int i = Peek(is, binary);
|
||||
std::vector<CuMatrix<Real> > data;
|
||||
if (i != static_cast<int>('<')) {
|
||||
// back-compatibility code so we can read the older format of
|
||||
// MixtureProbComponent. This code should be deleted eventually.
|
||||
int32 size;
|
||||
ReadBasicType(is, binary, &size);
|
||||
KALDI_ASSERT(size >= 0);
|
||||
data.resize(size);
|
||||
for (int32 i = 0; i < size; i++)
|
||||
data[i].Read(is, binary);
|
||||
} else {
|
||||
ExpectToken(is, binary, "<CuBlockMatrix>");
|
||||
int32 size;
|
||||
ReadBasicType(is, binary, &size);
|
||||
KALDI_ASSERT(size >= 0);
|
||||
data.resize(size);
|
||||
for (int32 i = 0; i < size; i++)
|
||||
data[i].Read(is, binary);
|
||||
ExpectToken(is, binary, "</CuBlockMatrix>");
|
||||
}
|
||||
|
||||
CuBlockMatrix<Real> block_mat(data); // initializer from std::vector<CuMatrix<Real> > does
|
||||
// the main job of initialization.
|
||||
this->Swap(&block_mat);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
void CuBlockMatrix<Real>::Destroy() {
|
||||
data_.Resize(0, 0);
|
||||
block_data_.clear();
|
||||
num_rows_ = 0;
|
||||
FreeCudaData();
|
||||
}
|
||||
|
||||
// Does *this = alpha A B + beta * *this, discarding elements outside
|
||||
// the block structure of the *this matrix.
|
||||
template<class Real>
|
||||
void CuBlockMatrix<Real>::AddMatMat(
|
||||
BaseFloat alpha,
|
||||
const CuMatrix<Real> &A, MatrixTransposeType transA,
|
||||
const CuMatrix<Real> &B, MatrixTransposeType transB,
|
||||
BaseFloat beta) {
|
||||
MatrixIndexT A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
|
||||
A_row_stride = A.Stride(), A_col_stride = 1,
|
||||
B_num_rows = B.NumRows(), B_num_cols = B.NumCols(),
|
||||
B_row_stride = B.Stride(), B_col_stride = 1;
|
||||
if (transA == kTrans) {
|
||||
std::swap(A_num_rows, A_num_cols);
|
||||
std::swap(A_row_stride, A_col_stride);
|
||||
}
|
||||
if (transB == kTrans) {
|
||||
std::swap(B_num_rows, B_num_cols);
|
||||
std::swap(B_row_stride, B_col_stride);
|
||||
}
|
||||
KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
|
||||
&& A_num_cols == B_num_rows);
|
||||
if (NumBlocks() == 0) return; // empty matrix.
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
// (x,y,z) dimensions are (block-id, row-of-block, col-of-block)
|
||||
// First some logic to choose block dims...
|
||||
// we assume (which we can, safely) that CU1DBLOCK is <= the max threads per block.
|
||||
int32 x_blocksize = std::min(CU1DBLOCK, NumBlocks()); // x dim corresponds to block-idx.
|
||||
int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
|
||||
int32 y_blocksize = max_block_rows;
|
||||
while (y_blocksize * x_blocksize > CU1DBLOCK || y_blocksize > CU2DBLOCK)
|
||||
y_blocksize--;
|
||||
int32 z_blocksize = max_block_cols;
|
||||
while (z_blocksize * x_blocksize * y_blocksize > CU1DBLOCK || z_blocksize > CU2DBLOCK)
|
||||
z_blocksize--;
|
||||
|
||||
dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
|
||||
dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
|
||||
n_blocks(max_block_rows, y_blocksize),
|
||||
n_blocks(max_block_cols, z_blocksize));
|
||||
cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
|
||||
A.Data(), A_num_cols, A_row_stride, A_col_stride,
|
||||
B.Data(), B_row_stride, B_col_stride, alpha, beta);
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
int32 row_offset = 0, col_offset = 0;
|
||||
for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
|
||||
CuSubMatrix<Real> this_block = Block(b);
|
||||
MatrixIndexT this_num_rows = this_block.NumRows(),
|
||||
this_num_cols = this_block.NumCols();
|
||||
CuSubMatrix<Real> A_part = (transA == kNoTrans ?
|
||||
A.Range(row_offset, this_num_rows,
|
||||
0, A.NumCols()) :
|
||||
A.Range(0, A.NumRows(),
|
||||
row_offset, this_num_rows)),
|
||||
B_part = (transB == kNoTrans ?
|
||||
B.Range(0, B.NumRows(),
|
||||
col_offset, this_num_cols) :
|
||||
B.Range(col_offset, this_num_cols,
|
||||
0, B.NumCols()));
|
||||
this_block.AddMatMat(alpha, A_part, transA, B_part, transB, beta);
|
||||
row_offset += this_num_rows;
|
||||
col_offset += this_num_cols;
|
||||
}
|
||||
KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
|
||||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
MatrixIndexT CuBlockMatrix<Real>::MaxBlockCols() const {
|
||||
MatrixIndexT max_cols = 0;
|
||||
for (size_t i = 0; i < block_data_.size(); i++)
|
||||
max_cols = std::max(max_cols, block_data_[i].num_cols);
|
||||
return max_cols;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
MatrixIndexT CuBlockMatrix<Real>::MaxBlockRows() const {
|
||||
return data_.NumRows();
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
void CuBlockMatrix<Real>::CopyFromMat(const CuMatrix<Real> &M) {
|
||||
KALDI_ASSERT(NumRows() == M.NumRows() && NumCols() == M.NumCols());
|
||||
MatrixIndexT row_offset = 0, col_offset = 0;
|
||||
for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
|
||||
CuSubMatrix<Real> this_block = Block(b);
|
||||
MatrixIndexT this_num_rows = this_block.NumRows(),
|
||||
this_num_cols = this_block.NumCols();
|
||||
const CuSubMatrix<Real> src(M, row_offset, this_num_rows,
|
||||
col_offset, this_num_cols);
|
||||
this_block.CopyFromMat(src);
|
||||
row_offset += this_num_rows;
|
||||
col_offset += this_num_cols;
|
||||
}
|
||||
KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
|
||||
}
|
||||
|
||||
/**
|
||||
* Print the matrix to stream
|
||||
*/
|
||||
template<typename Real>
|
||||
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
|
||||
bool binary = false;
|
||||
mat.Write(out, binary);
|
||||
return out;
|
||||
}
|
||||
// instantiate the template
|
||||
template
|
||||
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
|
||||
template
|
||||
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);
|
||||
|
||||
// Instantiate the class for float and double.
|
||||
template class CuBlockMatrix<float>;
|
||||
template class CuBlockMatrix<double>;
|
||||
|
||||
} // namespace kaldi
|
|
@ -0,0 +1,150 @@
|
|||
// cudamatrix/cu-block-matrix.h
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
|
||||
#define KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include <vector>
|
||||
#include "cudamatrix/cu-common.h"
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
/**
|
||||
The class CuBlockMatrix holds a vector of objects of type CuMatrix,
|
||||
say, M_1, M_2, .. M_N
|
||||
and it represents the matrix diag(M_1, M_2, ... M_N). Note:
|
||||
the individual matrices do not have to be square. The reason the
|
||||
class is needed is mostly so that we can efficiently multiply by this
|
||||
block-diagonal structure in a parallel way.
|
||||
|
||||
If we have a GPU available, CuBlockMatrix will store a copy of the
|
||||
individual CuMatrix quantities M_1 .. M_N on the GPU, but their
|
||||
'primary' home remains on the CPU.. what we mean by this is that
|
||||
while the data remains on the GPU, the "primary" version of the
|
||||
Matrix object that holds the pointers will remain on the CPU.
|
||||
We just copy it over to the GPU whenever it is changed.
|
||||
*/
|
||||
|
||||
template<typename Real>
|
||||
class CuBlockMatrix {
|
||||
public:
|
||||
friend class CuMatrixBase<Real>;
|
||||
|
||||
CuBlockMatrix();
|
||||
|
||||
CuBlockMatrix(const std::vector<CuMatrix<Real> > &data);
|
||||
|
||||
~CuBlockMatrix() { Destroy(); }
|
||||
|
||||
/// Copy constructor
|
||||
CuBlockMatrix(const CuBlockMatrix &other);
|
||||
|
||||
/// Assignment operator
|
||||
CuBlockMatrix &operator= (const CuBlockMatrix &other);
|
||||
|
||||
void Write(std::ostream &os, bool binary) const;
|
||||
|
||||
void Read(std::istream &is, bool binary);
|
||||
|
||||
MatrixIndexT NumRows() const { return num_rows_; }
|
||||
|
||||
MatrixIndexT NumCols() const { return data_.num_cols_; }
|
||||
|
||||
MatrixIndexT NumBlocks() const { return block_data_.size(); }
|
||||
|
||||
// Returns max num-columns of any block
|
||||
MatrixIndexT MaxBlockCols() const ;
|
||||
|
||||
// Returns max num-rows of any block
|
||||
MatrixIndexT MaxBlockRows() const;
|
||||
|
||||
const CuSubMatrix<Real> Block(MatrixIndexT b) const;
|
||||
|
||||
CuSubMatrix<Real> Block(MatrixIndexT b); // return CuMatrixBase to disallow resizes.
|
||||
|
||||
|
||||
/// Does *this = alpha A B + beta * *this, discarding elements of the product outside
|
||||
/// the block structure of the *this matrix. The transA and transB parameters
|
||||
/// can be used to substitute A^T for A and B^T for B, respectively.
|
||||
void AddMatMat(BaseFloat alpha,
|
||||
const CuMatrix<Real> &A, MatrixTransposeType transA,
|
||||
const CuMatrix<Real> &B, MatrixTransposeType transB,
|
||||
BaseFloat beta);
|
||||
|
||||
|
||||
/// Copies elements within the block structure from matrix M, discarding others.
|
||||
/// Note: this has not been implemented in a very efficient way, it's used only
|
||||
/// for testing.
|
||||
void CopyFromMat(const CuMatrix<Real> &M);
|
||||
|
||||
/// Normalizes the columns of *this so that each one sums to one.
|
||||
/// On error (e.g. inf's), will set the column to a constant value that
|
||||
/// sums to one.
|
||||
void NormalizeColumns();
|
||||
|
||||
void Swap(CuBlockMatrix *other);
|
||||
|
||||
protected:
|
||||
CuMatrix<Real> data_; // This is a single matrix into which
|
||||
// we pack all the blocks (possibly with spaces left over)
|
||||
|
||||
struct BlockMatrixData{
|
||||
MatrixIndexT num_rows;
|
||||
MatrixIndexT num_cols;
|
||||
MatrixIndexT row_offset;
|
||||
MatrixIndexT col_offset;
|
||||
};
|
||||
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
const CuBlockMatrixData* CuData() const { return cu_data_; }
|
||||
#endif
|
||||
private:
|
||||
|
||||
/// If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL
|
||||
void FreeCudaData();
|
||||
/// If using GPU, allocate and set cu_data_ on the GPU to reflect "data_".
|
||||
void SetCudaData();
|
||||
|
||||
|
||||
/// Frees and deinitializes everything.
|
||||
void Destroy();
|
||||
|
||||
std::vector<BlockMatrixData> block_data_;
|
||||
|
||||
MatrixIndexT num_rows_; // sum of num_rows of elements of block_data_.
|
||||
#if HAVE_CUDA == 1
|
||||
CuBlockMatrixData *cu_data_; // We store the pointers and some additional info
|
||||
// on the GPU card in a form more suited to
|
||||
// use by CUDA kernels.
|
||||
#endif
|
||||
}; // class CuBlockMatrix
|
||||
|
||||
template<typename Real>
|
||||
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat);
|
||||
|
||||
|
||||
} // namespace Kaldi
|
||||
#endif
|
|
@ -0,0 +1,53 @@
|
|||
// cudamatrix/cu-choleskykernel-ansi.h
|
||||
|
||||
// Copyright 2010-2013Dr. Stephan Kramer
|
||||
// Institut für Numerische und Angewandte Mathematik
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_ANSI_H_
|
||||
#define KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_ANSI_H_
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "cudamatrix/cu-matrixdim.h"
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
|
||||
extern "C" {
|
||||
|
||||
/*********************************************************
|
||||
* float CUDA kernel calls
|
||||
*/
|
||||
void cudaF_factorize_diagonal_block(float* A, int block_offset, MatrixDim d);
|
||||
void cudaF_strip_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d);
|
||||
void cudaF_diag_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d);
|
||||
void cudaF_lo_update(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d);
|
||||
|
||||
|
||||
/*********************************************************
|
||||
* double CUDA kernel calls
|
||||
*/
|
||||
void cudaD_factorize_diagonal_block(double* A, int block_offset, MatrixDim d);
|
||||
void cudaD_strip_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d);
|
||||
void cudaD_diag_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d);
|
||||
void cudaD_lo_update(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d);
|
||||
}
|
||||
|
||||
#endif // HAVE_CUDA
|
||||
|
||||
#endif
|
|
@ -0,0 +1,359 @@
|
|||
// cudamatrix/cu-choleskykernel.cu
|
||||
|
||||
// Copyright 2010-2013 Dr. Stephan Kramer
|
||||
// Institut fur Numerische und Angewandte Mathematik
|
||||
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "cudamatrix/cu-choleskykernels-ansi.h"
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#define TILE_SIZE 16
|
||||
|
||||
/***********************************************************************
|
||||
* CUDA kernels
|
||||
* some functions are templated to have the float/double operations
|
||||
*/
|
||||
__device__ int lex_index_2D (int r, int c, int row_length) {
|
||||
return c + r*row_length;
|
||||
}
|
||||
|
||||
|
||||
__device__ int global_pos(int t_pos, int block_offset) {
|
||||
return t_pos + TILE_SIZE*block_offset;
|
||||
}
|
||||
|
||||
|
||||
__device__ float inv_sqrt(float x) {
|
||||
return rsqrtf(x);
|
||||
}
|
||||
|
||||
|
||||
__device__ double inv_sqrt(double x) {
|
||||
return rsqrt(x);
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
__global__
|
||||
void __factorize_diagonal_block(T* A, int block_offset, MatrixDim d) {
|
||||
int global_row_length = d.stride;
|
||||
|
||||
int col = threadIdx.x;
|
||||
int row = threadIdx.y;
|
||||
|
||||
int global_row = global_pos(row,block_offset);
|
||||
int global_col = global_pos(col,block_offset);
|
||||
|
||||
if ((global_row >= d.cols) || (global_col >= d.cols))
|
||||
return;
|
||||
|
||||
int k_max = TILE_SIZE;
|
||||
if (d.cols - global_pos(0,block_offset) < TILE_SIZE)
|
||||
k_max = d.cols % TILE_SIZE;
|
||||
|
||||
|
||||
int idx = lex_index_2D(global_row, global_col, global_row_length);
|
||||
|
||||
__shared__ T L[TILE_SIZE][TILE_SIZE+1];
|
||||
|
||||
L[row][col] = 0;
|
||||
L[row][col] = A[idx];
|
||||
__syncthreads();
|
||||
|
||||
if ((row >= k_max) || (col >= k_max))
|
||||
return;
|
||||
|
||||
|
||||
T fac;
|
||||
|
||||
for (int k = 0; k < k_max; k++) {
|
||||
__syncthreads();
|
||||
fac = inv_sqrt(L[k][k]);
|
||||
__syncthreads();
|
||||
|
||||
if ((row==k)&&(col>=k))
|
||||
L[col][row] = (L[col][row])*fac;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if ((row>=col)&&(col>k))
|
||||
L[row][col] = L[row][col] - L[col][k]*L[row][k];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (row >= col) {
|
||||
A[idx] = L[row][col];
|
||||
if (A[idx] > 100000)
|
||||
A[idx] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
__global__
|
||||
void __strip_update(T* A, int block_offset, MatrixDim d) {
|
||||
int global_row_length = d.stride;
|
||||
|
||||
int boffy = block_offset;
|
||||
int boffx = blockIdx.x + boffy + 1;
|
||||
|
||||
int col = threadIdx.x;
|
||||
int row = threadIdx.y;
|
||||
|
||||
__shared__ T topleft[TILE_SIZE][TILE_SIZE+1];
|
||||
__shared__ T workingmat[TILE_SIZE][TILE_SIZE+1];
|
||||
|
||||
int global_row = global_pos(row,block_offset);
|
||||
int global_col = global_pos(col,block_offset);
|
||||
|
||||
if ((global_row >= d.cols) || (global_col >= d.cols))
|
||||
return;
|
||||
|
||||
int idx = lex_index_2D(global_row, global_col, global_row_length);
|
||||
|
||||
topleft[row][col] = 0;
|
||||
topleft[row][col] = A[idx];
|
||||
//__syncthreads();
|
||||
|
||||
global_row = global_pos(row,boffx);
|
||||
|
||||
if (global_row >= d.cols)
|
||||
return;
|
||||
|
||||
int idx_w = lex_index_2D(global_row, global_col, global_row_length);
|
||||
//int row2 = row + block_offset * TILE_SIZE;
|
||||
//int idx_w = row2 + col*global_row_length;
|
||||
workingmat[col][row]=0;
|
||||
workingmat[col][row]=A[idx_w];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (row==0) {
|
||||
for (int k = 0; k < TILE_SIZE; k++) {
|
||||
T sum=0.0;
|
||||
for (int m = 0; m < k; m++)
|
||||
sum = sum + topleft[k][m]*workingmat[m][col];
|
||||
|
||||
workingmat[k][col] = (workingmat[k][col] - sum) / topleft[k][k];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
A[idx_w] = workingmat[col][row];
|
||||
if (A[idx_w] > 100000)
|
||||
A[idx_w] = 1;
|
||||
//A[idx_w] = 1;
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
__global__
|
||||
void __diag_update(T* A, int block_offset, MatrixDim d) {
|
||||
int global_row_length = d.stride;
|
||||
int boffx = blockIdx.x + block_offset + 1;
|
||||
|
||||
int col = threadIdx.x;
|
||||
int row = threadIdx.y;
|
||||
|
||||
int global_row = global_pos(row,boffx);
|
||||
int global_col = global_pos(col,block_offset);
|
||||
|
||||
if ((global_row >= d.cols) || (global_col >= d.cols))
|
||||
return;
|
||||
|
||||
int idx = lex_index_2D(global_row, global_col, global_row_length);
|
||||
|
||||
__shared__ T left[TILE_SIZE][TILE_SIZE+1];
|
||||
|
||||
left[row][col] = 0;
|
||||
left[row][col] = A[idx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
T sum = 0.0;
|
||||
|
||||
|
||||
if (row >= col) {
|
||||
for (int kk = 0; kk < TILE_SIZE; kk++)
|
||||
sum = sum + left[row][kk]*left[col][kk];
|
||||
|
||||
//__syncthreads();
|
||||
|
||||
global_col = global_pos(col, boffx);
|
||||
|
||||
if (global_col >= d.cols)
|
||||
return;
|
||||
|
||||
idx = lex_index_2D(global_row, global_col, global_row_length);
|
||||
|
||||
A[idx] = A[idx] - sum;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
__global__
|
||||
void __lo_update(T* A, int block_offset, int n_blocks, MatrixDim d) {
|
||||
int global_row_length = d.stride;
|
||||
int col = threadIdx.x;
|
||||
int row = threadIdx.y;
|
||||
|
||||
int boffy = blockIdx.y + block_offset + 1;
|
||||
//int boffx = boffy + 1;
|
||||
int boffx = boffy + 1;
|
||||
|
||||
__shared__ T left[TILE_SIZE][TILE_SIZE];
|
||||
|
||||
__shared__ T upt[TILE_SIZE][TILE_SIZE + 1];
|
||||
|
||||
int global_row = global_pos(row,boffy);
|
||||
int global_col_src = global_pos(col,block_offset);
|
||||
|
||||
if ((global_row >= d.cols) || (global_col_src >= d.cols))
|
||||
return;
|
||||
|
||||
int idx = lex_index_2D(global_row, global_col_src, global_row_length);
|
||||
|
||||
upt[row][col] = 0;
|
||||
upt[row][col] = A[idx];
|
||||
__syncthreads();
|
||||
|
||||
for (; boffx < n_blocks; boffx++) {
|
||||
global_row = global_pos(row,boffx);
|
||||
|
||||
if (global_row >= d.cols)
|
||||
return;
|
||||
|
||||
idx = lex_index_2D(global_row, global_col_src, global_row_length);
|
||||
|
||||
left[row][col] = 0;
|
||||
left[row][col] = A[idx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (global_row >= d.cols)
|
||||
return;
|
||||
|
||||
T matrixprod = 0.0;
|
||||
|
||||
for (int kk = 0; kk < TILE_SIZE; kk++)
|
||||
matrixprod += left[row][kk]*upt[col][kk];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int global_col = global_pos(col,boffy);
|
||||
if (global_col >= d.cols)
|
||||
return;
|
||||
|
||||
idx = lex_index_2D(global_row, global_col, global_row_length);
|
||||
A[idx] = A[idx] - matrixprod;
|
||||
}
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* ANSI-C wrappers of CUDA kernels
|
||||
*/
|
||||
|
||||
/*
|
||||
* float
|
||||
*/
|
||||
|
||||
void cudaF_factorize_diagonal_block(float* A, int block_offset, MatrixDim d) {
|
||||
dim3 threads(TILE_SIZE,TILE_SIZE);
|
||||
__factorize_diagonal_block<<<1,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void cudaF_strip_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
|
||||
dim3 threads(TILE_SIZE,TILE_SIZE);
|
||||
if (n_remaining_blocks >= 2) {
|
||||
dim3 stripgrid(n_remaining_blocks-1);
|
||||
__strip_update<<<stripgrid,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
} else {
|
||||
int stripgrid = 1;
|
||||
__strip_update<<<stripgrid,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
void cudaF_diag_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
|
||||
dim3 threads(TILE_SIZE,TILE_SIZE);
|
||||
if (n_remaining_blocks >= 2) {
|
||||
dim3 diaggrid(n_remaining_blocks-1);
|
||||
__diag_update<<<diaggrid,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
} else {
|
||||
int diaggrid = 1;
|
||||
__diag_update<<<diaggrid,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
void cudaF_lo_update(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) {
|
||||
dim3 logrid;
|
||||
logrid.x = 1;
|
||||
logrid.y = n_remaining_blocks-2;
|
||||
dim3 threads(TILE_SIZE,TILE_SIZE);
|
||||
__lo_update<<<logrid,threads>>>(A,block_offset,n_blocks,d);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
/*
|
||||
* double
|
||||
*/
|
||||
void cudaD_factorize_diagonal_block(double* A, int block_offset, MatrixDim d) {
|
||||
dim3 threads(TILE_SIZE,TILE_SIZE);
|
||||
__factorize_diagonal_block<<<1,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void cudaD_strip_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
|
||||
dim3 threads(TILE_SIZE,TILE_SIZE);
|
||||
if (n_remaining_blocks >= 2) {
|
||||
dim3 stripgrid(n_remaining_blocks-1);
|
||||
__strip_update<<<stripgrid,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
} else {
|
||||
int stripgrid = 1;
|
||||
__strip_update<<<stripgrid,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
void cudaD_diag_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
|
||||
dim3 threads(TILE_SIZE,TILE_SIZE);
|
||||
if (n_remaining_blocks >= 2) {
|
||||
dim3 diaggrid(n_remaining_blocks-1);
|
||||
__diag_update<<<diaggrid,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
} else {
|
||||
int diaggrid = 1;
|
||||
__diag_update<<<diaggrid,threads>>>(A,block_offset,d);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
void cudaD_lo_update(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) {
|
||||
dim3 logrid;
|
||||
logrid.x = 1;
|
||||
logrid.y = n_remaining_blocks-2;
|
||||
dim3 threads(TILE_SIZE,TILE_SIZE);
|
||||
__lo_update<<<logrid,threads>>>(A,block_offset,n_blocks,d);
|
||||
cudaThreadSynchronize();
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
// cudamatrix/cu-choleskykernel.h
|
||||
|
||||
// Copyright 2010-2013 Dr. Stephan Kramer
|
||||
// Institut für Numerische und Angewandte Mathematik
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_H_
|
||||
#define KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_H_
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
|
||||
#include "base/kaldi-error.h"
|
||||
#include "cudamatrix/cu-choleskykernels-ansi.h"
|
||||
|
||||
/*
|
||||
* In this file are C++ templated wrappers
|
||||
* of the ANSI-C CUDA kernels
|
||||
*/
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/*********************************************************
|
||||
* base templates
|
||||
*/
|
||||
template<typename Real> inline void cuda_factorize_diagonal_block(Real* A, int block_offset, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_strip_update(Real* A, int block_offset, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_diag_update(Real* A, int block_offset, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_lo_update(Real* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
/*********************************************************
|
||||
* float specialization
|
||||
*/
|
||||
template<> inline void cuda_factorize_diagonal_block<float>(float* A, int block_offset, MatrixDim d) { cudaF_factorize_diagonal_block(A,block_offset,d); }
|
||||
template<> inline void cuda_strip_update<float>(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaF_strip_update(A,block_offset,n_remaining_blocks,d); }
|
||||
template<> inline void cuda_diag_update<float>(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaF_diag_update(A,block_offset,n_remaining_blocks,d); }
|
||||
template<> inline void cuda_lo_update<float>(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { cudaF_lo_update(A,block_offset,n_blocks,n_remaining_blocks,d); }
|
||||
/*********************************************************
|
||||
* double specialization
|
||||
*/
|
||||
template<> inline void cuda_factorize_diagonal_block<double>(double* A, int block_offset, MatrixDim d) { cudaD_factorize_diagonal_block(A,block_offset,d); }
|
||||
template<> inline void cuda_strip_update<double>(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaD_strip_update(A,block_offset,n_remaining_blocks,d); }
|
||||
template<> inline void cuda_diag_update<double>(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaD_diag_update(A,block_offset,n_remaining_blocks,d); }
|
||||
template<> inline void cuda_lo_update<double>(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { cudaD_lo_update(A,block_offset,n_blocks,n_remaining_blocks,d); }
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // HAVE_CUDA
|
||||
|
||||
#endif
|
|
@ -0,0 +1,32 @@
|
|||
#ifndef KALDI_CUDAMATRIX_COMMON_H_
|
||||
#define KALDI_CUDAMATRIX_COMMON_H_
|
||||
|
||||
// This file contains some #includes, forward declarations
|
||||
// and typedefs that are needed by all the main header
|
||||
// files in this directory.
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "matrix/kaldi-blas.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-common.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans) {
|
||||
cublasOperation_t cublas_trans;
|
||||
|
||||
if (kaldi_trans == kNoTrans)
|
||||
cublas_trans = CUBLAS_OP_N;
|
||||
else if (kaldi_trans == kTrans)
|
||||
cublas_trans = CUBLAS_OP_T;
|
||||
else
|
||||
cublas_trans = CUBLAS_OP_C;
|
||||
return cublas_trans;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
#endif // KALDI_CUDAMATRIX_COMMON_H_
|
|
@ -22,20 +22,20 @@
|
|||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_COMMON_H_
|
||||
#define KALDI_CUDAMATRIX_CU_COMMON_H_
|
||||
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
|
||||
#include "cudamatrix/cu-matrixdim.h" // for CU1DBLOCK and CU2DBLOCK
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "base/kaldi-error.h"
|
||||
#include "matrix/matrix-common.h"
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
#include <cublas.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include "base/kaldi-error.h"
|
||||
|
||||
|
||||
#define cuSafeCall(fun) \
|
||||
#define CU_SAFE_CALL(fun) \
|
||||
{ \
|
||||
int32 ret; \
|
||||
if ((ret = (fun)) != 0) { \
|
||||
|
@ -47,19 +47,19 @@
|
|||
|
||||
namespace kaldi {
|
||||
|
||||
/** The size of edge of CUDA square block **/
|
||||
static const int32 CUBLOCK = 16;
|
||||
/** Number of blocks in which the task of size 'size' is splitted **/
|
||||
inline int32 n_blocks(int32 size, int32 block_size) {
|
||||
return size / block_size + ((size % block_size == 0)? 0 : 1);
|
||||
}
|
||||
|
||||
/** Number of blocks in which the task of size 'size' is splitted **/
|
||||
inline int32 n_blocks(int32 size, int32 block_size) {
|
||||
return size / block_size + ((size % block_size == 0)? 0 : 1);
|
||||
}
|
||||
cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans);
|
||||
|
||||
}
|
||||
|
||||
#endif // HAVE_CUDA
|
||||
|
||||
namespace kaldi {
|
||||
// Some forward declarations, frequently needed
|
||||
// Some forward declarations, needed for friend declarations.
|
||||
template<typename Real> class CuVectorBase;
|
||||
template<typename Real> class CuVector;
|
||||
template<typename Real> class CuSubVector;
|
||||
|
@ -67,7 +67,13 @@ template<typename Real> class CuRand;
|
|||
template<typename Real> class CuMatrixBase;
|
||||
template<typename Real> class CuMatrix;
|
||||
template<typename Real> class CuSubMatrix;
|
||||
template<typename Real> class CuRand;
|
||||
template<typename Real> class CuPackedMatrix;
|
||||
template<typename Real> class CuSpMatrix;
|
||||
template<typename Real> class CuTpMatrix;
|
||||
|
||||
template<typename Real> class CuBlockMatrix; // this has no non-CU counterpart.
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
// cudamatrix/cu-device.cc
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2013 Lucas Ondel
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -19,140 +21,137 @@
|
|||
|
||||
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
|
||||
#include <cublas.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <dlfcn.h>
|
||||
#include <unistd.h> // for sleep
|
||||
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "base/kaldi-error.h"
|
||||
|
||||
#include "util/common-utils.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
CuDevice::CuDevice()
|
||||
: active_gpu_id_(-3), verbose_(true)
|
||||
{ }
|
||||
|
||||
|
||||
|
||||
CuDevice::~CuDevice() {
|
||||
if (Enabled()) {
|
||||
cuSafeCall(cublasShutdown());
|
||||
} else if (active_gpu_id_ == -2) {
|
||||
KALDI_WARN << "CUDA was NOT used! No CUDA GPU detected!";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* SelectGpuId(gpu_id)
|
||||
* SelectGpuId(use_gpu)
|
||||
*
|
||||
* The argument 'gpu_id' meaning: 0..N selects a GPU,
|
||||
* -1 disables CUDA, -2 performs GPU auto-detection.
|
||||
* There are 3 'use_gpu' modes for GPU selection:
|
||||
* "yes" -- Select GPU automatically (or get one by exclusive mode)
|
||||
* and die if this fails.
|
||||
* "optional" -- Do as above, but if it fails, back off to CPU.
|
||||
* "no" -- Run on CPU.
|
||||
*
|
||||
* If there is no GPU in the system, and we have GPU auto-detection,
|
||||
* or GPU is manually disabled the computation will run on CPU.
|
||||
* In other cases it is an error (manual selection).
|
||||
* In case of Compute exclusive mode, the GPU is selected by OS.
|
||||
*
|
||||
* In case of Compute exclusive mode, the GPU is selected by OS,
|
||||
* this has priority over manual/auto selection of GPU.
|
||||
* Otherwise GPU selection is based on largest proportion of free memory.
|
||||
* This can eventually lead to multiple processes computing on single GPU,
|
||||
* which is slow. More practical is to use "compute exclusive mode".
|
||||
*
|
||||
* Since the autoselection of GPU is not perfect, it may still
|
||||
* happen that two processes compute on single GPU, which is slow.
|
||||
* The users are advised to use manual selection or exclusive mode.
|
||||
*
|
||||
* This method must be called at the very beginning of the program
|
||||
* (before the cudamatrix objects allocate memory for the data),
|
||||
* or not at all (when we intentionally want to run on the CPU).
|
||||
* This method is to be called at the very beginning of the program
|
||||
* (before first allocation in cudamatrix), or not at all (default to CPU).
|
||||
*
|
||||
*/
|
||||
void CuDevice::SelectGpuId(int32 gpu_id) {
|
||||
void CuDevice::SelectGpuId(std::string use_gpu) {
|
||||
// Possible modes
|
||||
if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional") {
|
||||
KALDI_ERR << "Please choose : --use-gpu=yes|no|optional, passed '" << use_gpu << "'";
|
||||
}
|
||||
|
||||
// Make sure this function is not called twice!
|
||||
if(Enabled()) {
|
||||
if (Enabled()) {
|
||||
KALDI_ERR << "There is already an active GPU " << active_gpu_id_
|
||||
<< ", cannot change it on the fly!";
|
||||
}
|
||||
// Allow the GPU to stay disabled
|
||||
if(!Enabled() && gpu_id == -1) {
|
||||
KALDI_LOG << "Selected device: " << gpu_id
|
||||
<< ", we don't even try to get a GPU. We run on CPU.";
|
||||
active_gpu_id_ = -1;
|
||||
if(!Enabled() && use_gpu == "no") {
|
||||
KALDI_LOG << "Manually selected to compute on CPU.";
|
||||
return;
|
||||
}
|
||||
|
||||
// Check that we have a gpu available
|
||||
int32 n_gpu = 0;
|
||||
cudaGetDeviceCount(&n_gpu);
|
||||
if(n_gpu == 0 && gpu_id == -2) {
|
||||
// If we do automatic selection and no GPU is found, we run on a CPU
|
||||
KALDI_WARN << "CUDA will NOT be used!!! No CUDA capable GPU detected...";
|
||||
active_gpu_id_ = -2;
|
||||
return;
|
||||
}
|
||||
// In other cases it is an error, no GPU is an error
|
||||
if(n_gpu == 0) {
|
||||
KALDI_ERR << "No CUDA capable GPU detected, while explicitly asked for gpu-id '"
|
||||
<< gpu_id << "'.";
|
||||
}
|
||||
|
||||
|
||||
//Now we know that there is a GPU in the system,
|
||||
//and we don't want to have it disabled.
|
||||
//
|
||||
//For the GPU selection there are 3 possibilities,
|
||||
//with priorities according to the order:
|
||||
//
|
||||
//1.) We have compute exclusive mode on (GPU is selected by OS)
|
||||
//2.) User did not specify the GPU-id (default value -2),
|
||||
// we will do automatic selection.
|
||||
//3.) User specified the GPU to run on, so we select it.
|
||||
if(IsComputeExclusive()) {
|
||||
//we have the GPU context now...
|
||||
;
|
||||
} else if(gpu_id == -2) {
|
||||
SelectGpuIdAuto();
|
||||
} else {
|
||||
//try to select the desired GPU
|
||||
int32 ret = cudaSetDevice(gpu_id);
|
||||
//handle the possible errors (no recovery!!!)
|
||||
switch(ret) {
|
||||
case cudaSuccess : {
|
||||
//create the GPU context
|
||||
cudaError_t e;
|
||||
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
|
||||
if(e != cudaSuccess) {
|
||||
KALDI_ERR << "Failed to create CUDA context on a GPU.";
|
||||
}
|
||||
//this was okay, so we are done!
|
||||
KALDI_LOG << "Selected device: " << gpu_id << " (manually)";
|
||||
break;
|
||||
}
|
||||
case cudaErrorInvalidDevice : {
|
||||
int32 n_gpu = 0;
|
||||
cudaGetDeviceCount(&n_gpu);
|
||||
KALDI_ERR << "cudaSetDevice(" << gpu_id << "):"
|
||||
<< " '" << gpu_id << "' is not a VALID CUDA device! "
|
||||
<< " (system has " << n_gpu << " GPUs,"
|
||||
<< " valid IDs 0.." << n_gpu-1 << ")";
|
||||
break;
|
||||
}
|
||||
default :
|
||||
KALDI_ERR << "cudaSetDevice(" << gpu_id << "): "
|
||||
<< "returned " << ret << ", "
|
||||
<< cudaGetErrorString((cudaError_t)ret);
|
||||
if (use_gpu == "yes") {
|
||||
KALDI_ERR << "No CUDA GPU detected!";
|
||||
}
|
||||
if (use_gpu == "optional") {
|
||||
KALDI_WARN << "Running on CPU!!! No CUDA GPU detected...";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Now the we should have active GPU,
|
||||
// so we can query its name and memory stats
|
||||
// and notify user which GPU is finally used.
|
||||
//
|
||||
// Create a CUDA context : in case of compute-exclusive mode OS selects gpu_id,
|
||||
// or default gpu_id=0. In the case with no free GPUs a context cannot be created
|
||||
// (compute-exclusive mode).
|
||||
//
|
||||
cudaError_t e;
|
||||
e = cudaThreadSynchronize(); //<< CUDA context gets created here.
|
||||
if (e != cudaSuccess) {
|
||||
// So far no we don't have context, sleep a bit and retry.
|
||||
int32 sec_sleep = 2;
|
||||
KALDI_WARN << "Will try again to get a GPU after " << sec_sleep
|
||||
<< " seconds.";
|
||||
sleep(sec_sleep);
|
||||
//
|
||||
e = cudaThreadSynchronize(); //<< 2nd trial to get CUDA context.
|
||||
if (e != cudaSuccess) {
|
||||
if (use_gpu == "yes") {
|
||||
KALDI_ERR << "Failed to create CUDA context, no more unused GPUs?";
|
||||
}
|
||||
if (use_gpu == "optional") {
|
||||
KALDI_WARN << "Running on CPU!!! No more unused CUDA GPUs?";
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Re-assure we have the context
|
||||
KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
|
||||
|
||||
// Check if the machine use compute exclusive mode
|
||||
if (IsComputeExclusive()) {
|
||||
FinalizeActiveGpu();
|
||||
return;
|
||||
} else {
|
||||
// Or suggest to use compute exclusive mode
|
||||
if(n_gpu > 1) {
|
||||
KALDI_WARN << "Hint: It is practical to set the GPUs into ``compute exclusive mode''."
|
||||
<< " Selection of free GPUs would be done by OS automatically.";
|
||||
}
|
||||
// And select the GPU according to proportion of free memory
|
||||
if(SelectGpuIdAuto()) {
|
||||
FinalizeActiveGpu();
|
||||
return;
|
||||
} else {
|
||||
// Could not get GPU, after prevously having the CUDA context?
|
||||
// Strange but not impossible...
|
||||
if (use_gpu == "yes") {
|
||||
KALDI_ERR << "Error acquiring GPU.";
|
||||
}
|
||||
if (use_gpu == "optional") {
|
||||
KALDI_WARN << "Running on CPU!!! Error acquiring GPU.";
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CuDevice::FinalizeActiveGpu() {
|
||||
// The device at this point should have active GPU, so we can query its name
|
||||
// and memory stats and notify user which GPU is finally used.
|
||||
|
||||
// Get the device-id of active device:
|
||||
{
|
||||
int32 act_gpu_id;
|
||||
|
@ -164,44 +163,38 @@ void CuDevice::SelectGpuId(int32 gpu_id) {
|
|||
// Remember the id of active GPU
|
||||
active_gpu_id_ = act_gpu_id; //CuDevice::Enabled() is true from now on
|
||||
// Initialize the CUBLAS
|
||||
cuSafeCall(cublasInit());
|
||||
CU_SAFE_CALL(cublasInit());
|
||||
|
||||
// Notify user which GPU is finally used
|
||||
char name[128];
|
||||
DeviceGetName(name,128,act_gpu_id);
|
||||
KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: "
|
||||
<< name << "\t" << GetFreeMemory(NULL, NULL);
|
||||
}
|
||||
|
||||
CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, act_gpu_id));
|
||||
|
||||
KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t"
|
||||
<< GetFreeMemory(&free_memory_at_startup_, NULL) << " version "
|
||||
<< properties_.major << "." << properties_.minor;
|
||||
|
||||
if (verbose_) PrintMemoryUsage();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
bool CuDevice::DoublePrecisionSupported() {
|
||||
if (!Enabled()) return true;
|
||||
return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
|
||||
// Double precision is supported from version 1.3
|
||||
}
|
||||
|
||||
|
||||
bool CuDevice::IsComputeExclusive() {
|
||||
// check that we have a gpu
|
||||
int32 n_gpu = 0;
|
||||
cudaGetDeviceCount(&n_gpu);
|
||||
if(n_gpu == 0) {
|
||||
KALDI_LOG << "No CUDA devices found";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create a GPU context
|
||||
// This will be kept if we detect compute exclusive mode
|
||||
// or released in the other case.
|
||||
//
|
||||
// It does not harm if the function gets called twice,
|
||||
// and the context is already created.
|
||||
cudaError_t e;
|
||||
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
|
||||
if(e != cudaSuccess) {
|
||||
KALDI_ERR << "Failed to create CUDA context on a GPU. No more unused GPUs in compute exclusive mode?";
|
||||
}
|
||||
|
||||
// assume we already have an CUDA context created
|
||||
KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
|
||||
|
||||
// get the device-id and its device-properties
|
||||
int32 gpu_id = -1;
|
||||
e = cudaGetDevice(&gpu_id);
|
||||
cudaError_t e = cudaGetDevice(&gpu_id);
|
||||
if(e != cudaSuccess) {
|
||||
KALDI_ERR << "Failed to get current device";
|
||||
}
|
||||
|
@ -216,12 +209,12 @@ bool CuDevice::IsComputeExclusive() {
|
|||
KALDI_LOG << "CUDA setup operating under Compute Exclusive Mode.";
|
||||
return true;
|
||||
break;
|
||||
#if (CUDA_VERSION >= 4000)
|
||||
#if (CUDA_VERSION >= 4000)
|
||||
case cudaComputeModeExclusiveProcess :
|
||||
KALDI_LOG << "CUDA setup operating under Compute Exclusive Process Mode.";
|
||||
return true;
|
||||
break;
|
||||
#endif
|
||||
#endif
|
||||
default :
|
||||
// The computation mode is not compute-exclusive,
|
||||
// in this case we release the GPU context...
|
||||
|
@ -234,21 +227,20 @@ bool CuDevice::IsComputeExclusive() {
|
|||
}
|
||||
|
||||
|
||||
|
||||
void CuDevice::SelectGpuIdAuto() {
|
||||
// check that we have at least one gpu
|
||||
bool CuDevice::SelectGpuIdAuto() {
|
||||
// Check that we have at least one gpu
|
||||
int32 n_gpu = 0;
|
||||
cudaGetDeviceCount(&n_gpu);
|
||||
if(n_gpu == 0) {
|
||||
KALDI_ERR << "No CUDA devices found";
|
||||
return;
|
||||
KALDI_WARN << "No CUDA devices found";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// The GPU is selected according to maximal free memory ratio
|
||||
std::vector<float> free_mem_ratio(n_gpu+1, 0.0);
|
||||
//get ratios of memory use, if possible
|
||||
// Get ratios of memory use, if possible
|
||||
KALDI_LOG << "Selecting from " << n_gpu << " GPUs";
|
||||
for(int32 n=0; n<n_gpu; n++) {
|
||||
for(int32 n = 0; n < n_gpu; n++) {
|
||||
int32 ret = cudaSetDevice(n);
|
||||
switch(ret) {
|
||||
case cudaSuccess : {
|
||||
|
@ -292,23 +284,22 @@ void CuDevice::SelectGpuIdAuto() {
|
|||
if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
|
||||
}
|
||||
//the free_mem_ratio should be bigger than zero
|
||||
if(!free_mem_ratio[max_id] > 0.0) {
|
||||
KALDI_ERR << "No device could be selected (this should never happen)";
|
||||
}
|
||||
KALDI_ASSERT(free_mem_ratio[max_id] > 0.0);
|
||||
|
||||
//finally select the GPU
|
||||
KALDI_LOG << "Selected device: " << max_id << " (automatically)";
|
||||
cuSafeCall(cudaSetDevice(max_id));
|
||||
CU_SAFE_CALL(cudaSetDevice(max_id));
|
||||
//create the context
|
||||
cudaError_t e;
|
||||
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
|
||||
if(e != cudaSuccess) {
|
||||
KALDI_ERR << "Failed to create CUDA context on a GPU.";
|
||||
KALDI_WARN << "Failed to create CUDA context on a GPU.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void CuDevice::AccuProfile(const std::string &key, double time) {
|
||||
if (profile_map_.find(key) == profile_map_.end()) {
|
||||
profile_map_[key] = 0.0;
|
||||
|
@ -316,23 +307,35 @@ void CuDevice::AccuProfile(const std::string &key, double time) {
|
|||
profile_map_[key] += time;
|
||||
}
|
||||
|
||||
|
||||
void CuDevice::PrintMemoryUsage() const {
|
||||
if (Enabled()) {
|
||||
int64 free_memory_now;
|
||||
GetFreeMemory(&free_memory_now, NULL);
|
||||
KALDI_LOG << "Memory used: " << (free_memory_at_startup_ - free_memory_now) << " bytes.";
|
||||
}
|
||||
}
|
||||
|
||||
void CuDevice::PrintProfile() {
|
||||
if (verbose_ && Enabled()) {
|
||||
std::ostringstream os;
|
||||
os << "-----\n[cudevice profile]\n";
|
||||
std::map<std::string, double>::iterator it;
|
||||
for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
|
||||
os << it->first << "\t" << it->second << "s\n";
|
||||
}
|
||||
std::vector<std::pair<double, std::string> > pairs;
|
||||
for(it = profile_map_.begin(); it != profile_map_.end(); ++it)
|
||||
pairs.push_back(std::make_pair(it->second, it->first));
|
||||
std::sort(pairs.begin(), pairs.end());
|
||||
size_t max_print = 15, start_pos = (pairs.size() <= max_print ?
|
||||
0 : pairs.size() - max_print);
|
||||
for (size_t i = start_pos; i < pairs.size(); i++)
|
||||
os << pairs[i].second << "\t" << pairs[i].first << "s\n";
|
||||
os << "-----";
|
||||
KALDI_LOG << os.str();
|
||||
PrintMemoryUsage();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::string CuDevice::GetFreeMemory(int64* free, int64* total) {
|
||||
std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
|
||||
// WARNING! the CUDA API is inconsistent accross versions!
|
||||
#if (CUDA_VERSION >= 3020)
|
||||
//define the function signature type
|
||||
|
@ -406,14 +409,354 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
|
|||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// The instance of the static singleton
|
||||
//
|
||||
CuDevice CuDevice::msDevice;
|
||||
//
|
||||
////////////////////////////////////////////////
|
||||
struct CuAllocatorOptions {
|
||||
int32 count; // Number of times we free and delete a particular size before we
|
||||
// start to cache it.
|
||||
int32 cleanup_interval_bytes;
|
||||
CuAllocatorOptions(): count(1), cleanup_interval_bytes(1000000) { }
|
||||
};
|
||||
|
||||
|
||||
/// We define class CuAllocator inside the .cc file, because we don't want to
|
||||
/// expose it in the header. Its purpose is to hang on to memory that we have
|
||||
/// freed, so that we don't waste time in cudaMalloc and cudaMallocPitch().
|
||||
/// For some reason, they are sometimes very slow.
|
||||
class CuAllocator {
|
||||
public:
|
||||
CuAllocator(const CuAllocatorOptions &opts, CuDevice *device):
|
||||
device_(device), opts_(opts),
|
||||
cleanup_countdown_bytes_(opts.cleanup_interval_bytes) { }
|
||||
|
||||
inline void *Malloc(size_t size);
|
||||
|
||||
inline void *MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
|
||||
|
||||
inline void Free(void *ptr);
|
||||
|
||||
~CuAllocator();
|
||||
private:
|
||||
inline void *MallocInternal(size_t row_bytes, size_t num_rows, size_t *pitch);
|
||||
|
||||
// struct MemInfoForSize stores information associated with a particular size
|
||||
// of allocated memory. The row_bytes and num_rows refer to the arguments of
|
||||
// a cudaMallocPitch call; for regular, non-pitch allocations with cudaMalloc,
|
||||
// we make "row_bytes" zero and the size in bytes is "num_rows"... there is a
|
||||
// reason why we do it this way round (make num_rows contain the size in
|
||||
// bytes); it relates to the ordering of the map, and the behavior when
|
||||
// we didn't find the exact size and want to find larger match.
|
||||
|
||||
|
||||
struct MemInfoForSize {
|
||||
size_t row_bytes; // or zero, if a regular CudaMalloc, not
|
||||
// CudaMallocPitch.
|
||||
size_t num_rows; // or the number of rows, if it's a regular CudaMalloc
|
||||
// call, not CudaMallocPitch.
|
||||
size_t pitch; // If CudaMallocPitch, the pitch returned by CudaMallocPitch;
|
||||
// this code assumes (and checks) that it's a deterministic
|
||||
// function of row_bytes and num_rows.
|
||||
size_t countdown; // number that have been freed and not cached.
|
||||
size_t currently_used; // number that are "in the wild".. kept for
|
||||
// diagnostics and error detection.
|
||||
std::vector<void*> freed; // freed and cached...
|
||||
|
||||
MemInfoForSize(size_t row_bytes,
|
||||
size_t num_rows,
|
||||
int32 count):
|
||||
row_bytes(row_bytes),
|
||||
num_rows(num_rows),
|
||||
pitch(0),
|
||||
countdown(count),
|
||||
currently_used(0) { }
|
||||
};
|
||||
|
||||
|
||||
// FindMemInfo returns the MemInfoForSize object for this (row_bytes,
|
||||
// num_rows) combination if it exists; otherwise...
|
||||
// if there is a MemInfoForSize object with the same row_bytes and larger (but
|
||||
// not more than twice larger) num_rows that has freed memory waiting, it
|
||||
// returns that; otherwise, it returns a new MemInfoForSize object for the
|
||||
// requested size).
|
||||
|
||||
inline MemInfoForSize *FindMemInfo(size_t row_bytes,
|
||||
size_t num_rows) {
|
||||
if (row_bytes >= size_to_list_.size())
|
||||
size_to_list_.resize(row_bytes + 1, NULL);
|
||||
|
||||
// note: we set row_bytes to 0 for regular, linear allocation.
|
||||
KALDI_ASSERT(num_rows != 0);
|
||||
|
||||
if (size_to_list_[row_bytes] == NULL)
|
||||
size_to_list_[row_bytes] = new std::map<size_t, MemInfoForSize*>;
|
||||
|
||||
|
||||
std::map<size_t, MemInfoForSize*> &size_to_list = *(size_to_list_[row_bytes]);
|
||||
|
||||
typedef std::map<size_t, MemInfoForSize* >::iterator IterType;
|
||||
|
||||
// get an iterator to the requested object or the next-larger one.
|
||||
// Here, upper_bound(num_rows - 1) returns an object strictly greater
|
||||
// than num_rows - 1, which could be num_rows itself. We need to
|
||||
// treat num_rows == 0 as a special case because of size_t being
|
||||
// unsigned.
|
||||
IterType iter = (num_rows == 0 ? size_to_list.begin() :
|
||||
size_to_list.upper_bound(num_rows - 1));
|
||||
|
||||
if (iter != size_to_list.end() && iter->first == num_rows) {
|
||||
// Found a MemInfoForSize object
|
||||
// with the requested size -> return it.
|
||||
KALDI_ASSERT(iter->second->row_bytes == row_bytes &&
|
||||
iter->second->num_rows == num_rows);
|
||||
return iter->second;
|
||||
} else if (iter != size_to_list.end() &&
|
||||
iter->second->num_rows <= 2 * num_rows &&
|
||||
!iter->second->freed.empty()) {
|
||||
// Return the non-matching one with freed memory, which is larger than
|
||||
// this one but not more than twice larger.
|
||||
KALDI_ASSERT(iter->second->row_bytes == row_bytes &&
|
||||
iter->second->num_rows > num_rows); // confirm expectations.
|
||||
return iter->second;
|
||||
} else {
|
||||
// There was no such object, and the next-larger object either did not
|
||||
// exist, had more than twice the num-rows requested, or had no free
|
||||
// memory -> create an object with the requested size.
|
||||
return (size_to_list[num_rows] = new MemInfoForSize(row_bytes, num_rows,
|
||||
opts_.count));
|
||||
}
|
||||
}
|
||||
|
||||
void PossiblyCleanup(size_t num_bytes);
|
||||
|
||||
// A periodic housekeeping task..
|
||||
void Cleanup();
|
||||
|
||||
// Frees all memory in the "freed" vectors; memory that the
|
||||
// user freed but we held on to. If destroy == true, also
|
||||
// clean up all memory held in the size_to_list_ object (i.e.
|
||||
// allocated maps and MemInfoForSize objects).
|
||||
void ReleaseAllCachedMemory(bool destroy = false);
|
||||
|
||||
CuDevice *device_; // device this is attached to...
|
||||
CuAllocatorOptions opts_;
|
||||
|
||||
|
||||
unordered_map<void*, MemInfoForSize*> addr_to_list_;
|
||||
|
||||
// size_to_list_ is indexed first by row_bytes (which is zero for linear
|
||||
// mallocs) and then by num_rows (which for linear mallocs, is the actual size
|
||||
// in bytes).
|
||||
std::vector<std::map<size_t, MemInfoForSize*>* > size_to_list_;
|
||||
|
||||
int32 cleanup_countdown_bytes_; // countdown in bytes, until the next time we check
|
||||
// whether we should do cleanup
|
||||
};
|
||||
|
||||
|
||||
void* CuAllocator::Malloc(size_t size) {
|
||||
KALDI_ASSERT(size > 0);
|
||||
return MallocInternal(0, size, NULL);
|
||||
}
|
||||
|
||||
void* CuAllocator::MallocPitch(size_t num_rows, size_t row_bytes,
|
||||
size_t *pitch) {
|
||||
KALDI_ASSERT(num_rows > 0 && row_bytes > 0 && pitch != NULL);
|
||||
return MallocInternal(num_rows, row_bytes, pitch);
|
||||
}
|
||||
|
||||
void* CuAllocator::MallocInternal(size_t row_bytes,
|
||||
size_t num_rows,
|
||||
size_t *pitch_out) {
|
||||
// we share the code for standard cudaMalloc and cudaMallocPitch
|
||||
// because most of it is the same. for cudaMalloc, we'll have
|
||||
// row_bytes == 0, and num_rows is just the size to be allocated.
|
||||
KALDI_ASSERT(num_rows != 0 && (row_bytes != 0) == (pitch_out != NULL));
|
||||
|
||||
MemInfoForSize *info = FindMemInfo(row_bytes, num_rows);
|
||||
if (!info->freed.empty()) { // We can satisfy the request with cached,
|
||||
// previously-allocated memory.
|
||||
void *ans = info->freed.back();
|
||||
info->freed.pop_back();
|
||||
info->currently_used++;
|
||||
addr_to_list_[ans] = info;
|
||||
if (pitch_out) *pitch_out = info->pitch;
|
||||
return ans;
|
||||
} else {
|
||||
PossiblyCleanup(row_bytes == 0 ? num_rows : row_bytes * num_rows);
|
||||
void *ans;
|
||||
if (row_bytes == 0) { // Simple malloc request, not "MallocPitch".
|
||||
size_t size = num_rows;
|
||||
int32 ret = cudaMalloc(&ans, size);
|
||||
if (ret != 0) {
|
||||
KALDI_WARN << "Allocation of memory block of " << size << " bytes "
|
||||
<< "failed, releasing cached memory and retrying.";
|
||||
ReleaseAllCachedMemory();
|
||||
ret = cudaMalloc(&ans, size);
|
||||
if (ret != 0)
|
||||
KALDI_WARN << "Allocation failed for the second time. Printing "
|
||||
<< "device memory usage and exiting";
|
||||
device_->PrintMemoryUsage();
|
||||
KALDI_ERR << "Memory allocation failure";
|
||||
}
|
||||
} else {
|
||||
size_t pitch;
|
||||
int32 ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
|
||||
if (ret != 0) { // allocation failed...
|
||||
KALDI_WARN << "Allocation of " << num_rows << " rows, each of size "
|
||||
<< row_bytes << " bytes failed, releasing cached "
|
||||
<< "memory and retrying.";
|
||||
ReleaseAllCachedMemory();
|
||||
ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
|
||||
if (ret != 0) {
|
||||
KALDI_WARN << "Allocation failed for the second time. Printing "
|
||||
<< "device memory usage and exiting";
|
||||
device_->PrintMemoryUsage();
|
||||
KALDI_ERR << "Memory allocation failure";
|
||||
}
|
||||
}
|
||||
KALDI_ASSERT(pitch > 0);
|
||||
if (info->pitch == 0) { // First allocation; have not set info->pitch yet.
|
||||
info->pitch = pitch;
|
||||
} else if (pitch != info->pitch) {
|
||||
KALDI_ERR << "Pitch differs between multiple calls with the same "
|
||||
<< "parameters: " << pitch << " vs. " << info->pitch;
|
||||
}
|
||||
*pitch_out = info->pitch;
|
||||
}
|
||||
addr_to_list_[ans] = info;
|
||||
info->currently_used++;
|
||||
return ans;
|
||||
}
|
||||
}
|
||||
|
||||
void CuAllocator::Free(void *addr) {
|
||||
unordered_map<void*, MemInfoForSize*>::iterator iter
|
||||
= addr_to_list_.find(addr);
|
||||
if (iter == addr_to_list_.end()) {
|
||||
KALDI_ERR << "Attempt to free address " << addr << " that was not allocated "
|
||||
<< "by CuDevice::Malloc() (or was previously freed);";
|
||||
}
|
||||
MemInfoForSize *info = iter->second;
|
||||
addr_to_list_.erase(addr); // Erase this element in the addr_to_list_ map.
|
||||
info->currently_used--;
|
||||
if (info->countdown == 0) { // We have freed [i.e. actually freed with
|
||||
// CudaFree()] enough of these that we think
|
||||
// we're wasting too much time this way and
|
||||
// need to start caching them.
|
||||
info->freed.push_back(addr);
|
||||
} else { // Actually free the address, and decrease "countdown".
|
||||
info->countdown--;
|
||||
CU_SAFE_CALL(cudaFree(addr)); // This is how we free, even if allocated with
|
||||
// cudaMallocPitch().
|
||||
}
|
||||
}
|
||||
|
||||
void CuAllocator::ReleaseAllCachedMemory(bool destroy) {
|
||||
KALDI_VLOG(2) << "Releasing all cached memory.";
|
||||
for (size_t i = 0; i < size_to_list_.size(); i++) {
|
||||
if (size_to_list_[i] == NULL)
|
||||
continue;
|
||||
typedef std::map<size_t, MemInfoForSize*>::iterator IterType;
|
||||
for (IterType iter = size_to_list_[i]->begin();
|
||||
iter != size_to_list_[i]->end(); ++iter) {
|
||||
MemInfoForSize *info = iter->second;
|
||||
if (destroy && !info->freed.empty()) {
|
||||
// When called from the destructor at program end, if verbose level is
|
||||
// high, say the sizes we had.
|
||||
if (info->row_bytes == 0) {
|
||||
KALDI_VLOG(3) << "Releasing " << info->freed.size() << " blocks of "
|
||||
<< info->num_rows << " bytes.";
|
||||
} else {
|
||||
KALDI_VLOG(3) << "Releasing " << info->freed.size()
|
||||
<< " 2-d blocks of " << info->num_rows << " rows of "
|
||||
<< info->row_bytes << " bytes each.";
|
||||
}
|
||||
}
|
||||
if (!destroy) {
|
||||
// We only do this freeing part when we're *not* called from the
|
||||
// destuctor (destroy = false). This leads to a crash when called from
|
||||
// the destructor, with cudaFree returning "unload of CUDA runtime
|
||||
// failed". Presumably this has to do with the destruction order of
|
||||
// C++, which we can't really control.
|
||||
while (!info->freed.empty()) {
|
||||
CU_SAFE_CALL(cudaFree(info->freed.back()));
|
||||
info->freed.pop_back();
|
||||
}
|
||||
}
|
||||
if (destroy)
|
||||
delete info;
|
||||
}
|
||||
if (destroy) {
|
||||
delete size_to_list_[i];
|
||||
size_to_list_[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CuAllocator::Cleanup() {
|
||||
// TODO: implement this or remove it (and also PossiblyCleanup).
|
||||
// Actually we may never implement this, as just calling
|
||||
// ReleaseAllCachedMemory whenever an allocation fails is probably
|
||||
// sufficient.
|
||||
}
|
||||
void CuAllocator::PossiblyCleanup(size_t num_bytes) {
|
||||
if (static_cast<size_t>(cleanup_countdown_bytes_) <= num_bytes) {
|
||||
Cleanup();
|
||||
cleanup_countdown_bytes_ = opts_.cleanup_interval_bytes;
|
||||
} else {
|
||||
cleanup_countdown_bytes_ -= static_cast<int32>(num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
CuAllocator::~CuAllocator() {
|
||||
// Check that nothing was allocated by the user and not freed.
|
||||
std::set<MemInfoForSize*> unfreed_set;
|
||||
typedef unordered_map<void*, MemInfoForSize *>::iterator IterType;
|
||||
for (IterType iter = addr_to_list_.begin(); iter != addr_to_list_.end();
|
||||
++iter)
|
||||
unfreed_set.insert(iter->second);
|
||||
for (std::set<MemInfoForSize*>::iterator iter = unfreed_set.begin();
|
||||
iter != unfreed_set.end(); ++iter) {
|
||||
MemInfoForSize *info = *iter;
|
||||
KALDI_ASSERT(info->currently_used > 0); // Or should not be in this set
|
||||
// (code error or memory corruption)
|
||||
if (info->num_rows == 0) {
|
||||
KALDI_WARN << info->currently_used << " memory chunks of size "
|
||||
<< info->row_bytes << " were allocated and not freed.";
|
||||
} else {
|
||||
KALDI_WARN << info->currently_used << " memory chunks of size "
|
||||
<< info->row_bytes << " per row, and " << info->num_rows
|
||||
<< " rows, were allocated and not freed.";
|
||||
}
|
||||
}
|
||||
|
||||
bool destroy = true;
|
||||
ReleaseAllCachedMemory(destroy);
|
||||
}
|
||||
|
||||
void CuDevice::Free(void *ptr) { allocator_->Free(ptr); }
|
||||
|
||||
void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
|
||||
return allocator_->MallocPitch(row_bytes, num_rows, pitch);
|
||||
}
|
||||
|
||||
void* CuDevice::Malloc(size_t size) {
|
||||
return allocator_->Malloc(size);
|
||||
}
|
||||
|
||||
CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true),
|
||||
allocator_(new CuAllocator(CuAllocatorOptions(), this))
|
||||
{ }
|
||||
|
||||
|
||||
CuDevice::~CuDevice() {
|
||||
if (allocator_ != NULL)
|
||||
delete allocator_;
|
||||
if (Enabled())
|
||||
CU_SAFE_CALL(cublasShutdown());
|
||||
}
|
||||
|
||||
// The instance of the static singleton
|
||||
CuDevice CuDevice::global_device_;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -22,75 +22,105 @@
|
|||
#ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_
|
||||
#define KALDI_CUDAMATRIX_CU_DEVICE_H_
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
class CuAllocator; // Forward declaration.
|
||||
|
||||
/**
|
||||
* Singleton object which represents CUDA device
|
||||
* responsible for CUBLAS initilalisation, collects profiling info
|
||||
*/
|
||||
class CuDevice {
|
||||
// Singleton interface...
|
||||
private:
|
||||
CuDevice();
|
||||
CuDevice(CuDevice&);
|
||||
CuDevice &operator=(CuDevice&);
|
||||
|
||||
// Singleton object (there should only be one instantiated per program)
|
||||
public:
|
||||
~CuDevice();
|
||||
static CuDevice& Instantiate() {
|
||||
return msDevice;
|
||||
}
|
||||
static inline CuDevice& Instantiate() { return global_device_; }
|
||||
|
||||
private:
|
||||
static CuDevice msDevice;
|
||||
// We provide functions Malloc, MallocPitch and Free which replace cudaMalloc,
|
||||
// cudaMallocPitch and cudaFree. Their function is to cache the results of
|
||||
// previous allocations to avoid the very large overhead that CUDA's
|
||||
// allocation seems to give for some setups.
|
||||
void* Malloc(size_t size);
|
||||
|
||||
void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
|
||||
|
||||
void Free(void *ptr);
|
||||
|
||||
/// Select a GPU for computation, the 'use_gpu' modes are:
|
||||
/// "yes" -- Select GPU automatically and die if this fails.
|
||||
/// "optional" -- Do as above, but if it fails, back off to CPU.
|
||||
/// "no" -- Run on CPU.
|
||||
/// (more comments in cu-device.cc)
|
||||
void SelectGpuId(std::string use_gpu);
|
||||
|
||||
|
||||
/**********************************/
|
||||
// Instance interface
|
||||
public:
|
||||
|
||||
/// Check if the CUDA device is selected for use
|
||||
bool Enabled() {
|
||||
/// Check if the CUDA GPU is selected for use
|
||||
bool Enabled() const {
|
||||
return (active_gpu_id_ > -1);
|
||||
}
|
||||
|
||||
/// Manually select GPU by id (more comments in cu-device.cc)
|
||||
void SelectGpuId(int32 gpu_id);
|
||||
/// Get the active GPU id
|
||||
int32 ActiveGpuId() {
|
||||
return active_gpu_id_;
|
||||
}
|
||||
|
||||
void Verbose(bool verbose) {
|
||||
verbose_ = verbose;
|
||||
}
|
||||
/// Returns true if either we have no GPU, or we have a GPU
|
||||
/// and it supports double precision.
|
||||
bool DoublePrecisionSupported();
|
||||
|
||||
void SetVerbose(bool verbose) { verbose_ = verbose; }
|
||||
|
||||
/// Sum the IO time
|
||||
void AccuProfile(const std::string &key, double time);
|
||||
void PrintProfile();
|
||||
|
||||
void PrintMemoryUsage() const;
|
||||
|
||||
void ResetProfile() {
|
||||
profile_map_.clear();
|
||||
}
|
||||
|
||||
/// Get the actual GPU memory use stats
|
||||
std::string GetFreeMemory(int64* free = NULL, int64* total = NULL);
|
||||
std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const;
|
||||
/// Get the name of the GPU
|
||||
void DeviceGetName(char* name, int32 len, int32 dev);
|
||||
|
||||
private:
|
||||
/// Check if the GPU run in compute exclusive mode
|
||||
bool IsComputeExclusive();
|
||||
/// Automatically select GPU
|
||||
void SelectGpuIdAuto();
|
||||
CuDevice();
|
||||
CuDevice(CuDevice&); // Disallow.
|
||||
CuDevice &operator=(CuDevice&); // Disallow.
|
||||
|
||||
static CuDevice global_device_;
|
||||
|
||||
/// Check if the GPU run in compute exclusive mode Returns true if it is
|
||||
/// running in compute exclusive mode and we have a GPU. Returns false
|
||||
/// otherwise. Sets error to true if there was some error, such as that we
|
||||
/// were running in compute exclusive modes but no GPUs available; otherwise
|
||||
/// sets it to false.
|
||||
bool IsComputeExclusive();
|
||||
|
||||
/// Automatically select GPU and get CUDA context. Returns true on success.
|
||||
bool SelectGpuIdAuto();
|
||||
|
||||
/// Try to get CUDA context on manually selected GPU. Return true on success.
|
||||
bool SelectGpuIdManual(int32 gpu_id);
|
||||
|
||||
void FinalizeActiveGpu();
|
||||
|
||||
/// Should only be called if Enabled() == true.
|
||||
int32 MajorDeviceVersion();
|
||||
|
||||
/// Should only be called if Enabled() == true.
|
||||
int32 MinorDeviceVersion();
|
||||
|
||||
private:
|
||||
std::map<std::string, double> profile_map_;
|
||||
|
||||
/// active_gpu_id_ values:
|
||||
|
@ -99,14 +129,20 @@ class CuDevice {
|
|||
/// -1 SelectGpuId was called, but the GPU was manually disabled
|
||||
/// 0..N Normal GPU IDs
|
||||
int32 active_gpu_id_;
|
||||
///
|
||||
|
||||
int64 free_memory_at_startup_;
|
||||
|
||||
cudaDeviceProp properties_;
|
||||
|
||||
bool verbose_;
|
||||
|
||||
CuAllocator *allocator_;
|
||||
|
||||
}; // class CuDevice
|
||||
|
||||
|
||||
}// namespace
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // HAVE_CUDA
|
||||
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
// cudamatrix/cu-kernels-ansi.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
// 2013 Hainan Xu
|
||||
// 2013 Xiaohui Zhang
|
||||
// 2013 Johns Hopkins University (author: Guoguo Chen)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -25,8 +29,7 @@
|
|||
|
||||
#include "cudamatrix/cu-matrixdim.h"
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
extern "C" {
|
||||
|
||||
/*********************************************************
|
||||
|
@ -43,13 +46,39 @@ void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, Matr
|
|||
/*
|
||||
* CuMatrix
|
||||
*/
|
||||
void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
|
||||
void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
|
||||
void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim,
|
||||
const float *vec, const float *mat2, int mat2_row_stride,
|
||||
int mat2_col_stride, float beta);
|
||||
void cudaF_copy_from_tp_trans(int Gr, int Bl, float* A, const float* B, MatrixDim dmat);
|
||||
void cudaFD_copy_from_tp_trans(int Gr, int Bl, float* A, const double* B, MatrixDim dmat);
|
||||
void cudaF_copy_from_tp(int Gr, int Bl, float* A, const float* B, MatrixDim dmat);
|
||||
void cudaFD_copy_from_tp(int Gr, int Bl, float* A, const double* B, MatrixDim dmat);
|
||||
void cudaF_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d);
|
||||
void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
|
||||
void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
|
||||
void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
|
||||
void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
|
||||
void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
|
||||
void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
|
||||
void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
|
||||
void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
|
||||
void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
|
||||
void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
|
||||
void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
|
||||
void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
|
||||
void cudaF_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
|
||||
void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec, const float alpha, int dim);
|
||||
void cudaF_scale_diag(int Gr, int Bl, float* mat, float value, int dim);
|
||||
void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
|
||||
void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
|
||||
void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim d);
|
||||
void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride);
|
||||
void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride);
|
||||
void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
|
||||
void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
|
||||
void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size);
|
||||
void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power);
|
||||
void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d);
|
||||
void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d);
|
||||
void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d);
|
||||
|
@ -58,29 +87,82 @@ void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, floa
|
|||
/*
|
||||
* CuVector
|
||||
*/
|
||||
void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig, float changed);
|
||||
void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a, float param_1, float param_2, float param_3, int* flag, int dim);
|
||||
void cudaF_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim);
|
||||
void cudaF_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim);
|
||||
void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
|
||||
void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
|
||||
void cudaF_vec_min(const float* v, float* value, int dim);
|
||||
void cudaF_vec_max(const float* v, float* value, int dim);
|
||||
void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
|
||||
void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
|
||||
void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
|
||||
void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
|
||||
void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
|
||||
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
|
||||
int N_col_stride, int threads_per_element, float beta);
|
||||
void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim);
|
||||
void cudaF_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
|
||||
void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim);
|
||||
void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
|
||||
void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
|
||||
void cudaF_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size);
|
||||
void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim);
|
||||
void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim);
|
||||
void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
|
||||
void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
|
||||
void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
|
||||
void cudaF_add_row_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d);
|
||||
void cudaF_add_col_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d);
|
||||
void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
|
||||
|
||||
// Note: B_trans is nonzero if B is transposed.
|
||||
void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const float *Adata,
|
||||
int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
|
||||
const CuBlockMatrixData *B_cu_data, int B_num_blocks,
|
||||
float alpha, float beta, int B_trans);
|
||||
void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
|
||||
const float *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
|
||||
const float *D_data, int D_row_stride, int D_col_stride,
|
||||
float alpha, float beta);
|
||||
/*
|
||||
* cu::
|
||||
*/
|
||||
void cudaF_softmax(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d);
|
||||
void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride);
|
||||
void cudaF_softmax_part(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d);
|
||||
void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d);
|
||||
void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d);
|
||||
void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d);
|
||||
void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
|
||||
void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power);
|
||||
void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
|
||||
void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int src_stride);
|
||||
void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
|
||||
void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d);
|
||||
|
||||
void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d);
|
||||
void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
|
||||
void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d);
|
||||
void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in);
|
||||
|
||||
void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
|
||||
void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
|
||||
void cudaF_one(int Gr, int Bl, float* x, int dim);
|
||||
void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
|
||||
void cudaF_copy_from_sp(int Gr, int Bl, const float* x, float* y, int d_in, MatrixDim d_out);
|
||||
void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
|
||||
void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
|
||||
void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
|
||||
void cudaF_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<float>* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t);
|
||||
void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
|
||||
void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
|
||||
float *S, MatrixDim sdim);
|
||||
void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
|
||||
const float *src_data, MatrixDim src_dim,
|
||||
const Int32Pair *indices);
|
||||
void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
|
||||
const Int32Pair *indices, int indices_size,
|
||||
float *output);
|
||||
|
||||
|
||||
|
||||
/*********************************************************
|
||||
* double CUDA kernel calls
|
||||
*/
|
||||
|
@ -88,13 +170,39 @@ void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *co
|
|||
/*
|
||||
* CuMatrix
|
||||
*/
|
||||
void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
|
||||
void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
|
||||
void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
|
||||
const double *vec, const double *mat2, int mat2_row_stride,
|
||||
int mat2_col_stride, double beta);
|
||||
void cudaD_copy_from_tp_trans(int Gr, int Bl, double* A, const double* B, MatrixDim dmat);
|
||||
void cudaDF_copy_from_tp_trans(int Gr, int Bl, double* A, const float* B, MatrixDim dmat);
|
||||
void cudaD_copy_from_tp(int Gr, int Bl, double* A, const double* B, MatrixDim dmat);
|
||||
void cudaDF_copy_from_tp(int Gr, int Bl, double* A, const float* B, MatrixDim dmat);
|
||||
void cudaD_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d);
|
||||
void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
|
||||
void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
|
||||
void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
|
||||
void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
|
||||
void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
|
||||
void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
|
||||
void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
|
||||
void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
|
||||
void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
|
||||
void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
|
||||
void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
|
||||
void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
|
||||
void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
|
||||
void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec, const double alpha, int dim);
|
||||
void cudaD_scale_diag(int Gr, int Bl, double* mat, double value, int dim);
|
||||
void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
|
||||
void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
|
||||
void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim d);
|
||||
void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride);
|
||||
void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride);
|
||||
void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
|
||||
void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
|
||||
void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size);
|
||||
void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power);
|
||||
void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d);
|
||||
void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d);
|
||||
void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d);
|
||||
|
@ -103,31 +211,101 @@ void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, do
|
|||
/*
|
||||
* CuVector
|
||||
*/
|
||||
void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig, double changed);
|
||||
void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a, double param_1, double param_2, double param_3, int* flag, int dim);
|
||||
void cudaD_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim);
|
||||
void cudaD_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim);
|
||||
void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim);
|
||||
void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
|
||||
void cudaD_vec_min(const double* v, double* value, int dim);
|
||||
void cudaD_vec_max(const double* v, double* value, int dim);
|
||||
void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
|
||||
void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
|
||||
void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
|
||||
void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
|
||||
void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
|
||||
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
|
||||
int N_col_stride, int threads_per_element, double beta);
|
||||
void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim);
|
||||
void cudaD_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
|
||||
void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
|
||||
void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim);
|
||||
void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
|
||||
void cudaD_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size);
|
||||
void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim);
|
||||
void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim);
|
||||
void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
|
||||
void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
|
||||
void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
|
||||
void cudaD_add_row_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d);
|
||||
void cudaD_add_col_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d);
|
||||
void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
|
||||
// note: B_trans is nonzero if B is tranposed.
|
||||
void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const double *Adata,
|
||||
int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
|
||||
const CuBlockMatrixData *B_cu_data, int B_num_blocks,
|
||||
double alpha, double beta, int B_trans);
|
||||
void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
|
||||
const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
|
||||
const double *D_data, int D_row_stride, int D_col_stride,
|
||||
double alpha, double beta);
|
||||
|
||||
|
||||
/*
|
||||
* cu::
|
||||
*/
|
||||
void cudaD_softmax(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d);
|
||||
void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride);
|
||||
void cudaD_softmax_part(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d);
|
||||
void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d);
|
||||
void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d);
|
||||
void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d);
|
||||
void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
|
||||
void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power);
|
||||
void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
|
||||
void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int src_stride);
|
||||
void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
|
||||
void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d);
|
||||
|
||||
void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d);
|
||||
void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
|
||||
void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d);
|
||||
void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in);
|
||||
|
||||
void cudaD_randomize(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
|
||||
void cudaD_splice(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
|
||||
void cudaD_one(int Gr, int Bl, double* x, int dim);
|
||||
void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
|
||||
void cudaD_copy_from_sp(int Gr, int Bl, const double* x, double* y, int d_in, MatrixDim d_out);
|
||||
void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
|
||||
void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
|
||||
void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
|
||||
|
||||
|
||||
// some mostly mixed-type kernels.
|
||||
void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
|
||||
void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
|
||||
void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
|
||||
void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
|
||||
void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
|
||||
void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
|
||||
void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
|
||||
void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
|
||||
|
||||
void cudaD_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<double>* x, int s, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t);
|
||||
|
||||
void cudaD_transpose_matrix(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
|
||||
void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* T, MatrixDim tdim,
|
||||
double *S, MatrixDim sdim);
|
||||
void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
|
||||
const double *src_data, MatrixDim src_dim,
|
||||
const Int32Pair *indices);
|
||||
void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
|
||||
const Int32Pair *indices, int indices_size,
|
||||
double *output);
|
||||
|
||||
|
||||
|
||||
} // extern "C"
|
||||
|
||||
#endif // HAVE_CUDA
|
||||
|
||||
|
||||
#endif
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,6 +1,11 @@
|
|||
// cudamatrix/cu-kernels.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2013 Ehsan Variani
|
||||
// 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
// 2013 Hainan Xu
|
||||
// 2013 Xiaohui Zhang
|
||||
// 2013 Johns Hopkins University (author: Guoguo Chen)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -22,7 +27,7 @@
|
|||
#ifndef KALDI_CUDAMATRIX_CU_KERNELS_H_
|
||||
#define KALDI_CUDAMATRIX_CU_KERNELS_H_
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
|
||||
#include "base/kaldi-error.h"
|
||||
#include "cudamatrix/cu-kernels-ansi.h"
|
||||
|
@ -34,147 +39,366 @@
|
|||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
|
||||
/*********************************************************
|
||||
* base templates
|
||||
*/
|
||||
|
||||
/*
|
||||
* CuMatrix
|
||||
*/
|
||||
template<typename Real> inline void cuda_set_const(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_add(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_scale(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_apply_log(dim3 Gr, dim3 Bl, Real *mat, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_mul_elements(dim3 Gr, dim3 Bl, Real *mat, const Real *A, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *scale, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *scale, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *vec_div, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_add_mat(dim3 Gr, dim3 Bl, Real alpha, const Real *A, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, Real alpha, const Real *col, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, Real alpha, const Real *row, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
|
||||
/*
|
||||
* CuVector
|
||||
*/
|
||||
template<typename Real> inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_sum, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_sum, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_invert_elements(dim3 Gr, dim3 Bl, Real *data, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
|
||||
template<typename Real> inline void cuda_sigmoid(dim3 Gr, dim3 Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, Real *eout, const Real *e, const Real *y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_tanh(dim3 Gr, dim3 Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, Real *eout, const Real *e, const Real *y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_softmax(size_t Gr, size_t Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const Real *X, const int32_cuda *vec_ids, Real* Y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
|
||||
template<typename Real> inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, Real *wei, Real *grad, Real l1, Real lr, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, Real *mat_net_out, Real *vec_log_post, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
|
||||
template<typename Real> inline void cuda_randomize(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_splice(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_copy(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
|
||||
|
||||
|
||||
/*********************************************************
|
||||
* float specializations
|
||||
*/
|
||||
|
||||
/*
|
||||
* CuMatrix
|
||||
*/
|
||||
template<> inline void cuda_set_const<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_set_const(Gr,Bl,mat,value,d); }
|
||||
template<> inline void cuda_add<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_add(Gr,Bl,mat,value,d); }
|
||||
template<> inline void cuda_scale<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_scale(Gr,Bl,mat,value,d); }
|
||||
template<> inline void cuda_apply_log<float>(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) { cudaF_apply_log(Gr,Bl,mat,d); }
|
||||
template<> inline void cuda_mul_elements<float>(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim d) { cudaF_mul_elements(Gr,Bl,mat,A,d); }
|
||||
template<> inline void cuda_mul_cols_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr,Bl,mat,scale,d); }
|
||||
template<> inline void cuda_mul_rows_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
|
||||
template<> inline void cuda_div_rows_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d) { cudaF_div_rows_vec(Gr,Bl,mat,vec_div,d); }
|
||||
template<> inline void cuda_add_mat<float>(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d) { cudaF_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
|
||||
template<> inline void cuda_add_vec_to_cols<float>(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
|
||||
template<> inline void cuda_add_vec_to_rows<float>(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
|
||||
|
||||
inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_upp_low(Gr, Bl, A, dimA); }
|
||||
inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_low_upp(Gr, Bl, A, dimA); }
|
||||
inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim,
|
||||
const float *vec, const float *mat2, int mat2_row_stride,
|
||||
int mat2_col_stride, float beta) {
|
||||
cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
|
||||
mat2_row_stride, mat2_col_stride, beta);
|
||||
}
|
||||
inline void cuda_copy_from_tp_trans(int Gr, int Bl, float* A, const float* B, MatrixDim dmat) { cudaF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
|
||||
inline void cuda_copy_from_tp_trans(int Gr, int Bl, float* A, const double* B, MatrixDim dmat) { cudaFD_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
|
||||
inline void cuda_copy_from_tp(int Gr, int Bl, float* A, const float* B, MatrixDim dmat) { cudaF_copy_from_tp(Gr,Bl,A,B,dmat); }
|
||||
inline void cuda_copy_from_tp(int Gr, int Bl, float* A, const double* B, MatrixDim dmat) { cudaFD_copy_from_tp(Gr,Bl,A,B,dmat); }
|
||||
|
||||
inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
|
||||
cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
|
||||
}
|
||||
inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
|
||||
cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
|
||||
}
|
||||
inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
|
||||
cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
|
||||
}
|
||||
inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
|
||||
cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
|
||||
}
|
||||
|
||||
inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
|
||||
cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
|
||||
}
|
||||
inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
|
||||
cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
|
||||
}
|
||||
inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
|
||||
cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
|
||||
}
|
||||
inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
|
||||
cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
|
||||
}
|
||||
|
||||
inline void cuda_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { cudaF_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
|
||||
inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr,Bl,mat,d); }
|
||||
inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); }
|
||||
inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) { cudaF_apply_heaviside(Gr,Bl,mat,dim); }
|
||||
inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim dim) { cudaF_apply_floor(Gr,Bl,mat,floor_val,dim); }
|
||||
inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim dim) { cudaF_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
|
||||
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
|
||||
cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
|
||||
}
|
||||
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
|
||||
cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
|
||||
}
|
||||
inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) { cudaF_trace(Gr,Bl,mat,value,dim); }
|
||||
inline void cuda_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d) { cudaF_set_diag(Gr,Bl,mat,value,d); }
|
||||
inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { cudaF_set_diag_packed(Gr,Bl,mat,value,dim); }
|
||||
inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { cudaF_add_diag_packed(Gr,Bl,mat,value,dim); }
|
||||
inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_set_const(Gr,Bl,mat,value,d); }
|
||||
inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_set_zero_above_diag(Gr,Bl,mat,d); }
|
||||
inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_add(Gr,Bl,mat,value,d); }
|
||||
inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec, const float alpha, int dim) { cudaF_add_vec2(Gr,Bl,mat,vec,alpha,dim); }
|
||||
inline void cuda_scale_diag(int Gr, int Bl, float* mat, float value, int dim) { cudaF_scale_diag(Gr,Bl,mat,value,dim); }
|
||||
inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_scale(Gr,Bl,mat,value,d); }
|
||||
inline void cuda_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) { cudaF_apply_log(Gr,Bl,mat,d); }
|
||||
inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride) {
|
||||
cudaF_mul_elements(Gr,Bl,mat,A,dst_d,src_stride);
|
||||
}
|
||||
inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride) {
|
||||
cudaF_max(Gr,Bl,mat,A,dst_d,src_stride);
|
||||
}
|
||||
inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr,Bl,mat,scale,d); }
|
||||
inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
|
||||
inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
|
||||
inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
|
||||
inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d) { cudaF_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
|
||||
inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
|
||||
inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
|
||||
inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_transpose_matrix(Gr, Bl, mat, d); }
|
||||
inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
|
||||
float *S, MatrixDim sdim) {
|
||||
cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* CuVector
|
||||
*/
|
||||
template<> inline void cuda_add_row_sum_mat<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
|
||||
template<> inline void cuda_add_col_sum_mat<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
|
||||
template<> inline void cuda_invert_elements<float>(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
|
||||
inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig, float changed) {cudaF_replace_value(Gr, Bl, v, dim, orig, changed); }
|
||||
inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d) { cudaF_div_rows_vec(Gr,Bl,mat,vec_div,d); }
|
||||
inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a, float param_1, float param_2, float param_3, int* flag, int dim) { cudaF_set_bias_params(Gr,Bl,v,a,param_1,param_2,param_3,flag,dim); }
|
||||
inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
|
||||
inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
|
||||
inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_mul_elements(Gr,Bl,v,a,dim); }
|
||||
inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { cudaF_vec_soft_max(Gr,Bl,v,dim); }
|
||||
inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(v,value,dim); }
|
||||
inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
|
||||
inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
|
||||
inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
|
||||
inline void cuda_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
|
||||
inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
|
||||
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
|
||||
int N_col_stride, int threads_per_element, float beta) {
|
||||
cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
|
||||
N_col_stride, threads_per_element, beta);
|
||||
}
|
||||
inline void cuda_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
|
||||
inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
|
||||
inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
|
||||
inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
|
||||
inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
|
||||
inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc) { cudaF_vec_sum(Gr,Bl,v,value,dim,inc); }
|
||||
inline void cuda_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size) { cudaF_pvec_sum(Gr, Bl, vec, pvec_sum, dim, size); }
|
||||
inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim) { cudaF_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
|
||||
inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim) { cudaF_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
|
||||
inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) { cudaF_vec_apply_exp(Gr,Bl,v,dim); }
|
||||
inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) { cudaF_vec_apply_log(Gr,Bl,v,flag,dim); }
|
||||
inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
|
||||
inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
|
||||
inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
|
||||
// B_trans nonzero if B transposed.
|
||||
inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const float *Adata,
|
||||
int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
|
||||
const CuBlockMatrixData *B_cu_data, int B_num_blocks,
|
||||
float alpha, float beta, int B_trans) {
|
||||
cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride,
|
||||
B_cu_data, B_num_blocks, alpha, beta, B_trans);
|
||||
}
|
||||
inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
|
||||
const float *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
|
||||
const float *D_data, int D_row_stride, int D_col_stride,
|
||||
float alpha, float beta) {
|
||||
cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
|
||||
C_row_stride, C_col_stride, D_data, D_row_stride,
|
||||
D_col_stride, alpha, beta);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* cu::
|
||||
*/
|
||||
template<> inline void cuda_sigmoid<float>(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d) { cudaF_sigmoid(Gr,Bl,y,x,d); }
|
||||
template<> inline void cuda_diff_sigmoid<float>(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d); }
|
||||
template<> inline void cuda_tanh<float>(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d) { cudaF_tanh(Gr,Bl,y,x,d); }
|
||||
template<> inline void cuda_diff_tanh<float>(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d); }
|
||||
template<> inline void cuda_softmax<float>(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d) { cudaF_softmax(Gr,Bl,y,x,d); }
|
||||
template<> inline void cuda_softmax_part<float>(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d) { cudaF_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
|
||||
inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_soft_hinge(Gr,Bl,y,x,d,src_stride); }
|
||||
inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power) { cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);}
|
||||
inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_sigmoid(Gr,Bl,y,x,d,src_stride); }
|
||||
inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int src_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,src_stride); }
|
||||
inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); }
|
||||
inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d); }
|
||||
inline void cuda_softmax(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d) { cudaF_softmax(Gr,Bl,y,x,d); }
|
||||
/*
|
||||
Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK threads reduce a row at the same time.
|
||||
Gr: the number of rows
|
||||
*/
|
||||
inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
|
||||
inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d) { cudaF_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
|
||||
|
||||
template<> inline void cuda_regularize_l1<float>(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
|
||||
template<> inline void cuda_find_row_max_id<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
|
||||
template<> inline void cuda_diff_xent<float>(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
|
||||
|
||||
template<> inline void cuda_randomize<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
|
||||
|
||||
template<> inline void cuda_splice<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaF_splice(Gr,Bl,y,x,off,d_out,d_in); }
|
||||
template<> inline void cuda_copy<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
|
||||
inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
|
||||
inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
|
||||
inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
|
||||
inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in) {
|
||||
cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
|
||||
}
|
||||
|
||||
|
||||
/*********************************************************
|
||||
* double specializations
|
||||
*/
|
||||
inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
|
||||
|
||||
inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaF_splice(Gr,Bl,y,x,off,d_out,d_in); }
|
||||
inline void cuda_one(int Gr,int Bl,float* x,int dim) { cudaF_one(Gr,Bl,x,dim); }
|
||||
inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
|
||||
inline void cuda_copy_from_sp(int Gr, int Bl, const float* x, float* y, int d_in, MatrixDim d_out) { cudaF_copy_from_sp(Gr,Bl,x,y,d_in,d_out); }
|
||||
inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_lower(Gr,Bl,x,y,d_in); }
|
||||
inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_upper(Gr,Bl,x,y,d_in); }
|
||||
inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_mean(Gr,Bl,x,y,d_in); }
|
||||
inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int32 size, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) {cudaF_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
|
||||
inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
|
||||
const float *src_data, MatrixDim src_dim,
|
||||
const Int32Pair *indices) {
|
||||
cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
|
||||
}
|
||||
inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
|
||||
MatrixDim dim, const Int32Pair *indices,
|
||||
int indices_size, float *output) {
|
||||
cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
|
||||
}
|
||||
|
||||
|
||||
// double versions
|
||||
|
||||
/*
|
||||
* CuMatrix
|
||||
*/
|
||||
template<> inline void cuda_set_const<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_set_const(Gr,Bl,mat,value,d); }
|
||||
template<> inline void cuda_add<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_add(Gr,Bl,mat,value,d); }
|
||||
template<> inline void cuda_scale<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_scale(Gr,Bl,mat,value,d); }
|
||||
template<> inline void cuda_apply_log<double>(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_apply_log(Gr,Bl,mat,d); }
|
||||
template<> inline void cuda_mul_elements<double>(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim d) { cudaD_mul_elements(Gr,Bl,mat,A,d); }
|
||||
template<> inline void cuda_mul_cols_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_cols_vec(Gr,Bl,mat,scale,d); }
|
||||
template<> inline void cuda_mul_rows_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
|
||||
template<> inline void cuda_div_rows_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d) { cudaD_div_rows_vec(Gr,Bl,mat,vec_div,d); }
|
||||
template<> inline void cuda_add_mat<double>(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d) { cudaD_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
|
||||
template<> inline void cuda_add_vec_to_cols<double>(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
|
||||
template<> inline void cuda_add_vec_to_rows<double>(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
|
||||
|
||||
inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_upp_low(Gr, Bl, A, dimA); }
|
||||
inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_low_upp(Gr, Bl, A, dimA); }
|
||||
inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
|
||||
const double *vec, const double *mat2, int mat2_row_stride,
|
||||
int mat2_col_stride, double beta) {
|
||||
cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
|
||||
mat2_row_stride, mat2_col_stride, beta);
|
||||
}
|
||||
inline void cuda_copy_from_tp_trans(int Gr, int Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
|
||||
inline void cuda_copy_from_tp_trans(int Gr, int Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
|
||||
inline void cuda_copy_from_tp(int Gr, int Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp(Gr,Bl,A,B,dmat); }
|
||||
inline void cuda_copy_from_tp(int Gr, int Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp(Gr,Bl,A,B,dmat); }
|
||||
inline void cuda_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d) { cudaD_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
|
||||
inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_apply_exp(Gr,Bl,mat,d); }
|
||||
inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim dim) { cudaD_apply_pow(Gr,Bl,mat,power,dim); }
|
||||
inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) { cudaD_apply_heaviside(Gr,Bl,mat,dim); }
|
||||
inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim dim) { cudaD_apply_floor(Gr,Bl,mat,floor_val,dim); }
|
||||
inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim dim) { cudaD_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
|
||||
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
|
||||
cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
|
||||
}
|
||||
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
|
||||
cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
|
||||
}
|
||||
inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) { cudaD_trace(Gr,Bl,mat,value,dim); }
|
||||
inline void cuda_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d) { cudaD_set_diag(Gr,Bl,mat,value,d); }
|
||||
inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { cudaD_set_diag_packed(Gr,Bl,mat,value,dim); }
|
||||
inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { cudaD_add_diag_packed(Gr,Bl,mat,value,dim); }
|
||||
inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_set_const(Gr,Bl,mat,value,d); }
|
||||
inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_set_zero_above_diag(Gr,Bl,mat,d); }
|
||||
inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_add(Gr,Bl,mat,value,d); }
|
||||
inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec, const double alpha, int dim) { cudaD_add_vec2(Gr,Bl,mat,vec,alpha,dim); }
|
||||
inline void cuda_scale_diag(int Gr, int Bl, double* mat, double value, int dim) { cudaD_scale_diag(Gr,Bl,mat,value,dim); }
|
||||
inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_scale(Gr,Bl,mat,value,d); }
|
||||
inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_apply_log(Gr,Bl,mat,d); }
|
||||
inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) {
|
||||
cudaD_mul_elements(Gr,Bl,mat,A,dst_d,src_stride);
|
||||
}
|
||||
inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) {
|
||||
cudaD_max(Gr,Bl,mat,A,dst_d,src_stride);
|
||||
}
|
||||
inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_cols_vec(Gr,Bl,mat,scale,d); }
|
||||
inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
|
||||
inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size) { cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
|
||||
inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
|
||||
inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d) { cudaD_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
|
||||
inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
|
||||
inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
|
||||
inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_transpose_matrix(Gr, Bl, mat, d); }
|
||||
inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* T, MatrixDim tdim,
|
||||
double *S, MatrixDim sdim) {
|
||||
cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* CuVector
|
||||
*/
|
||||
template<> inline void cuda_add_row_sum_mat<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
|
||||
template<> inline void cuda_add_col_sum_mat<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
|
||||
template<> inline void cuda_invert_elements<double>(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
|
||||
inline void cuda_replace_value(int Gr, int Bl, double *v, int dim, double orig, double changed) {cudaD_replace_value(Gr, Bl, v, dim, orig, changed); }
|
||||
inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d) { cudaD_div_rows_vec(Gr,Bl,mat,vec_div,d); }
|
||||
inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a, double param_1, double param_2, double param_3, int* flag, int dim) { cudaD_set_bias_params(Gr,Bl,v,a,param_1,param_2,param_3,flag,dim); }
|
||||
inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
|
||||
inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
|
||||
inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim) { cudaD_vec_mul_elements(Gr,Bl,v,a,dim); }
|
||||
inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr,Bl,v,dim); }
|
||||
inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_min(v,value,dim); }
|
||||
inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
|
||||
inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
|
||||
inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
|
||||
inline void cuda_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
|
||||
inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
|
||||
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
|
||||
int N_col_stride, int threads_per_element, double beta) {
|
||||
cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
|
||||
N_col_stride, threads_per_element, beta);
|
||||
}
|
||||
inline void cuda_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
|
||||
inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
|
||||
inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
|
||||
inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
|
||||
inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
|
||||
inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc) { cudaD_vec_sum(Gr,Bl,v,value,dim,inc); }
|
||||
inline void cuda_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size) { cudaD_pvec_sum(Gr,Bl,vec,pvec_sum,dim,size); }
|
||||
inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim) { cudaD_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
|
||||
inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim) { cudaD_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
|
||||
inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) { cudaD_vec_apply_exp(Gr,Bl,v,dim); }
|
||||
inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim) { cudaD_vec_apply_log(Gr,Bl,v,flag,dim); }
|
||||
inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
|
||||
inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
|
||||
inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
|
||||
// B_trans nonzero if B transposed.
|
||||
inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const double *Adata,
|
||||
int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
|
||||
const CuBlockMatrixData *B_cu_data, int B_num_blocks,
|
||||
double alpha, double beta, int B_trans) {
|
||||
cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride,
|
||||
B_cu_data, B_num_blocks, alpha, beta, B_trans);
|
||||
}
|
||||
inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
|
||||
const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
|
||||
const double *D_data, int D_row_stride, int D_col_stride,
|
||||
double alpha, double beta) {
|
||||
cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
|
||||
C_row_stride, C_col_stride, D_data, D_row_stride,
|
||||
D_col_stride, alpha, beta);
|
||||
}
|
||||
|
||||
/*
|
||||
* cu::
|
||||
*/
|
||||
template<> inline void cuda_sigmoid<double>(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d) { cudaD_sigmoid(Gr,Bl,y,x,d); }
|
||||
template<> inline void cuda_diff_sigmoid<double>(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d); }
|
||||
template<> inline void cuda_tanh<double>(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d) { cudaD_tanh(Gr,Bl,y,x,d); }
|
||||
template<> inline void cuda_diff_tanh<double>(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d); }
|
||||
template<> inline void cuda_softmax<double>(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d) { cudaD_softmax(Gr,Bl,y,x,d); }
|
||||
template<> inline void cuda_softmax_part<double>(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d) { cudaD_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
|
||||
inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_soft_hinge(Gr,Bl,y,x,d,src_stride); }
|
||||
inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power) { cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power); }
|
||||
inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_sigmoid(Gr,Bl,y,x,d,src_stride); }
|
||||
inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int src_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,src_stride); }
|
||||
inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); }
|
||||
inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d); }
|
||||
inline void cuda_softmax(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d) { cudaD_softmax(Gr,Bl,y,x,d); }
|
||||
inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
|
||||
inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d) { cudaD_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
|
||||
|
||||
template<> inline void cuda_regularize_l1<double>(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
|
||||
template<> inline void cuda_find_row_max_id<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
|
||||
template<> inline void cuda_diff_xent<double>(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) { cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
|
||||
inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
|
||||
inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
|
||||
inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) {
|
||||
cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d);
|
||||
}
|
||||
inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in) {
|
||||
cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
|
||||
}
|
||||
|
||||
template<> inline void cuda_randomize<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
|
||||
template<> inline void cuda_splice<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaD_splice(Gr,Bl,y,x,off,d_out,d_in); }
|
||||
template<> inline void cuda_copy<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
|
||||
inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
|
||||
inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaD_splice(Gr,Bl,y,x,off,d_out,d_in); }
|
||||
inline void cuda_one(int Gr,int Bl,double* x,int dim) { cudaD_one(Gr,Bl,x,dim); }
|
||||
inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
|
||||
inline void cuda_copy_from_sp(int Gr, int Bl, const double* x, double* y, int d_in, MatrixDim d_out) { cudaD_copy_from_sp(Gr,Bl,x,y,d_in,d_out); }
|
||||
inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_lower(Gr,Bl,x,y,d_in); }
|
||||
inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_upper(Gr,Bl,x,y,d_in); }
|
||||
inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_mean(Gr,Bl,x,y,d_in); }
|
||||
inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int32 size, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t) {cudaD_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
|
||||
inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
|
||||
const double *src_data, MatrixDim src_dim, const Int32Pair *indices) {
|
||||
cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
|
||||
}
|
||||
inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
|
||||
MatrixDim dim, const Int32Pair *indices,
|
||||
int indices_size, double *output) {
|
||||
cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// Also include some template-friendly wrappers of cublas functions:
|
||||
inline void cuda_axpy(int n, float alpha, const float *x, int incx, float *y, int incy) {
|
||||
cublasSaxpy(n, alpha, x, incx, y, incy);
|
||||
}
|
||||
inline void cuda_axpy(int n, double alpha, const double *x, int incx, double *y, int incy) {
|
||||
cublasDaxpy(n, alpha, x, incx, y, incy);
|
||||
}
|
||||
inline void cuda_scal(int n, float alpha, float *x, int incx) {
|
||||
cublasSscal(n, alpha, x, incx);
|
||||
}
|
||||
inline void cuda_scal(int n, double alpha, double *x, int incx) {
|
||||
cublasDscal(n, alpha, x, incx);
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
|
||||
#endif // HAVE_CUDA
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,181 @@
|
|||
// cudamatrix/cuda-math-test.cc
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (Author: David Snyder)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "cudamatrix/cu-matrix-lib.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
/*
|
||||
* Unit tests
|
||||
*/
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuMathRandomize() {
|
||||
int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
|
||||
CuMatrix<Real> src(M, N);
|
||||
CuMatrix<Real> tgt(M, N);
|
||||
CuArray<int32> copy_from_idx;
|
||||
|
||||
src.SetRandn();
|
||||
int32 n_rows = src.NumRows();
|
||||
int32 n_columns = src.NumCols();
|
||||
std::vector<int32> copy_from_idx_vec;
|
||||
|
||||
for (int32 i = 0; i < n_rows; i++) {
|
||||
copy_from_idx_vec.push_back(rand() % n_rows);
|
||||
}
|
||||
copy_from_idx.CopyFromVec(copy_from_idx_vec);
|
||||
cu::Randomize(src, copy_from_idx, &tgt);
|
||||
|
||||
for (int32 i = 0; i < n_rows; i++) {
|
||||
for (int32 j = 0; j < n_columns; j++) {
|
||||
Real src_val = src(copy_from_idx_vec.at(i), j);
|
||||
Real tgt_val = tgt(i, j);
|
||||
AssertEqual(src_val, tgt_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuMathCopy() {
|
||||
int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
|
||||
CuMatrix<Real> src(M, N);
|
||||
CuMatrix<Real> tgt(M, N);
|
||||
CuArray<int32> copy_from_idx;
|
||||
|
||||
src.SetRandn();
|
||||
int32 n_rows = src.NumRows();
|
||||
int32 n_columns = src.NumCols();
|
||||
std::vector<int32> copy_from_idx_vec;
|
||||
|
||||
for (int32 i = 0; i < n_columns; i++) {
|
||||
copy_from_idx_vec.push_back(rand() % n_columns);
|
||||
}
|
||||
copy_from_idx.CopyFromVec(copy_from_idx_vec);
|
||||
cu::Copy(src, copy_from_idx, &tgt);
|
||||
|
||||
for (int32 i = 0; i < n_rows; i++) {
|
||||
for (int32 j = 0; j < n_columns; j++) {
|
||||
Real src_val = src(i, copy_from_idx_vec.at(j));
|
||||
Real tgt_val = tgt(i, j);
|
||||
AssertEqual(src_val, tgt_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuMathSplice() {
|
||||
int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
|
||||
CuMatrix<Real> src(M, N);
|
||||
CuArray<int32> frame_offsets;
|
||||
|
||||
src.SetRandn();
|
||||
int32 n_rows = src.NumRows();
|
||||
int32 n_columns = src.NumCols();
|
||||
std::vector<int32> frame_offsets_vec;
|
||||
|
||||
// The number of columns of tgt is rows(src)
|
||||
// times n_frame_offsets, so we keep n_frame_offsets
|
||||
// reasonably small (2 <= n <= 6).
|
||||
int32 n_frame_offsets = rand() % 7 + 2;
|
||||
for (int32 i = 0; i < n_frame_offsets; i++) {
|
||||
frame_offsets_vec.push_back(rand() % 2 * n_columns - n_columns);
|
||||
}
|
||||
|
||||
CuMatrix<Real> tgt(M, N * n_frame_offsets);
|
||||
frame_offsets.CopyFromVec(frame_offsets_vec);
|
||||
cu::Splice(src, frame_offsets, &tgt);
|
||||
|
||||
for (int32 i = 0; i < n_rows; i++) {
|
||||
for (int32 k = 0; k < n_frame_offsets; k++) {
|
||||
for (int32 j = 0; j < n_columns; j++) {
|
||||
Real src_val;
|
||||
if (i + frame_offsets_vec.at(k) >= n_rows) {
|
||||
src_val = src(n_rows-1, j);
|
||||
} else if (i + frame_offsets_vec.at(k) <= 0) {
|
||||
src_val = src(0, j);
|
||||
} else {
|
||||
src_val = src(i + frame_offsets_vec.at(k), j);
|
||||
}
|
||||
Real tgt_val = tgt(i, k * n_columns + j);
|
||||
AssertEqual(src_val, tgt_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CudaMathUnitTest() {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported())
|
||||
#endif
|
||||
UnitTestCuMathRandomize<Real>();
|
||||
UnitTestCuMathSplice<Real>();
|
||||
UnitTestCuMathCopy<Real>();
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
for (int32 loop = 0; loop < 2; loop++) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (loop == 0)
|
||||
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
|
||||
else
|
||||
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
|
||||
#endif
|
||||
srand(time(NULL));
|
||||
kaldi::CudaMathUnitTest<float>();
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CudaMathUnitTest<double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CudaMathUnitTest<float>();
|
||||
#endif
|
||||
|
||||
if (loop == 0)
|
||||
KALDI_LOG << "Tests without GPU use succeeded.\n";
|
||||
else
|
||||
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
|
||||
}
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -36,15 +36,15 @@ namespace cu {
|
|||
template<typename Real>
|
||||
void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
|
||||
KALDI_ASSERT(SameDim(*weight, *grad));
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK, CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(weight->NumCols(), CUBLOCK), n_blocks(weight->NumRows(), CUBLOCK));
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));
|
||||
|
||||
cuda_regularize_l1(dimGrid, dimBlock, weight->data_, grad->data_, l1, lr, weight->Dim());
|
||||
cuSafeCall(cudaGetLastError());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
|
@ -77,21 +77,21 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
|
|||
|
||||
template<typename Real>
|
||||
void Randomize(const CuMatrixBase<Real> &src,
|
||||
const CuStlVector<int32> ©_from_idx,
|
||||
const CuArray<int32> ©_from_idx,
|
||||
CuMatrixBase<Real> *tgt) {
|
||||
|
||||
KALDI_ASSERT(src.NumCols() == tgt->NumCols());
|
||||
KALDI_ASSERT(src.NumRows() == tgt->NumRows());
|
||||
KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
/*
|
||||
Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
|
||||
dim3 dimBlock(CUBLOCK, CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(copy_from_idx.Dim(), CUBLOCK));
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
|
||||
*/
|
||||
|
||||
/*
|
||||
|
@ -108,7 +108,7 @@ void Randomize(const CuMatrixBase<Real> &src,
|
|||
MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();
|
||||
|
||||
cuda_randomize(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_idx.Data(), dimtgt, dimsrc);
|
||||
cuSafeCall(cudaGetLastError());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
|
@ -116,7 +116,7 @@ void Randomize(const CuMatrixBase<Real> &src,
|
|||
{
|
||||
// randomize in CPU
|
||||
const MatrixBase<Real> &srcmat = src.Mat();
|
||||
const std::vector<int32> ©_from_idxvec = copy_from_idx.Vec();
|
||||
const int32 *copy_from_idxvec = copy_from_idx.Data();
|
||||
MatrixBase<Real> &tgtmat = tgt->Mat();
|
||||
for(int32 i=0; i<copy_from_idx.Dim(); i++) {
|
||||
tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
|
||||
|
@ -127,20 +127,20 @@ void Randomize(const CuMatrixBase<Real> &src,
|
|||
|
||||
|
||||
template<typename Real>
|
||||
void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<Real> *tgt) {
|
||||
void Splice(const CuMatrix<Real> &src, const CuArray<int32> &frame_offsets, CuMatrix<Real> *tgt) {
|
||||
|
||||
KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
|
||||
KALDI_ASSERT(src.NumRows() == tgt->NumRows());
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK, CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
|
||||
|
||||
cuda_splice(dimGrid, dimBlock, tgt->data_, src.data_, frame_offsets.Data(), tgt->Dim(), src.Dim());
|
||||
cuSafeCall(cudaGetLastError());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
|
@ -148,11 +148,12 @@ void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets,
|
|||
{
|
||||
// expand in CPU
|
||||
const MatrixBase<Real> &srcmat = src.Mat();
|
||||
const std::vector<int32> &frame_offsetvec = frame_offsets.Vec();
|
||||
const int32 *frame_offsetvec = frame_offsets.Data();
|
||||
int32 dim = frame_offsets.Dim();
|
||||
MatrixBase<Real> &tgtmat = tgt->Mat();
|
||||
//
|
||||
for(int32 r=0; r < tgtmat.NumRows(); r++) {
|
||||
for(int32 off=0; off < static_cast<int32>(frame_offsetvec.size()); off++) {
|
||||
for(int32 off=0; off < dim; off++) {
|
||||
int32 r_off = r + frame_offsetvec[off];
|
||||
if(r_off < 0) r_off = 0;
|
||||
if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
|
||||
|
@ -165,20 +166,20 @@ void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets,
|
|||
|
||||
|
||||
template<typename Real>
|
||||
void Copy(const CuMatrix<Real> &src, const CuStlVector<int32> ©_from_indices, CuMatrix<Real> *tgt) {
|
||||
void Copy(const CuMatrix<Real> &src, const CuArray<int32> ©_from_indices, CuMatrix<Real> *tgt) {
|
||||
|
||||
KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
|
||||
KALDI_ASSERT(src.NumRows() == tgt->NumRows());
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK, CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
|
||||
|
||||
cuda_copy(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_indices.Data(), tgt->Dim(), src.Dim());
|
||||
cuSafeCall(cudaGetLastError());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
|
@ -186,11 +187,12 @@ void Copy(const CuMatrix<Real> &src, const CuStlVector<int32> ©_from_indices
|
|||
{
|
||||
// expand in CPU
|
||||
const MatrixBase<Real> &srcmat = src.Mat();
|
||||
const std::vector<int32> ©_from_indicesvec = copy_from_indices.Vec();
|
||||
const int32 *copy_from_indicesvec = copy_from_indices.Data();
|
||||
int32 dim = copy_from_indices.Dim();
|
||||
MatrixBase<Real> &tgtmat = tgt->Mat();
|
||||
//
|
||||
for(int32 r=0; r < tgtmat.NumRows(); r++) {
|
||||
for(int32 c=0; c < static_cast<int32>(copy_from_indicesvec.size()); c++) {
|
||||
for(int32 r = 0; r < tgtmat.NumRows(); r++) {
|
||||
for(int32 c = 0; c < dim; c++) {
|
||||
tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
|
||||
}
|
||||
}
|
||||
|
@ -204,21 +206,21 @@ template
|
|||
void RegularizeL1(CuMatrixBase<double> *weight, CuMatrixBase<double> *grad, double l1, double lr);
|
||||
|
||||
template
|
||||
void Splice(const CuMatrix<float> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<float> *tgt);
|
||||
void Splice(const CuMatrix<float> &src, const CuArray<int32> &frame_offsets, CuMatrix<float> *tgt);
|
||||
template
|
||||
void Splice(const CuMatrix<double> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<double> *tgt);
|
||||
void Splice(const CuMatrix<double> &src, const CuArray<int32> &frame_offsets, CuMatrix<double> *tgt);
|
||||
template
|
||||
void Copy(const CuMatrix<float> &src, const CuStlVector<int32> ©_from_indices, CuMatrix<float> *tgt);
|
||||
void Copy(const CuMatrix<float> &src, const CuArray<int32> ©_from_indices, CuMatrix<float> *tgt);
|
||||
template
|
||||
void Copy(const CuMatrix<double> &src, const CuStlVector<int32> ©_from_indices, CuMatrix<double> *tgt);
|
||||
void Copy(const CuMatrix<double> &src, const CuArray<int32> ©_from_indices, CuMatrix<double> *tgt);
|
||||
|
||||
template
|
||||
void Randomize(const CuMatrixBase<float> &src,
|
||||
const CuStlVector<int32> ©_from_idx,
|
||||
const CuArray<int32> ©_from_idx,
|
||||
CuMatrixBase<float> *tgt);
|
||||
template
|
||||
void Randomize(const CuMatrixBase<double> &src,
|
||||
const CuStlVector<int32> ©_from_idx,
|
||||
const CuArray<int32> ©_from_idx,
|
||||
CuMatrixBase<double> *tgt);
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// cudamatrix/cu-math.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2013 Johns Hopkins University (Author: David Snyder)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -22,7 +23,7 @@
|
|||
#ifndef KALDI_CUDAMATRIX_CU_MATH_H_
|
||||
#define KALDI_CUDAMATRIX_CU_MATH_H_
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-stlvector.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "util/timer.h"
|
||||
|
||||
|
@ -38,21 +39,38 @@ template<typename Real>
|
|||
void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *gradient,
|
||||
Real l1_penalty, Real learning_rate);
|
||||
|
||||
/// ie. switch rows according to copy_from_idx
|
||||
/// Copies a permutation of src into tgt. The row permutation is specified in
|
||||
/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The
|
||||
/// dimensions of copy_from_idx must be equivalent to the number of rows in
|
||||
/// tgt and src and all elements in the vector must be in [0, src.numRows()-1].
|
||||
template<typename Real>
|
||||
void Randomize(const CuMatrixBase<Real> &src,
|
||||
const CuStlVector<int32> ©_from_idx,
|
||||
const CuArray<int32> ©_from_idx,
|
||||
CuMatrixBase<Real> *tgt);
|
||||
|
||||
/// ie. concatenate the frames with offsets from frame_offsets
|
||||
/// Splice concatenates frames of src as specified in frame_offsets into tgt.
|
||||
/// The dimensions of tgt must be equivalent to the number of rows in src
|
||||
/// and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim().
|
||||
/// As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the
|
||||
/// general case where i in [0..src.NumRows()-1],
|
||||
/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1]
|
||||
/// and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the
|
||||
/// number of rows in src or less than 0 than the right side of the equation
|
||||
/// is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid
|
||||
/// an index out of bounds.
|
||||
template<typename Real>
|
||||
void Splice(const CuMatrix<Real> &src,
|
||||
const CuStlVector<int32> &frame_offsets,
|
||||
const CuArray<int32> &frame_offsets,
|
||||
CuMatrix<Real> *tgt);
|
||||
|
||||
/// Copies elements from src into tgt as given by copy_from_indices.
|
||||
/// The matrices src and tgt must have the same dimensions and
|
||||
/// the dimension of copy_from_indices must equal the number of columns
|
||||
/// in the src matrix. As a result, tgt(i, j) == src(i, copy_from_indices[j]).
|
||||
/// Also see CuMatrix::CopyCols(), which is more general.
|
||||
template<typename Real>
|
||||
void Copy(const CuMatrix<Real> &src,
|
||||
const CuStlVector<int32> ©_from_indices,
|
||||
const CuArray<int32> ©_from_indices,
|
||||
CuMatrix<Real> *tgt);
|
||||
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
namespace kaldi {
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
inline CuSubMatrix<Real>::CuSubMatrix(const CuMatrixBase<Real> &mat,
|
||||
const MatrixIndexT row_offset,
|
||||
const MatrixIndexT num_rows,
|
||||
|
|
|
@ -1,31 +1,32 @@
|
|||
// matrix/packed-matrix-inl.h
|
||||
// cudamatrix/cu-matrix-lib.h
|
||||
|
||||
// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Lukas Burget;
|
||||
// Saarland University; Yanmin Qian; Jan Silovsky;
|
||||
// Haihua Xu
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#ifndef KALDI_MATRIX_PACKED_MATRIX_INL_H_
|
||||
#define KALDI_MATRIX_PACKED_MATRIX_INL_H_
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
#ifndef KALDI_CUDAMATRIX_CU_MATRIX_LIB_H_
|
||||
#define KALDI_CUDAMATRIX_CU_MATRIX_LIB_H_
|
||||
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
#include "cudamatrix/cu-tp-matrix.h"
|
||||
#include "cudamatrix/cu-block-matrix.h"
|
||||
#include "cudamatrix/cu-rand.h"
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
// cudamatrix/cu-matrix-speed-test.cc
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real>
|
||||
std::string NameOf() {
|
||||
return (sizeof(Real) == 8 ? "<double>" : "<float>");
|
||||
}
|
||||
|
||||
template<typename Real> void TestCuMatrixMatMat(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuMatrix<Real> M(dim, dim), N(dim, dim), O(dim, dim);
|
||||
M.SetRandn();
|
||||
N.SetRandn();
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
O.AddMatMat(1.0, M, kNoTrans, N, kNoTrans, 0.0);
|
||||
}
|
||||
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuMatrix::AddMatMat" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
template<typename Real> void TestCuMatrixSigmoid(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuMatrix<Real> M(dim, dim), N(dim, dim);
|
||||
M.SetRandn();
|
||||
N.SetRandn();
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
N.Sigmoid(M);
|
||||
}
|
||||
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuMatrix::Sigmoid" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuMatrix<Real> M(256, dim), N(256, dim);
|
||||
M.SetRandn();
|
||||
N.SetRandn();
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
N.ApplySoftMaxPerRow(M);
|
||||
}
|
||||
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuMatrix::Softmax" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
|
||||
for (int32 n = 0; n < 2; n++) {
|
||||
MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
|
||||
BaseFloat time_in_secs = 0.08;
|
||||
|
||||
CuMatrix<Real> M(dim, dim), N(dim, dim);
|
||||
M.SetRandn();
|
||||
N.SetRandn();
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
TraceMatMat(M, N, trans);
|
||||
}
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf<Real>()
|
||||
<< (trans == kTrans ? " [transposed]" : "") << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void TestCuMatrixCopyLowerToUpper(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuMatrix<Real> M(dim, dim);
|
||||
M.SetRandn();
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (; tim.Elapsed() < time_in_secs; iter++) {
|
||||
M.CopyLowerToUpper();
|
||||
}
|
||||
CuMatrix<Real> M2(M, kTrans);
|
||||
AssertEqual(M, M2);
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuMatrix::CopyLowerToUpper" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void TestCuMatrixCopyUpperToLower(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuMatrix<Real> M(dim, dim);
|
||||
M.SetRandn();
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (; tim.Elapsed() < time_in_secs; iter++) {
|
||||
M.CopyUpperToLower();
|
||||
}
|
||||
CuMatrix<Real> M2(M, kTrans);
|
||||
AssertEqual(M, M2);
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuMatrix::CopyUpperToLower" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CudaMatrixSpeedTest() {
|
||||
std::vector<int32> sizes;
|
||||
sizes.push_back(16);
|
||||
sizes.push_back(128);
|
||||
sizes.push_back(256);
|
||||
sizes.push_back(1024);
|
||||
int32 ns = sizes.size();
|
||||
for (int32 s = 0; s < ns; s++)
|
||||
TestCuMatrixMatMat<Real>(sizes[s]);
|
||||
for (int32 s = 0; s < ns; s++)
|
||||
TestCuMatrixSigmoid<Real>(sizes[s]);
|
||||
|
||||
for (int32 s = 0; s < ns; s++)
|
||||
TestCuMatrixSoftmax<Real>(sizes[s]);
|
||||
for (int32 s = 0; s < ns; s++)
|
||||
TestCuMatrixTraceMatMat<Real>(sizes[s]);
|
||||
for (int32 s = 0; s < ns; s++)
|
||||
TestCuMatrixCopyLowerToUpper<Real>(sizes[s]);
|
||||
for (int32 s = 0; s < ns; s++)
|
||||
TestCuMatrixCopyUpperToLower<Real>(sizes[s]);
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
//Select the GPU
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
|
||||
#endif
|
||||
|
||||
kaldi::CudaMatrixSpeedTest<float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CudaMatrixSpeedTest<double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CudaMatrixSpeedTest<double>();
|
||||
#endif
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
std::cout << "Tests succeeded.\n";
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,7 +1,10 @@
|
|||
// cudamatrix/cu-matrix.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// Johns Hopkins University (author: Daniel Povey)
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
// 2013 Hainan Xu
|
||||
// 2013 Xiaohui Zhang
|
||||
// 2013 Johns Hopkins University (author: Guoguo Chen)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -27,14 +30,18 @@
|
|||
|
||||
#include "cudamatrix/cu-matrixdim.h"
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-value.h"
|
||||
#include "matrix/matrix-common.h"
|
||||
#include "matrix/kaldi-matrix.h"
|
||||
#include "cudamatrix/cu-stlvector.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-rand.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
template<typename Real>
|
||||
Real TraceMatMat(const CuMatrixBase<Real> &A, const CuMatrixBase<Real> &B,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
/**
|
||||
* Matrix for CUDA computing.
|
||||
* Does the computation on the CUDA card when CUDA is compiled in and
|
||||
|
@ -42,23 +49,77 @@ namespace kaldi {
|
|||
* otherwise, does it on the CPU.
|
||||
*/
|
||||
|
||||
/*
|
||||
template<typename Real>
|
||||
struct MatrixElement {
|
||||
int row;
|
||||
int column;
|
||||
Real weight;
|
||||
};
|
||||
// */
|
||||
|
||||
template<typename Real>
|
||||
class CuMatrixBase {
|
||||
public:
|
||||
friend class CuMatrixBase<float>;
|
||||
friend class CuMatrixBase<double>;
|
||||
friend class CuVectorBase<float>;
|
||||
friend class CuVectorBase<double>;
|
||||
friend class VectorBase<Real>;
|
||||
friend class CuSpMatrix<Real>;
|
||||
friend class CuTpMatrix<float>;
|
||||
friend class CuTpMatrix<double>;
|
||||
friend class CuVectorBase<Real>;
|
||||
friend class CuSubMatrix<Real>;
|
||||
friend class CuRand<Real>;
|
||||
friend class CuSubVector<Real>;
|
||||
friend class CuBlockMatrix<Real>;
|
||||
friend void cu::RegularizeL1<Real>(CuMatrixBase<Real> *weight,
|
||||
CuMatrixBase<Real> *grad, Real l1, Real lr);
|
||||
friend void cu::Splice<Real>(const CuMatrix<Real> &src,
|
||||
const CuStlVector<int32> &frame_offsets,
|
||||
const CuArray<int32> &frame_offsets,
|
||||
CuMatrix<Real> *tgt);
|
||||
friend void cu::Copy<Real>(const CuMatrix<Real> &src,
|
||||
const CuStlVector<int32> ©_from_indices,
|
||||
const CuArray<int32> ©_from_indices,
|
||||
CuMatrix<Real> *tgt);
|
||||
friend void cu::Randomize<Real>(const CuMatrixBase<Real> &src,
|
||||
const CuStlVector<int32> ©_from_idx,
|
||||
const CuArray<int32> ©_from_idx,
|
||||
CuMatrixBase<Real> *tgt);
|
||||
|
||||
/// Copies column r from column indices[r] of src.
|
||||
/// As a special case, if indexes[i] == -1, sets column i to zero
|
||||
/// indices.size() must equal this->NumCols(),
|
||||
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
|
||||
/// and src.NumRows() must equal this.NumRows()
|
||||
void CopyCols(const CuMatrixBase<Real> &src,
|
||||
const std::vector<MatrixIndexT> &indices);
|
||||
|
||||
/// Version of CopyCols that takes CuArray argument.
|
||||
void CopyCols(const CuMatrixBase<Real> &src,
|
||||
const CuArray<MatrixIndexT> &indices);
|
||||
|
||||
|
||||
/// Copies row r from row indices[r] of src.
|
||||
/// As a special case, if indexes[i] <== -1, sets row i to zero
|
||||
/// "reorder".size() must equal this->NumRows(),
|
||||
/// all elements of "reorder" must be in [0, src.NumRows()-1],
|
||||
/// and src.NumCols() must equal this.NumCols()
|
||||
void CopyRows(const CuMatrixBase<Real> &src,
|
||||
const std::vector<MatrixIndexT> &indices);
|
||||
|
||||
|
||||
/// For each row r of this and for each column c, sets (*this)(r, c) to the
|
||||
/// sum \sum_j src(r, j), where j ranges from indices[c].first through
|
||||
/// indices[c].second - 1.
|
||||
void SumColumnRanges(const CuMatrixBase<Real> &src,
|
||||
const CuArray<Int32Pair> &indices);
|
||||
|
||||
|
||||
friend Real TraceMatMat<Real>(const CuMatrixBase<Real> &A,
|
||||
const CuMatrixBase<Real> &B,
|
||||
MatrixTransposeType trans);
|
||||
|
||||
void AddToDiag(Real value);
|
||||
|
||||
/// Dimensions
|
||||
MatrixIndexT NumRows() const { return num_rows_; }
|
||||
|
@ -72,26 +133,66 @@ class CuMatrixBase {
|
|||
return d;
|
||||
}
|
||||
|
||||
Real FrobeniusNorm() const { return sqrt(TraceMatMat(*this, *this, kTrans)); }
|
||||
|
||||
bool IsUnit(Real tol = 0.001) const;
|
||||
|
||||
bool ApproxEqual(const CuMatrixBase<Real> &other, float tol = 0.01) const;
|
||||
|
||||
/// Get size of matrix in bytes
|
||||
MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
|
||||
|
||||
/// Get size of matrix row in bytes
|
||||
MatrixIndexT RowSizeInBytes() const { return num_cols_*sizeof(Real); }
|
||||
|
||||
/// Get size of matrix stride in bytes
|
||||
MatrixIndexT StrideSizeInBytes() const { return stride_*sizeof(Real); }
|
||||
// Copy functions. These do not resize.
|
||||
template<typename OtherReal>
|
||||
void CopyFromMat(const MatrixBase<OtherReal> &src,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
void CopyFromMat(const MatrixBase<Real> &src,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
/// Copy functions (reallocates when needed, but note from Dan: eventually
|
||||
/// I'll change it to just die if the sizes don't match, like the Matrix class.)
|
||||
void CopyFromMat(const CuMatrixBase<Real> &src);
|
||||
void CopyFromMat(const MatrixBase<Real> &src);
|
||||
void CopyToMat(MatrixBase<Real> *dst) const;
|
||||
void CopyFromSp(const CuSpMatrix<Real> &M);
|
||||
|
||||
template<typename OtherReal>
|
||||
void CopyFromTp(const CuTpMatrix<OtherReal> &M,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
template<typename OtherReal>
|
||||
void CopyFromMat(const CuMatrixBase<OtherReal> &M,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
template<typename OtherReal>
|
||||
void CopyToMat(MatrixBase<OtherReal> *dst,
|
||||
MatrixTransposeType trans = kNoTrans) const;
|
||||
|
||||
void CopyRowsFromVec(const CuVectorBase<Real> &v);
|
||||
|
||||
void CopyRowsFromVec(const VectorBase<Real> &v);
|
||||
|
||||
/// Copy vector into specific column of matrix.
|
||||
void CopyColFromVec(const CuVectorBase<Real> &v, const MatrixIndexT col);
|
||||
|
||||
/// Set each element to the sigmoid of the corresponding element of "src":
|
||||
/// element by element, *this = 1 / (1 + exp(-src)).
|
||||
/// element by element, x = 1 / (1 + exp(-x))
|
||||
void Sigmoid(const CuMatrixBase<Real> &src);
|
||||
|
||||
/// Apply the function y = log(1 + exp(x)), to each element.
|
||||
/// Note: the derivative of this function is the sigmoid function.
|
||||
/// This is like a soft ReLU.
|
||||
void SoftHinge(const CuMatrixBase<Real> &src);
|
||||
|
||||
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
|
||||
/// where G = x.NumCols() / y.NumCols() must be an integer.
|
||||
void GroupPnorm(const CuMatrixBase<Real> &src, Real pow);
|
||||
|
||||
/// Calculate derivatives for the GroupPnorm function above...
|
||||
/// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
|
||||
/// and "output" is the result of the computation (i.e. the "this" of that function
|
||||
/// call), and *this has the same dimension as "input", then it sets each element
|
||||
/// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
|
||||
/// "output-elem" is whichever element of output depends on that input element.
|
||||
void GroupPnormDeriv(const CuMatrixBase<Real> &input,
|
||||
const CuMatrixBase<Real> &output, Real power);
|
||||
|
||||
/// Compute the hyperbolic tangent (tanh) function; element by element,
|
||||
/// *this = tanh(src).
|
||||
void Tanh(const CuMatrixBase<Real> &src);
|
||||
|
@ -105,7 +206,7 @@ class CuMatrixBase {
|
|||
/// tanh output. Does, element-by-element, *this = diff * (1 - value^2).
|
||||
void DiffTanh(const CuMatrixBase<Real> &value,
|
||||
const CuMatrixBase<Real> &diff);
|
||||
|
||||
|
||||
/// Differentiate the block [softmax+cross-entropy] :
|
||||
/// dE/da = posterior_mat - target_mat,
|
||||
/// 'E' is error function, 'a' is activation on softmax input
|
||||
|
@ -115,16 +216,30 @@ class CuMatrixBase {
|
|||
/// net_out_or_diff ... before invocation net output, after diff dE/da
|
||||
/// log_post_tgt ... per-frame statistics for cross-entropy computations :
|
||||
/// log(sum_row(posterior_mat .* target_mat))
|
||||
void DiffXent(const CuStlVector<int32> &tgt,
|
||||
void DiffXent(const CuArray<int32> &tgt,
|
||||
CuVector<Real> *log_post_tgt);
|
||||
|
||||
/// This method may be only called for symmetric matrices (it accesses the
|
||||
/// upper as well as lower triangle). The result is put in the lower
|
||||
/// triangle, and the upper triangle zeroed.
|
||||
void Cholesky();
|
||||
|
||||
void SymInvertPosDef(); ///< Inversion for positive definite symmetric matrices.
|
||||
///< Requires that the input is symmetric (we do not check this).
|
||||
///< The output is symmetric.
|
||||
|
||||
void ApplyPow(Real power);
|
||||
void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0)
|
||||
void ApplyFloor(Real floor_val);
|
||||
void ApplyCeiling(Real ceiling_val);
|
||||
void ApplyExp();
|
||||
/// Softmax nonlinearity
|
||||
/// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik)
|
||||
/// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row
|
||||
/// for each row, the max value is first subtracted for good numerical stability
|
||||
void Softmax(const CuMatrixBase<Real> &src);
|
||||
void ApplySoftMaxPerRow(const CuMatrixBase<Real> &src);
|
||||
|
||||
/// Find the id of the maximal element for each row
|
||||
void FindRowMaxId(CuStlVector<int32> *id) const;
|
||||
void FindRowMaxId(CuArray<int32> *id) const;
|
||||
|
||||
/*
|
||||
// Copy row interval from matrix
|
||||
|
@ -139,27 +254,90 @@ class CuMatrixBase {
|
|||
void SetZero();
|
||||
void Set(Real value);
|
||||
void Add(Real value);
|
||||
void SetZeroUpperDiag();
|
||||
void Scale(Real value);
|
||||
void ApplyLog();
|
||||
/// Multiply two matrices elementhwise: C = A .* C
|
||||
void MulElements(const CuMatrixBase<Real>& A);
|
||||
|
||||
/// Multiply two matrices elementwise: C = A .* C
|
||||
void MulElements(const CuMatrixBase<Real> &A);
|
||||
/// Do, elementwise, *this = max(*this, A).
|
||||
void Max(const CuMatrixBase<Real> &A);
|
||||
/// scale i'th column by scale[i]
|
||||
void MulColsVec(const CuVectorBase<Real> &scale);
|
||||
/// scale i'th row by scale[i]
|
||||
void MulRowsVec(const CuVectorBase<Real> &scale);
|
||||
void MulRowsVec(const CuVectorBase<Real> &scale);
|
||||
/// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j].
|
||||
void MulRowsGroupMat(const CuMatrixBase<Real> &src);
|
||||
/// divide i'th row by scale[i]
|
||||
void DivRowsVec(const CuVectorBase<Real> &div);
|
||||
/// B = aplha * A + beta * B
|
||||
void AddMat(Real alpha, const CuMatrixBase<Real>& A, Real beta=1.0);
|
||||
void AddMat(Real alpha, const CuMatrixBase<Real> &A, Real beta=1.0);
|
||||
/// B = aplha * row + beta * B
|
||||
void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta=1.0);
|
||||
void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
|
||||
/// B = aplha * row + beta * B
|
||||
void AddVecToRows(Real alpha, const CuVectorBase<Real> &row, Real beta=1.0);
|
||||
void AddVecToRows(Real alpha, const CuVectorBase<Real> &row, Real beta = 1.0);
|
||||
/// C = alpha * A(^T)*B(^T) + beta * C
|
||||
void AddMatMat(Real alpha, const CuMatrixBase<Real>& A, MatrixTransposeType transA,
|
||||
const CuMatrixBase<Real>& B, MatrixTransposeType transB, Real beta);
|
||||
void AddMatMat(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const CuMatrixBase<Real> &B, MatrixTransposeType transB, Real beta);
|
||||
|
||||
/// *this = beta * *this + alpha * M M^T, for symmetric matrices. It only
|
||||
/// updates the lower triangle of *this. It will leave the matrix asymmetric;
|
||||
/// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
|
||||
void SymAddMat2(const Real alpha, const CuMatrixBase<Real> &M,
|
||||
MatrixTransposeType transA, Real beta);
|
||||
|
||||
|
||||
/// This function is like AddMatMat but for where the second argument is of
|
||||
/// type CuBlockMatrix (a block-diagonal matrix of blocks).
|
||||
void AddMatBlock(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const CuBlockMatrix<Real> &B, MatrixTransposeType transB, Real beta);
|
||||
|
||||
/// *this = beta * *this + alpha * diag(v) * M [or M^T].
|
||||
/// The same as adding M but scaling each row M_i by v(i).
|
||||
void AddDiagVecMat(const Real alpha, CuVectorBase<Real> &v,
|
||||
const CuMatrixBase<Real> &M, MatrixTransposeType transM,
|
||||
Real beta = 1.0);
|
||||
|
||||
/// this <-- beta*this + alpha*A*B
|
||||
void AddMatSp(const Real alpha,
|
||||
const CuMatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const CuSpMatrix<Real> &B,
|
||||
const Real beta) {
|
||||
CuMatrix<Real> M(B);
|
||||
return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
|
||||
}
|
||||
|
||||
/// this <-- beta*this + alpha*SpA*B
|
||||
void AddSpMat(const Real alpha,
|
||||
const CuSpMatrix<Real> &A,
|
||||
const CuMatrixBase<Real> &B, MatrixTransposeType transB,
|
||||
const Real beta) {
|
||||
CuMatrix<Real> M(A);
|
||||
return AddMatMat(alpha, M, kNoTrans, B, transB, beta);
|
||||
}
|
||||
|
||||
/// this <-- beta*this + alpha*A*B.
|
||||
void AddTpMat(const Real alpha,
|
||||
const CuTpMatrix<Real> &A, MatrixTransposeType transA,
|
||||
const CuMatrixBase<Real> &B, MatrixTransposeType transB,
|
||||
const Real beta) {
|
||||
CuMatrix<Real> M(A);
|
||||
return AddMatMat(alpha, M, transA, B, transB, beta);
|
||||
}
|
||||
|
||||
/// this <-- beta*this + alpha*A*B.
|
||||
void AddMatTp(const Real alpha,
|
||||
const CuMatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const CuTpMatrix<Real> &B, MatrixTransposeType transB,
|
||||
const Real beta) {
|
||||
CuMatrix<Real> M(B);
|
||||
return AddMatMat(alpha, A, transA, M, transB, beta);
|
||||
}
|
||||
|
||||
void CopyFromBlock(const CuBlockMatrix<Real> &B,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
void CopyLowerToUpper();
|
||||
void CopyUpperToLower();
|
||||
inline CuSubMatrix<Real> Range(const MatrixIndexT row_offset,
|
||||
const MatrixIndexT num_rows,
|
||||
const MatrixIndexT col_offset,
|
||||
|
@ -177,11 +355,67 @@ class CuMatrixBase {
|
|||
return CuSubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
|
||||
}
|
||||
|
||||
inline const CuSubVector<Real> Row(MatrixIndexT i) const {
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
|
||||
static_cast<UnsignedMatrixIndexT>(num_rows_));
|
||||
return CuSubVector<Real>(data_ + (i * stride_), NumCols());
|
||||
}
|
||||
|
||||
inline CuSubVector<Real> Row(MatrixIndexT i) {
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
|
||||
static_cast<UnsignedMatrixIndexT>(num_rows_));
|
||||
return CuSubVector<Real>(data_ + (i * stride_), NumCols());
|
||||
}
|
||||
|
||||
inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
|
||||
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
|
||||
static_cast<UnsignedMatrixIndexT>(num_rows_) &&
|
||||
static_cast<UnsignedMatrixIndexT>(c) <
|
||||
static_cast<UnsignedMatrixIndexT>(num_cols_));
|
||||
return CuValue<Real>(data_ + r * stride_ + c);
|
||||
}
|
||||
|
||||
inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
|
||||
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
|
||||
static_cast<UnsignedMatrixIndexT>(num_rows_) &&
|
||||
static_cast<UnsignedMatrixIndexT>(c) <
|
||||
static_cast<UnsignedMatrixIndexT>(num_cols_));
|
||||
return CuValue<Real>(data_ + r * stride_ + c); // will be casted to Real.
|
||||
}
|
||||
|
||||
Real Sum() const;
|
||||
|
||||
/// Return the trace. If check_square = true, will crash if matrix is not square.
|
||||
Real Trace(bool check_square = true) const;
|
||||
|
||||
void SetRandn();
|
||||
|
||||
void SetRandUniform();
|
||||
|
||||
void Write(std::ostream &os, bool binary) const;
|
||||
|
||||
// This function resizes the output to indices.size(), and for each element of
|
||||
// "indices" it interprets it as a (row, column) index into *this, and puts
|
||||
// (*this)(row, column) into the corresponding element of "output".
|
||||
void Lookup(const std::vector<Int32Pair> &indices,
|
||||
std::vector<Real> *output) const;
|
||||
protected:
|
||||
// The following two functions should only be called if we did not compile with CUDA
|
||||
// or could not get a CUDA card; in that case the contents are interpreted the
|
||||
// same as a regular matrix.
|
||||
inline const MatrixBase<Real> &Mat() const {
|
||||
return *(reinterpret_cast<const MatrixBase<Real>* >(this));
|
||||
}
|
||||
inline MatrixBase<Real> &Mat() {
|
||||
return *(reinterpret_cast<MatrixBase<Real>* >(this));
|
||||
}
|
||||
|
||||
/// Get raw row pointer
|
||||
inline const Real* RowData(MatrixIndexT r) const { return data_ + r * stride_; }
|
||||
inline Real* RowData(MatrixIndexT r) { return data_ + r * stride_; }
|
||||
inline const Real *Data() const { return data_; }
|
||||
inline Real *Data() { return data_; }
|
||||
|
||||
|
||||
|
||||
// The constructors are protected to prevent the user creating an instance of
|
||||
|
@ -198,19 +432,9 @@ class CuMatrixBase {
|
|||
MatrixIndexT stride):
|
||||
data_(data), num_cols_(num_cols), num_rows_(num_rows), stride_(stride) { }
|
||||
|
||||
// The following two functions should only be called if we did not compile with CUDA
|
||||
// or could not get a CUDA card; in that case the contents are interpreted the
|
||||
// same as a regular matrix.
|
||||
inline const MatrixBase<Real> &Mat() const {
|
||||
return *(reinterpret_cast<const MatrixBase<Real>* >(this));
|
||||
}
|
||||
inline MatrixBase<Real> &Mat() {
|
||||
return *(reinterpret_cast<MatrixBase<Real>* >(this));
|
||||
}
|
||||
|
||||
Real *data_; ///< GPU data pointer (or regular matrix data pointer,
|
||||
///< if either CUDA was not compiled in or we could not
|
||||
///< acquire the device).
|
||||
///< if either CUDA was not compiled in or we could not
|
||||
///< acquire the device).
|
||||
// Note: it might seem a bit backwards that we have the number of columns
|
||||
// first here; it's necessary because we need the data to be laid out the same
|
||||
// as for MatrixBase so the Mat() function call will work. We don't want to
|
||||
|
@ -239,15 +463,34 @@ class CuMatrix: public CuMatrixBase<Real> {
|
|||
|
||||
// Note: we had to remove the "explicit" keyword due
|
||||
// to problems with STL vectors of CuMatrixBase.
|
||||
CuMatrix(const CuMatrix<Real> &other) {
|
||||
this->Resize(other.NumRows(), other.NumCols(), kUndefined);
|
||||
this->CopyFromMat(other);
|
||||
CuMatrix(const CuMatrix<Real> &other,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
explicit CuMatrix(const CuBlockMatrix<Real> &other,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
explicit CuMatrix(const CuMatrixBase<Real> &other,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
template<typename OtherReal>
|
||||
explicit CuMatrix(const MatrixBase<OtherReal> &other,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
/// Copy constructor taking SpMatrix...
|
||||
explicit CuMatrix(const CuSpMatrix<Real> &M) : CuMatrixBase<Real>() {
|
||||
Resize(M.NumRows(), M.NumRows(), kUndefined);
|
||||
this->CopyFromSp(M);
|
||||
}
|
||||
|
||||
explicit CuMatrix(const MatrixBase<Real> &other) {
|
||||
this->Resize(other.NumRows(), other.NumCols(), kUndefined);
|
||||
this->CopyFromMat(other);
|
||||
}
|
||||
/// Copy constructor taking TpMatrix...
|
||||
template <typename OtherReal>
|
||||
explicit CuMatrix(const CuTpMatrix<OtherReal> & M,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
/// Copy constructor: as above, but from another type.
|
||||
template<typename OtherReal>
|
||||
explicit CuMatrix(const CuMatrixBase<OtherReal> &M,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
CuMatrix<Real> &operator = (const CuMatrixBase<Real> &other) {
|
||||
this->Resize(other.NumRows(), other.NumCols(), kUndefined);
|
||||
|
@ -265,21 +508,45 @@ class CuMatrix: public CuMatrixBase<Real> {
|
|||
this->Resize(other.NumRows(), other.NumCols(), kUndefined);
|
||||
this->CopyFromMat(other);
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
void Transpose();
|
||||
|
||||
/// Allocate the memory
|
||||
void Resize(MatrixIndexT rows, MatrixIndexT cols,
|
||||
MatrixResizeType resize_type = kSetZero);
|
||||
|
||||
|
||||
|
||||
void Swap(Matrix<Real> *mat);
|
||||
void Swap(CuMatrix<Real> *mat);
|
||||
|
||||
template<typename OtherReal>
|
||||
void Swap(CuMatrix<OtherReal> *mat);
|
||||
|
||||
/// I/O functions
|
||||
void Read(std::istream &is, bool binary);
|
||||
void Write(std::ostream &os, bool binary) const;
|
||||
|
||||
/// Destructor
|
||||
~CuMatrix() { Destroy(); }
|
||||
|
||||
inline const Matrix<Real> &Mat() const {
|
||||
return *(reinterpret_cast<const Matrix<Real>* >(this));
|
||||
}
|
||||
inline Matrix<Real> &Mat() {
|
||||
return *(reinterpret_cast<Matrix<Real>* >(this));
|
||||
}
|
||||
|
||||
/// This function does: for each element { row, column, weight } indexed i in
|
||||
/// the vector "elements", let x(i) = A(row(i), column(i)); then it does
|
||||
/// (*this)(row(i), column(i)) += weight(i) / x(i), and
|
||||
/// *tot_objf = \sum_i weight(i) * log(x(i)), and
|
||||
/// *tot_weight = \sum_i weight(i)
|
||||
/// Preconditions: A must be strictly positive, and no (row, column) pair
|
||||
/// may be repeated within "elements"
|
||||
void CompObjfAndDeriv(const std::vector<MatrixElement<Real> > &elements,
|
||||
const CuMatrix<Real> &A,
|
||||
Real *tot_objf,
|
||||
Real* tot_weight);
|
||||
|
||||
private:
|
||||
void Destroy();
|
||||
};
|
||||
|
@ -305,27 +572,55 @@ class CuSubMatrix: public CuMatrixBase<Real> {
|
|||
CuSubMatrix<Real> &operator = (const CuSubMatrix<Real> &other);
|
||||
};
|
||||
|
||||
template<class Real>
|
||||
|
||||
template<typename Real>
|
||||
bool ApproxEqual(const CuMatrixBase<Real> &A,
|
||||
const CuMatrixBase<Real> &B, Real tol = 0.01) {
|
||||
return A.ApproxEqual(B, tol);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
inline void AssertEqual(CuMatrixBase<Real> &A, CuMatrixBase<Real> &B,
|
||||
float tol = 0.01) {
|
||||
KALDI_ASSERT(A.ApproxEqual(B, tol));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
bool SameDim(const CuMatrixBase<Real> &M, const CuMatrixBase<Real> &N) {
|
||||
return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool SameDimAndStride(const CuMatrixBase<Real> &M, const CuMatrixBase<Real> &N) {
|
||||
return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols()
|
||||
&& M.Stride() == N.Stride());
|
||||
}
|
||||
|
||||
|
||||
/// I/O
|
||||
template<typename Real>
|
||||
std::ostream &operator << (std::ostream &out, const CuMatrixBase<Real> &mat);
|
||||
|
||||
|
||||
|
||||
} // namespace
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
Matrix<Real>::Matrix(const CuMatrixBase<OtherReal> &M,
|
||||
MatrixTransposeType trans) {
|
||||
if (trans == kNoTrans) Init(M.NumRows(), M.NumCols());
|
||||
else Init(M.NumCols(), M.NumRows());
|
||||
M.CopyToMat(this, trans);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void MatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &cu,
|
||||
MatrixTransposeType trans) {
|
||||
cu.CopyToMat(this, trans);
|
||||
}
|
||||
|
||||
|
||||
#include "cu-matrix-inl.h"
|
||||
} // namespace
|
||||
|
||||
|
||||
#include "cudamatrix/cu-matrix-inl.h"
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// cudamatrix/cu-matrixdim.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -28,12 +29,20 @@
|
|||
#ifdef _MSC_VER
|
||||
typedef unsigned __int32 uint32_cuda;
|
||||
typedef __int32 int32_cuda;
|
||||
typedef __int32 MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
|
||||
#else
|
||||
#include <stdint.h>
|
||||
typedef uint32_t uint32_cuda;
|
||||
typedef int32_t int32_cuda;
|
||||
typedef int32_t MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
|
||||
#endif
|
||||
|
||||
template<typename Real>
|
||||
struct MatrixElement {
|
||||
int32_cuda row;
|
||||
int32_cuda column;
|
||||
Real weight;
|
||||
};
|
||||
|
||||
extern "C" {
|
||||
/**
|
||||
|
@ -45,8 +54,37 @@ extern "C" {
|
|||
int32_cuda cols;
|
||||
int32_cuda stride;
|
||||
} MatrixDim;
|
||||
|
||||
// we define the following constants here because this file is included
|
||||
// both by the C++ code and also CUDA code.
|
||||
|
||||
|
||||
// The size of a CUDA 1-d block, e.g. for vector operations..
|
||||
#define CU1DBLOCK 256
|
||||
|
||||
// The size of edge of CUDA square block, e.g. for matrix operations.
|
||||
// Must be defined the same in cu-kernels-ansi.h
|
||||
#define CU2DBLOCK 16
|
||||
|
||||
|
||||
/** This structure is used in cu-block-matrix.h to store information
|
||||
about a block-diagonal matrix. We declare it here so that it
|
||||
will be accessible
|
||||
*/
|
||||
typedef struct CuBlockMatrixData_ {
|
||||
int32_cuda row_offset; // sum of #rows of previous M_i
|
||||
int32_cuda col_offset; // sum of #cols of previous M_i
|
||||
MatrixDim matrix_dim; // dimension of this M_i
|
||||
void *matrix_data; // data for M_i. This is a pointer to either float* or
|
||||
// double*. Because C doesn't support templates and to
|
||||
// avoid extra coding to support the two cases, we
|
||||
// decided to make this a void* pointer.
|
||||
} CuBlockMatrixData;
|
||||
|
||||
typedef struct Int32Pair {
|
||||
int32_cuda first;
|
||||
int32_cuda second;
|
||||
} Int32Pair;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,265 @@
|
|||
// cudamatrix/cu-sp-matrix-test.cc
|
||||
//
|
||||
// Copyright 2013 Ehsan Variani
|
||||
// Lucas Ondel
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// UnitTests for testing cu-sp-matrix.h methods.
|
||||
//
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/*
|
||||
* INITIALIZERS
|
||||
*/
|
||||
|
||||
/*
|
||||
* ASSERTS
|
||||
*/
|
||||
template<typename Real>
|
||||
static void AssertEqual(const CuPackedMatrix<Real> &A,
|
||||
const CuPackedMatrix<Real> &B,
|
||||
float tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
|
||||
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void AssertEqual(const PackedMatrix<Real> &A,
|
||||
const PackedMatrix<Real> &B,
|
||||
float tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
|
||||
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void AssertDiagEqual(const PackedMatrix<Real> &A,
|
||||
const CuPackedMatrix<Real> &B,
|
||||
float value,
|
||||
float tol = 0.001) {
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
|
||||
KALDI_ASSERT(std::abs((A(i, i)+value) - B(i, i))
|
||||
< tol * std::max(1.0, (double) (std::abs(A(i, i)) + std::abs(B(i, i) + value))));
|
||||
}
|
||||
}
|
||||
template<typename Real>
|
||||
static void AssertDiagEqual(const PackedMatrix<Real> &A,
|
||||
const PackedMatrix<Real> &B,
|
||||
float value,
|
||||
float tol = 0.001) {
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
|
||||
KALDI_ASSERT(std::abs((A(i, i)+value) - B(i, i))
|
||||
< tol * std::max(1.0, (double) (std::abs(A(i, i)) + std::abs(B(i, i) + value))));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void AssertEqual(const PackedMatrix<Real> &A,
|
||||
const CuPackedMatrix<Real> &B,
|
||||
float tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
|
||||
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static bool ApproxEqual(const PackedMatrix<Real> &A,
|
||||
const PackedMatrix<Real> &B, Real tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
PackedMatrix<Real> diff(A);
|
||||
diff.AddPacked(1.0, B);
|
||||
Real a = std::max(A.Max(), -A.Min()), b = std::max(B.Max(), -B.Min()),
|
||||
d = std::max(diff.Max(), -diff.Min());
|
||||
return (d <= tol * std::max(a, b));
|
||||
}
|
||||
|
||||
/*
|
||||
* Unit Tests
|
||||
*/
|
||||
template<typename Real>
|
||||
static void UnitTestCuPackedMatrixConstructor() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10 * i;
|
||||
|
||||
PackedMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuPackedMatrix<Real> B(A);
|
||||
CuPackedMatrix<Real> C(B);
|
||||
AssertEqual(B, C);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuPackedMatrixCopy() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10 * i;
|
||||
|
||||
PackedMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuPackedMatrix<Real> B(A);
|
||||
|
||||
CuPackedMatrix<Real> C(dim);
|
||||
C.CopyFromPacked(A);
|
||||
CuPackedMatrix<Real> D(dim);
|
||||
D.CopyFromPacked(B);
|
||||
AssertEqual(C, D);
|
||||
|
||||
PackedMatrix<Real> E(dim);
|
||||
D.CopyToPacked(&E);
|
||||
AssertEqual(A, E);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuPackedMatrixTrace() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 5 * i + rand() % 10;
|
||||
|
||||
PackedMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuPackedMatrix<Real> B(A);
|
||||
|
||||
AssertEqual(A.Trace(), B.Trace());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuPackedMatrixScale() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 5 * i + rand() % 10;
|
||||
|
||||
PackedMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuPackedMatrix<Real> B(A);
|
||||
|
||||
Real scale_factor = 23.5896223;
|
||||
A.Scale(scale_factor);
|
||||
B.Scale(scale_factor);
|
||||
AssertEqual(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuPackedMatrixScaleDiag() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 5 * i + rand() % 10;
|
||||
|
||||
PackedMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuPackedMatrix<Real> B(A);
|
||||
|
||||
Real scale_factor = 23.5896223;
|
||||
A.ScaleDiag(scale_factor);
|
||||
B.ScaleDiag(scale_factor);
|
||||
AssertEqual(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuPackedMatrixAddToDiag() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 5 * i + rand() % 10;
|
||||
|
||||
PackedMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuPackedMatrix<Real> B(A);
|
||||
|
||||
Real value = rand() % 50;
|
||||
B.AddToDiag(value);
|
||||
|
||||
AssertDiagEqual(A, B, value);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuPackedMatrixSetUnit() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 5 * i + rand() % 10;
|
||||
|
||||
CuPackedMatrix<Real> A(dim);
|
||||
A.SetUnit();
|
||||
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
|
||||
for (MatrixIndexT j = 0; j < A.NumRows(); j++) {
|
||||
if (i != j) {
|
||||
KALDI_ASSERT(A(i, j) == 0);
|
||||
} else {
|
||||
KALDI_ASSERT(A(i, j) == 1.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CudaPackedMatrixUnitTest() {
|
||||
UnitTestCuPackedMatrixConstructor<Real>();
|
||||
//UnitTestCuPackedMatrixCopy<Real>();
|
||||
UnitTestCuPackedMatrixTrace<Real>();
|
||||
UnitTestCuPackedMatrixScale<Real>();
|
||||
UnitTestCuPackedMatrixAddToDiag<Real>();
|
||||
UnitTestCuPackedMatrixSetUnit<Real>();
|
||||
}
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
using namespace kaldi;
|
||||
#if HAVE_CUDA == 1
|
||||
// Select the GPU
|
||||
CuDevice::Instantiate().SelectGpuId("yes");
|
||||
#endif
|
||||
kaldi::CudaPackedMatrixUnitTest<float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CudaPackedMatrixUnitTest<double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CudaPackedMatrixUnitTest<double>();
|
||||
#endif
|
||||
|
||||
KALDI_LOG << "Tests succeeded";
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,400 @@
|
|||
// cudamatrix/cu-packed-matrix.cc
|
||||
|
||||
// Copyright 2009-2013 Johns Hopkins University (author: Daniel Povey)
|
||||
// Karel Vesely
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cublas.h>
|
||||
#endif
|
||||
|
||||
#include "util/timer.h"
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-kernels.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-packed-matrix.h"
|
||||
#include "cudamatrix/cublas-wrappers.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::Resize(MatrixIndexT rows,
|
||||
MatrixResizeType resize_type) {
|
||||
// This code does not currently support the other resize_type options.
|
||||
KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
|
||||
|
||||
if (this->num_rows_ == rows) {
|
||||
if (resize_type == kSetZero) this->SetZero();
|
||||
return;
|
||||
}
|
||||
|
||||
if (this->num_rows_ != 0)
|
||||
this->Destroy();
|
||||
if (rows == 0) return;
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice &device = CuDevice::Instantiate();
|
||||
if (device.Enabled()) {
|
||||
Timer tim;
|
||||
this->num_rows_ = rows;
|
||||
size_t nr = static_cast<size_t>(num_rows_),
|
||||
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
|
||||
this->data_ = static_cast<Real*>(device.Malloc(num_bytes));
|
||||
|
||||
if (resize_type == kSetZero) this->SetZero();
|
||||
device.AccuProfile("CuPackedMatrix::Resize", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{ // Let the initializer of SpMatrix<Real> handle the allocation,
|
||||
// and then just do Swap which will switch the pointers.
|
||||
// This wastes a few instructions but is simple to code.
|
||||
SpMatrix<Real> mat(rows, resize_type);
|
||||
this->Swap(&mat);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::SetRandn() {
|
||||
if (num_rows_ != 0) {
|
||||
MatrixIndexT size = num_rows_ * (num_rows_ + 1) / 2;
|
||||
CuSubVector<Real> tmp(data_, size);
|
||||
CuRand<Real> rand;
|
||||
rand.RandGaussian(&tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::Destroy() {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (this->data_ != NULL) {
|
||||
CuDevice::Instantiate().Free(this->data_);
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
|
||||
}
|
||||
this->data_ = NULL;
|
||||
this->num_rows_ = 0;
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::Swap(PackedMatrix<Real> *mat) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (this->num_rows_ == 0) {
|
||||
if (mat->num_rows_ != 0) {
|
||||
// *this is empty, but mat is nonempty.
|
||||
Resize(mat->num_rows_, kUndefined);
|
||||
CopyFromPacked(*mat);
|
||||
mat->Resize(0);
|
||||
}
|
||||
// else both are empty.
|
||||
} else { // *this is nonempty.
|
||||
if (mat->num_rows_ != 0) {
|
||||
// Both *this and *mat are nonempty. Recurse to simpler cases.
|
||||
// this could be done more efficiently in the case where
|
||||
// the size does not change.
|
||||
PackedMatrix<Real> temp;
|
||||
this->Swap(&temp); // now temp is full, *this is empty.
|
||||
mat->Swap(&temp); // now mat has data from *this, temp has
|
||||
// data from mat.
|
||||
this->Swap(mat); // copy data in mat to *this, which is now empty.
|
||||
} else { // *this is full but *mat is empty.
|
||||
mat->Resize(this->num_rows_, kUndefined);
|
||||
this->CopyToPacked(mat);
|
||||
this->Destroy();
|
||||
}
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
std::swap(mat->data_, this->data_);
|
||||
std::swap(mat->num_rows_, this->num_rows_);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::CopyFromPacked(const CuPackedMatrix<Real> &src) {
|
||||
KALDI_ASSERT(src.NumRows() == num_rows_);
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (num_rows_ == 0) return; // Nothing to do.
|
||||
Timer tim;
|
||||
size_t nr = static_cast<size_t>(num_rows_),
|
||||
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
|
||||
|
||||
CU_SAFE_CALL(cudaMemcpy(data_, src.data_, num_bytes,
|
||||
cudaMemcpyDeviceToDevice));
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked1",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().CopyFromPacked(src.Mat());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::CopyFromPacked(const PackedMatrix<Real> &src) {
|
||||
KALDI_ASSERT(src.NumRows() == num_rows_);
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (num_rows_ == 0) return; // Nothing to do.
|
||||
Timer tim;
|
||||
CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.SizeInBytes(),
|
||||
cudaMemcpyHostToDevice));
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked2",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().CopyFromPacked(src);
|
||||
//memcpy(data_, src.Data(), SizeInBytes());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::CopyToPacked(PackedMatrix<Real> *dst) const {
|
||||
KALDI_ASSERT(dst->NumRows() == NumRows());
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (num_rows_ == 0) return; // Nothing to do.
|
||||
Timer tim;
|
||||
size_t nr = static_cast<size_t>(num_rows_),
|
||||
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
|
||||
|
||||
CU_SAFE_CALL(cudaMemcpy(dst->data_, data_, num_bytes,
|
||||
cudaMemcpyDeviceToHost));
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrixMatrix::CopyToPackedD2H",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
//memcpy(data_, dst->Data(), SizeInBytes());
|
||||
dst->CopyFromPacked(Mat());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::CopyRowsFromPacked(int32 r, const CuPackedMatrix<Real> &src, int32 src_ro, int32 dst_ro) {
|
||||
KALDI_ASSERT(r+src_ro <= src.NumRows());
|
||||
KALDI_ASSERT(r+dst_ro <= NumRows());
|
||||
KALDI_ASSERT(NumCols() == src.NumCols());
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
MatrixIndexT dst_pitch = stride_*sizeof(Real);
|
||||
MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
|
||||
MatrixIndexT width = src.NumCols()*sizeof(Real);
|
||||
|
||||
const Real *p_src = src.Data() + src_ro*src.Stride();
|
||||
Real *p_dst = data_ + dst_ro*stride_;
|
||||
|
||||
CU_SAFE_CALL(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, r, cudaMemcpyDeviceToDevice));
|
||||
|
||||
CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memcpy(Data()+dst_ro*stride_, src.Data()+src_ro*src.Stride(), r*stride_*sizeof(Real));
|
||||
}
|
||||
} */
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::Read(std::istream &is, bool binary) {
|
||||
PackedMatrix<Real> temp;
|
||||
temp.Read(is, binary);
|
||||
Destroy();
|
||||
Swap(&temp);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
|
||||
PackedMatrix<Real> temp(this->num_rows_, kUndefined);
|
||||
this->CopyToPacked(&temp);
|
||||
temp.Write(os, binary);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::SetZero() {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
size_t nr = static_cast<size_t>(num_rows_),
|
||||
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
|
||||
|
||||
CU_SAFE_CALL(cudaMemset(reinterpret_cast<void*>(this->data_), 0, num_bytes));
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().SetZero();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
Real CuPackedMatrix<Real>::Trace() const {
|
||||
Real result = 0.0;
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (num_rows_ == 0) return 0.0;
|
||||
CuVector<Real> tmp(num_rows_, kUndefined);
|
||||
tmp.CopyDiagFromPacked(*this);
|
||||
return tmp.Sum();
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
result = Mat().Trace();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::SetDiag(Real alpha) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (num_rows_ == 0) return;
|
||||
Timer tim;
|
||||
int dimBlock(CU1DBLOCK);
|
||||
int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
|
||||
cuda_set_diag_packed(dimGrid,dimBlock,data_,alpha,num_rows_);
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetDiag", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().SetDiag(alpha);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::Scale(Real alpha) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
size_t nr = static_cast<size_t>(num_rows_),
|
||||
num_elements = ((nr * (nr+1)) / 2);
|
||||
cublas_scal(num_elements, alpha, data_, 1);
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::Scale", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().Scale(alpha);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::ScaleDiag(Real alpha) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
int dimBlock(CU1DBLOCK);
|
||||
int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
|
||||
CU_SAFE_CALL(cudaGetLastError()); // TEMP
|
||||
cuda_scale_diag(dimGrid,dimBlock,data_,alpha,num_rows_);
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::ScaleDiag", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().ScaleDiag(alpha);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::AddPacked(const Real alpha, const CuPackedMatrix<Real> &M) {
|
||||
KALDI_ASSERT(num_rows_ == M.NumRows());
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (num_rows_ == 0) return;
|
||||
Timer tim;
|
||||
size_t nr = num_rows_,
|
||||
sz = (nr * (nr + 1)) / 2;
|
||||
cublas_axpy(sz, alpha, M.Data(), 1, data_, 1);
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddPacked", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().AddPacked(alpha, M.Mat());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::AddToDiag(Real r) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (num_rows_ == 0) return;
|
||||
Timer tim;
|
||||
int dimBlock(CU1DBLOCK);
|
||||
int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
|
||||
cuda_add_diag_packed(dimGrid,dimBlock,data_,r,num_rows_);
|
||||
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddToDiag", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
// TODO
|
||||
Mat().AddToDiag(r);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuPackedMatrix<Real>::SetUnit() {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
this->SetZero();
|
||||
this->SetDiag(1.0);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().SetUnit();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Print the matrix to stream
|
||||
*/
|
||||
template<typename Real>
|
||||
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat) {
|
||||
PackedMatrix<Real> temp(mat.NumRows());
|
||||
mat.CopyToPacked(&temp);
|
||||
out << temp;
|
||||
return out;
|
||||
}
|
||||
|
||||
// instantiate the template
|
||||
template
|
||||
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<float> &mat);
|
||||
template
|
||||
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<double> &mat);
|
||||
|
||||
|
||||
// Instantiate class CuPackedMatrix for float and double.
|
||||
template class CuPackedMatrix<float>;
|
||||
template class CuPackedMatrix<double>;
|
||||
|
||||
|
||||
} // namespace kaldi
|
|
@ -0,0 +1,176 @@
|
|||
// cudamatrix/cu-packed-matrix.h
|
||||
|
||||
// Copyright 2009-2013 Johns Hopkins University (author: Daniel Povey)
|
||||
// Karel Vesely
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_PACKED_MATRIX_H_
|
||||
#define KALDI_CUDAMATRIX_CU_PACKED_MATRIX_H_
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-value.h"
|
||||
#include "matrix/matrix-common.h"
|
||||
#include "matrix/kaldi-matrix.h"
|
||||
#include "matrix/packed-matrix.h"
|
||||
#include "matrix/sp-matrix.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
/**
|
||||
* Matrix for CUDA computing. This is a base class for packed
|
||||
* triangular and symmetric matrices.
|
||||
* Does the computation on the CUDA card when CUDA is compiled in and
|
||||
* we have a suitable GPU (CuDevice::Instantiate().Enabled() == true);
|
||||
* otherwise, does it on the CPU.
|
||||
*/
|
||||
|
||||
|
||||
/// @brief Packed CUDA matrix: base class for triangular and symmetric matrices on
|
||||
/// a GPU card.
|
||||
template<typename Real>
|
||||
class CuPackedMatrix {
|
||||
public:
|
||||
friend class CuMatrixBase<Real>;
|
||||
friend class CuVectorBase<Real>;
|
||||
friend class CuSubMatrix<Real>;
|
||||
friend class CuRand<Real>;
|
||||
|
||||
CuPackedMatrix() : data_(NULL), num_rows_(0) {}
|
||||
|
||||
explicit CuPackedMatrix(MatrixIndexT r,
|
||||
MatrixResizeType resize_type = kSetZero):
|
||||
data_(NULL), num_rows_(0) { Resize(r, resize_type); }
|
||||
|
||||
explicit CuPackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL), num_rows_(0) {
|
||||
Resize(orig.num_rows_, kUndefined);
|
||||
CopyFromPacked(orig);
|
||||
}
|
||||
|
||||
explicit CuPackedMatrix(const CuPackedMatrix<Real> &orig) : data_(NULL), num_rows_(0) {
|
||||
Resize(orig.NumRows(), kUndefined);
|
||||
CopyFromPacked(orig);
|
||||
}
|
||||
|
||||
void SetZero(); /// < Set to zero
|
||||
void SetUnit(); /// < Set to unit matrix.
|
||||
void SetRandn(); /// < Set to random values of a normal distribution
|
||||
void SetDiag(Real alpha); /// < Set the diagonal value to alpha
|
||||
void AddToDiag(Real r); ///< Add this quantity to the diagonal of the matrix.
|
||||
|
||||
void Scale(Real alpha);
|
||||
void ScaleDiag(Real alpha);
|
||||
Real Trace() const;
|
||||
|
||||
~CuPackedMatrix() { Destroy(); }
|
||||
|
||||
/// Set packed matrix to a specified size (can be zero).
|
||||
/// The value of the new data depends on resize_type:
|
||||
/// -if kSetZero, the new data will be zero
|
||||
/// -if kUndefined, the new data will be undefined
|
||||
/// -if kCopyData, the new data will be the same as the old data in any
|
||||
/// shared positions, and zero elsewhere.
|
||||
/// This function takes time proportional to the number of data elements.
|
||||
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);
|
||||
|
||||
// Copy functions (do not resize).
|
||||
void CopyFromPacked(const CuPackedMatrix<Real> &src);
|
||||
void CopyFromPacked(const PackedMatrix<Real> &src);
|
||||
void CopyToPacked(PackedMatrix<Real> *dst) const;
|
||||
|
||||
void Read(std::istream &in, bool binary);
|
||||
|
||||
void Write(std::ostream &out, bool binary) const;
|
||||
|
||||
void Destroy();
|
||||
|
||||
/// Swaps the contents of *this and *other. Shallow swap.
|
||||
void Swap(CuPackedMatrix<Real> *other);
|
||||
|
||||
/// Swaps the contents of *this and *other.
|
||||
void Swap(PackedMatrix<Real> *other);
|
||||
Real* Data() { return data_; }
|
||||
const Real* Data() const { return data_; }
|
||||
|
||||
inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
|
||||
if (static_cast<UnsignedMatrixIndexT>(c) >
|
||||
static_cast<UnsignedMatrixIndexT>(r))
|
||||
std::swap(c, r);
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
|
||||
static_cast<UnsignedMatrixIndexT>(this->num_rows_));
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Real value;
|
||||
CU_SAFE_CALL(cudaMemcpy(&value, this->data_ + (r * (r+1)) / 2 + c,
|
||||
sizeof(Real), cudaMemcpyDeviceToHost));
|
||||
return value;
|
||||
} else
|
||||
#endif
|
||||
return this->data_[(r * (r+1)) / 2 + c];
|
||||
}
|
||||
|
||||
inline MatrixIndexT NumRows() const { return num_rows_; }
|
||||
inline MatrixIndexT NumCols() const { return num_rows_; }
|
||||
|
||||
/// Returns size in bytes of the data held by the matrix.
|
||||
size_t SizeInBytes() const {
|
||||
size_t nr = static_cast<size_t>(num_rows_),
|
||||
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
|
||||
return num_bytes;
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
// The following two functions should only be called if we did not compile with CUDA
|
||||
// or could not get a CUDA card; in that case the contents are interpreted the
|
||||
// same as a regular matrix.
|
||||
inline const PackedMatrix<Real> &Mat() const {
|
||||
return *(reinterpret_cast<const PackedMatrix<Real>* >(this));
|
||||
}
|
||||
inline PackedMatrix<Real> &Mat() {
|
||||
return *(reinterpret_cast<PackedMatrix<Real>* >(this));
|
||||
}
|
||||
|
||||
|
||||
// Will only be called from this class or derived classes.
|
||||
|
||||
Real *data_;
|
||||
MatrixIndexT num_rows_;
|
||||
|
||||
void AddPacked(const Real alpha, const CuPackedMatrix<Real> &M);
|
||||
|
||||
private:
|
||||
// Disallow assignment.
|
||||
PackedMatrix<Real> & operator=(const PackedMatrix<Real> &other);
|
||||
}; // class CuPackedMatrix
|
||||
|
||||
|
||||
/// I/O
|
||||
template<typename Real>
|
||||
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat);
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
#endif
|
|
@ -1,6 +1,7 @@
|
|||
// cudamatrix/cu-rand-inl.h
|
||||
// cudamatrix/cu-rand.cc
|
||||
|
||||
// Copyright 2012 Karel Vesely
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -18,14 +19,8 @@
|
|||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_RAND_INL_H_
|
||||
#define KALDI_CUDAMATRIX_CU_RAND_INL_H_
|
||||
|
||||
#include "base/kaldi-math.h"
|
||||
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-rand.h"
|
||||
#include "cudamatrix/cu-matrix-lib.h"
|
||||
#include "cudamatrix/cu-randkernels.h"
|
||||
|
||||
|
||||
|
@ -34,112 +29,120 @@ namespace kaldi {
|
|||
|
||||
template<typename Real>
|
||||
void CuRand<Real>::SeedGpu(MatrixIndexT state_size) {
|
||||
if(NULL != host_) delete[] host_;
|
||||
host_ = new uint32[state_size];
|
||||
host_size_ = state_size;
|
||||
|
||||
SeedBuffer(&z1_, state_size);
|
||||
SeedBuffer(&z2_, state_size);
|
||||
SeedBuffer(&z3_, state_size);
|
||||
SeedBuffer(&z4_, state_size);
|
||||
KALDI_ASSERT(state_size >= 0);
|
||||
state_size_ = state_size;
|
||||
|
||||
delete[] host_;
|
||||
host_ = NULL;
|
||||
host_size_ = 0;
|
||||
SeedBuffer(state_size, &z1_);
|
||||
SeedBuffer(state_size, &z2_);
|
||||
SeedBuffer(state_size, &z3_);
|
||||
SeedBuffer(state_size, &z4_);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuRand<Real>::SeedBuffer(uint32* *tgt, MatrixIndexT state_size) {
|
||||
// generate random state
|
||||
for(MatrixIndexT i=0; i<host_size_; i++) {
|
||||
host_[i] = RandInt(128, RAND_MAX);
|
||||
}
|
||||
#if HAVE_CUDA==1
|
||||
// push it to the GPU
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
int32 state_size_in_bytes = state_size*sizeof(uint32);
|
||||
// resize the GPU buffer
|
||||
if (state_size_ != state_size) {
|
||||
cudaFree(*tgt);
|
||||
cudaMalloc((void**)tgt, state_size_in_bytes);
|
||||
void CuRand<Real>::SeedBuffer(MatrixIndexT state_size, uint32 **tgt) {
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice &device = CuDevice::Instantiate();
|
||||
if (device.Enabled()) {
|
||||
if (*tgt != NULL) {
|
||||
device.Free(*tgt);
|
||||
*tgt = NULL;
|
||||
}
|
||||
// copy the values
|
||||
cudaMemcpy(*tgt, host_, state_size_in_bytes, cudaMemcpyHostToDevice);
|
||||
} else
|
||||
#endif
|
||||
{ // use back-off host buffer
|
||||
if (state_size_ != state_size) {
|
||||
delete[] (*tgt);
|
||||
*tgt = new uint32[state_size];
|
||||
}
|
||||
int32 state_size_in_bytes = state_size*sizeof(uint32);
|
||||
memcpy(*tgt, host_, state_size_in_bytes);
|
||||
if (state_size == 0) return; // Nothing to do.
|
||||
std::vector<uint32> temp_rand_data(state_size);
|
||||
for(MatrixIndexT i = 0; i < state_size; i++)
|
||||
temp_rand_data[i] = RandInt(128, RAND_MAX);
|
||||
int32 state_size_in_bytes = state_size * sizeof(uint32);
|
||||
*tgt = static_cast<uint32*>(device.Malloc(state_size_in_bytes));
|
||||
CU_SAFE_CALL(cudaMemcpy(*tgt, &(temp_rand_data[0]),
|
||||
state_size_in_bytes, cudaMemcpyHostToDevice));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
CuRand<Real>::~CuRand() {
|
||||
SeedBuffer(0, &z1_);
|
||||
SeedBuffer(0, &z2_);
|
||||
SeedBuffer(0, &z3_);
|
||||
SeedBuffer(0, &z4_);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real> void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
|
||||
#if HAVE_CUDA==1
|
||||
template<typename Real> void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
int32 tgt_size = tgt->NumRows()*tgt->Stride();
|
||||
int32 tgt_size = tgt->NumRows() * tgt->Stride();
|
||||
if (tgt_size != state_size_) SeedGpu(tgt_size);
|
||||
|
||||
dim3 dimBlock(CUBLOCK, CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->num_cols_, CUBLOCK), n_blocks(tgt->num_rows_, CUBLOCK));
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->num_cols_, CU2DBLOCK), n_blocks(tgt->num_rows_, CU2DBLOCK));
|
||||
|
||||
cuda_rand(dimGrid, dimBlock, tgt->data_, z1_, z2_, z3_, z4_, tgt->Dim());
|
||||
cuSafeCall(cudaGetLastError());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
for(int32 r=0; r<tgt->NumRows(); r++) {
|
||||
for(int32 c=0; c<tgt->num_cols_; c++) {
|
||||
tgt->Mat()(r, c) = kaldi::RandUniform();
|
||||
}
|
||||
}
|
||||
tgt->SetRandUniform();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real> void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
|
||||
#if HAVE_CUDA==1
|
||||
template<typename Real> void CuRand<Real>::RandGaussian(CuMatrixBase<Real> *tgt) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
int32 tgt_size = tgt->NumRows()*tgt->Stride();
|
||||
if (tgt_size != state_size_) SeedGpu(tgt_size);
|
||||
|
||||
dim3 dimBlock(CUBLOCK, CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->num_cols_, CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
|
||||
|
||||
int32 tgt_size = tgt->NumRows() * tgt->Stride();
|
||||
if (tgt_size == 0)
|
||||
return;
|
||||
if (tgt_size > state_size_) SeedGpu(tgt_size);
|
||||
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(tgt->num_cols_, CU2DBLOCK), n_blocks(tgt->num_rows_, CU2DBLOCK));
|
||||
|
||||
cuda_gauss_rand(dimGrid, dimBlock, tgt->data_, z1_, z2_, z3_, z4_, tgt->Dim());
|
||||
cuSafeCall(cudaGetLastError());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
for(int32 r=0; r<tgt->NumRows(); r++) {
|
||||
for(int32 c=0; c<tgt->num_cols_; c++) {
|
||||
tgt->Mat()(r, c) = RandGauss();
|
||||
}
|
||||
}
|
||||
tgt->SetRandn();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuRand<Real>::RandGaussian(CuVectorBase<Real> *tgt) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
int32 tgt_size = tgt->Dim();
|
||||
if (tgt_size != state_size_) SeedGpu(tgt_size);
|
||||
|
||||
int dimBlock(CU1DBLOCK);
|
||||
int dimGrid(n_blocks(tgt->Dim(), CU1DBLOCK));
|
||||
|
||||
cuda_vec_gauss_rand(dimGrid, dimBlock, tgt->Data(), z1_, z2_, z3_, z4_, tgt->Dim());
|
||||
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
tgt->Vec().SetRandn();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuRand<Real>::BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states) {
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
|
@ -156,15 +159,15 @@ template<typename Real> void CuRand<Real>::BinarizeProbs(const CuMatrix<Real> &p
|
|||
RandUniform(&tmp_);
|
||||
|
||||
// use the uniform random numbers to compute discrete 0/1 states
|
||||
dim3 dimBlock(CUBLOCK, CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(states->num_cols_, CUBLOCK), n_blocks(states->num_rows_, CUBLOCK));
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(states->num_cols_, CU2DBLOCK), n_blocks(states->num_rows_, CU2DBLOCK));
|
||||
|
||||
cuda_binarize_probs(dimGrid, dimBlock, states->data_, probs.data_, tmp_.data_, states->Dim());
|
||||
cuSafeCall(cudaGetLastError());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
for(int32 r=0; r<states->num_rows_; r++) {
|
||||
for(int32 c=0; c<states->num_cols_; c++) {
|
||||
|
@ -182,10 +185,12 @@ template<typename Real> void CuRand<Real>::AddGaussNoise(CuMatrix<Real> *tgt, Re
|
|||
tgt->AddMat(gscale, tmp_, 1.0);
|
||||
}
|
||||
|
||||
|
||||
// Instantiate the class for float and double.
|
||||
template class CuRand<float>;
|
||||
template class CuRand<double>;
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
|
||||
#include "base/kaldi-math.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
@ -33,25 +33,18 @@ template<typename Real>
|
|||
class CuRand {
|
||||
public:
|
||||
|
||||
CuRand()
|
||||
: z1_(NULL), z2_(NULL), z3_(NULL), z4_(NULL), state_size_(0),
|
||||
host_(NULL), host_size_(0)
|
||||
{ }
|
||||
|
||||
~CuRand() {
|
||||
#if HAVE_CUDA == 1
|
||||
cudaFree(z1_); cudaFree(z2_); cudaFree(z3_); cudaFree(z4_);
|
||||
#endif
|
||||
delete[] host_;
|
||||
}
|
||||
CuRand(): z1_(NULL), z2_(NULL), z3_(NULL), z4_(NULL), state_size_(0) { }
|
||||
|
||||
~CuRand();
|
||||
|
||||
/// on demand seeding of all the buffers
|
||||
void SeedGpu(MatrixIndexT state_size);
|
||||
|
||||
/// fill with uniform random numbers (0.0-1.0)
|
||||
void RandUniform(CuMatrix<Real> *tgt);
|
||||
/// fill with numbers drawn from uniform distribution on [0, 1]
|
||||
void RandUniform(CuMatrixBase<Real> *tgt);
|
||||
/// fill with normal random numbers
|
||||
void RandGaussian(CuMatrix<Real> *tgt);
|
||||
void RandGaussian(CuMatrixBase<Real> *tgt);
|
||||
void RandGaussian(CuVectorBase<Real> *tgt);
|
||||
|
||||
/// align probabilities to discrete 0/1 states (use uniform samplig)
|
||||
void BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states);
|
||||
|
@ -59,8 +52,9 @@ class CuRand {
|
|||
void AddGaussNoise(CuMatrix<Real> *tgt, Real gscale = 1.0);
|
||||
|
||||
private:
|
||||
/// seed one buffer
|
||||
void SeedBuffer(uint32* *tgt, MatrixIndexT state_size);
|
||||
/// seed one buffer on the GPU. If state_size == 0, just frees any
|
||||
/// existing buffers.
|
||||
void SeedBuffer(MatrixIndexT state_size, uint32 **tgt);
|
||||
|
||||
private:
|
||||
|
||||
|
@ -75,19 +69,13 @@ class CuRand {
|
|||
/// Inner state of the ``grid-like'' random number generator
|
||||
uint32 *z1_, *z2_, *z3_, *z4_;
|
||||
int32 state_size_; ///< size of the buffers
|
||||
|
||||
uint32 *host_; ///< host bufer, used for initializing
|
||||
int32 host_size_; ///< size of the host buffer
|
||||
|
||||
|
||||
CuMatrix<Real> tmp_; ///< auxiliary matrix
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // namsepace
|
||||
|
||||
#include "cudamatrix/cu-rand-inl.h"
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
#include "cudamatrix/cu-matrixdim.h"
|
||||
#include "cudamatrix/cu-kernels-ansi.h"
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
@ -34,6 +34,7 @@ extern "C" {
|
|||
*/
|
||||
void cudaF_rand(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
|
||||
void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
|
||||
void cudaF_vec_gauss_rand(int Gr, int Bl, float *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim);
|
||||
void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float *states, const float *probs, float *rand, MatrixDim d);
|
||||
|
||||
/*********************************************************
|
||||
|
@ -41,6 +42,7 @@ void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float *states, const float *probs, f
|
|||
*/
|
||||
void cudaD_rand(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
|
||||
void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
|
||||
void cudaD_vec_gauss_rand(int Gr, int Bl, double *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim);
|
||||
void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double *states, const double *probs, double *rand, MatrixDim d);
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// cudamatrix/cu-randkernels.cu
|
||||
|
||||
// Copyright 2012 Karel Vesely
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -108,6 +109,20 @@ static void _gauss_rand(Real* mat, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda
|
|||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
__global__
|
||||
static void _vec_gauss_rand(Real* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
|
||||
int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (blockIdx.y > 0)
|
||||
return;
|
||||
|
||||
if ( i < dim ) {
|
||||
v[i] = BoxMuller<Real>(z1[i],z2[i],z3[i],z4[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
__global__
|
||||
static void _binarize_probs(Real* states, const Real* probs, const Real* rand, MatrixDim d) {
|
||||
|
@ -136,6 +151,10 @@ void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, uint32_cuda* z1, uint32_cuda
|
|||
_gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d);
|
||||
}
|
||||
|
||||
void cudaF_vec_gauss_rand(int Gr, int Bl, float* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
|
||||
_vec_gauss_rand<<<Gr,Bl>>>(v,z1,z2,z3,z4,dim);
|
||||
}
|
||||
|
||||
void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) {
|
||||
_binarize_probs<<<Gr,Bl>>>(states,probs,rand,d);
|
||||
}
|
||||
|
@ -153,6 +172,10 @@ void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, uint32_cuda* z1, uint32_cud
|
|||
_gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d);
|
||||
}
|
||||
|
||||
void cudaD_vec_gauss_rand(int Gr, int Bl, double* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
|
||||
_vec_gauss_rand<<<Gr,Bl>>>(v,z1,z2,z3,z4,dim);
|
||||
}
|
||||
|
||||
void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) {
|
||||
_binarize_probs<<<Gr,Bl>>>(states,probs,rand,d);
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
#ifndef KALDI_CUDAMATRIX_CU_RANDKERNELS_H_
|
||||
#define KALDI_CUDAMATRIX_CU_RANDKERNELS_H_
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#if HAVE_CUDA == 1
|
||||
|
||||
#include "base/kaldi-error.h"
|
||||
#include "cudamatrix/cu-randkernels-ansi.h"
|
||||
|
@ -38,6 +38,7 @@ namespace kaldi {
|
|||
*/
|
||||
template<typename Real> inline void cuda_rand(dim3 Gr, dim3 Bl, Real *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_gauss_rand(dim3 Gr, dim3 Bl, Real *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_vec_gauss_rand(int Gr, int Bl, Real *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
template<typename Real> inline void cuda_binarize_probs(dim3 Gr, dim3 Bl, Real *states, const Real *probs, Real *rand, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
|
||||
|
||||
/*********************************************************
|
||||
|
@ -45,6 +46,7 @@ template<typename Real> inline void cuda_binarize_probs(dim3 Gr, dim3 Bl, Real *
|
|||
*/
|
||||
template<> inline void cuda_rand<float>(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaF_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
|
||||
template<> inline void cuda_gauss_rand<float>(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaF_gauss_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
|
||||
template<> inline void cuda_vec_gauss_rand<float>(int Gr, int Bl, float *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { cudaF_vec_gauss_rand(Gr,Bl,v,z1,z2,z3,z4,dim); }
|
||||
template<> inline void cuda_binarize_probs<float>(dim3 Gr, dim3 Bl, float *states, const float *probs, float *rand, MatrixDim d) { cudaF_binarize_probs(Gr,Bl,states,probs,rand,d); }
|
||||
|
||||
/*********************************************************
|
||||
|
@ -52,6 +54,7 @@ template<> inline void cuda_binarize_probs<float>(dim3 Gr, dim3 Bl, float *state
|
|||
*/
|
||||
template<> inline void cuda_rand<double>(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaD_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
|
||||
template<> inline void cuda_gauss_rand<double>(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaD_gauss_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
|
||||
template<> inline void cuda_vec_gauss_rand<double>(int Gr, int Bl, double *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { cudaD_vec_gauss_rand(Gr,Bl,v,z1,z2,z3,z4,dim); }
|
||||
template<> inline void cuda_binarize_probs<double>(dim3 Gr, dim3 Bl, double *states, const double *probs, double *rand, MatrixDim d) { cudaD_binarize_probs(Gr,Bl,states,probs,rand,d); }
|
||||
|
||||
} // namespace
|
||||
|
|
|
@ -0,0 +1,187 @@
|
|||
// cudamatrix/cu-matrix-speed-test.cc
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real>
|
||||
std::string NameOf() {
|
||||
return (sizeof(Real) == 8 ? "<double>" : "<float>");
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixInvert(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.5;
|
||||
int32 iter = 0;
|
||||
Timer tim;
|
||||
CuSpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
KALDI_ASSERT(A.Trace() != 0.0); // true with probability 1...
|
||||
CuSpMatrix<Real> B(A);
|
||||
|
||||
if (iter > 0) {
|
||||
B.Invert();
|
||||
} else { // do some more testing...
|
||||
|
||||
CuMatrix<Real> D(A);
|
||||
A.AddMat2(1.0, D, kTrans, 1.0);
|
||||
A.AddToDiag(0.1 * dim);
|
||||
|
||||
CuMatrix<Real> C(B);
|
||||
B.AddMat2(1.0, C, kTrans, 1.0);
|
||||
B.AddToDiag(0.1 * dim);
|
||||
|
||||
A.Invert();
|
||||
B.Invert();
|
||||
|
||||
SpMatrix<Real> E(dim);
|
||||
B.CopyToSp(&E);
|
||||
|
||||
SpMatrix<Real> A2(A);
|
||||
AssertEqual(A2, E);
|
||||
}
|
||||
}
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuSpMatrix::Invert" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixCopyFromMat(int32 dim, SpCopyType copy_type) {
|
||||
BaseFloat time_in_secs = 0.1;
|
||||
int32 iter = 0;
|
||||
Timer tim;
|
||||
CuMatrix<Real> A(dim, dim);
|
||||
CuSpMatrix<Real> S(dim);
|
||||
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
S.CopyFromMat(A, copy_type);
|
||||
}
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuSpMatrix::CopyFromMat" << NameOf<Real>()
|
||||
<< ", with copy-type "
|
||||
<<(copy_type == kTakeLower ? "kTakeLower" :
|
||||
(copy_type == kTakeUpper ? "kTakeUpper" :
|
||||
"kTakeMeanAndCheck")) << " and dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuMatrixApproxInvert(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.5;
|
||||
int32 iter = 0;
|
||||
|
||||
// Get random orthogonal matrix.
|
||||
Matrix<Real> Q_cpu(dim, dim);
|
||||
|
||||
Q_cpu.SetRandn();
|
||||
for (int32 r = 0; r < dim; r++) {
|
||||
for (int32 s = 0; s < r; s++)
|
||||
Q_cpu.Row(r).AddVec(-1.0 * VecVec(Q_cpu.Row(r), Q_cpu.Row(s)), Q_cpu.Row(s));
|
||||
Q_cpu.Row(r).Scale(1.0 / Q_cpu.Row(r).Norm(2.0));
|
||||
}
|
||||
CuMatrix<Real> Q(Q_cpu);
|
||||
|
||||
CuVector<Real> s(dim);
|
||||
Real eig_range = 50.0; // factor of 50 on eigenvalues.. this affects the speed.
|
||||
Real first_eig = 0.001 + RandUniform() * 5.0;
|
||||
for (int32 r = 0; r < dim; r++)
|
||||
s(r) = first_eig * exp(r * log(eig_range) / dim);
|
||||
|
||||
s.ApplyPow(0.5);
|
||||
Q.MulColsVec(s);
|
||||
CuSpMatrix<Real> A(dim);
|
||||
A.AddMat2(1.0, Q, kNoTrans, 0.0);
|
||||
|
||||
|
||||
Timer tim;
|
||||
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
CuSpMatrix<Real> Atmp(A);
|
||||
Atmp.InvertPosDefApprox(0.1);
|
||||
}
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuSpMatrix::InvertPosDefApprox" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
template<typename Real> void CuSpMatrixSpeedTest() {
|
||||
std::vector<int32> sizes;
|
||||
sizes.push_back(16);
|
||||
sizes.push_back(128);
|
||||
sizes.push_back(256);
|
||||
sizes.push_back(1024);
|
||||
int32 ns = sizes.size();
|
||||
|
||||
for (int32 s = 0; s < ns; s++) {
|
||||
UnitTestCuSpMatrixInvert<Real>(sizes[s]);
|
||||
UnitTestCuMatrixApproxInvert<Real>(sizes[s]);
|
||||
UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeLower);
|
||||
UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeUpper);
|
||||
UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeMean);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
//Select the GPU
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
|
||||
#endif
|
||||
|
||||
kaldi::CuSpMatrixSpeedTest<float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CuSpMatrixSpeedTest<double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CuSpMatrixSpeedTest<double>();
|
||||
#endif
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
std::cout << "Tests succeeded.\n";
|
||||
}
|
|
@ -0,0 +1,437 @@
|
|||
// cudamatrix/cu-sp-matrix-test.cc
|
||||
//
|
||||
// Copyright 2013 Ehsan Variani
|
||||
// Lucas Ondel
|
||||
// Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
//
|
||||
// UnitTests for testing cu-sp-matrix.h methods.
|
||||
//
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/*
|
||||
* Unit Tests
|
||||
*/
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixConstructor() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10 * i;
|
||||
|
||||
Matrix<Real> A(dim, dim);
|
||||
A.SetRandn();
|
||||
SpMatrix<Real> B(A, kTakeLower);
|
||||
|
||||
CuMatrix<Real> C(A);
|
||||
CuSpMatrix<Real> D(C, kTakeLower);
|
||||
|
||||
SpMatrix<Real> E(dim);
|
||||
D.CopyToSp(&E);
|
||||
|
||||
SpMatrix<Real> F(D);
|
||||
|
||||
AssertEqual(F, B);
|
||||
//added by hxu, to test copy from SpMatrix to CuSpMatrix
|
||||
|
||||
AssertEqual(B, E);
|
||||
|
||||
KALDI_ASSERT(!B.IsUnit());
|
||||
B.SetZero();
|
||||
B.SetDiag(1.0);
|
||||
KALDI_ASSERT(B.IsUnit());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixApproxEqual() {
|
||||
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
int32 dim = 1 + rand() % 10;
|
||||
SpMatrix<Real> A(dim), B(dim);
|
||||
A.SetRandn();
|
||||
B.SetRandn();
|
||||
BaseFloat threshold = 0.01;
|
||||
for (int32 j = 0; j < 20; j++, threshold *= 1.3) {
|
||||
bool b1 = A.ApproxEqual(B, threshold);
|
||||
SpMatrix<Real> diff(A);
|
||||
diff.AddSp(-1.0, B);
|
||||
bool b2 = (diff.FrobeniusNorm() < threshold * std::max(A.FrobeniusNorm(),
|
||||
B.FrobeniusNorm()));
|
||||
KALDI_ASSERT(b1 == b2);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixOperator() {
|
||||
SpMatrix<Real> A(100);
|
||||
A.SetRandn();
|
||||
|
||||
CuSpMatrix<Real> B(100);
|
||||
B.CopyFromSp(A);
|
||||
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
KALDI_ASSERT(std::abs(A(i, j) - B(i, j)) < 0.0001);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixAddToDiag() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10*i;
|
||||
SpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuSpMatrix<Real> B(A);
|
||||
|
||||
Matrix<Real> D(A);
|
||||
A.AddToDiag(i);
|
||||
|
||||
CuMatrix<Real> C(B);
|
||||
B.AddToDiag(i);
|
||||
|
||||
SpMatrix<Real> E(dim);
|
||||
B.CopyToSp(&E);
|
||||
|
||||
AssertEqual(A, E);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixCopyFromMat() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
SpCopyType copy_type = (i % 3 == 0 ? kTakeMean :
|
||||
(i % 3 == 1 ? kTakeLower : kTakeUpper));
|
||||
MatrixIndexT dim = 10 * i + rand() % 5;
|
||||
CuMatrix<Real> A(dim, dim);
|
||||
A.SetRandn();
|
||||
Matrix<Real> A2(A);
|
||||
|
||||
CuSpMatrix<Real> B(A, copy_type);
|
||||
SpMatrix<Real> B2(A2, copy_type);
|
||||
SpMatrix<Real> B3(B);
|
||||
if (!ApproxEqual(B2, B3) ) {
|
||||
KALDI_ERR << "Matrices differ, A = " << A << ", B2 = " << B2 << ", B3(CUDA) = " << B3;
|
||||
}
|
||||
KALDI_ASSERT(B3.Trace() != 0);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixApproxInvert(int32 dim) {
|
||||
// Get random orthogonal matrix.
|
||||
CuMatrix<Real> Q(dim, dim);
|
||||
|
||||
Q.SetRandn();
|
||||
for (int32 r = 0; r < dim; r++) {
|
||||
for (int32 s = 0; s < r; s++)
|
||||
Q.Row(r).AddVec(-1.0 * VecVec(Q.Row(r), Q.Row(s)), Q.Row(s));
|
||||
Q.Row(r).Scale(1.0 / Q.Row(r).Norm(2.0));
|
||||
}
|
||||
|
||||
CuVector<Real> s(dim); // factor of 10 on eigenvalues, evenly spaced in log.
|
||||
Real eig_range = 50.0;
|
||||
Real first_eig = 0.001 + RandUniform() * 5.0;
|
||||
for (int32 r = 0; r < dim; r++)
|
||||
s(r) = first_eig * exp(r * log(eig_range) / dim);
|
||||
|
||||
s.ApplyPow(0.5);
|
||||
Q.MulColsVec(s);
|
||||
CuSpMatrix<Real> A(dim);
|
||||
A.AddMat2(1.0, Q, kNoTrans, 0.0);
|
||||
CuMatrix<Real> A_orig(A);
|
||||
|
||||
BaseFloat max_error = 0.1;
|
||||
A.InvertPosDefApprox(max_error);
|
||||
|
||||
|
||||
CuMatrix<Real> prod(dim, dim);
|
||||
prod.AddSpMat(1.0, A, A_orig, kNoTrans, 0.0);
|
||||
KALDI_ASSERT(prod.IsUnit(max_error));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixInvert() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10*i + rand() % 5;
|
||||
CuSpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
KALDI_ASSERT(A.Trace() != 0.0); // true with probability 1...
|
||||
SpMatrix<Real> B(A);
|
||||
|
||||
CuMatrix<Real> D(A);
|
||||
A.AddMat2(1.0, D, kTrans, 1.0);
|
||||
A.AddToDiag(i);
|
||||
|
||||
Matrix<Real> C(B);
|
||||
B.AddMat2(1.0, C, kTrans, 1.0);
|
||||
B.AddToDiag(i);
|
||||
|
||||
CuSpMatrix<Real> Acopy(A);
|
||||
A.Invert();
|
||||
B.Invert();
|
||||
|
||||
SpMatrix<Real> A2(A);
|
||||
AssertEqual(A2, B);
|
||||
|
||||
CuMatrix<Real> I(dim, dim);
|
||||
I.AddMatMat(1.0, CuMatrix<Real>(Acopy), kNoTrans, CuMatrix<Real>(A), kNoTrans, 0.0);
|
||||
KALDI_ASSERT(I.IsUnit(0.01));
|
||||
}
|
||||
}
|
||||
|
||||
// TODO (variani) : fails for dim = 0
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixAddVec2() {
|
||||
for (int32 i = 0; i < 50; i++) {
|
||||
MatrixIndexT dim = 1 + rand() % 200;
|
||||
SpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuSpMatrix<Real> B(A);
|
||||
|
||||
Vector<Real> C(dim);
|
||||
C.SetRandn();
|
||||
CuVector<Real> D(C);
|
||||
Real alpha = RandGauss();
|
||||
|
||||
A.AddVec2(alpha, C);
|
||||
B.AddVec2(alpha, D);
|
||||
|
||||
SpMatrix<Real> E(dim);
|
||||
B.CopyToSp(&E);
|
||||
|
||||
AssertEqual(A, E);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixAddMat2() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim_row = 15 * i + rand() % 10;
|
||||
MatrixIndexT dim_col = 7 *i + rand() % 10;
|
||||
Matrix<Real> A(dim_row, dim_col);
|
||||
A.SetRandn();
|
||||
CuMatrix<Real> B(A);
|
||||
|
||||
SpMatrix<Real> C(dim_col);
|
||||
C.SetRandn();
|
||||
CuSpMatrix<Real> D(C);
|
||||
|
||||
const Real alpha = 2.0;
|
||||
const Real beta = 3.0;
|
||||
|
||||
C.AddMat2(alpha, A, kTrans, beta);
|
||||
D.AddMat2(alpha, B, kTrans, beta);
|
||||
|
||||
SpMatrix<Real> E(dim_col);
|
||||
D.CopyToSp(&E);
|
||||
|
||||
AssertEqual(C, E);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSpMatrixAddSp() {
|
||||
for (MatrixIndexT i = 1; i < 50; i++) {
|
||||
MatrixIndexT dim = 7 * i + rand() % 10;
|
||||
|
||||
SpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuSpMatrix<Real> B(A);
|
||||
|
||||
SpMatrix<Real> C(dim);
|
||||
C.SetRandn();
|
||||
const CuSpMatrix<Real> D(C);
|
||||
|
||||
const Real alpha = 2.0;
|
||||
|
||||
A.AddSp(alpha, C);
|
||||
B.AddSp(alpha, D);
|
||||
|
||||
SpMatrix<Real> E(dim);
|
||||
B.CopyToSp(&E);
|
||||
|
||||
AssertEqual(A, E);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real, typename OtherReal>
|
||||
static void UnitTestCuSpMatrixTraceSpSp() {
|
||||
for (MatrixIndexT i = 1; i < 2; i++) {
|
||||
MatrixIndexT dim = 100 + rand() % 255;
|
||||
|
||||
SpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
const CuSpMatrix<Real> B(A);
|
||||
SpMatrix<OtherReal> C(dim);
|
||||
C.SetRandn();
|
||||
const CuSpMatrix<OtherReal> D(C);
|
||||
|
||||
Real t1 = TraceSpSp(A, C), t2 = TraceSpSp(B, D);
|
||||
KALDI_ASSERT(ApproxEqual(t1, t2));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void UnitTestCuSpMatrixSetUnit() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 100 * i + rand() % 255;
|
||||
if (i % 5 == 0) dim = 0;
|
||||
CuSpMatrix<Real> S1(dim), S2(dim), S4(dim);
|
||||
S1.SetRandn();
|
||||
S2.SetRandn();
|
||||
S4.SetRandn();
|
||||
SpMatrix<Real> S3(dim);
|
||||
S3.SetUnit();
|
||||
S1.SetUnit();
|
||||
S2.SetZero();
|
||||
S2.SetDiag(1.0);
|
||||
S4.SetZero();
|
||||
S4.AddToDiag(0.4);
|
||||
S4.AddToDiag(0.6);
|
||||
CuSpMatrix<Real> cu_S3(S3);
|
||||
KALDI_LOG << "S1 norm is " << S1.FrobeniusNorm();
|
||||
KALDI_LOG << "S2 norm is " << S2.FrobeniusNorm();
|
||||
KALDI_LOG << "S3 norm is " << S3.FrobeniusNorm();
|
||||
AssertEqual(S1, cu_S3);
|
||||
AssertEqual(S2, cu_S3);
|
||||
AssertEqual(S4, cu_S3);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuSpMatrixIO() {
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
int32 dimM = rand() % 255;
|
||||
if (i % 5 == 0) { dimM = 0; }
|
||||
CuSpMatrix<Real> mat(dimM);
|
||||
mat.SetRandn();
|
||||
std::ostringstream os;
|
||||
bool binary = (i % 4 < 2);
|
||||
mat.Write(os, binary);
|
||||
|
||||
CuSpMatrix<Real> mat2;
|
||||
std::istringstream is(os.str());
|
||||
mat2.Read(is, binary);
|
||||
AssertEqual(mat, mat2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename Real, typename OtherReal>
|
||||
static void UnitTestCuSpMatrixAddSp() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 100 * i + rand() % 255;
|
||||
|
||||
SpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
const CuSpMatrix<Real> B(A);
|
||||
SpMatrix<OtherReal> C(dim);
|
||||
C.SetRandn();
|
||||
const CuSpMatrix<OtherReal> D(C);
|
||||
|
||||
A.AddSp(1.0, C);
|
||||
B.AddSp(1.0, D);
|
||||
|
||||
AssertEqual(A, B);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CudaSpMatrixUnitTest() {
|
||||
UnitTestCuSpMatrixIO<Real>();
|
||||
UnitTestCuSpMatrixConstructor<Real>();
|
||||
UnitTestCuSpMatrixOperator<Real>();
|
||||
UnitTestCuSpMatrixApproxEqual<Real>();
|
||||
UnitTestCuSpMatrixInvert<Real>();
|
||||
UnitTestCuSpMatrixApproxInvert<Real>(300);
|
||||
UnitTestCuSpMatrixApproxInvert<Real>(100);
|
||||
UnitTestCuSpMatrixApproxInvert<Real>(10);
|
||||
UnitTestCuSpMatrixCopyFromMat<Real>();
|
||||
UnitTestCuSpMatrixAddVec2<Real>();
|
||||
UnitTestCuSpMatrixAddMat2<Real>();
|
||||
UnitTestCuSpMatrixAddSp<Real>();
|
||||
UnitTestCuSpMatrixAddToDiag<Real>();
|
||||
UnitTestCuSpMatrixSetUnit<Real>();
|
||||
}
|
||||
|
||||
template<typename Real, typename OtherReal> void CudaSpMatrixUnitTest() {
|
||||
UnitTestCuSpMatrixTraceSpSp<Real, OtherReal>();
|
||||
|
||||
}
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
using namespace kaldi;
|
||||
|
||||
for (int32 loop = 0; loop < 2; loop++) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (loop == 0)
|
||||
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
|
||||
else
|
||||
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
|
||||
#endif
|
||||
|
||||
kaldi::CudaSpMatrixUnitTest<float>();
|
||||
kaldi::CudaSpMatrixUnitTest<float, float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CudaSpMatrixUnitTest<double>();
|
||||
kaldi::CudaSpMatrixUnitTest<float, double>();
|
||||
kaldi::CudaSpMatrixUnitTest<double, float>();
|
||||
kaldi::CudaSpMatrixUnitTest<double, double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CudaSpMatrixUnitTest<float, double>();
|
||||
kaldi::CudaSpMatrixUnitTest<double, float>();
|
||||
kaldi::CudaSpMatrixUnitTest<double, double>();
|
||||
#endif
|
||||
|
||||
if (loop == 0)
|
||||
KALDI_LOG << "Tests without GPU use succeeded.\n";
|
||||
else
|
||||
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
|
||||
}
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,361 @@
|
|||
#if HAVE_CUDA == 1
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cublas.h>
|
||||
#endif
|
||||
|
||||
#include "util/timer.h"
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-kernels.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cublas-wrappers.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real>
|
||||
void CuSpMatrix<Real>::CopyFromMat(const CuMatrixBase<Real> &M,
|
||||
SpCopyType copy_type) {
|
||||
KALDI_ASSERT(this->num_rows_ == M.NumRows() &&
|
||||
this->num_rows_ == M.NumCols());
|
||||
if (this->num_rows_ == 0)
|
||||
return;
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
MatrixIndexT D = this->NumRows();
|
||||
if (D == 0)
|
||||
return;
|
||||
switch (copy_type) {
|
||||
case kTakeMeanAndCheck:
|
||||
KALDI_ERR << "kTakeMeanAndCheck not supported!";
|
||||
// The grid/block dimensions have been very roughly tuned for the
|
||||
// individual cases.
|
||||
case kTakeMean:
|
||||
{
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
|
||||
cuda_take_mean(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
}
|
||||
break;
|
||||
case kTakeLower:
|
||||
{
|
||||
dim3 dimBlock(1, CU1DBLOCK);
|
||||
dim3 dimGrid(D, n_blocks(D, CU1DBLOCK));
|
||||
cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
break;
|
||||
case kTakeUpper:
|
||||
{
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
|
||||
cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KALDI_ASSERT("Invalid argument to CuSpMatrix::CopyFromMat");
|
||||
}
|
||||
CuDevice::Instantiate().AccuProfile("CuSpMatrix::CopyFromMat(from CuMatrixBase)", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().CopyFromMat(M.Mat(), copy_type);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuSpMatrix<Real>::Invert() {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
CuMatrix<Real> mat(this->num_rows_, this->num_rows_);
|
||||
mat.CopyFromSp(*this);
|
||||
mat.SymInvertPosDef();
|
||||
this->CopyFromMat(mat);
|
||||
} else
|
||||
#endif
|
||||
{ // Use inversion of CPU-based SpMatrix.
|
||||
Mat().Invert();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuSpMatrix<Real>::InvertPosDefApprox(BaseFloat max_error) {
|
||||
if (this->num_rows_ == 0) return;
|
||||
MatrixIndexT dim = this->num_rows_;
|
||||
CuMatrix<Real> temp(dim * 5, dim);
|
||||
CuSubMatrix<Real> A(temp, 0, dim, 0, dim),
|
||||
AA(temp, dim, dim, 0, dim),
|
||||
AAA(temp, 2 * dim, dim, 0, dim),
|
||||
AAAA(temp, 3 * dim, dim, 0, dim);
|
||||
Real prescale = dim / this->Trace();
|
||||
this->Scale(prescale); // We'll compute the inverse of the prescaled A, and then
|
||||
// put that factor back later. This is useful since we
|
||||
// deal with high powers of A that could get large or small.
|
||||
A.CopyFromSp(*this);
|
||||
// use *this as a temporary SpMatrix; we've stored its contents in "A".
|
||||
this->AddMat2(1.0, A, kNoTrans, 0.0);
|
||||
AA.CopyFromSp(*this);
|
||||
{ // now create AAA and AAAA using a single multiplication.
|
||||
CuSubMatrix<Real> A_and_AA(temp, 0, dim * 2, 0, dim),
|
||||
AAA_and_AAAA(temp, dim * 2, dim * 2, 0, dim);
|
||||
// Note: below, the transpose-ness of AA is arbitrary since it's symmetric;
|
||||
// I guess that transposed may be faster.
|
||||
AAA_and_AAAA.AddMatMat(1.0, A_and_AA, kNoTrans, AA, kTrans, 0.0);
|
||||
}
|
||||
|
||||
// Note: below, trace_A equals dim because of the prescaling, we
|
||||
// ensured that.
|
||||
Vector<double> trace(8); // trace(i) is trace(A^(i+1))
|
||||
trace(0) = dim;
|
||||
{
|
||||
CuVector<Real> trace_vec(dim * 5);
|
||||
CuSubVector<Real> trace_lower4(trace_vec, 0, dim * 4),
|
||||
trace_lower3(trace_vec, 0, dim * 3),
|
||||
trace1(trace_vec, 0, dim), trace2(trace_vec, dim, dim),
|
||||
trace3(trace_vec, dim * 2, dim), trace4(trace_vec, dim * 3, dim),
|
||||
ones(trace_vec, dim * 4, dim);
|
||||
trace_lower4.AddDiagMat2(1.0, temp.Range(0, dim * 4, 0, dim),
|
||||
kNoTrans, 0.0);
|
||||
ones.Set(1.0);
|
||||
// TODO: can make these vecvec's faster as fake matrix multiplies.
|
||||
trace(1) = VecVec(trace1, ones);
|
||||
trace(3) = VecVec(trace2, ones);
|
||||
trace(5) = VecVec(trace3, ones);
|
||||
trace(7) = VecVec(trace4, ones);
|
||||
// Now we want to get odd-numbered trace quantities, so multiply the
|
||||
// rows of A through AAA with the rows of AA through AAA.
|
||||
CuSubMatrix<Real> lower_three(temp, 0, dim * 3, 0, dim),
|
||||
upper_three(temp, dim, dim * 3, 0, dim);
|
||||
trace_lower3.AddDiagMatMat(1.0, lower_three, kNoTrans, upper_three, kTrans, 0.0);
|
||||
trace(2) = VecVec(trace1, ones);
|
||||
trace(4) = VecVec(trace2, ones);
|
||||
trace(6) = VecVec(trace3, ones);
|
||||
}
|
||||
{ // Check the trace values.
|
||||
CuMatrix<Real> power(A);
|
||||
for (int32 i = 0; i < 8; i++) {
|
||||
double this_trace = power.Trace();
|
||||
AssertEqual(this_trace, trace(i));
|
||||
CuMatrix<Real> temp_power(power);
|
||||
power.AddMatMat(1.0, temp_power, kNoTrans, A, kNoTrans, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
// We'll use a and B to get the coefficients. These operations are in very
|
||||
// tiny dimensions -> faster and more convenient to use CPU.
|
||||
SubVector<double> a(trace, 0, 4);
|
||||
SpMatrix<double> B(4);
|
||||
for (int32 r = 0; r < 4; r++)
|
||||
for (int32 c = 0; c <= r; c++)
|
||||
B(r, c) = trace(r + c + 1);
|
||||
|
||||
TpMatrix<double> C(4);
|
||||
C.Cholesky(B);
|
||||
C.Invert();
|
||||
SpMatrix<double> Binv(4);
|
||||
Binv.AddTp2(1.0, C, kTrans, 0.0);
|
||||
Vector<double> v(4);
|
||||
v.AddSpVec(1.0, Binv, a, 0.0);
|
||||
|
||||
Real av = VecVec(a, v), vBv = VecSpVec(v, B, v),
|
||||
error = (vBv + dim) - 2.0 * av;
|
||||
|
||||
|
||||
KALDI_ASSERT(error >= 0.0); // note: error is a squared Frobenius
|
||||
// norm.
|
||||
|
||||
KALDI_VLOG(5) << "a is " << a << ", B is " << B;
|
||||
KALDI_VLOG(5) << "Dim is " << dim << ", error norm is " << sqrt(error);
|
||||
|
||||
if (error <= max_error) {
|
||||
// It's sufficient to return with the approximation up to A^3.
|
||||
A.Scale(v(1));
|
||||
A.AddToDiag(v(0));
|
||||
A.AddMat(v(2), AA);
|
||||
A.AddMat(v(3), AAA);
|
||||
this->CopyFromMat(A, kTakeLower);
|
||||
this->Scale(prescale);
|
||||
return;
|
||||
} else {
|
||||
// Let X be the approximate inverse of A: X = v(0) I + v(1) A + v(2) A^2 + v(3) A^3.
|
||||
// Let AX be A times X: AX = v(0) A + v(1) A^2 + v(2) A^3 + v(3) A^4.
|
||||
// We can construct both X and AX out of quantities we've already computed.
|
||||
|
||||
CuSubMatrix<Real> X(temp, dim * 4, dim, 0, dim),
|
||||
AX(temp, dim * 3, dim, 0, dim);
|
||||
|
||||
AX.Scale(v(3)); // AX re-uses memory of AAAA: scale that.
|
||||
AX.AddMat(v(2), AAA);
|
||||
AX.AddMat(v(1), AA);
|
||||
AX.AddMat(v(0), A);
|
||||
|
||||
X.AddMat(v(3), AAA); // X was zero before; space never used.
|
||||
X.AddMat(v(2), AA);
|
||||
X.AddMat(v(1), A);
|
||||
X.AddToDiag(v(0));
|
||||
|
||||
int32 num_iters = 10;
|
||||
for (int32 i = 0; i < num_iters; i++) {
|
||||
CuSubMatrix<Real> AX_and_X(temp, dim * 3, dim * 2, 0, dim),
|
||||
AAXX_and_AXX(temp, dim, dim * 2, 0, dim); // Note: in our variable-naming
|
||||
// conventions we put the A's first; since all quantities commute it doesn't
|
||||
// matter which order we put them in. Note: the transpose of AX below is
|
||||
// arbitrary (it's symmetric); I think it might be more efficient.`
|
||||
AAXX_and_AXX.AddMatMat(1.0, AX_and_X, kNoTrans, AX, kTrans, 0.0);
|
||||
|
||||
// The iteration now is X' <--- X (2I - AX). This is the iteration of
|
||||
// Schulz/Hoteling/whatever. To get the objf (and for the next iteration)
|
||||
// we also want AX'. Use X' <-- 2X - AXX, and AX' <-- 2AX - AAXX.
|
||||
// They go in the same place as before. For now on, forget about the dash
|
||||
// on the X, we'll just call it X.
|
||||
AX_and_X.Scale(2.0);
|
||||
AX_and_X.AddMat(-1.0, AAXX_and_AXX);
|
||||
|
||||
// The squared error is ||AX - I||^2 = tr((AX - I)(AX - I)) = tr(AX^T AX) + dim - 2 tr(AX)
|
||||
Real a = TraceMatMat(AX, AX, kTrans), b = AX.Trace();
|
||||
error = a + dim - 2 * b;
|
||||
|
||||
KALDI_VLOG(5) << "Better-inverse error is "
|
||||
<< sqrt(error);
|
||||
if (error <= max_error) {
|
||||
this->CopyFromMat(X, kTakeLower);
|
||||
this->Scale(prescale);
|
||||
return;
|
||||
}
|
||||
}
|
||||
KALDI_ASSERT("Error: max iters reached."); // TODO
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuSpMatrix<Real>::AddVec2(const Real alpha, const CuVectorBase<Real> &v) {
|
||||
KALDI_ASSERT(v.Dim() == this->NumRows());
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
size_t nr = this->num_rows_;
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK));
|
||||
|
||||
cublas_spr('U', this->num_rows_, alpha, v.Data(),
|
||||
1, this->Data());
|
||||
CU_SAFE_CALL(cudaGetLastError());
|
||||
CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().AddVec2(alpha, v.Vec());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuSpMatrix<Real>::AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
|
||||
MatrixTransposeType transM, const Real beta) {
|
||||
KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
|
||||
|| (transM == kTrans && this->NumRows() == M.NumCols()));
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
MatrixIndexT this_dim = this->NumRows(),
|
||||
m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows());
|
||||
|
||||
if (this_dim == 0) return;
|
||||
if (alpha == 0.0) {
|
||||
if (beta != 1.0) this->Scale(beta);
|
||||
return;
|
||||
}
|
||||
|
||||
char trans = (transM == kTrans ? 'N' : 'T');
|
||||
|
||||
CuMatrix<Real> tmp_mat(*this);
|
||||
cublas_syrk('U', trans, this_dim, m_other_dim, alpha, M.Data(),
|
||||
M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride());
|
||||
this->CopyFromMat(tmp_mat, kTakeLower);
|
||||
|
||||
CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddMat2", tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().AddMat2(alpha, M.Mat(), transM, beta);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* C++ templatd wrapper of ANSI-C CUBLAS function GEMM (matrix multiply)
|
||||
*/
|
||||
|
||||
template<typename Real, typename OtherReal>
|
||||
Real TraceSpSp(const CuSpMatrix<Real> &A, const CuSpMatrix<OtherReal> &B) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
MatrixIndexT nr = A.NumRows(), size = nr * (nr+1) / 2;
|
||||
CuVector<Real> Adiag(nr, kUndefined);
|
||||
CuVector<OtherReal> Bdiag(nr, kUndefined);
|
||||
Adiag.CopyDiagFromPacked(A);
|
||||
Bdiag.CopyDiagFromPacked(B);
|
||||
CuSubVector<Real> Aall(A.Data(), size);
|
||||
CuSubVector<OtherReal> Ball(B.Data(), size);
|
||||
// Below, we subtrace VecVec(Adiag, Bdiag) to remove double-counting
|
||||
// on the diagonal.
|
||||
return 2.0 * VecVec(Aall, Ball) - VecVec(Adiag, Bdiag);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
return TraceSpSp(A.Mat(), B.Mat());
|
||||
}
|
||||
}
|
||||
template
|
||||
float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<float> &B);
|
||||
template
|
||||
float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<double> &B);
|
||||
template
|
||||
double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<float> &B);
|
||||
template
|
||||
double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<double> &B);
|
||||
|
||||
|
||||
template<typename Real>
|
||||
bool CuSpMatrix<Real>::ApproxEqual(const CuSpMatrix<Real> &B, Real tol) const {
|
||||
KALDI_ASSERT(this->NumRows() == B.NumRows());
|
||||
CuSpMatrix<Real> diff(*this);
|
||||
diff.AddSp(-1.0, B);
|
||||
Real a = this->FrobeniusNorm(), b = B.FrobeniusNorm(),
|
||||
d = diff.FrobeniusNorm();
|
||||
return (d <= tol * std::max(a, b));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
bool CuSpMatrix<Real>::IsUnit(Real tol) const {
|
||||
// want to return:
|
||||
//FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.:
|
||||
//sqrt (trace((*this - I)(*this-I)) <= tol * NumRows()
|
||||
// trace((*this - I)(*this - I)) <= tol * NumRows()
|
||||
// trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows()
|
||||
// trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows()
|
||||
|
||||
// Note: we could do this more efficiently still, by slightly changing the
|
||||
// definition of IsUnit and getting rid of the extra stuff inside TraceSpSp
|
||||
// that corrects for the diagonal being counted twice.
|
||||
|
||||
return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <=
|
||||
tol * this->NumRows());
|
||||
}
|
||||
|
||||
|
||||
template class CuSpMatrix<float>;
|
||||
template class CuSpMatrix<double>;
|
||||
|
||||
|
||||
|
||||
} // namespace
|
|
@ -0,0 +1,146 @@
|
|||
#ifndef KALDI_CUDAMATRIX_CU_SP_MATRIX_H_
|
||||
#define KALDI_CUDAMATRIX_CU_SP_MATRIX_H_
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "matrix/matrix-common.h"
|
||||
#include "matrix/sp-matrix.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-packed-matrix.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/// TraceSpSp returns tr(A B)
|
||||
template<typename Real, typename OtherReal>
|
||||
Real TraceSpSp(const CuSpMatrix<Real> &A, const CuSpMatrix<OtherReal> &B);
|
||||
|
||||
template<typename Real>
|
||||
class CuSpMatrix : public CuPackedMatrix<Real> {
|
||||
friend class CuMatrixBase<Real>;
|
||||
friend class CuVectorBase<Real>;
|
||||
friend class CuTpMatrix<Real>;
|
||||
friend class CuSubMatrix<Real>;
|
||||
friend class CuRand<Real>;
|
||||
|
||||
template<class R, class S>
|
||||
friend R TraceSpSp(const CuSpMatrix<R> &A, const CuSpMatrix<S> &B);
|
||||
public:
|
||||
|
||||
CuSpMatrix(): CuPackedMatrix<Real>() {}
|
||||
|
||||
explicit CuSpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
|
||||
: CuPackedMatrix<Real>(r, resize_type) {}
|
||||
|
||||
explicit CuSpMatrix(const SpMatrix<Real> &orig)
|
||||
: CuPackedMatrix<Real>(orig) {}
|
||||
|
||||
explicit CuSpMatrix(const CuSpMatrix<Real> &orig)
|
||||
: CuPackedMatrix<Real>(orig) {}
|
||||
|
||||
explicit CuSpMatrix(const CuMatrixBase<Real> &orig,
|
||||
SpCopyType copy_type = kTakeLower)
|
||||
: CuPackedMatrix<Real>(orig.NumRows(), kUndefined) {
|
||||
CopyFromMat(orig, copy_type);
|
||||
}
|
||||
|
||||
~CuSpMatrix() {}
|
||||
|
||||
inline void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) {
|
||||
CuPackedMatrix<Real>::Resize(nRows, resize_type);
|
||||
}
|
||||
|
||||
Real FrobeniusNorm() const { return sqrt(TraceSpSp(*this, *this)); }
|
||||
|
||||
bool IsUnit(Real tol = 0.001) const;
|
||||
|
||||
bool ApproxEqual(const CuSpMatrix<Real> &other, Real tol = 0.001) const;
|
||||
|
||||
void CopyFromSp(const CuSpMatrix<Real> &other) {
|
||||
CuPackedMatrix<Real>::CopyFromPacked(other);
|
||||
}
|
||||
void CopyFromSp(const SpMatrix<Real> &other) {
|
||||
CuPackedMatrix<Real>::CopyFromPacked(other);
|
||||
}
|
||||
|
||||
void CopyFromMat(const CuMatrixBase<Real> &orig,
|
||||
SpCopyType copy_type = kTakeLower);
|
||||
|
||||
void CopyToSp(SpMatrix<Real> *dst) const { //added const by hxu
|
||||
CuPackedMatrix<Real>::CopyToPacked(dst);
|
||||
}
|
||||
|
||||
inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
|
||||
if (static_cast<UnsignedMatrixIndexT>(c) >
|
||||
static_cast<UnsignedMatrixIndexT>(r))
|
||||
std::swap(c, r);
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
|
||||
static_cast<UnsignedMatrixIndexT>(this->num_rows_));
|
||||
return CuValue<Real>(this->data_ + (r * (r+1)) / 2 + c);
|
||||
}
|
||||
|
||||
inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
|
||||
if (static_cast<UnsignedMatrixIndexT>(c) >
|
||||
static_cast<UnsignedMatrixIndexT>(r))
|
||||
std::swap(c, r);
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
|
||||
static_cast<UnsignedMatrixIndexT>(this->num_rows_));
|
||||
return CuValue<Real>(this->data_ + (r * (r+1)) / 2 + c); // will be
|
||||
// casted to Real.
|
||||
}
|
||||
|
||||
/// Approximate inversion of positive definite matrices, using repeated
|
||||
/// multiplication. Limits the error by ensuring that
|
||||
/// || I - A Ainv ||^2 <= max_error, using Frobenius norm (so guarantees
|
||||
// that (I - A Ainv).IsUnit(max_error) == true).
|
||||
void InvertPosDefApprox(BaseFloat max_error = 0.1);
|
||||
|
||||
/// Note: the CuMatrix version of the Invert() function will only work for
|
||||
/// positive definite matrices; it is based on Cholesky.
|
||||
void Invert();
|
||||
|
||||
void AddVec2(const Real alpha, const CuVectorBase<Real> &v);
|
||||
|
||||
void AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
|
||||
MatrixTransposeType transM, const Real beta);
|
||||
|
||||
void AddSp(const Real alpha, const CuSpMatrix<Real> &Ma) {
|
||||
this->AddPacked(alpha, Ma);
|
||||
}
|
||||
|
||||
protected:
|
||||
inline const SpMatrix<Real> &Mat() const {
|
||||
return *(reinterpret_cast<const SpMatrix<Real>* >(this));
|
||||
}
|
||||
inline SpMatrix<Real> &Mat() {
|
||||
return *(reinterpret_cast<SpMatrix<Real>* >(this));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<typename Real>
|
||||
inline bool ApproxEqual(const CuSpMatrix<Real> &A,
|
||||
const CuSpMatrix<Real> &B, Real tol = 0.001) {
|
||||
return A.ApproxEqual(B, tol);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
inline void AssertEqual(const CuSpMatrix<Real> &A,
|
||||
const CuSpMatrix<Real> &B, Real tol = 0.001) {
|
||||
KALDI_ASSERT(ApproxEqual(A, B, tol));
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
SpMatrix<Real>::SpMatrix(const CuSpMatrix<Real> &cu) {
|
||||
Resize(cu.NumRows());
|
||||
cu.CopyToSp(this);
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
|
@ -1,213 +0,0 @@
|
|||
// cudamatrix/cu-stlvector-inl.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_STLVECTOR_INL_H_
|
||||
#define KALDI_CUDAMATRIX_CU_STLVECTOR_INL_H_
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-kernels.h"
|
||||
#endif
|
||||
|
||||
#include "util/timer.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
template<typename IntType>
|
||||
const IntType* CuStlVector<IntType>::Data() const {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
return data_;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
return &vec_.front();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename IntType>
|
||||
IntType* CuStlVector<IntType>::Data() {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
return data_;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
return &vec_.front();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename IntType>
|
||||
CuStlVector<IntType>& CuStlVector<IntType>::Resize(MatrixIndexT dim) {
|
||||
if (dim_ == dim) {
|
||||
// SetZero();
|
||||
return *this;
|
||||
}
|
||||
|
||||
Destroy();
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
cuSafeCall(cudaMalloc((void**)&data_, dim*sizeof(IntType)));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
vec_.resize(dim);
|
||||
}
|
||||
|
||||
dim_ = dim;
|
||||
SetZero();
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename IntType>
|
||||
void CuStlVector<IntType>::Destroy() {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (NULL != data_) {
|
||||
cuSafeCall(cudaFree(data_));
|
||||
data_ = NULL;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
vec_.resize(0);
|
||||
}
|
||||
|
||||
dim_ = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename IntType>
|
||||
CuStlVector<IntType>& CuStlVector<IntType>::CopyFromVec(const std::vector<IntType> &src) {
|
||||
Resize(src.size());
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
cuSafeCall(cudaMemcpy(data_, &src.front(), src.size()*sizeof(IntType), cudaMemcpyHostToDevice));
|
||||
|
||||
CuDevice::Instantiate().AccuProfile("CuStlVector::CopyFromVecH2D",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memcpy(&vec_.front(), &src.front(), src.size()*sizeof(IntType));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename IntType>
|
||||
void CuStlVector<IntType>::CopyToVec(std::vector<IntType> *dst) const {
|
||||
if (static_cast<MatrixIndexT>(dst->size()) != dim_) {
|
||||
dst->resize(dim_);
|
||||
}
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
cuSafeCall(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(IntType), cudaMemcpyDeviceToHost));
|
||||
CuDevice::Instantiate().AccuProfile("CuStlVector::CopyToVecD2H",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memcpy(&dst->front(), &vec_.front(), dim_*sizeof(IntType));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename IntType>
|
||||
void CuStlVector<IntType>::SetZero() {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
cuSafeCall(cudaMemset(data_, 0, dim_*sizeof(IntType)));
|
||||
CuDevice::Instantiate().AccuProfile("CuStlVector::SetZero",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
vec_.assign(dim_, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Print the vector to stream
|
||||
*/
|
||||
template<typename IntType>
|
||||
std::ostream &operator << (std::ostream &out, const CuStlVector<IntType> &vec) {
|
||||
std::vector<IntType> tmp;
|
||||
vec.CopyToVec(&tmp);
|
||||
out << "[";
|
||||
for(int32 i=0; i<tmp.size(); i++) {
|
||||
out << " " << tmp[i];
|
||||
}
|
||||
out << " ]\n";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Methods wrapping the ANSI-C CUDA kernels
|
||||
*/
|
||||
template<>
|
||||
inline void CuStlVector<int32>::Set(int32 value) {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
|
||||
::MatrixDim d = { 1, Dim(), Dim() };
|
||||
|
||||
cudaI32_set_const(dimGrid, dimBlock, data_, value, d);
|
||||
cuSafeCall(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
vec_.assign(vec_.size(), value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -1,109 +0,0 @@
|
|||
// cudamatrix/cu-stlvector.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_STLVECTOR_H_
|
||||
#define KALDI_CUDAMATRIX_CU_STLVECTOR_H_
|
||||
|
||||
#include "matrix/kaldi-vector.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename IntType> class CuMatrix;
|
||||
|
||||
/**
|
||||
* std::vector equivalent for CUDA computing
|
||||
*/
|
||||
template<typename IntType>
|
||||
class CuStlVector {
|
||||
typedef CuStlVector<IntType> ThisType;
|
||||
public:
|
||||
|
||||
/// Default Constructor
|
||||
CuStlVector<IntType>()
|
||||
: dim_(0), data_(NULL) {
|
||||
}
|
||||
/// Constructor with memory initialisation
|
||||
CuStlVector<IntType>(MatrixIndexT dim)
|
||||
: dim_(0), data_(NULL) {
|
||||
Resize(dim);
|
||||
}
|
||||
|
||||
/// Destructor
|
||||
~CuStlVector() {
|
||||
Destroy();
|
||||
}
|
||||
|
||||
/// Dimensions
|
||||
MatrixIndexT Dim() const {
|
||||
return dim_;
|
||||
}
|
||||
|
||||
/// Get raw pointer
|
||||
const IntType* Data() const;
|
||||
IntType* Data();
|
||||
|
||||
/// Allocate the memory
|
||||
ThisType& Resize(MatrixIndexT dim);
|
||||
|
||||
/// Deallocate the memory
|
||||
void Destroy();
|
||||
|
||||
/// Copy functions (reallocates when needed)
|
||||
ThisType& CopyFromVec(const std::vector<IntType> &src);
|
||||
void CopyToVec(std::vector<IntType> *dst) const;
|
||||
|
||||
/// Math operations
|
||||
void SetZero();
|
||||
void Set(IntType value);
|
||||
|
||||
/// Accessor to non-GPU vector
|
||||
const std::vector<IntType>& Vec() const {
|
||||
return vec_;
|
||||
}
|
||||
std::vector<IntType>& Vec() {
|
||||
return vec_;
|
||||
}
|
||||
|
||||
private:
|
||||
MatrixIndexT dim_; ///< dimension of the vector
|
||||
IntType *data_; ///< GPU data pointer
|
||||
std::vector<IntType> vec_; ///< non-GPU vector as back-up
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Signatures of general/specialized methods
|
||||
*/
|
||||
template<typename Real> void CuStlVector<Real>::Set(Real value) { KALDI_ERR << __func__ << " Not implemented"; }
|
||||
template<> inline void CuStlVector<int32>::Set(int32 value);
|
||||
|
||||
|
||||
/// I/O
|
||||
template<typename IntType>
|
||||
std::ostream &operator << (std::ostream &out, const CuStlVector<IntType> &vec);
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
#include "cu-stlvector-inl.h"
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,582 @@
|
|||
// cudamatrix/cuda-test.cc
|
||||
//
|
||||
//
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
#include "cudamatrix/cu-tp-matrix.h"
|
||||
#include "cudamatrix/cu-packed-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include <numeric>
|
||||
#include <time.h>
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/*
|
||||
* INITIALIZERS
|
||||
*/
|
||||
template<typename Real>
|
||||
static void InitRand(SpMatrix<Real> *M) {
|
||||
do {
|
||||
for (MatrixIndexT i = 0; i < M->NumRows(); i++) {
|
||||
for (MatrixIndexT j = 0; j <= i; j++ ) {
|
||||
(*M)(i,j) = RandGauss();
|
||||
}
|
||||
}
|
||||
} while (M->NumRows() != 0 && M->Cond() > 100);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void InitRand(VectorBase<Real> *v) {
|
||||
for (MatrixIndexT i = 0; i < v->Dim(); i++) {
|
||||
(*v)(i) = RandGauss();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestSetZeroUpperDiag() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10 * i;
|
||||
Matrix<Real> A(dim,dim);
|
||||
A.SetRandn();
|
||||
CuMatrix<Real> B(A);
|
||||
|
||||
B.SetZeroUpperDiag();
|
||||
|
||||
Real sum = 0.0;
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
for (MatrixIndexT j = i + 1; j < dim; j++)
|
||||
sum += A(i,j);
|
||||
}
|
||||
|
||||
KALDI_LOG << "the upper diaganoal sum for A is : " << sum;
|
||||
B.CopyToMat(&A);
|
||||
sum = 0.0;
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
for (MatrixIndexT j = i + 1; j < dim; j++)
|
||||
sum += A(i,j);
|
||||
}
|
||||
KALDI_LOG << "the upper diaganoal sum for B is : " << sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> static void UnitTestCholesky() {
|
||||
for (MatrixIndexT iter = 0; iter < 3; iter++) {
|
||||
MatrixIndexT dim = 300 + rand() % 200;
|
||||
// set dimension
|
||||
// computing the matrix for cholesky input
|
||||
// CuMatrix is cuda matrix class while Matrix is cpu matrix class
|
||||
CuMatrix<Real> A(dim,dim);
|
||||
Matrix<Real> B(dim,dim);
|
||||
Vector<Real> C(dim);
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
B(i,i) = 1;
|
||||
C(i) = i + 1;
|
||||
}
|
||||
B.AddVecVec(1.0, C, C);
|
||||
// copy the matrix to cudamatrix object
|
||||
A.CopyFromMat(B);
|
||||
A.CopyToMat(&B);
|
||||
//KALDI_LOG << B << '\n';
|
||||
// doing cholesky
|
||||
A.Cholesky();
|
||||
|
||||
Matrix<Real> D(dim,dim);
|
||||
A.CopyToMat(&D);
|
||||
|
||||
//KALDI_LOG << "D is: " << D << '\n';
|
||||
Matrix<Real> E(dim,dim);
|
||||
E.AddMatMat(1.0, D, kNoTrans, D, kTrans, 0.0);
|
||||
// check if the D'D is equal to B or not!
|
||||
AssertEqual(B, E);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestTrace() {
|
||||
for (MatrixIndexT iter = 1; iter < 18; iter++) {
|
||||
MatrixIndexT dim = iter;
|
||||
KALDI_LOG << "dim is : " << iter;
|
||||
SpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuSpMatrix<Real> B(A);
|
||||
KALDI_LOG << "cpu trace is : " << A.Trace();
|
||||
KALDI_LOG << "gpu trace is : " << B.Trace();
|
||||
}
|
||||
/*
|
||||
Vector<Real> tim(100);
|
||||
Vector<Real> d(100);
|
||||
for (MatrixIndexT iter = 0; iter < 100; iter++) {
|
||||
MatrixIndexT dim = 10000 + rand() % 400;
|
||||
Matrix<Real> A(dim,dim);
|
||||
A.SetRandn();
|
||||
CuMatrix<Real> B(A);
|
||||
CuSpMatrix<Real> C(B,kTakeLower);
|
||||
clock_t t1 = clock();
|
||||
tim(iter) = C.Trace();
|
||||
clock_t t2 = clock();
|
||||
//tim(iter) = t2 - t1;
|
||||
d(iter) = dim;
|
||||
KALDI_LOG << tim(iter) << iter << '\n';
|
||||
KALDI_LOG << d(iter) << iter << '\n';
|
||||
}
|
||||
KALDI_LOG << "tim is " << tim << '\n';
|
||||
KALDI_LOG << "dim is " << d << '\n';
|
||||
*/
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitInvert() {
|
||||
//MatrixIndexT dim = 15 + rand() % 40;;
|
||||
MatrixIndexT dim = 8;
|
||||
CuMatrix<Real> A(dim,dim);
|
||||
Matrix<Real> B(dim,dim);
|
||||
Vector<Real> C(dim);
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
B(i,i) = 1;
|
||||
C(i) = i + 1;
|
||||
}
|
||||
B.AddVecVec(1.0,C,C);
|
||||
CuMatrix<Real> tmp(dim,dim);
|
||||
A.CopyFromMat(B);
|
||||
//A.Cholesky();
|
||||
A.CopyToMat(&B);
|
||||
KALDI_LOG << "B is : " << '\n';
|
||||
KALDI_LOG << B << '\n';
|
||||
A.SymInvertPosDef();
|
||||
Matrix<Real> D(dim,dim);
|
||||
A.CopyToMat(&D);
|
||||
KALDI_LOG << "D is : " << '\n';
|
||||
KALDI_LOG << D << '\n';
|
||||
Matrix<Real> X(dim,dim);
|
||||
X.AddMatMat(1,B,kNoTrans,D,kNoTrans,0);
|
||||
KALDI_LOG << X << '\n';
|
||||
//for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
// for (MatrixIndexT j = i+1; j < dim; j++)
|
||||
// D(i,j) = 0;
|
||||
//}
|
||||
//Matrix<Real> E(dim,dim);
|
||||
//E.AddMatMat(1,D,kNoTrans,D,kTrans,0);
|
||||
//AssertEqual(B,E);
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestInvert() {
|
||||
for (MatrixIndexT iter = 0; iter < 3; iter++) {
|
||||
MatrixIndexT dim = 500 + rand() % 400;
|
||||
|
||||
KALDI_LOG << "dim is : " << '\n';
|
||||
KALDI_LOG << dim << '\n';
|
||||
CuMatrix<Real> A(dim,dim);
|
||||
Matrix<Real> B(dim,dim);
|
||||
Vector<Real> C(dim);
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
B(i,i) = 1;
|
||||
C(i) = (i/(1.0*dim)) + 1;
|
||||
}
|
||||
Matrix<Real> Identity(B);
|
||||
B.AddVecVec(1.0, C, C);
|
||||
// Now we have a positive-definite B (inversion would
|
||||
// fail if it were not positive definite).
|
||||
|
||||
A.CopyFromMat(B);
|
||||
|
||||
A.SymInvertPosDef();
|
||||
Matrix<Real> D(dim,dim);
|
||||
A.CopyToMat(&D);
|
||||
|
||||
Matrix<Real> X(dim,dim);
|
||||
X.AddMatMat(1.0, B, kNoTrans, D, kNoTrans, 0.0);
|
||||
// KALDI_LOG << "X is (should be identity): " << X << '\n';
|
||||
AssertEqual(Identity, X, (sizeof(Real) == 4 ? 0.1 : 0.001));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestConstructor() {
|
||||
MatrixIndexT dim = 8;
|
||||
CuMatrix<Real> A(dim,dim);
|
||||
Matrix<Real> B(dim,dim);
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
for (MatrixIndexT j = 0; j <=i; j++)
|
||||
B(i,j) = i+j;
|
||||
for (MatrixIndexT j = i+1; j < dim; j++)
|
||||
B(i,j) = i+j+4;
|
||||
}
|
||||
KALDI_LOG << "A is : " << '\n';
|
||||
KALDI_LOG << B << '\n';
|
||||
A.CopyFromMat(B);
|
||||
//CuSpMatrix<Real> C(dim);
|
||||
//C.CopyFromMat(A,kTakeLower);
|
||||
CuSpMatrix<Real> C(A, kTakeLower);
|
||||
SpMatrix<Real> D(dim);
|
||||
C.CopyToSp(&D);
|
||||
KALDI_LOG << "C is : " << '\n';
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
std::cout << D(i,j) << " ";
|
||||
std::cout << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestCopySp() {
|
||||
// Checking that the various versions of copying
|
||||
// matrix to SpMatrix work the same in the symmetric case.
|
||||
for (MatrixIndexT iter = 0;iter < 5;iter++) {
|
||||
int32 dim = 5 + rand() % 10;
|
||||
SpMatrix<Real> A(dim), B(dim);
|
||||
A.SetRandn();
|
||||
Matrix<Real> C(A);
|
||||
//CuMatrix<Real> D(C);
|
||||
|
||||
{
|
||||
CuMatrix<Real> D2(dim,dim);
|
||||
D2.CopyFromMat(C);
|
||||
KALDI_LOG << "D2 is " << D2;
|
||||
CuSpMatrix<Real> E(D2.NumRows(), kUndefined);
|
||||
KALDI_LOG << "D2 is " << D2;
|
||||
E.CopyFromMat(D2, kTakeLower);
|
||||
KALDI_LOG << "D2 is " << D2;
|
||||
}
|
||||
|
||||
CuMatrix<Real> D(dim,dim);
|
||||
D.CopyFromMat(C);
|
||||
KALDI_LOG << "D stride is : " << D.Stride() <<'\n';
|
||||
|
||||
CuSpMatrix<Real> E(D,kTakeLower);
|
||||
///CuSpMatrix<Real> E(dim);
|
||||
//E.CopyFromMat(D,kTakeLower);
|
||||
/*
|
||||
KALDI_LOG << D.NumRows() << '\n';
|
||||
//E.CopyFromMat(D, kTakeMean);
|
||||
//E(D, kTakeMean);
|
||||
//KALDI_LOG << E.NumRows() << '\n';
|
||||
|
||||
E.CopyToMat(&B);
|
||||
AssertEqual(A, B);
|
||||
B.SetZero();
|
||||
//E.CopyFromMat(D, kTakeLower);
|
||||
CuSpMatrix<Real> F(D,kTakeLower);
|
||||
//F(D, kTakeLower);
|
||||
F.CopyToMat(&B);
|
||||
AssertEqual(A, B);
|
||||
B.SetZero();
|
||||
//E.CopyFromMat(D, kTakeUpper);
|
||||
//E(D, kTakeUpper);
|
||||
CuSpMatrix<Real> G(D, kTakeUpper);
|
||||
G.CopyToMat(&B);
|
||||
AssertEqual(A, B);
|
||||
*/
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestCopyFromMat() {
|
||||
MatrixIndexT dim = 8;
|
||||
CuMatrix<Real> A(dim,dim);
|
||||
Matrix<Real> B(dim,dim);
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
for (MatrixIndexT j = 0; j <=i; j++)
|
||||
B(i,j) = i+j;
|
||||
for (MatrixIndexT j = i+1; j < dim; j++)
|
||||
B(i,j) = i+j+4;
|
||||
}
|
||||
KALDI_LOG << "A is : " << '\n';
|
||||
KALDI_LOG << B << '\n';
|
||||
A.CopyFromMat(B);
|
||||
CuSpMatrix<Real> C(dim);
|
||||
C.CopyFromMat(A,kTakeLower);
|
||||
SpMatrix<Real> D(dim);
|
||||
C.CopyToSp(&D);
|
||||
KALDI_LOG << "C is : " << '\n';
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
std::cout << D(i,j) << " ";
|
||||
std::cout << '\n';
|
||||
}
|
||||
C.CopyFromMat(A,kTakeUpper);
|
||||
C.CopyToSp(&D);
|
||||
KALDI_LOG << "C is : " << '\n';
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
std::cout << D(i,j) << " ";
|
||||
std::cout << '\n';
|
||||
}
|
||||
|
||||
C.CopyFromMat(A,kTakeMean);
|
||||
C.CopyToSp(&D);
|
||||
KALDI_LOG << "C is : " << '\n';
|
||||
for (MatrixIndexT i = 0; i < dim; i++) {
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
std::cout << D(i,j) << " ";
|
||||
std::cout << '\n';
|
||||
}
|
||||
|
||||
//KALDI_LOG << D << '\n';
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestMatrix() {
|
||||
//operator()
|
||||
for (MatrixIndexT iter = 0; iter < 2; iter++) {
|
||||
int32 dim1 = 6 + rand() % 10;
|
||||
int32 dim2 = 8 + rand() % 10;
|
||||
Matrix<Real> A(dim1,dim2);
|
||||
A.SetRandn();
|
||||
CuMatrix<Real> B(A);
|
||||
KALDI_ASSERT(A(3, 4) == B(3, 4));
|
||||
B(3, 4) = 2.0;
|
||||
A(3, 4) = B(3, 4);
|
||||
KALDI_ASSERT(A(3, 4) == B(3, 4));
|
||||
|
||||
SpMatrix<Real> As(dim1);
|
||||
CuSpMatrix<Real> Bs(As);
|
||||
KALDI_ASSERT(As(3, 4) == Bs(3, 4));
|
||||
Bs(3, 4) = 2.0;
|
||||
if (rand() % 2 == 0)
|
||||
As(3, 4) = Bs(3, 4);
|
||||
else
|
||||
As(3, 4) = (const_cast<const CuSpMatrix<Real>&>(Bs))(3, 4);
|
||||
|
||||
KALDI_ASSERT(As(3, 4) == Bs(3, 4));
|
||||
|
||||
Vector<Real> v(dim1);
|
||||
CuVector<Real> w(v);
|
||||
KALDI_ASSERT(w(2) == v(2));
|
||||
w(2) = 3.0;
|
||||
v(2) = w(2);
|
||||
KALDI_ASSERT(w(2) == v(2));
|
||||
}
|
||||
|
||||
//SetRandn
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim1 = 15 + rand() % 10;
|
||||
int32 dim2 = dim1;//10 + rand() % 14;
|
||||
//KALDI_LOG << "dimension is " << dim1
|
||||
// << " " << dim2 << '\n';
|
||||
CuMatrix<Real> A(dim1,dim2);
|
||||
A.SetRandn();
|
||||
Matrix<Real> A1(dim1,dim2);
|
||||
A.CopyToMat(&A1);
|
||||
//KALDI_LOG << "gpu sum is: " << A.Sum() << '\n';
|
||||
//KALDI_LOG << "cpu sum is: " << A1.Sum() << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestMulTp() {
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim = 1 + rand() % 30;
|
||||
Vector<Real> v(dim);
|
||||
v.SetRandn();
|
||||
TpMatrix<Real> M(dim);
|
||||
M.SetRandn();
|
||||
CuVector<Real> cv(v);
|
||||
CuTpMatrix<Real> cM(M);
|
||||
|
||||
Vector<Real> v2(dim);
|
||||
cv.CopyToVec(&v2);
|
||||
AssertEqual(v, v2);
|
||||
v.MulTp(M, iter % 2 == 0 ? kTrans:kNoTrans);
|
||||
cv.MulTp(cM, iter % 2 == 0 ? kTrans:kNoTrans);
|
||||
cv.CopyToVec(&v2);
|
||||
// KALDI_LOG << "v is " << v << ", v2 is " << v2;
|
||||
AssertEqual(v, v2);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestVector() {
|
||||
// Scale
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim = 24 + rand() % 10;
|
||||
Vector<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuVector<Real> B(A);
|
||||
Vector<Real> C(dim);
|
||||
Real r = 1.43;
|
||||
B.Scale(r);
|
||||
B.CopyToVec(&C);
|
||||
A.Scale(r);
|
||||
//KALDI_LOG << A;
|
||||
//KALDI_LOG << (A.Scale(r));
|
||||
//KALDI_LOG << C;
|
||||
AssertEqual(A, C);
|
||||
}
|
||||
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim = 15 + rand() % 10;
|
||||
CuVector<Real> A(dim);
|
||||
CuVector<Real> B(dim);
|
||||
Vector<Real> A1(dim);
|
||||
Vector<Real> B1(dim);
|
||||
A.SetRandn();
|
||||
B.SetRandn();
|
||||
A.CopyToVec(&A1);
|
||||
B.CopyToVec(&B1);
|
||||
A.MulElements(B);
|
||||
A1.MulElements(B1);
|
||||
Vector<Real> A2(dim);
|
||||
A.CopyToVec(&A2);
|
||||
AssertEqual(A1,A2);
|
||||
}
|
||||
/*
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim = 72;
|
||||
CuVector<Real> A(dim);
|
||||
Vector<Real> A1(dim);
|
||||
CuMatrix<Real> B(9,8);
|
||||
Matrix<Real> B1(9,8);
|
||||
B.SetRandn();
|
||||
B.CopyToMat(&B1);
|
||||
A.CopyRowsFromMat(B);
|
||||
A1.CopyRowsFromMat(B1);
|
||||
Vector<Real> A2(dim);
|
||||
A.CopyToVec(&A2);
|
||||
AssertEqual(A1,A2);
|
||||
}
|
||||
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim = 15 + rand() % 10;
|
||||
CuVector<Real> A(dim);
|
||||
A.SetRandn();
|
||||
Vector<Real> A1(dim);
|
||||
A.CopyToVec(&A1);
|
||||
KALDI_LOG << "cpu min is : " << A1.Min() << '\n';
|
||||
KALDI_LOG << "gpu min is : " << A.Min() << '\n';
|
||||
}
|
||||
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim = 15 + rand() % 10;
|
||||
CuVector<Real> A(dim);
|
||||
A.SetRandn();
|
||||
Vector<Real> A1(dim);
|
||||
A.CopyToVec(&A1);
|
||||
CuVector<Real> B(dim);
|
||||
B.SetRandn();
|
||||
Vector<Real> B1(dim);
|
||||
B.CopyToVec(&B1);
|
||||
CuVector<Real> C(dim);
|
||||
C.SetRandn();
|
||||
Vector<Real> C1(dim);
|
||||
C.CopyToVec(&C1);
|
||||
Real alpha = 2;
|
||||
Real beta = 3;
|
||||
A.AddVecVec(alpha, B, C, beta);
|
||||
A1.AddVecVec(alpha,B1,C1,beta);
|
||||
Vector<Real> D(dim);
|
||||
A.CopyToVec(&D);
|
||||
AssertEqual(D,A1);
|
||||
}
|
||||
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim1 = 15 + rand() % 10;
|
||||
int32 dim2 = 10 + rand() % 10;
|
||||
Matrix<Real> A(dim1,dim2);
|
||||
for (MatrixIndexT i = 0; i < dim1; i++) {
|
||||
for (MatrixIndexT j = 0; j < dim2; j++)
|
||||
A(i,j) = i + 2 * j + 1;
|
||||
}
|
||||
KALDI_LOG << A;
|
||||
CuMatrix<Real> B(dim1,dim2);
|
||||
B.CopyFromMat(A);
|
||||
CuVector<Real> C(dim1);
|
||||
C.SetZero();
|
||||
Real alpha = 1;
|
||||
Real beta = 1;
|
||||
C.AddDiagMat2(alpha, B, kNoTrans, beta);
|
||||
Vector<Real> D(dim1);
|
||||
C.CopyToVec(&D);
|
||||
KALDI_LOG << D << '\n';
|
||||
Vector<Real> E(dim1);
|
||||
E.AddDiagMat2(alpha, A, kNoTrans, beta);
|
||||
KALDI_LOG << E;
|
||||
AssertEqual(D,E);
|
||||
}
|
||||
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim1 = 15 + rand() % 10;
|
||||
int32 dim2 = 10 + rand() % 10;
|
||||
Matrix<Real> A(dim1,dim2);
|
||||
for (MatrixIndexT i = 0; i < dim1; i++) {
|
||||
for (MatrixIndexT j = 0; j < dim2; j++)
|
||||
A(i,j) = i + 2 * j + 1;
|
||||
}
|
||||
KALDI_LOG << A;
|
||||
CuMatrix<Real> B(dim1,dim2);
|
||||
B.CopyFromMat(A);
|
||||
CuSubVector<Real> C(B,1);
|
||||
Vector<Real> D(dim2);
|
||||
C.CopyToVec(&D);
|
||||
KALDI_LOG << D;
|
||||
}
|
||||
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim = 15 + rand() % 10;
|
||||
CuVector<Real> A(dim);
|
||||
A.SetRandn();
|
||||
Vector<Real> A1(dim);
|
||||
A.CopyToVec(&A1);
|
||||
CuVector<Real> B(dim);
|
||||
B.SetRandn();
|
||||
Vector<Real> B1(dim);
|
||||
B.CopyToVec(&B1);
|
||||
Real dot = VecVec(A,B);
|
||||
KALDI_LOG << "dot product in gpu: " << dot << '\n';
|
||||
dot = VecVec(A1,B1);
|
||||
KALDI_LOG << "dot product in cpu: " << dot << '\n';
|
||||
}
|
||||
|
||||
for (MatrixIndexT iter = 0; iter < 10; iter++) {
|
||||
int32 dim = 15 + rand() % 10;
|
||||
CuVector<Real> A(dim);
|
||||
Vector<Real> A1(dim);
|
||||
for (MatrixIndexT i = 0; i < dim; i++)
|
||||
A1(i) = i;
|
||||
A.CopyFromVec(A1);
|
||||
KALDI_LOG << A(dim-2) << '\n';
|
||||
KALDI_LOG << A1(dim-2) << '\n';
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void CuMatrixUnitTest() {
|
||||
UnitTestTrace<Real>();
|
||||
UnitTestCholesky<Real>();
|
||||
UnitTestInvert<Real>();
|
||||
UnitInvert<Real>();
|
||||
UnitTestCopyFromMat<Real>();
|
||||
UnitTestCopySp<Real>();
|
||||
UnitTestConstructor<Real>();
|
||||
UnitTestVector<Real>();
|
||||
UnitTestMulTp<Real>();
|
||||
UnitTestMatrix<Real>();
|
||||
UnitTestSetZeroUpperDiag<Real>();
|
||||
}
|
||||
} //namespace
|
||||
|
||||
int main() {
|
||||
using namespace kaldi;
|
||||
#if HAVE_CUDA == 1
|
||||
kaldi::CuDevice::Instantiate().SelectGpuId("yes");
|
||||
#endif
|
||||
|
||||
kaldi::CuMatrixUnitTest<float>();
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
if (!kaldi::CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
KALDI_WARN << "Double precision not supported, not testing that code";
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
kaldi::CuMatrixUnitTest<double>();
|
||||
}
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
kaldi::CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
|
||||
|
||||
KALDI_LOG << "Tests succeeded.\n";
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,218 @@
|
|||
// cudamatrix/cu-sp-matrix-test.cc
|
||||
//
|
||||
// Copyright 2013 Ehsan Variani
|
||||
// Lucas Ondel
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// UnitTests for testing cu-sp-matrix.h methods.
|
||||
//
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-tp-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void AssertEqual(const CuPackedMatrix<Real> &A,
|
||||
const CuPackedMatrix<Real> &B,
|
||||
float tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
|
||||
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void AssertEqual(const PackedMatrix<Real> &A,
|
||||
const PackedMatrix<Real> &B,
|
||||
float tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
|
||||
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void AssertEqual(const PackedMatrix<Real> &A,
|
||||
const CuPackedMatrix<Real> &B,
|
||||
float tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
|
||||
for (MatrixIndexT j = 0; j <= i; j++)
|
||||
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
|
||||
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Unit Tests
|
||||
*/
|
||||
template<typename Real>
|
||||
static void UnitTestCuTpMatrixInvert() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 5 * i + rand() % 10;
|
||||
|
||||
TpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuTpMatrix<Real> B(A);
|
||||
|
||||
AssertEqual<Real>(A, B, 0.005);
|
||||
A.Invert();
|
||||
B.Invert();
|
||||
AssertEqual<Real>(A, B, 0.005);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuTpMatrixCopyFromTp() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 5 * i + rand() % 10;
|
||||
|
||||
TpMatrix<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuTpMatrix<Real> B(dim);
|
||||
B.CopyFromTp(A);
|
||||
CuTpMatrix<Real> C(dim);
|
||||
C.CopyFromTp(B);
|
||||
|
||||
AssertEqual<Real>(A, B);
|
||||
AssertEqual<Real>(B, C);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuTpMatrixCopyFromMat() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixTransposeType trans = (i % 2 == 0 ? kNoTrans : kTrans);
|
||||
|
||||
MatrixIndexT dim = 10*i + rand() % 5;
|
||||
CuMatrix<Real> A(dim, dim);
|
||||
A.SetRandn();
|
||||
Matrix<Real> A2(A);
|
||||
|
||||
CuTpMatrix<Real> B(dim);
|
||||
B.CopyFromMat(A, trans);
|
||||
TpMatrix<Real> B2(dim);
|
||||
B2.CopyFromMat(A2, trans);
|
||||
TpMatrix<Real> B3(B);
|
||||
AssertEqual(B2, B3);
|
||||
KALDI_ASSERT(B3.Trace() != 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuTpMatrixCholesky() {
|
||||
for (MatrixIndexT i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 1 + rand() % 10;
|
||||
if (i > 4) {
|
||||
dim += 32 * (rand() % 5);
|
||||
}
|
||||
|
||||
Matrix<Real> M(dim, dim + 2);
|
||||
M.SetRandn();
|
||||
SpMatrix<Real> A(dim);
|
||||
A.AddMat2(1.0, M, kNoTrans, 0.0); // sets A to random almost-surely +ve
|
||||
// definite matrix.
|
||||
CuSpMatrix<Real> B(A);
|
||||
|
||||
TpMatrix<Real> C(dim);
|
||||
C.SetRandn();
|
||||
CuTpMatrix<Real> D(C);
|
||||
C.Cholesky(A);
|
||||
D.Cholesky(B);
|
||||
|
||||
AssertEqual<Real>(C, D);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuTpMatrixIO() {
|
||||
for (int32 i = 0; i < 3; i++) {
|
||||
int32 dimM = rand() % 255 + 10;
|
||||
if (i % 5 == 0) { dimM = 0; }
|
||||
CuTpMatrix<Real> mat(dimM);
|
||||
mat.SetRandn();
|
||||
std::ostringstream os;
|
||||
bool binary = (i % 4 < 2);
|
||||
mat.Write(os, binary);
|
||||
|
||||
CuTpMatrix<Real> mat2;
|
||||
std::istringstream is(os.str());
|
||||
mat2.Read(is, binary);
|
||||
AssertEqual(mat, mat2);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CudaTpMatrixUnitTest() {
|
||||
UnitTestCuTpMatrixIO<Real>();
|
||||
UnitTestCuTpMatrixInvert<Real>();
|
||||
UnitTestCuTpMatrixCopyFromTp<Real>();
|
||||
UnitTestCuTpMatrixCholesky<Real>();
|
||||
UnitTestCuTpMatrixCopyFromMat<Real>();
|
||||
}
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
using namespace kaldi;
|
||||
|
||||
|
||||
for (int32 loop = 0; loop < 2; loop++) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (loop == 0)
|
||||
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
|
||||
else
|
||||
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
|
||||
#endif
|
||||
kaldi::CudaTpMatrixUnitTest<float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CudaTpMatrixUnitTest<double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CudaTpMatrixUnitTest<double>();
|
||||
#endif
|
||||
|
||||
if (loop == 0)
|
||||
KALDI_LOG << "Tests without GPU use succeeded.\n";
|
||||
else
|
||||
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
|
||||
}
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
#if HAVE_CUDA==1
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cublas.h>
|
||||
#endif
|
||||
|
||||
#include "util/timer.h"
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-kernels.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-tp-matrix.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
#include "cudamatrix/cublas-wrappers.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real>
|
||||
CuTpMatrix<Real>::CuTpMatrix(const CuMatrixBase<Real> &orig, MatrixTransposeType trans):
|
||||
CuPackedMatrix<Real>(orig.NumRows(), kUndefined) {
|
||||
KALDI_ASSERT(orig.NumRows() == orig.NumCols());
|
||||
this->CopyFromMat(orig, trans);
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuTpMatrix<Real>::Cholesky(const CuSpMatrix<Real> &orig) {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
CuMatrix<Real> tmp(orig);
|
||||
tmp.Cholesky();
|
||||
this->CopyFromMat(tmp, kNoTrans);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
this->Mat().Cholesky(orig.Mat());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuTpMatrix<Real>::Invert() {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
int dimBlock(CU2DBLOCK);
|
||||
int dimGrid(n_blocks(this->NumRows(), CU2DBLOCK));
|
||||
CuMatrix<Real> tmp(this->NumRows(), this->NumRows());
|
||||
int dim = this->NumRows();
|
||||
Real alpha = 1.0;
|
||||
cuda_set_diag(dimGrid, dimBlock, tmp.Data(), alpha, tmp.Dim());
|
||||
//Matrix<Real> A(dim,dim);
|
||||
//tmp.CopyToMat(&A);
|
||||
CuMatrix<Real> tmp2(dim, dim);
|
||||
tmp2.CopyFromTp(*this);
|
||||
cublas_trsm(dim, dim, alpha, tmp2.Data(), tmp2.Dim().stride,
|
||||
tmp.Data(), tmp.Dim().stride);
|
||||
this->CopyFromMat(tmp, kNoTrans);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().Invert();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuTpMatrix<Real>::CopyFromMat(const CuMatrixBase<Real> &M,
|
||||
MatrixTransposeType Trans) {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
MatrixIndexT num_rows = this->num_rows_;
|
||||
KALDI_ASSERT(num_rows == M.NumRows() && this->num_rows_ == M.NumCols());
|
||||
if (num_rows == 0)
|
||||
return;
|
||||
Timer tim;
|
||||
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
|
||||
dim3 dimGrid(n_blocks(num_rows, CU2DBLOCK), n_blocks(num_rows, CU2DBLOCK));
|
||||
if (Trans == kNoTrans) {
|
||||
cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
|
||||
cudaThreadSynchronize();
|
||||
} else {
|
||||
cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Mat().CopyFromMat(M.Mat(), Trans);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
TpMatrix<Real>::TpMatrix(const CuTpMatrix<Real> &cu) {
|
||||
this->Resize(cu.NumRows());
|
||||
this->CopyFromMat(cu);
|
||||
}
|
||||
template TpMatrix<float>::TpMatrix(const CuTpMatrix<float> &cu);
|
||||
template TpMatrix<double>::TpMatrix(const CuTpMatrix<double> &cu);
|
||||
|
||||
template<class Real>
|
||||
void TpMatrix<Real>::CopyFromMat(const CuTpMatrix<Real> &other) {
|
||||
other.CopyToPacked(this);
|
||||
}
|
||||
// instantiate the template above.
|
||||
template void TpMatrix<float>::CopyFromMat(const CuTpMatrix<float> &other);
|
||||
template void TpMatrix<double>::CopyFromMat(const CuTpMatrix<double> &other);
|
||||
|
||||
template class CuTpMatrix<float>;
|
||||
template class CuTpMatrix<double>;
|
||||
|
||||
} // namespace
|
|
@ -0,0 +1,83 @@
|
|||
// cudamatrix/cu-tp-matrix.h
|
||||
// Copyright 2013 Ehsan Variani
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef KALDI_CUDAMATRIX_CU_TP_MATRIX_H_
|
||||
#define KALDI_CUDAMATRIX_CU_TP_MATRIX_H_
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "matrix/matrix-common.h"
|
||||
#include "matrix/tp-matrix.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
#include "cudamatrix/cu-packed-matrix.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real> class CuTpMatrix;
|
||||
|
||||
template<typename Real>
|
||||
class CuTpMatrix : public CuPackedMatrix<Real> {
|
||||
friend class CuMatrixBase<float>;
|
||||
friend class CuMatrixBase<double>;
|
||||
friend class CuVectorBase<Real>;
|
||||
friend class CuSubMatrix<Real>;
|
||||
friend class CuRand<Real>;
|
||||
friend class CuTpMatrix<float>;
|
||||
friend class CuTpMatrix<double>;
|
||||
public:
|
||||
CuTpMatrix() : CuPackedMatrix<Real>() {}
|
||||
explicit CuTpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
|
||||
: CuPackedMatrix<Real>(r, resize_type) {}
|
||||
explicit CuTpMatrix<Real>(const TpMatrix<Real> &orig)
|
||||
: CuPackedMatrix<Real>(orig) {}
|
||||
explicit CuTpMatrix<Real>(const CuTpMatrix<Real> &orig)
|
||||
: CuPackedMatrix<Real>(orig) {}
|
||||
explicit CuTpMatrix<Real>(const CuMatrixBase<Real> &orig,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
|
||||
~CuTpMatrix() {}
|
||||
|
||||
void CopyFromMat(const CuMatrixBase<Real> &M,
|
||||
MatrixTransposeType Trans = kNoTrans);
|
||||
|
||||
void CopyFromTp(const CuTpMatrix<Real> &other) {
|
||||
CuPackedMatrix<Real>::CopyFromPacked(other);
|
||||
}
|
||||
void CopyFromTp(const TpMatrix<Real> &other) {
|
||||
CuPackedMatrix<Real>::CopyFromPacked(other);
|
||||
}
|
||||
void Cholesky(const CuSpMatrix<Real>& Orig);
|
||||
void Invert();
|
||||
|
||||
protected:
|
||||
inline const TpMatrix<Real> &Mat() const {
|
||||
return *(reinterpret_cast<const TpMatrix<Real>* >(this));
|
||||
}
|
||||
inline TpMatrix<Real> &Mat() {
|
||||
return *(reinterpret_cast<TpMatrix<Real>* >(this));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
|
@ -0,0 +1,88 @@
|
|||
// cudamatrix/cu-value.h
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_VALUE_H_
|
||||
#define KALDI_CUDAMATRIX_CU_VALUE_H_
|
||||
|
||||
#include <cudamatrix/cu-device.h>
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/// The following class is used to simulate non-const
|
||||
/// references to Real, e.g. as returned by the non-const operator ().
|
||||
/// This class is also used as a convenient way of
|
||||
/// reading a single Real value from the device.
|
||||
template<typename Real>
|
||||
class CuValue {
|
||||
public:
|
||||
CuValue(Real *data): data_(data) { }
|
||||
CuValue(const CuValue &other): data_(other.data_) { }
|
||||
|
||||
inline CuValue operator = (const CuValue<Real> &other) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
CU_SAFE_CALL(cudaMemcpy(data_, other.data_, sizeof(Real), cudaMemcpyDeviceToDevice));
|
||||
return *this;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
*data_ = *other.data_;
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
inline Real operator = (Real r) { // assignment from Real
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
CU_SAFE_CALL(cudaMemcpy(data_, &r, sizeof(Real), cudaMemcpyHostToDevice));
|
||||
return r;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
*data_ = r;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
inline Real operator += (Real r) { return (*this = r + Real(*this)); }
|
||||
|
||||
|
||||
inline operator Real () const { // assignment to Real
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Real value;
|
||||
CU_SAFE_CALL(cudaMemcpy(&value, data_,
|
||||
sizeof(Real), cudaMemcpyDeviceToHost));
|
||||
return value;
|
||||
} else
|
||||
#endif
|
||||
return *data_;
|
||||
}
|
||||
private:
|
||||
Real *data_;
|
||||
}; // class CuValue<Real>
|
||||
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
||||
#endif // KALDI_CUDAMATRIX_CU_VALUE_H_
|
|
@ -1,462 +0,0 @@
|
|||
// cudamatrix/cu-vector-inl.h
|
||||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
|
||||
#ifndef KALDI_CUDAMATRIX_CU_VECTOR_INL_H_
|
||||
#define KALDI_CUDAMATRIX_CU_VECTOR_INL_H_
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
#include <cuda_runtime_api.h>
|
||||
#endif
|
||||
|
||||
#include "util/timer.h"
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-device.h"
|
||||
#include "cudamatrix/cu-kernels.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
|
||||
template<typename Real>
|
||||
CuVector<Real>::CuVector(const CuVector<Real> &v) {
|
||||
this->Resize(v.dim_);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
CuVector<Real>::CuVector(const CuVectorBase<Real> &v) {
|
||||
this->Resize(v.dim_);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
CuVector<Real>::CuVector(const VectorBase<Real> &v) {
|
||||
this->Resize(v.dim_);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuVector<Real>::Resize(MatrixIndexT dim, MatrixResizeType t) {
|
||||
KALDI_ASSERT(t == kSetZero || t == kUndefined); // Others not implemented
|
||||
// yet.
|
||||
if (this->dim_ == dim) {
|
||||
this->SetZero();
|
||||
return;
|
||||
}
|
||||
if (this->dim_ != 0)
|
||||
this->Destroy();
|
||||
if (dim == 0) return;
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
cuSafeCall(cudaMalloc(reinterpret_cast<void**>(&this->data_), dim * sizeof(Real)));
|
||||
this->dim_ = dim;
|
||||
if (t == kSetZero) this->SetZero();
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vector<Real> vec(dim);
|
||||
this->Swap(&vec);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuVector<Real>::Swap(Vector<Real> *vec) {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (this->dim_ == 0) {
|
||||
if (vec->dim_ != 0) {
|
||||
// *this is empty, but vec is nonempty.
|
||||
Resize(vec->dim_, kUndefined);
|
||||
this->CopyFromVec(*vec);
|
||||
vec->Resize(0);
|
||||
}
|
||||
// else both are empty.
|
||||
} else { // *this is nonempty.
|
||||
if (vec->dim_ != 0) {
|
||||
// Both *this and *vec are nonempty. Recurse to simpler cases.
|
||||
// this could be done more efficiently in the case where
|
||||
// the size does not change.
|
||||
Vector<Real> temp;
|
||||
this->Swap(&temp); // now temp is full, *this is empty.
|
||||
vec->Swap(&temp); // now vec has data from *this, temp has
|
||||
// data from vec.
|
||||
Swap(vec); // copy data in vec to *this, which is now empty.
|
||||
} else { // *this is full but *vec is empty.
|
||||
vec->Resize(this->dim_, kUndefined);
|
||||
this->CopyToVec(vec);
|
||||
this->Destroy();
|
||||
}
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
std::swap(vec->data_, this->data_);
|
||||
std::swap(vec->dim_, this->dim_);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void CuVector<Real>::Destroy() {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
if (this->data_ != NULL) {
|
||||
cuSafeCall(cudaFree(this->data_));
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
|
||||
}
|
||||
this->data_ = NULL;
|
||||
this->dim_ = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::CopyFromVec(const CuVectorBase<Real> &src) {
|
||||
KALDI_ASSERT(src.Dim() == dim_);
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
cuSafeCall(cudaMemcpy(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice));
|
||||
CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecD2D",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memcpy(static_cast<void*>(data_), static_cast<void*>(src.data_),
|
||||
dim_ * sizeof(Real));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::CopyFromVec(const VectorBase<Real> &src) {
|
||||
KALDI_ASSERT(src.Dim() == dim_);
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
cuSafeCall(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice));
|
||||
|
||||
CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
memcpy(static_cast<void*>(data_), static_cast<const void*>(src.Data()),
|
||||
dim_ * sizeof(Real));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::CopyToVec(VectorBase<Real> *dst) const {
|
||||
KALDI_ASSERT(dst->Dim() == dim_);
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
cuSafeCall(cudaMemcpy(dst->Data(), this->data_,
|
||||
dim_*sizeof(Real), cudaMemcpyDeviceToHost));
|
||||
CuDevice::Instantiate().AccuProfile("CuVector::CopyToVecD2H",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
dst->CopyFromVec(Vec());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVector<Real>::Read(std::istream &is, bool binary) {
|
||||
Vector<BaseFloat> temp;
|
||||
temp.Read(is, binary);
|
||||
Destroy();
|
||||
Swap(&temp);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVector<Real>::Write(std::ostream &os, bool binary) const {
|
||||
Vector<BaseFloat> temp(this->dim_);
|
||||
this->CopyToVec(&temp);
|
||||
temp.Write(os, binary);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::SetZero() {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
KALDI_ASSERT(dim_>0);
|
||||
KALDI_ASSERT(data_!=NULL);
|
||||
Timer tim;
|
||||
cuSafeCall(cudaMemset(data_, 0, dim_*sizeof(Real)));
|
||||
CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec().SetZero();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Print the vector to stream
|
||||
*/
|
||||
template<typename Real>
|
||||
std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec) {
|
||||
Vector<Real> temp;
|
||||
vec.CopyToVec(&temp);
|
||||
out << temp;
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Methods wrapping the ANSI-C CUDA kernels
|
||||
*/
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::Set(Real value) {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
|
||||
::MatrixDim d = { 1, Dim(), Dim() };
|
||||
|
||||
cuda_set_const(dimGrid, dimBlock, data_, value, d);
|
||||
cuSafeCall(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec().Set(value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::Add(Real value) {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
|
||||
::MatrixDim d = { 1, Dim(), Dim() };
|
||||
|
||||
cuda_add(dimGrid, dimBlock, data_, value, d);
|
||||
cuSafeCall(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec().Add(value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::Scale(Real value) {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
|
||||
::MatrixDim d = { 1, Dim(), Dim() };
|
||||
|
||||
cuda_scale(dimGrid, dimBlock, data_, value, d);
|
||||
cuSafeCall(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec().Scale(value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
void CuVectorBase<Real>::AddVec(Real alpha, const CuVectorBase<Real> &vec,
|
||||
Real beta) {
|
||||
KALDI_ASSERT(vec.Dim() == Dim());
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK);
|
||||
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
|
||||
::MatrixDim d = { 1, Dim(), Dim() };
|
||||
|
||||
cuda_add_mat(dimGrid, dimBlock, alpha, vec.data_, beta, data_, d);
|
||||
cuSafeCall(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (beta != 1.0) Vec().Scale(beta);
|
||||
Vec().AddVec(alpha, vec.Vec());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
|
||||
Real beta) {
|
||||
KALDI_ASSERT(mat.NumCols() == Dim());
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
CuVector<Real> temp(Dim()); // create a buffer
|
||||
temp.SetZero();
|
||||
|
||||
MatrixDim d = mat.Dim(); // only stride will be used!
|
||||
|
||||
// process per 256 row blocks
|
||||
for(int32 block=0; (block+1)*256 <= mat.NumRows(); block++) {
|
||||
// 1st dim ... rows, 2nd dim ... cols
|
||||
dim3 dimBlock(256, 1);
|
||||
dim3 dimGrid(1, mat.NumCols());
|
||||
int32 offset = block*256*d.stride;
|
||||
|
||||
cuda_add_row_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
|
||||
}
|
||||
|
||||
// process the remainder
|
||||
int32 div = mat.NumRows() / 256;
|
||||
int32 mod = mat.NumRows() % 256;
|
||||
if (mod != 0) {
|
||||
// 1st dim ... rows, 2nd dim ... cols
|
||||
dim3 dimBlock(mod, 1);
|
||||
dim3 dimGrid(1, mat.NumCols());
|
||||
int32 offset = div*256*d.stride;
|
||||
|
||||
cuda_add_row_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
|
||||
}
|
||||
// now we have the sum!
|
||||
|
||||
// add buffer rmp to this vector using alpha and beta
|
||||
this->AddVec(alpha,temp,beta);
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec().AddRowSumMat(alpha, mat.Mat(), beta);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::AddColSumMat(Real alpha,
|
||||
const CuMatrixBase<Real> &mat,
|
||||
Real beta) {
|
||||
KALDI_ASSERT(mat.NumRows() == Dim());
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
CuVector<Real> temp(Dim()); // create a buffer
|
||||
|
||||
MatrixDim d = mat.Dim(); // only stride will be used!
|
||||
|
||||
// process per 256 column blocks
|
||||
for(int32 block=0; (block+1)*256 <= mat.NumCols(); block++) {
|
||||
// 1st dim ... cols, 2nd dim ... rows
|
||||
dim3 dimBlock(256, 1);
|
||||
dim3 dimGrid(1, mat.NumRows());
|
||||
int32 offset = block*256;
|
||||
|
||||
cuda_add_col_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
|
||||
}
|
||||
|
||||
// process the remainder
|
||||
int32 div = mat.NumCols() / 256;
|
||||
int32 mod = mat.NumCols() % 256;
|
||||
if (mod != 0) {
|
||||
// 1st dim ... cols, 2nd dim ... rows
|
||||
dim3 dimBlock(mod, 1);
|
||||
dim3 dimGrid(1, mat.NumRows());
|
||||
int32 offset=div*256;
|
||||
|
||||
cuda_add_col_sum_mat(dimGrid, dimBlock, mat.data_ +offset, temp.data_, d);
|
||||
}
|
||||
// now we have the sum!
|
||||
|
||||
// add buffer rmp to this vector using alpha and beta
|
||||
this->AddVec(alpha, temp, beta);
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec().AddColSumMat(alpha, mat.Mat(), beta);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void CuVectorBase<Real>::InvertElements() {
|
||||
#if HAVE_CUDA==1
|
||||
if (CuDevice::Instantiate().Enabled()) {
|
||||
Timer tim;
|
||||
|
||||
dim3 dimBlock(CUBLOCK*8, 1);
|
||||
dim3 dimGrid(n_blocks(dim_, CUBLOCK*8));
|
||||
MatrixDim d = {1, dim_, dim_};
|
||||
|
||||
cuda_invert_elements(dimGrid, dimBlock, data_, d);
|
||||
cuSafeCall(cudaGetLastError());
|
||||
|
||||
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec().InvertElements();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
// cudamatrix/cu-vector-speed-test.cc
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real>
|
||||
std::string NameOf() {
|
||||
return (sizeof(Real) == 8 ? "<double>" : "<float>");
|
||||
}
|
||||
|
||||
template<typename Real> void TestCuVectorSoftmax(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuVector<Real> M(dim);
|
||||
M.SetRandn();
|
||||
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
M.ApplySoftMax();
|
||||
}
|
||||
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuVector::Softmax" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void TestCuVectorSum(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuVector<Real> M(dim);
|
||||
M.SetRandn();
|
||||
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
M.Sum();
|
||||
}
|
||||
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuVector::Sum" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuVector<Real> M(dim);
|
||||
M.SetRandn();
|
||||
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
CuVector<Real> ones(dim);
|
||||
ones.Set(1.0);
|
||||
VecVec(M, ones);
|
||||
}
|
||||
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuVector::VecVecOne" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
|
||||
BaseFloat time_in_secs = 0.05;
|
||||
CuVector<Real> v(dim);
|
||||
v.SetRandn();
|
||||
CuMatrix<Real> N(dim, dim), O(dim, dim);
|
||||
N.SetRandn(); O.SetRandn();
|
||||
|
||||
Timer tim;
|
||||
int32 iter = 0;
|
||||
|
||||
for (;tim.Elapsed() < time_in_secs; iter++) {
|
||||
v.AddDiagMatMat(1.0, N, kNoTrans, O, kNoTrans, 1.0);
|
||||
}
|
||||
|
||||
BaseFloat fdim = dim;
|
||||
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
|
||||
KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>() << ", for dim = "
|
||||
<< dim << ", speed was " << gflops << " gigaflops.";
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real> void CudaVectorSpeedTest() {
|
||||
std::vector<int32> sizes;
|
||||
sizes.push_back(16);
|
||||
sizes.push_back(128);
|
||||
sizes.push_back(256);
|
||||
sizes.push_back(1024);
|
||||
int32 ns = sizes.size();
|
||||
for (int32 s = 0; s < ns; s++) {
|
||||
TestCuVectorSoftmax<Real>(sizes[s]);
|
||||
}
|
||||
|
||||
|
||||
for (int32 s = 0; s < ns; s++) {
|
||||
TestCuVectorSum<Real>(sizes[s]);
|
||||
}
|
||||
|
||||
for (int32 s = 0; s < ns; s++) {
|
||||
TestCuVectorVecVecOne<Real>(sizes[s]);
|
||||
}
|
||||
|
||||
for (int32 s = 0; s < ns; s++) {
|
||||
TestCuVectorAddDiagMatMat<Real>(sizes[s]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
//Select the GPU
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
|
||||
#endif
|
||||
|
||||
kaldi::CudaVectorSpeedTest<float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CudaVectorSpeedTest<double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CudaVectorSpeedTest<double>();
|
||||
#endif
|
||||
std::cout << "Tests succeeded.\n";
|
||||
}
|
||||
|
|
@ -0,0 +1,751 @@
|
|||
// cudamatrix/cuda-vector-test.cc
|
||||
|
||||
// Copyright 2013 Lucas Ondel
|
||||
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "util/common-utils.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-tp-matrix.h"
|
||||
#include "cudamatrix/cu-sp-matrix.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/*
|
||||
* INITIALIZERS
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Unit tests
|
||||
*/
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuVectorIO() {
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
int32 dimM = rand() % 255;
|
||||
if (i % 5 == 0) { dimM = 0; }
|
||||
CuVector<Real> vec(dimM);
|
||||
vec.SetRandn();
|
||||
std::ostringstream os;
|
||||
bool binary = (i % 4 < 2);
|
||||
vec.Write(os, binary);
|
||||
|
||||
CuVector<Real> vec2;
|
||||
std::istringstream is(os.str());
|
||||
vec2.Read(is, binary);
|
||||
AssertEqual(vec, vec2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real, typename OtherReal>
|
||||
static void UnitTestCuVectorCopyFromVec() {
|
||||
for (int32 i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10 * i;
|
||||
Vector<Real> A(dim);
|
||||
A.SetRandn();
|
||||
CuVector<OtherReal> B(A);
|
||||
Vector<Real> C(B);
|
||||
CuVector<Real> D(dim);
|
||||
D.CopyFromVec(C);
|
||||
Vector<OtherReal> E(dim);
|
||||
E.CopyFromVec(D);
|
||||
CuVector<Real> F(E);
|
||||
CuVector<Real> A2(A);
|
||||
AssertEqual(F, A2);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuSubVector() {
|
||||
for (int32 iter = 0 ; iter < 10; iter++) {
|
||||
int32 M1 = 1 + rand () % 10, M2 = 1 + rand() % 1, M3 = 1 + rand() % 10, M = M1 + M2 + M3,
|
||||
m = rand() % M2;
|
||||
CuVector<Real> vec(M);
|
||||
vec.SetRandn();
|
||||
CuSubVector<Real> subvec1(vec, M1, M2),
|
||||
subvec2 = vec.Range(M1, M2);
|
||||
Real f1 = vec(M1 + m), f2 = subvec1(m), f3 = subvec2(m);
|
||||
KALDI_ASSERT(f1 == f2);
|
||||
KALDI_ASSERT(f2 == f3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuVectorMulTp() {
|
||||
for (int32 i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10 * i;
|
||||
Vector<Real> A(dim);
|
||||
A.SetRandn();
|
||||
TpMatrix<Real> B(dim);
|
||||
B.SetRandn();
|
||||
|
||||
CuVector<Real> C(A);
|
||||
CuTpMatrix<Real> D(B);
|
||||
|
||||
A.MulTp(B, kNoTrans);
|
||||
C.MulTp(D, kNoTrans);
|
||||
|
||||
CuVector<Real> E(A);
|
||||
AssertEqual(C, E);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void UnitTestCuVectorAddTp() {
|
||||
for (int32 i = 1; i < 10; i++) {
|
||||
MatrixIndexT dim = 10 * i;
|
||||
Vector<Real> A(dim);
|
||||
A.SetRandn();
|
||||
TpMatrix<Real> B(dim);
|
||||
B.SetRandn();
|
||||
Vector<Real> C(dim);
|
||||
C.SetRandn();
|
||||
|
||||
CuVector<Real> D(A);
|
||||
CuTpMatrix<Real> E(B);
|
||||
CuVector<Real> F(C);
|
||||
|
||||
A.AddTpVec(1.0, B, kNoTrans, C, 1.0);
|
||||
D.AddTpVec(1.0, E, kNoTrans, F, 1.0);
|
||||
|
||||
CuVector<Real> G(A);
|
||||
AssertEqual(D, G);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestVecVec() {
|
||||
int32 M = 10 % rand() % 100;
|
||||
CuVector<Real> vec1(M), vec2(M);
|
||||
vec1.SetRandn();
|
||||
vec2.SetRandn();
|
||||
Real prod = 0.0;
|
||||
for (int32 i = 0; i < M; i++)
|
||||
prod += vec1(i) * vec2(i);
|
||||
AssertEqual(prod, VecVec(vec1, vec2));
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddVec() {
|
||||
int32 M = 10 % rand() % 100;
|
||||
CuVector<Real> vec1(M);
|
||||
CuVector<Real> vec2(M);
|
||||
vec1.SetRandn();
|
||||
vec2.SetRandn();
|
||||
CuVector<Real> vec1_orig(vec1);
|
||||
BaseFloat alpha = 0.43243;
|
||||
vec1.AddVec(alpha, vec2);
|
||||
|
||||
for (int32 i = 0; i < M; i++)
|
||||
AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddVecCross() {
|
||||
for (int32 i = 0; i < 4; i++) {
|
||||
int32 M = 10 % rand() % 100;
|
||||
CuVector<float> vec1(M);
|
||||
CuVector<Real> vec2(M);
|
||||
vec1.SetRandn();
|
||||
vec2.SetRandn();
|
||||
|
||||
if (i == 0) {
|
||||
CuVector<Real> vec1_orig(vec1);
|
||||
Real alpha = 0.43243;
|
||||
vec1.AddVec(alpha, vec2);
|
||||
|
||||
for (int32 i = 0; i < M; i++)
|
||||
AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
|
||||
} else {
|
||||
CuVector<Real> vec2_orig(vec2);
|
||||
Real alpha = 0.43243;
|
||||
vec2.AddVec(alpha, vec1);
|
||||
for (int32 i = 0; i < M; i++)
|
||||
AssertEqual(vec2_orig(i) + alpha * vec1(i), vec2(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddVecExtra() {
|
||||
int32 M = 10 % rand() % 100;
|
||||
CuVector<Real> vec1(M), vec2(M);
|
||||
vec1.SetRandn();
|
||||
vec2.SetRandn();
|
||||
CuVector<Real> vec1_orig(vec1);
|
||||
BaseFloat alpha = 0.43243, beta = 1.4321;
|
||||
vec1.AddVec(alpha, vec2, beta);
|
||||
|
||||
for (int32 i = 0; i < M; i++)
|
||||
AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i));
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddRowSumMat() {
|
||||
int32 M = 10 + rand() % 280, N = 10 + rand() % 20;
|
||||
BaseFloat alpha = 10.0143432, beta = 43.4321;
|
||||
CuMatrix<Real> mat(N, M);
|
||||
mat.SetRandn();
|
||||
CuVector<Real> vec(M);
|
||||
mat.SetRandn();
|
||||
Matrix<Real> mat2(mat);
|
||||
Vector<Real> vec2(M);
|
||||
vec.AddRowSumMat(alpha, mat, beta);
|
||||
vec2.AddRowSumMat(alpha, mat2, beta);
|
||||
Vector<Real> vec3(vec);
|
||||
AssertEqual(vec2, vec3);
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddColSumMat() {
|
||||
int32 M = 10 + rand() % 280, N = 10 + rand() % 20;
|
||||
BaseFloat alpha = 10.0143432, beta = 43.4321;
|
||||
CuMatrix<Real> mat(M, N);
|
||||
mat.SetRandn();
|
||||
CuVector<Real> vec(M);
|
||||
mat.SetRandn();
|
||||
Matrix<Real> mat2(mat);
|
||||
Vector<Real> vec2(M);
|
||||
vec.AddColSumMat(alpha, mat, beta);
|
||||
vec2.AddColSumMat(alpha, mat2, beta);
|
||||
Vector<Real> vec3(vec);
|
||||
AssertEqual(vec2, vec3);
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTestApproxEqual() {
|
||||
int32 M = 10 + rand() % 100;
|
||||
CuVector<Real> vec1(M), vec2(M);
|
||||
vec1.SetRandn();
|
||||
vec2.SetRandn();
|
||||
Real tol = 0.5;
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
Real sumsq = 0.0, sumsq_orig = 0.0;
|
||||
for (int32 j = 0; j < M; j++) {
|
||||
sumsq += (vec1(j) - vec2(j)) * (vec1(j) - vec2(j));
|
||||
sumsq_orig += vec1(j) * vec1(j);
|
||||
}
|
||||
Real rms = sqrt(sumsq), rms_orig = sqrt(sumsq_orig);
|
||||
KALDI_ASSERT(vec1.ApproxEqual(vec2, tol) == (rms <= tol * rms_orig));
|
||||
tol *= 2.0;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> static void UnitTestCuVectorReplaceValue() {
|
||||
for (int32 i = 0; i < 5; i++) {
|
||||
int32 dim = 100 + rand() % 200;
|
||||
Real orig = 0.1 * (rand() % 100), changed = 0.1 * (rand() % 50);
|
||||
Vector<Real> vec(dim);
|
||||
vec.SetRandn();
|
||||
vec(dim / 2) = orig;
|
||||
CuVector<Real> vec1(vec);
|
||||
vec.ReplaceValue(orig, changed);
|
||||
vec1.ReplaceValue(orig, changed);
|
||||
Vector<Real> vec2(vec1);
|
||||
AssertEqual(vec, vec2);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestInvertElements() {
|
||||
// Also tests MulElements();
|
||||
int32 M = 256 + rand() % 100;
|
||||
CuVector<Real> vec1(M);
|
||||
vec1.SetRandn();
|
||||
CuVector<Real> vec2(vec1);
|
||||
vec2.InvertElements();
|
||||
CuVector<Real> vec3(vec1);
|
||||
vec3.MulElements(vec2);
|
||||
// vec3 should be all ones.
|
||||
Real prod = VecVec(vec3, vec3);
|
||||
AssertEqual(prod, static_cast<Real>(M));
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestSum() {
|
||||
for (int32 i =1; i < 10; i++) {
|
||||
MatrixIndexT dim = 2048 * i + 100 % rand();
|
||||
CuVector<Real> A(dim), ones(dim);
|
||||
A.SetRandn();
|
||||
ones.Set(1.0);
|
||||
|
||||
AssertEqual(VecVec(A, ones), A.Sum());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestScale() {
|
||||
for (int32 i = 0; i < 4; i++) {
|
||||
int32 dim = 100 + 400 % rand();
|
||||
CuVector<Real> cu_vec(dim);
|
||||
cu_vec.SetRandn();
|
||||
Vector<Real> vec(cu_vec);
|
||||
BaseFloat scale = 0.333;
|
||||
cu_vec.Scale(scale);
|
||||
vec.Scale(scale);
|
||||
Vector<Real> vec2(cu_vec);
|
||||
KALDI_ASSERT(ApproxEqual(vec, vec2));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestCopyFromMat() {
|
||||
int32 M = 100 + rand() % 255, N = 100 + rand() % 255;
|
||||
CuMatrix<Real> cu_matrix(M, N);
|
||||
cu_matrix.SetRandn();
|
||||
for(int32 i = 0; i < N; i++) {
|
||||
CuVector<Real> vector(M);
|
||||
vector.CopyColFromMat(cu_matrix, i);
|
||||
for(int32 j = 0; j < M; j++) {
|
||||
KALDI_ASSERT(vector(j)==cu_matrix(j, i));
|
||||
}
|
||||
}
|
||||
Matrix<Real> matrix(cu_matrix), matrix2(M, N);
|
||||
CuMatrix<Real> matrix3(M, N);
|
||||
|
||||
CuVector<Real> vector(M * N), vector2(M * N);
|
||||
vector.CopyRowsFromMat(cu_matrix);
|
||||
vector2.CopyRowsFromMat(matrix);
|
||||
matrix2.CopyRowsFromVec(vector2);
|
||||
matrix3.CopyRowsFromVec(Vector<Real>(vector2));
|
||||
Vector<Real> vector3(M * N);
|
||||
vector3.CopyRowsFromMat(cu_matrix);
|
||||
|
||||
|
||||
for(int32 j = 0; j < M*N; j++) {
|
||||
if (rand() % 500 == 0) { // random small subset (it was slow)
|
||||
KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N));
|
||||
KALDI_ASSERT(vector2(j) == cu_matrix(j/N, j%N));
|
||||
KALDI_ASSERT(vector2(j) == matrix2(j/N, j%N));
|
||||
KALDI_ASSERT(vector3(j) == matrix2(j/N, j%N));
|
||||
KALDI_ASSERT(vector3(j) == matrix3(j/N, j%N));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestCopyDiagFromPacked() {
|
||||
for (int32 i = 0; i < 5; i++) {
|
||||
int32 N = 100 + rand() % 255;
|
||||
CuSpMatrix<Real> S(N);
|
||||
S.SetRandn();
|
||||
CuVector<Real> V(N, kUndefined);
|
||||
V.CopyDiagFromPacked(S);
|
||||
SpMatrix<Real> cpu_S(S);
|
||||
Vector<Real> cpu_V(N);
|
||||
cpu_V.CopyDiagFromPacked(cpu_S);
|
||||
Vector<Real> cpu_V2(V);
|
||||
KALDI_ASSERT(cpu_V.ApproxEqual(cpu_V2));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestCopyCross() {
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
int32 M = 100 + rand() % 255;
|
||||
if (rand() % 3 == 0) M = 0;
|
||||
CuVector<Real> v1(M);
|
||||
v1.SetRandn();
|
||||
CuVector<float> v2(M);
|
||||
v2.CopyFromVec(v1);
|
||||
CuVector<Real> v3(M);
|
||||
v3.CopyFromVec(v2);
|
||||
AssertEqual(v1, v3);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestCopyCross2() {
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
int32 M = 100 + rand() % 255;
|
||||
if (rand() % 3 == 0) M = 0;
|
||||
CuVector<Real> v1(M);
|
||||
v1.SetRandn();
|
||||
Vector<float> v2(M);
|
||||
v2.CopyFromVec(v1);
|
||||
CuVector<Real> v3(M);
|
||||
v3.CopyFromVec(v2);
|
||||
AssertEqual(v1, v3);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestCopyDiagFromMat() {
|
||||
for (int32 i = 0; i < 5; i++) {
|
||||
int32 M = 100 + rand() % 255, N = M + rand() % 2;
|
||||
Matrix<Real> matrix(M, N);
|
||||
if (i % 2 == 0) matrix.Transpose();
|
||||
matrix.SetRandn();
|
||||
Vector<Real> vector(M, kUndefined);
|
||||
vector.CopyDiagFromMat(matrix);
|
||||
|
||||
CuMatrix<Real> cuda_matrix(matrix);
|
||||
CuVector<Real> cuda_vector(M, kUndefined);
|
||||
cuda_vector.CopyDiagFromMat(cuda_matrix);
|
||||
Vector<Real> vector2(cuda_vector);
|
||||
AssertEqual(vector, vector2);
|
||||
AssertEqual(vector.Sum(), cuda_matrix.Trace(false));
|
||||
AssertEqual(cuda_vector.Sum(), matrix.Trace(false));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTestNorm() {
|
||||
int32 dim = 2;
|
||||
CuVector<Real> cu_vector(dim);
|
||||
cu_vector(0) = 1.0;
|
||||
cu_vector(1) = -2.0;
|
||||
KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0));
|
||||
KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0)));
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTestMin() {
|
||||
for (int32 p = 0; p < 5; p++) {
|
||||
int32 dim = 100 + rand() % 500;
|
||||
CuVector<Real> cu_vector(dim);
|
||||
cu_vector.SetRandn();
|
||||
Vector<Real> vector(cu_vector);
|
||||
Real min1 = cu_vector.Min(), min2 = vector.Min();
|
||||
KALDI_ASSERT(min1 == min2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTestMax() {
|
||||
for (int32 p = 0; p < 5; p++) {
|
||||
int32 dim = 100 + rand() % 500;
|
||||
CuVector<Real> cu_vector(dim);
|
||||
cu_vector.SetRandn();
|
||||
Vector<Real> vector(cu_vector);
|
||||
Real max1 = cu_vector.Max(), max2 = vector.Max();
|
||||
KALDI_ASSERT(max1 == max2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTestApplySoftMax() {
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
int32 dim = 100 + rand() % 300;
|
||||
//int32 dim = 1024;
|
||||
CuVector<Real> cu_vector(dim);
|
||||
cu_vector.SetRandn();
|
||||
Vector<Real> vector(cu_vector);
|
||||
|
||||
cu_vector.ApplySoftMax();
|
||||
vector.ApplySoftMax();
|
||||
CuVector<Real> cu_vector2(vector);
|
||||
//std::cout<<cu_vector <<"\n"<<cu_vector2<<std::endl;
|
||||
AssertEqual(cu_vector, cu_vector2);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestApplyExp() {
|
||||
int32 dim = 100;
|
||||
CuVector<Real> vector(dim);
|
||||
vector.SetRandn();
|
||||
CuVector<Real> vector2(vector);
|
||||
|
||||
vector.ApplyExp();
|
||||
for(int32 j = 0; j < dim; j++) {
|
||||
//std::cout<<"diff is "<<exp(vector2(j))-vector(j)<<std::endl;;
|
||||
KALDI_ASSERT(abs(exp(vector2(j))-vector(j)) < 0.000001 )
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestApplyLog() {
|
||||
int32 dim = 100;
|
||||
CuVector<Real> vector(dim);
|
||||
vector.SetRandn();
|
||||
for(int32 j = 0; j < dim; j++) {
|
||||
if(vector(j) <= 0.0)
|
||||
vector(j) = 1.0 - vector(j);
|
||||
}
|
||||
|
||||
CuVector<Real> vector2(vector);
|
||||
|
||||
vector.ApplyLog();
|
||||
for(int32 j = 0; j < dim; j++) {
|
||||
//std::cout<<"diff is "<<exp(vector2(j))-vector(j)<<std::endl;;
|
||||
KALDI_ASSERT(abs(log(vector2(j))-vector(j)) < 0.000001 )
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestApplyFloor() {
|
||||
for (int32 l = 0; l < 10; l++) {
|
||||
int32 dim = 100 + rand() % 700;
|
||||
CuVector<Real> cu_vector(dim);
|
||||
cu_vector.SetRandn();
|
||||
|
||||
Vector<Real> vector(cu_vector);
|
||||
BaseFloat floor = 0.33 * (-5 + rand() % 10);
|
||||
int32 i = cu_vector.ApplyFloor(floor);
|
||||
int32 j = vector.ApplyFloor(floor);
|
||||
|
||||
CuVector<Real> cu2(vector);
|
||||
|
||||
AssertEqual(cu2, cu_vector);
|
||||
if (i != j) {
|
||||
KALDI_WARN << "ApplyFloor return code broken...";
|
||||
}
|
||||
KALDI_ASSERT(i==j);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestApplyPow() {
|
||||
for (int32 l = 0; l < 10; l++) {
|
||||
int32 dim = 100 + rand() % 700;
|
||||
|
||||
CuVector<Real> cu_vector(dim);
|
||||
cu_vector.SetRandn();
|
||||
|
||||
Vector<Real> vector(cu_vector);
|
||||
|
||||
BaseFloat pow = -2 + (rand() % 5);
|
||||
cu_vector.ApplyPow(pow);
|
||||
vector.ApplyPow(pow);
|
||||
|
||||
CuVector<Real> cu2(vector);
|
||||
|
||||
AssertEqual(cu2, cu_vector);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddVecVec() {
|
||||
int32 dim = 100;
|
||||
CuVector<Real> cu_vector(dim);
|
||||
cu_vector.SetRandn();
|
||||
Vector<Real> vector(cu_vector);
|
||||
|
||||
Real beta = rand();
|
||||
Real alpha = rand();
|
||||
Vector<Real> v(dim), r(dim);
|
||||
v.SetRandn(); r.SetRandn();
|
||||
CuVector<Real> cuV(v), cuR(r);
|
||||
|
||||
|
||||
cu_vector.AddVecVec(alpha, cuR, cuV, beta);
|
||||
vector.AddVecVec(alpha, r, v, beta);
|
||||
|
||||
CuVector<Real> cu2(vector);
|
||||
std::cout<<cu2(0)<<' '<<cu_vector(0)<<std::endl;
|
||||
AssertEqual(cu2, cu_vector);
|
||||
}
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddDiagMat2() {
|
||||
for (int p = 0; p < 4; p++) {
|
||||
int32 M = 230 + rand() % 100, N = 230 + rand() % 100;
|
||||
BaseFloat alpha = 0.2 + rand() % 3, beta = 0.3 + rand() % 2;
|
||||
CuVector<Real> cu_vector(M);
|
||||
cu_vector.SetRandn();
|
||||
|
||||
CuMatrix<Real> cu_mat_orig(M, N);
|
||||
cu_mat_orig.SetRandn();
|
||||
MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
|
||||
CuMatrix<Real> cu_mat(cu_mat_orig, trans);
|
||||
|
||||
Vector<Real> vector(cu_vector);
|
||||
Matrix<Real> mat(cu_mat);
|
||||
|
||||
vector.AddDiagMat2(alpha, mat, trans, beta);
|
||||
cu_vector.AddDiagMat2(alpha, cu_mat, trans, beta);
|
||||
|
||||
Vector<Real> vector2(cu_vector);
|
||||
AssertEqual(vector, vector2);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
static void CuVectorUnitTestAddDiagMatMat() {
|
||||
for (MatrixIndexT iter = 0; iter < 4; iter++) {
|
||||
BaseFloat alpha = 0.432 + rand() % 5, beta = 0.043 + rand() % 2;
|
||||
MatrixIndexT dimM = 10 + rand() % 300,
|
||||
dimN = 5 + rand() % 300;
|
||||
CuVector<Real> v(dimM);
|
||||
CuMatrix<Real> M_orig(dimM, dimN), N_orig(dimN, dimM);
|
||||
M_orig.SetRandn();
|
||||
N_orig.SetRandn();
|
||||
MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans);
|
||||
MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans);
|
||||
CuMatrix<Real> M(M_orig, transM), N(N_orig, transN);
|
||||
|
||||
v.SetRandn();
|
||||
CuVector<Real> w(v);
|
||||
|
||||
w.AddDiagMatMat(alpha, M, transM, N, transN, beta);
|
||||
|
||||
{
|
||||
CuVector<Real> w2(v);
|
||||
CuMatrix<Real> MN(dimM, dimM);
|
||||
MN.AddMatMat(1.0, M, transM, N, transN, 0.0);
|
||||
CuVector<Real> d(dimM);
|
||||
d.CopyDiagFromMat(MN);
|
||||
w2.Scale(beta);
|
||||
w2.AddVec(alpha, d);
|
||||
AssertEqual(w, w2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddMatVec() {
|
||||
for (int32 i = 0; i < 10; i++) {
|
||||
int32 M = 10 + rand() % 500, N = 10 + rand() % 400;
|
||||
|
||||
bool transpose = (i % 2 == 0);
|
||||
|
||||
CuVector<Real> src_cu(M);
|
||||
src_cu.SetRandn();
|
||||
Vector<Real> src(src_cu);
|
||||
|
||||
CuVector<Real> dst_cu(N);
|
||||
dst_cu.SetRandn();
|
||||
Vector<Real> dst(dst_cu);
|
||||
|
||||
CuMatrix<Real> mat_cu(transpose ? M : N, transpose ? N : M);
|
||||
mat_cu.SetRandn();
|
||||
Matrix<Real> mat(mat_cu);
|
||||
|
||||
BaseFloat alpha = 0.5 * (rand() % 10), beta = 0.5 * (rand() % 10);
|
||||
dst_cu.AddMatVec(alpha, mat_cu, transpose ? kTrans : kNoTrans,
|
||||
src_cu, beta);
|
||||
dst.AddMatVec(alpha, mat, transpose ? kTrans : kNoTrans,
|
||||
src, beta);
|
||||
Vector<Real> dst2(dst_cu);
|
||||
AssertEqual(dst, dst2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTestAddSpVec() {
|
||||
for (int32 i = 0; i < 5; i++) {
|
||||
int32 M = 100 + rand() % 256;
|
||||
|
||||
CuVector<Real> src_cu(M);
|
||||
src_cu.SetRandn();
|
||||
Vector<Real> src(src_cu);
|
||||
|
||||
CuVector<Real> dst_cu(M);
|
||||
dst_cu.SetRandn();
|
||||
Vector<Real> dst(dst_cu);
|
||||
|
||||
CuSpMatrix<Real> mat_cu(M);
|
||||
mat_cu.SetRandn();
|
||||
SpMatrix<Real> mat(mat_cu);
|
||||
|
||||
BaseFloat alpha = 0.5 * (rand() % 5), beta = 0.5 * (rand() % 5);
|
||||
dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta);
|
||||
dst.AddSpVec(alpha, mat, src, beta);
|
||||
Vector<Real> dst2(dst_cu);
|
||||
AssertEqual(dst, dst2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real> void CuVectorUnitTest() {
|
||||
UnitTestCuVectorCopyFromVec<Real, float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported())
|
||||
#endif
|
||||
UnitTestCuVectorCopyFromVec<Real, double>();
|
||||
UnitTestCuVectorIO<Real>();
|
||||
CuVectorUnitTestVecVec<Real>();
|
||||
CuVectorUnitTestAddVec<Real>();
|
||||
CuVectorUnitTestAddVecCross<Real>();
|
||||
CuVectorUnitTestAddVecExtra<Real>();
|
||||
CuVectorUnitTestApproxEqual<Real>();
|
||||
CuVectorUnitTestScale<Real>();
|
||||
CuVectorUnitTestSum<Real>();
|
||||
CuVectorUnitTestInvertElements<Real>();
|
||||
CuVectorUnitTestAddRowSumMat<Real>();
|
||||
CuVectorUnitTestAddColSumMat<Real>();
|
||||
UnitTestCuVectorReplaceValue<Real>();
|
||||
UnitTestCuVectorAddTp<Real>();
|
||||
UnitTestCuVectorMulTp<Real>();
|
||||
UnitTestCuSubVector<Real>();
|
||||
CuVectorUnitTestCopyFromMat<Real>();
|
||||
CuVectorUnitTestMin<Real>();
|
||||
CuVectorUnitTestMax<Real>();
|
||||
CuVectorUnitTestApplySoftMax<Real>();
|
||||
CuVectorUnitTestCopyDiagFromPacked<Real>();
|
||||
CuVectorUnitTestCopyDiagFromMat<Real>();
|
||||
CuVectorUnitTestCopyCross<Real>();
|
||||
CuVectorUnitTestCopyCross2<Real>();
|
||||
CuVectorUnitTestNorm<Real>();
|
||||
CuVectorUnitTestApplyExp<Real>();
|
||||
CuVectorUnitTestApplyLog<Real>();
|
||||
CuVectorUnitTestApplyFloor<Real>();
|
||||
CuVectorUnitTestApplyPow<Real>();
|
||||
CuVectorUnitTestAddMatVec<Real>();
|
||||
CuVectorUnitTestAddSpVec<Real>();
|
||||
CuVectorUnitTestAddVecVec<Real>();
|
||||
CuVectorUnitTestAddDiagMat2<Real>();
|
||||
CuVectorUnitTestAddDiagMatMat<Real>();
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
//Select the GPU
|
||||
using namespace kaldi;
|
||||
const char *usage = "Usage: cu-vector-test [options]";
|
||||
|
||||
ParseOptions po(usage);
|
||||
std::string use_gpu = "yes";
|
||||
po.Register("use-gpu", &use_gpu, "yes|no|optional");
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 0) {
|
||||
po.PrintUsage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int32 loop = 0; loop < 2; loop++) {
|
||||
#if HAVE_CUDA == 1
|
||||
if (loop == 0)
|
||||
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
|
||||
else
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu);
|
||||
#endif
|
||||
|
||||
|
||||
kaldi::CuVectorUnitTest<float>();
|
||||
#if HAVE_CUDA == 1
|
||||
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
|
||||
kaldi::CuVectorUnitTest<double>();
|
||||
} else {
|
||||
KALDI_WARN << "Double precision not supported";
|
||||
}
|
||||
#else
|
||||
kaldi::CuVectorUnitTest<double>();
|
||||
#endif
|
||||
|
||||
if (loop == 0)
|
||||
KALDI_LOG << "Tests without GPU use succeeded.\n";
|
||||
else
|
||||
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
|
||||
}
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().PrintProfile();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -2,6 +2,8 @@
|
|||
|
||||
// Copyright 2009-2012 Karel Vesely
|
||||
// Johns Hopkins University (author: Daniel Povey)
|
||||
// Lucas Ondel
|
||||
// 2013 Xiaohui Zhang
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -25,49 +27,170 @@
|
|||
|
||||
#include "matrix/kaldi-vector.h"
|
||||
#include "cudamatrix/cu-common.h"
|
||||
#include "cudamatrix/cu-value.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
template<typename Real> class CuMatrixBase;
|
||||
|
||||
template<typename Real>
|
||||
Real VecVec(const CuVectorBase<Real> &v1, const CuVectorBase<Real> &v2);
|
||||
|
||||
template<typename Real, typename OtherReal>
|
||||
Real VecVec(const CuVectorBase<Real> &v1, const CuVectorBase<OtherReal> &v2);
|
||||
|
||||
/**
|
||||
* Vector for CUDA computing
|
||||
*/
|
||||
template<typename Real>
|
||||
class CuVectorBase {
|
||||
public:
|
||||
friend class CuVectorBase<float>;
|
||||
friend class CuVectorBase<double>;
|
||||
friend class CuMatrixBase<Real>;
|
||||
friend class MatrixBase<Real>;
|
||||
friend class CuPackedMatrix<Real>;
|
||||
friend class CuSpMatrix<Real>;
|
||||
friend class CuTpMatrix<Real>;
|
||||
|
||||
template <typename OtherReal>
|
||||
friend OtherReal VecVec(const CuVectorBase<OtherReal> &v1,
|
||||
const CuVectorBase<OtherReal> &v2);
|
||||
friend void cu::Splice<Real>(const CuMatrix<Real> &src,
|
||||
const CuStlVector<int32> &frame_offsets,
|
||||
const CuArray<int32> &frame_offsets,
|
||||
CuMatrix<Real> *tgt);
|
||||
|
||||
friend class CuRand<Real>;
|
||||
|
||||
/// Dimensions
|
||||
MatrixIndexT Dim() const { return dim_; }
|
||||
|
||||
/// Returns a pointer to the start of the vector's data.
|
||||
inline Real* Data() { return data_; }
|
||||
/// Returns a pointer to the start of the vector's data (const).
|
||||
inline const Real* Data() const { return data_; }
|
||||
|
||||
/// Copy functions; these will crash if the dimension
|
||||
/// do not match. The operator = in class CuVector will
|
||||
/// also change the sizes for you.
|
||||
void CopyFromVec(const CuVectorBase<Real> &src);
|
||||
void CopyFromVec(const VectorBase<Real> &src);
|
||||
void CopyToVec(VectorBase<Real> *dst) const;
|
||||
|
||||
template<typename OtherReal>
|
||||
void CopyFromVec(const CuVectorBase<OtherReal> &M);
|
||||
|
||||
template<typename OtherReal>
|
||||
void CopyFromVec(const VectorBase<OtherReal> &src);
|
||||
|
||||
|
||||
template<typename OtherReal>
|
||||
void CopyToVec(VectorBase<OtherReal> *dst) const;
|
||||
|
||||
void CopyRowsFromMat(const CuMatrixBase<Real> &M);
|
||||
|
||||
void CopyRowsFromMat(const MatrixBase<Real> &M);
|
||||
|
||||
/// Math operations
|
||||
void SetZero();
|
||||
void Set(Real value);
|
||||
void Add(Real value);
|
||||
void Scale(Real value);
|
||||
|
||||
void AddVec(Real alpha, const CuVectorBase<Real> &vec, Real beta = 1.0);
|
||||
|
||||
template<typename OtherReal>
|
||||
void AddVec(Real alpha, const CuVectorBase<OtherReal> &vec, Real beta = 1.0);
|
||||
|
||||
/// Sum the rows of the matrix, add to vector
|
||||
void AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0);
|
||||
/// Sum the columns of the matrix, add to vector
|
||||
void AddColSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0);
|
||||
|
||||
/// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
|
||||
/// Works even if rv == *this.
|
||||
void AddTpVec(const Real alpha, const CuTpMatrix<Real>&M,
|
||||
const MatrixTransposeType trans, const CuVectorBase<Real> &v,
|
||||
const Real beta); // **beta previously defaulted to 0.0**
|
||||
|
||||
/// Multiplies this vector by lower-triangular marix: *this <-- *this *M
|
||||
void MulTp(const CuTpMatrix<Real> &M, const MatrixTransposeType trans);
|
||||
|
||||
bool ApproxEqual(const CuVectorBase<Real> &other, float tol = 0.01) const;
|
||||
|
||||
void InvertElements();
|
||||
|
||||
void ApplySoftMax();
|
||||
void ApplyExp();
|
||||
void ApplyLog();
|
||||
MatrixIndexT ApplyFloor(Real floor_val);
|
||||
void ApplyPow(Real power);
|
||||
Real Sum() const;
|
||||
void SetRandn();
|
||||
|
||||
CuSubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
|
||||
return CuSubVector<Real>(*this, o, l);
|
||||
}
|
||||
|
||||
const CuSubVector<Real> Range(const MatrixIndexT o,
|
||||
const MatrixIndexT l) const {
|
||||
return CuSubVector<Real>(*this, o, l);
|
||||
}
|
||||
|
||||
void CopyColFromMat(const CuMatrixBase<Real> &mat, MatrixIndexT col);
|
||||
|
||||
template<typename OtherReal>
|
||||
void CopyColFromMat(const CuMatrixBase<OtherReal> &mat, MatrixIndexT col);
|
||||
|
||||
void AddMatVec(const Real alpha, const CuMatrixBase<Real> &M,
|
||||
MatrixTransposeType trans, const CuVectorBase<Real> &v,
|
||||
const Real beta);
|
||||
void AddVecVec(Real alpha, const CuVectorBase<Real> &v,
|
||||
const CuVectorBase<Real> &r, Real beta);
|
||||
|
||||
void AddSpVec(const Real alpha, const CuSpMatrix<Real> &S,
|
||||
const CuVectorBase<Real> &v, const Real beta);
|
||||
|
||||
/// Add the diagonal of a matrix times itself:
|
||||
/// *this = diag(M M^T) + beta * *this (if trans == kNoTrans), or
|
||||
/// *this = diag(M^T M) + beta * *this (if trans == kTrans).
|
||||
void AddDiagMat2(Real alpha, const CuMatrixBase<Real> &M,
|
||||
MatrixTransposeType trans, Real beta);
|
||||
|
||||
/// Add the diagonal of a matrix product: *this = diag(M N), assuming the
|
||||
/// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
|
||||
/// as you would expect.
|
||||
void AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M, MatrixTransposeType transM,
|
||||
const CuMatrixBase<Real> &N, MatrixTransposeType transN,
|
||||
Real beta = 1.0);
|
||||
|
||||
inline CuValue<Real> operator() (MatrixIndexT i) {
|
||||
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
|
||||
static_cast<UnsignedMatrixIndexT>(dim_));
|
||||
return CuValue<Real>(data_ + i);
|
||||
}
|
||||
|
||||
Real Norm(BaseFloat p); // Only works for p = 1 and p = 2.
|
||||
|
||||
inline Real operator() (MatrixIndexT i) const {
|
||||
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
|
||||
static_cast<UnsignedMatrixIndexT>(dim_));
|
||||
return CuValue<Real>(data_ + i); // will be casted to Real.
|
||||
}
|
||||
|
||||
/// Extracts the diagonal of a packed matrix M; works for Sp or Tp.
|
||||
void CopyDiagFromPacked(const CuPackedMatrix<Real> &M);
|
||||
|
||||
/// Extracts the diagonal of a matrix.
|
||||
void CopyDiagFromMat(const CuMatrix<Real> &M);
|
||||
|
||||
Real Max() const;
|
||||
Real Min() const;
|
||||
|
||||
// Set each element to y = (x == orig ? changed : x).
|
||||
void ReplaceValue(Real orig, Real changed);
|
||||
|
||||
void MulElements(const CuVectorBase<Real> &v);
|
||||
protected:
|
||||
|
||||
protected:
|
||||
// The following two functions should only be called if we did not compile
|
||||
// with CUDA or could not get a CUDA card; in that case the contents are
|
||||
// interpreted the same as a regular vector.
|
||||
|
@ -78,7 +201,7 @@ protected:
|
|||
return *(reinterpret_cast<VectorBase<Real>* >(this));
|
||||
}
|
||||
|
||||
/// Default constructor: make it private so the user cannot
|
||||
/// Default constructor: make it protected so the user cannot
|
||||
/// instantiate this class.
|
||||
CuVectorBase<Real>(): data_(NULL), dim_(0) { }
|
||||
|
||||
|
@ -89,14 +212,38 @@ protected:
|
|||
KALDI_DISALLOW_COPY_AND_ASSIGN(CuVectorBase);
|
||||
};
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class CuVector: public CuVectorBase<Real> {
|
||||
friend class CuVectorBase<float>;
|
||||
friend class CuVectorBase<double>;
|
||||
friend class CuMatrixBase<Real>;
|
||||
friend class CuPackedMatrix<Real>;
|
||||
friend class CuSpMatrix<Real>;
|
||||
friend class CuTpMatrix<Real>;
|
||||
|
||||
public:
|
||||
CuVector() { }
|
||||
CuVector(MatrixIndexT dim, MatrixResizeType t = kSetZero) { Resize(dim, t); }
|
||||
CuVector(const CuVector<Real> &v);
|
||||
|
||||
CuVector(const CuVectorBase<Real> &v);
|
||||
|
||||
CuVector(const VectorBase<Real> &v);
|
||||
explicit CuVector(const CuVector<Real> &v) : CuVectorBase<Real>() {
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
template<typename OtherReal>
|
||||
explicit CuVector(const CuVectorBase<OtherReal> &v) : CuVectorBase<Real>() {
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(v);
|
||||
}
|
||||
|
||||
template<typename OtherReal>
|
||||
explicit CuVector(const VectorBase<OtherReal> &v) : CuVectorBase<Real>() {
|
||||
Resize(v.Dim(), kUndefined);
|
||||
this->CopyFromVec(Vector<Real>(v));
|
||||
}
|
||||
|
||||
/// Allocate the memory
|
||||
void Resize(MatrixIndexT dim, MatrixResizeType t = kSetZero);
|
||||
|
@ -104,12 +251,20 @@ class CuVector: public CuVectorBase<Real> {
|
|||
~CuVector() { Destroy(); }
|
||||
|
||||
CuVector<Real> &operator = (const CuVectorBase<Real> &other) {
|
||||
Resize(other.Dim());
|
||||
CopyFromVec(other);
|
||||
Resize(other.Dim(), kUndefined);
|
||||
this->CopyFromVec(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
CuVector<Real> &operator = (const CuVector<Real> &other) {
|
||||
Resize(other.Dim(), kUndefined);
|
||||
this->CopyFromVec(other);
|
||||
return *this;
|
||||
}
|
||||
CuVector<Real> &operator = (const VectorBase<Real> &other) {
|
||||
Resize(other.Dim());
|
||||
CopyFromVec(other);
|
||||
this->CopyFromVec(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
@ -118,27 +273,91 @@ class CuVector: public CuVectorBase<Real> {
|
|||
void Write(std::ostream &is, bool binary) const;
|
||||
|
||||
void Swap(Vector<Real> *vec);
|
||||
|
||||
private:
|
||||
void Destroy();
|
||||
};
|
||||
|
||||
// We'll fill out the following class if it's needed.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class CuSubVector: public CuVectorBase<Real> {
|
||||
public:
|
||||
private:
|
||||
public:
|
||||
CuSubVector(const CuVectorBase<Real> &t, const MatrixIndexT origin,
|
||||
const MatrixIndexT length) : CuVectorBase<Real>() {
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
|
||||
static_cast<UnsignedMatrixIndexT>(length) <=
|
||||
static_cast<UnsignedMatrixIndexT>(t.Dim()));
|
||||
CuVectorBase<Real>::data_ = const_cast<Real*>(t.Data()+origin);
|
||||
CuVectorBase<Real>::dim_ = length;
|
||||
}
|
||||
/// Copy constructor
|
||||
/// this constructor needed for Range() to work in base class.
|
||||
CuSubVector(const CuSubVector &other) : CuVectorBase<Real> () {
|
||||
CuVectorBase<Real>::data_ = other.data_;
|
||||
CuVectorBase<Real>::dim_ = other.dim_;
|
||||
}
|
||||
|
||||
CuSubVector(const Real* data, MatrixIndexT length) : CuVectorBase<Real> () {
|
||||
// Yes, we're evading C's restrictions on const here, and yes, it can be used
|
||||
// to do wrong stuff; unfortunately the workaround would be very difficult.
|
||||
CuVectorBase<Real>::data_ = const_cast<Real*>(data);
|
||||
CuVectorBase<Real>::dim_ = length;
|
||||
}
|
||||
|
||||
/// This operation does not preserve const-ness, so be careful.
|
||||
CuSubVector(const CuMatrixBase<Real> &matrix, MatrixIndexT row) {
|
||||
CuVectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
|
||||
CuVectorBase<Real>::dim_ = matrix.NumCols();
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
/// I/O
|
||||
template<typename Real>
|
||||
std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec);
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
bool ApproxEqual(const CuVectorBase<Real> &a,
|
||||
const CuVectorBase<Real> &b, Real tol = 0.01) {
|
||||
return a.ApproxEqual(b, tol);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
inline void AssertEqual(CuVectorBase<Real> &a, CuVectorBase<Real> &b,
|
||||
float tol = 0.01) {
|
||||
KALDI_ASSERT(a.ApproxEqual(b, tol));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void CuVectorBase<Real>::CopyFromVec(const CuVectorBase<OtherReal> &v) {
|
||||
v.CopyToVec(&this);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void VectorBase<Real>::CopyFromVec(const CuVectorBase<OtherReal> &cu) {
|
||||
cu.CopyToVec(this);
|
||||
}
|
||||
|
||||
// declare template specializations.
|
||||
template <>
|
||||
template <>
|
||||
void CuVectorBase<double>::CopyFromVec<float>(const CuVectorBase<float> &src);
|
||||
|
||||
template<>
|
||||
template <>
|
||||
void CuVectorBase<float>::CopyFromVec<double>(const CuVectorBase<double> &src);
|
||||
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
Vector<Real>::Vector(const CuVectorBase<OtherReal> &cu) {
|
||||
Init(cu.Dim());
|
||||
cu.CopyToVec(this);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
#include "cu-vector-inl.h"
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
// cudamatrix/cublas-wrappers.h
|
||||
|
||||
// Copyright 2013 Johns Hopkins University (author: Daniel Povey);
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#ifndef KALDI_MATRIX_CUBLAS_WRAPPERS_H_
|
||||
#define KALDI_MATRIX_CUBLAS_WRAPPERS_H_ 1
|
||||
|
||||
// Do not include this file directly. It is to be included
|
||||
// by .cc files in this directory.
|
||||
|
||||
namespace kaldi {
|
||||
#if HAVE_CUDA == 1
|
||||
|
||||
inline void cublas_gemm(char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda,const float *B, int ldb, float beta, float *C, int ldc) {
|
||||
cublasSgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
|
||||
}
|
||||
inline void cublas_gemm(char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda,const double *B, int ldb, double beta, double *C, int ldc) {
|
||||
cublasDgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
|
||||
}
|
||||
inline void cublas_trsm(int m, int n, float alpha, const float* A, int lda, float* B, int ldb) {
|
||||
cublasStrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
|
||||
}
|
||||
inline void cublas_trsm(int m, int n, double alpha, const double* A, int lda, double* B, int ldb) {
|
||||
cublasDtrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
|
||||
}
|
||||
inline void cublas_syrk(char uplo, char trans, int n, int k,
|
||||
float alpha, const float *A, int lda,
|
||||
float beta, float *C, int ldc) {
|
||||
cublasSsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
|
||||
}
|
||||
inline void cublas_syrk(char uplo, char trans, int n, int k,
|
||||
double alpha, const double *A, int lda,
|
||||
double beta, double *C, int ldc) {
|
||||
cublasDsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
|
||||
}
|
||||
inline float cublas_dot(int n, const float *x, int incx, const float *y, int incy) {
|
||||
return cublasSdot(n, x, incx, y, incy);
|
||||
}
|
||||
inline double cublas_dot(int n, const double *x, int incx, const double *y, int incy) {
|
||||
return cublasDdot(n, x, incx, y, incy);
|
||||
}
|
||||
inline float cublas_asum(int n, const float* x, int incx) {
|
||||
return cublasSasum(n, x, incx);
|
||||
}
|
||||
inline double cublas_asum(int n, const double* x, int incx) {
|
||||
return cublasDasum(n, x, incx);
|
||||
}
|
||||
inline float cublas_nrm2(int n, const float* x, int incx) {
|
||||
return cublasSnrm2(n, x, incx);
|
||||
}
|
||||
inline double cublas_nrm2(int n, const double* x, int incx) {
|
||||
return cublasDnrm2(n, x, incx);
|
||||
}
|
||||
inline void cublas_copy(int n, const float* x, int incx,
|
||||
float* y, int incy) {
|
||||
cublasScopy(n,x,incx,y,incy);
|
||||
}
|
||||
inline void cublas_copy(int n, const double* x, int incx,
|
||||
double* y, int incy) {
|
||||
cublasDcopy(n,x,incx,y,incy);
|
||||
}
|
||||
inline void cublas_scal(int n, float alpha, float* mat, int incx) {
|
||||
cublasSscal(n, alpha, mat, incx);
|
||||
}
|
||||
inline void cublas_scal(int n, double alpha, double* mat, int incx) {
|
||||
cublasDscal(n, alpha, mat, incx);
|
||||
}
|
||||
|
||||
inline void cublas_axpy(int n, float alpha, const float* x, int incx, float* y, int incy) {
|
||||
cublasSaxpy(n, alpha, x, incx, y, incy);
|
||||
}
|
||||
inline void cublas_axpy(int n, double alpha, const double* x, int incx, double* y, int incy) {
|
||||
cublasDaxpy(n, alpha, x, incx, y, incy);
|
||||
}
|
||||
inline void cublas_gemv(char trans, int m, int n, float alpha,
|
||||
const float* A, int lda, const float* x,
|
||||
int incx, float beta, float* y, int incy) {
|
||||
cublasSgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
|
||||
}
|
||||
inline void cublas_gemv(char trans, int m, int n, double alpha,
|
||||
const double* A, int lda, const double* x,
|
||||
int incx, double beta, double* y, int incy) {
|
||||
cublasDgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
|
||||
}
|
||||
|
||||
inline void cublas_spmv(char uplo, int n, float alpha, const float *AP, const float *x,
|
||||
int incx, float beta, float *y, int incy) {
|
||||
cublasSspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
|
||||
}
|
||||
inline void cublas_spmv(char uplo, int n, double alpha, const double *AP, const double *x,
|
||||
int incx, double beta, double *y, int incy) {
|
||||
cublasDspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
// Use caution with these, the 'transpose' argument is the opposite of what it
|
||||
// should really be, due to CUDA storing things in column major order. We also
|
||||
// had to switch 'l' to 'u'; we view our packed matrices as lower-triangular,
|
||||
// row-by-row, but CUDA views the same layout as upper-triangular,
|
||||
// column-by-column.
|
||||
inline void cublas_tpmv(char trans, int n,
|
||||
const float* Ap, float* x, int incx) {
|
||||
return cublasStpmv('u', trans, 'n', n, Ap, x, incx);
|
||||
}
|
||||
inline void cublas_tpmv(char trans, int n, const double* Ap,
|
||||
double* x,int incx) {
|
||||
return cublasDtpmv('u', trans, 'n', n, Ap, x, incx);
|
||||
}
|
||||
|
||||
inline void cublas_spr(char uplo, int n, float alpha, const float *x,
|
||||
int incx, float *AP) {
|
||||
cublasSspr(uplo, n, alpha, x, incx, AP);
|
||||
}
|
||||
inline void cublas_spr(char uplo, int n, double alpha, const double *x,
|
||||
int incx, double *AP) {
|
||||
cublasDspr(uplo, n, alpha, x, incx, AP);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
// namespace kaldi
|
||||
|
||||
#endif
|
|
@ -1,713 +0,0 @@
|
|||
// cudamatrix/cuda-matrix-test.cc
|
||||
|
||||
// Copyright 2010 Karel Vesely
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-math.h"
|
||||
|
||||
using namespace kaldi;
|
||||
|
||||
|
||||
namespace kaldi {
|
||||
|
||||
/*
|
||||
* INITIALIZERS
|
||||
*/
|
||||
template<class Real>
|
||||
static void InitRand(VectorBase<Real> *v) {
|
||||
for (MatrixIndexT i = 0;i < v->Dim();i++)
|
||||
(*v)(i) = RandGauss();
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void InitRand(MatrixBase<Real> *M) {
|
||||
do {
|
||||
for (MatrixIndexT i = 0;i < M->NumRows();i++)
|
||||
for (MatrixIndexT j = 0;j < M->NumCols();j++)
|
||||
(*M)(i, j) = RandGauss();
|
||||
} while (M->NumRows() != 0 && M->Cond() > 100);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void RandGaussMatrix(MatrixBase<Real>* mat) {
|
||||
for(int32 r=0; r<mat->NumRows(); r++)
|
||||
for(int32 c=0; c<mat->NumCols(); c++)
|
||||
(*mat)(r,c) = RandGauss();
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void RandZeroToOneMatrix(MatrixBase<Real>* mat) {
|
||||
for(int32 r=0; r<mat->NumRows(); r++)
|
||||
for(int32 c=0; c<mat->NumCols(); c++)
|
||||
(*mat)(r,c) = RandUniform();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* ASSERTS
|
||||
*/
|
||||
template<class Real>
|
||||
static void AssertEqual(const MatrixBase<Real> &A,
|
||||
const MatrixBase<Real> &B,
|
||||
float tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows()&&A.NumCols() == B.NumCols());
|
||||
for (MatrixIndexT i = 0;i < A.NumRows();i++) {
|
||||
for (MatrixIndexT j = 0;j < A.NumCols();j++) {
|
||||
KALDI_ASSERT(std::abs(A(i, j)-B(i, j)) < tol*std::max(1.0, (double) (std::abs(A(i, j))+std::abs(B(i, j)))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static bool ApproxEqual(const MatrixBase<Real> &A,
|
||||
const MatrixBase<Real> &B, Real tol = 0.001) {
|
||||
KALDI_ASSERT(A.NumRows() == B.NumRows());
|
||||
MatrixBase<Real> diff(A);
|
||||
diff.AddSp(1.0, B);
|
||||
Real a = std::max(A.Max(), -A.Min()), b = std::max(B.Max(), -B.Min),
|
||||
d = std::max(diff.Max(), -diff.Min());
|
||||
return (d <= tol * std::max(a, b));
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void AssertEqual(VectorBase<Real> &A, VectorBase<Real> &B, float tol = 0.001) {
|
||||
KALDI_ASSERT(A.Dim() == B.Dim());
|
||||
for (MatrixIndexT i=0; i < A.Dim(); i++)
|
||||
KALDI_ASSERT(std::abs(A(i)-B(i)) < tol);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static bool ApproxEqual(VectorBase<Real> &A, VectorBase<Real> &B, float tol = 0.001) {
|
||||
KALDI_ASSERT(A.Dim() == B.Dim());
|
||||
for (MatrixIndexT i=0; i < A.Dim(); i++)
|
||||
if (std::abs(A(i)-B(i)) > tol) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void AssertEqual(std::vector<int32> &A, std::vector<int32> &B) {
|
||||
KALDI_ASSERT(A.size() == B.size());
|
||||
for (size_t i=0; i < A.size(); i++)
|
||||
KALDI_ASSERT(A[i] == B[i]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Unit tests
|
||||
*/
|
||||
|
||||
/*
|
||||
* CuMatrix
|
||||
*/
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixApplyLog() {
|
||||
Matrix<Real> H(100,100);
|
||||
RandGaussMatrix(&H);
|
||||
H.MulElements(H); //make numbers positive
|
||||
|
||||
CuMatrix<Real> D(100,100);
|
||||
D.CopyFromMat(H);
|
||||
|
||||
D.ApplyLog();
|
||||
H.ApplyLog();
|
||||
|
||||
Matrix<Real> H2(100,100);
|
||||
D.CopyToMat(&H2);
|
||||
|
||||
AssertEqual(H,H2);
|
||||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixMulElements() {
|
||||
Matrix<Real> Ha(100,100);
|
||||
Matrix<Real> Hb(100,100);
|
||||
RandGaussMatrix(&Ha);
|
||||
RandGaussMatrix(&Hb);
|
||||
|
||||
CuMatrix<Real> Da(100,100);
|
||||
CuMatrix<Real> Db(100,100);
|
||||
Da.CopyFromMat(Ha);
|
||||
Db.CopyFromMat(Hb);
|
||||
|
||||
Da.MulElements(Db);
|
||||
Ha.MulElements(Hb);
|
||||
|
||||
Matrix<Real> Ha2(100,100);
|
||||
Da.CopyToMat(&Ha2);
|
||||
|
||||
AssertEqual(Ha,Ha2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixMulColsVec() {
|
||||
Matrix<Real> Hm(100,99);
|
||||
Vector<Real> Hv(99);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(100,99);
|
||||
CuVector<Real> Dv(99);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dm.MulColsVec(Dv);
|
||||
Hm.MulColsVec(Hv);
|
||||
|
||||
Matrix<Real> Hm2(100,99);
|
||||
Dm.CopyToMat(&Hm2);
|
||||
|
||||
AssertEqual(Hm,Hm2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixMulRowsVec() {
|
||||
Matrix<Real> Hm(100,99);
|
||||
Vector<Real> Hv(100);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(100,99);
|
||||
CuVector<Real> Dv(100);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dm.MulRowsVec(Dv);
|
||||
Hm.MulRowsVec(Hv);
|
||||
|
||||
Matrix<Real> Hm2(100,99);
|
||||
Dm.CopyToMat(&Hm2);
|
||||
|
||||
AssertEqual(Hm,Hm2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixDivRowsVec() {
|
||||
Matrix<Real> Hm(100,99);
|
||||
Vector<Real> Hv(100);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(100,99);
|
||||
CuVector<Real> Dv(100);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dm.DivRowsVec(Dv);
|
||||
Hv.InvertElements();
|
||||
Hm.MulRowsVec(Hv);
|
||||
|
||||
Matrix<Real> Hm2(100,99);
|
||||
Dm.CopyToMat(&Hm2);
|
||||
|
||||
AssertEqual(Hm,Hm2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixAddMat() {
|
||||
Matrix<Real> Ha(100,100);
|
||||
Matrix<Real> Hb(100,100);
|
||||
RandGaussMatrix(&Ha);
|
||||
RandGaussMatrix(&Hb);
|
||||
|
||||
CuMatrix<Real> Da(100,100);
|
||||
CuMatrix<Real> Db(100,100);
|
||||
Da.CopyFromMat(Ha);
|
||||
Db.CopyFromMat(Hb);
|
||||
|
||||
Da.AddMat(0.5,Db);
|
||||
Ha.AddMat(0.5,Hb);
|
||||
|
||||
Matrix<Real> Ha2(100,100);
|
||||
Da.CopyToMat(&Ha2);
|
||||
|
||||
AssertEqual(Ha,Ha2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixAddVecToCols() {
|
||||
Matrix<Real> Hm(100,99);
|
||||
Vector<Real> Hv(100);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(100,99);
|
||||
CuVector<Real> Dv(100);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dm.AddVecToCols(0.5,Dv);
|
||||
Hm.AddVecToCols(0.5,Hv);
|
||||
|
||||
Matrix<Real> Hm2(100,99);
|
||||
Dm.CopyToMat(&Hm2);
|
||||
|
||||
AssertEqual(Hm,Hm2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixAddVecToRows() {
|
||||
Matrix<Real> Hm(100,99);
|
||||
Vector<Real> Hv(99);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(100,99);
|
||||
CuVector<Real> Dv(99);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dm.AddVecToRows(0.5,Dv);
|
||||
Hm.AddVecToRows(0.5,Hv);
|
||||
|
||||
Matrix<Real> Hm2(100,99);
|
||||
Dm.CopyToMat(&Hm2);
|
||||
|
||||
AssertEqual(Hm,Hm2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuMatrixAddMatMat() {
|
||||
Matrix<Real> Ha(200,100);
|
||||
Matrix<Real> Hb(100,200);
|
||||
Matrix<Real> Hc1(200,200);
|
||||
Matrix<Real> Hc2(100,100);
|
||||
RandGaussMatrix(&Ha);
|
||||
RandGaussMatrix(&Hb);
|
||||
|
||||
CuMatrix<Real> Da(200,100);
|
||||
CuMatrix<Real> Db(100,200);
|
||||
Da.CopyFromMat(Ha);
|
||||
Db.CopyFromMat(Hb);
|
||||
CuMatrix<Real> Dc1(200,200);
|
||||
CuMatrix<Real> Dc2(100,100);
|
||||
|
||||
Dc1.AddMatMat(0.5f,Da,kNoTrans,Db,kNoTrans,0.0f);
|
||||
Dc2.AddMatMat(0.5f,Da,kTrans,Db,kTrans,0.0f);
|
||||
Hc1.AddMatMat(0.5f,Ha,kNoTrans,Hb,kNoTrans,0.0f);
|
||||
Hc2.AddMatMat(0.5f,Ha,kTrans,Hb,kTrans,0.0f);
|
||||
|
||||
Matrix<Real> Hc1a(200,200);
|
||||
Matrix<Real> Hc2a(100,100);
|
||||
Dc1.CopyToMat(&Hc1a);
|
||||
Dc2.CopyToMat(&Hc2a);
|
||||
|
||||
AssertEqual(Hc1,Hc1a);
|
||||
AssertEqual(Hc2,Hc2a);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* CuVector unit tests
|
||||
*/
|
||||
template<class Real>
|
||||
static void UnitTestCuVectorAddVec() {
|
||||
Vector<Real> Hv(777);
|
||||
Vector<Real> Hw(777);
|
||||
InitRand(&Hv);
|
||||
InitRand(&Hw);
|
||||
|
||||
CuVector<Real> Dv(777);
|
||||
CuVector<Real> Dw(777);
|
||||
Dv.CopyFromVec(Hv);
|
||||
Dw.CopyFromVec(Hw);
|
||||
|
||||
Dv.AddVec(0.1,Dw,0.9);
|
||||
Hv.Scale(0.9);
|
||||
Hv.AddVec(0.1,Hw);
|
||||
|
||||
Vector<Real> Hv2(777);
|
||||
Dv.CopyToVec(&Hv2);
|
||||
|
||||
AssertEqual(Hv,Hv2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuVectorAddRowSumMat() {
|
||||
const int32 X=4321, Y=19;
|
||||
Real alpha=0.1, beta=0.7;
|
||||
|
||||
Matrix<Real> Hm(X,Y);
|
||||
Vector<Real> Hv(Y);
|
||||
Vector<Real> Hv_accu(Y);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(X,Y);
|
||||
CuVector<Real> Dv(Y);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dv.AddRowSumMat(alpha,Dm,beta);
|
||||
|
||||
Hv_accu.SetZero();
|
||||
Hv_accu.AddRowSumMat(1.0, Hm);
|
||||
Hv.Scale(beta);
|
||||
Hv.AddVec(alpha,Hv_accu);
|
||||
|
||||
Vector<Real> Hv2(Y);
|
||||
Dv.CopyToVec(&Hv2);
|
||||
|
||||
AssertEqual(Hv,Hv2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuVectorAddRowSumMatLarge() {
|
||||
Matrix<Real> Hm(1000,990);
|
||||
Vector<Real> Hv(990);
|
||||
Vector<Real> Hv_accu(990);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(1000,990);
|
||||
CuVector<Real> Dv(990);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dv.AddRowSumMat(0.5,Dm,0.7);
|
||||
|
||||
Hv_accu.SetZero();
|
||||
Hv_accu.AddRowSumMat(1.0, Hm);
|
||||
Hv.Scale(0.7);
|
||||
Hv.AddVec(0.5,Hv_accu);
|
||||
|
||||
Vector<Real> Hv2(990);
|
||||
Dv.CopyToVec(&Hv2);
|
||||
|
||||
AssertEqual(Hv,Hv2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuVectorAddColSumMat() {
|
||||
const int32 X=19, Y=4321;
|
||||
Real alpha=0.5, beta=0.7;
|
||||
|
||||
Matrix<Real> Hm(X,Y);
|
||||
Vector<Real> Hv(X);
|
||||
Vector<Real> Hv_accu(X);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(X,Y);
|
||||
CuVector<Real> Dv(X);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dv.AddColSumMat(alpha,Dm,beta);
|
||||
|
||||
Hv_accu.SetZero();
|
||||
Hv_accu.AddColSumMat(1.0, Hm);
|
||||
Hv.Scale(beta);
|
||||
Hv.AddVec(alpha, Hv_accu);
|
||||
|
||||
Vector<Real> Hv2(X);
|
||||
Dv.CopyToVec(&Hv2);
|
||||
|
||||
AssertEqual(Hv,Hv2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuVectorAddColSumMatLarge() {
|
||||
Matrix<Real> Hm(1000,990);
|
||||
Vector<Real> Hv(1000);
|
||||
Vector<Real> Hv_accu(1000);
|
||||
RandGaussMatrix(&Hm);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuMatrix<Real> Dm(1000,990);
|
||||
CuVector<Real> Dv(1000);
|
||||
Dm.CopyFromMat(Hm);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dv.AddColSumMat(0.5, Dm, 0.7);
|
||||
|
||||
Hv_accu.SetZero();
|
||||
Hv_accu.AddColSumMat(1.0, Hm);
|
||||
Hv.Scale(0.7);
|
||||
Hv.AddVec(0.5,Hv_accu);
|
||||
|
||||
Vector<Real> Hv2(1000);
|
||||
Dv.CopyToVec(&Hv2);
|
||||
|
||||
AssertEqual(Hv,Hv2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuVectorInvertElements() {
|
||||
Vector<Real> Hv(777);
|
||||
InitRand(&Hv);
|
||||
|
||||
CuVector<Real> Dv(777);
|
||||
Dv.CopyFromVec(Hv);
|
||||
|
||||
Dv.InvertElements();
|
||||
Hv.InvertElements();
|
||||
|
||||
Vector<Real> Hv2(777);
|
||||
Dv.CopyToVec(&Hv2);
|
||||
|
||||
AssertEqual(Hv,Hv2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* cu:: unit tests
|
||||
*/
|
||||
template<class Real>
|
||||
static void UnitTestCuSigmoid() {
|
||||
Matrix<Real> Hi(100,111);
|
||||
Matrix<Real> Ho(100,111);
|
||||
RandGaussMatrix(&Hi);
|
||||
|
||||
CuMatrix<Real> Di(100,111);
|
||||
CuMatrix<Real> Do(100,111);
|
||||
Di.CopyFromMat(Hi);
|
||||
|
||||
//gpu
|
||||
Do.Sigmoid(Di);
|
||||
//cpu
|
||||
for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
|
||||
for(MatrixIndexT c=0; c<Hi.NumCols(); c++) {
|
||||
Ho(r, c) = 1.0/(1.0+exp(-Hi(r, c)));
|
||||
}
|
||||
}
|
||||
|
||||
Matrix<Real> Ho2(100,111);
|
||||
Do.CopyToMat(&Ho2);
|
||||
|
||||
AssertEqual(Ho,Ho2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuDiffSigmoid() {
|
||||
Matrix<Real> Hi(100,111);
|
||||
Matrix<Real> Ho(100,111);
|
||||
Matrix<Real> Hy(100,111);
|
||||
RandGaussMatrix(&Hi);
|
||||
RandZeroToOneMatrix(&Hy);
|
||||
|
||||
CuMatrix<Real> Di(100,111);
|
||||
CuMatrix<Real> Do(100,111);
|
||||
CuMatrix<Real> Dy(100,111);
|
||||
Di.CopyFromMat(Hi);
|
||||
Dy.CopyFromMat(Hy);
|
||||
|
||||
//gpu
|
||||
Do.DiffSigmoid(Dy, Di);
|
||||
//cpu
|
||||
for(MatrixIndexT r=0; r<Ho.NumRows(); r++) {
|
||||
for(MatrixIndexT c=0; c<Ho.NumCols(); c++) {
|
||||
Ho(r, c) = Hy(r, c)*(1.0 - Hy(r, c)) * Hi(r, c);
|
||||
}
|
||||
}
|
||||
|
||||
Matrix<Real> Ho2(100,111);
|
||||
Do.CopyToMat(&Ho2);
|
||||
|
||||
AssertEqual(Ho,Ho2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuSoftmax() {
|
||||
Matrix<Real> Hi(100,111);
|
||||
Matrix<Real> Ho(100,111);
|
||||
RandGaussMatrix(&Hi);
|
||||
|
||||
CuMatrix<Real> Di(100,111);
|
||||
CuMatrix<Real> Do(100,111);
|
||||
Di.CopyFromMat(Hi);
|
||||
|
||||
//gpu
|
||||
Do.Softmax(Di);
|
||||
//cpu
|
||||
Ho.CopyFromMat(Hi);
|
||||
for(MatrixIndexT r=0; r<Ho.NumRows(); r++) {
|
||||
Ho.Row(r).ApplySoftMax();
|
||||
}
|
||||
|
||||
Matrix<Real> Ho2(100,111);
|
||||
Do.CopyToMat(&Ho2);
|
||||
|
||||
AssertEqual(Ho,Ho2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuFindRowMaxId() {
|
||||
Matrix<Real> Hi(100,111);
|
||||
RandGaussMatrix(&Hi);
|
||||
|
||||
CuMatrix<Real> Di(100,111);
|
||||
Di.CopyFromMat(Hi);
|
||||
|
||||
std::vector<int32> Hmax(100);
|
||||
CuStlVector<int32> Dmax(100);
|
||||
|
||||
//gpu
|
||||
Di.FindRowMaxId(&Dmax);
|
||||
|
||||
//cpu
|
||||
for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
|
||||
Real max=-1e20; int32 idx=-1;
|
||||
for(MatrixIndexT c=0; c<Hi.NumCols(); c++) {
|
||||
if(Hi(r,c) > max) { idx=c; max=Hi(r,c); }
|
||||
}
|
||||
Hmax[r] = idx;
|
||||
}
|
||||
|
||||
std::vector<int32> Hmax2(100);
|
||||
Dmax.CopyToVec(&Hmax2);
|
||||
|
||||
AssertEqual(Hmax,Hmax2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real>
|
||||
static void UnitTestCuDiffXent() {
|
||||
int32 X=100, Y=111;
|
||||
//nnet output / diff
|
||||
Matrix<Real> Hi(X,Y);
|
||||
RandZeroToOneMatrix(&Hi);
|
||||
CuMatrix<Real> Di(X,Y);
|
||||
Di.CopyFromMat(Hi);
|
||||
//target vector
|
||||
std::vector<int32> Htgt(X);
|
||||
for(int32 i=0; i<X; i++) {
|
||||
Htgt[i] = rand()%Y;
|
||||
}
|
||||
CuStlVector<int32> Dtgt(X);
|
||||
Dtgt.CopyFromVec(Htgt);
|
||||
//logpost vector
|
||||
Vector<Real> Hlogpost(X);
|
||||
CuVector<Real> Dlogpost(X);
|
||||
|
||||
//gpu
|
||||
Di.DiffXent(Dtgt, &Dlogpost);
|
||||
//cpu
|
||||
for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
|
||||
int32 col_tgt = Htgt[r];
|
||||
Hlogpost(r) = log(Hi(r, col_tgt));
|
||||
Hi(r, col_tgt) -= 1.0;
|
||||
}
|
||||
|
||||
Matrix<Real> Hi2(X,Y);
|
||||
Di.CopyToMat(&Hi2);
|
||||
Vector<Real> Hlogpost2(X);
|
||||
Dlogpost.CopyToVec(&Hlogpost2);
|
||||
|
||||
AssertEqual(Hi,Hi2);
|
||||
AssertEqual(Hlogpost,Hlogpost2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Real> void CudaMatrixUnitTest() {
|
||||
//test CuMatrix<Real> methods by cross-check with Matrix
|
||||
UnitTestCuMatrixApplyLog<Real>();
|
||||
UnitTestCuMatrixMulElements<Real>();
|
||||
UnitTestCuMatrixMulColsVec<Real>();
|
||||
UnitTestCuMatrixMulRowsVec<Real>();
|
||||
UnitTestCuMatrixDivRowsVec<Real>();
|
||||
UnitTestCuMatrixAddMat<Real>();
|
||||
UnitTestCuMatrixAddVecToCols<Real>();
|
||||
UnitTestCuMatrixAddVecToRows<Real>();
|
||||
UnitTestCuMatrixAddMatMat<Real>();
|
||||
//test CuVector<Real> methods
|
||||
UnitTestCuVectorAddVec<Real>();
|
||||
UnitTestCuVectorAddRowSumMat<Real>();
|
||||
UnitTestCuVectorAddRowSumMatLarge<Real>();
|
||||
UnitTestCuVectorAddColSumMat<Real>();
|
||||
UnitTestCuVectorAddColSumMatLarge<Real>();
|
||||
UnitTestCuVectorInvertElements<Real>();
|
||||
|
||||
UnitTestCuSigmoid<Real>();
|
||||
UnitTestCuDiffSigmoid<Real>();
|
||||
UnitTestCuFindRowMaxId<Real>();
|
||||
UnitTestCuSoftmax<Real>();
|
||||
UnitTestCuDiffXent<Real>();
|
||||
}
|
||||
|
||||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
int main() {
|
||||
//Select the GPU
|
||||
#if HAVE_CUDA==1
|
||||
CuDevice::Instantiate().SelectGpuId(-2); //-2 .. automatic selection
|
||||
#endif
|
||||
|
||||
|
||||
kaldi::CudaMatrixUnitTest<float>();
|
||||
kaldi::CudaMatrixUnitTest<double>();
|
||||
std::cout << "Tests succeeded.\n";
|
||||
}
|
|
@ -17,6 +17,9 @@
|
|||
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
// See the Apache 2 License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
|
||||
#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
|
||||
|
||||
|
||||
#include <limits>
|
||||
#include "matrix/sp-matrix.h"
|
||||
|
@ -235,6 +238,8 @@ inline void cblas_Xgemm(const double alpha,
|
|||
alpha, Adata, a_stride, Bdata, b_stride,
|
||||
beta, Mdata, stride);
|
||||
}
|
||||
|
||||
|
||||
inline void cblas_Xsymm(const float alpha,
|
||||
MatrixIndexT sz,
|
||||
const float *Adata,MatrixIndexT a_stride,
|
||||
|
@ -470,3 +475,5 @@ inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT st
|
|||
|
||||
}
|
||||
// namespace kaldi
|
||||
|
||||
#endif
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
namespace kaldi {
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void CompressedMatrix::CopyFromMat(
|
||||
const MatrixBase<Real> &mat) {
|
||||
if (data_ != NULL) {
|
||||
|
@ -95,6 +95,20 @@ void CompressedMatrix::CopyFromMat(const MatrixBase<float> &mat);
|
|||
template
|
||||
void CompressedMatrix::CopyFromMat(const MatrixBase<double> &mat);
|
||||
|
||||
|
||||
template<typename Real>
|
||||
CompressedMatrix &CompressedMatrix::operator =(const MatrixBase<Real> &mat) {
|
||||
this->CopyFromMat(mat);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Instantiate the template for float and double.
|
||||
template
|
||||
CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<float> &mat);
|
||||
|
||||
template
|
||||
CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<double> &mat);
|
||||
|
||||
inline uint16 CompressedMatrix::FloatToUint16(
|
||||
const GlobalHeader &global_header,
|
||||
float value) {
|
||||
|
@ -114,7 +128,7 @@ inline float CompressedMatrix::Uint16ToFloat(
|
|||
+ global_header.range * 1.52590218966964e-05 * value;
|
||||
}
|
||||
|
||||
template<class Real> // static
|
||||
template<typename Real> // static
|
||||
void CompressedMatrix::ComputeColHeader(
|
||||
const GlobalHeader &global_header,
|
||||
const Real *data, MatrixIndexT stride,
|
||||
|
@ -229,7 +243,7 @@ inline float CompressedMatrix::CharToFloat(
|
|||
}
|
||||
|
||||
|
||||
template<class Real> // static
|
||||
template<typename Real> // static
|
||||
void CompressedMatrix::CompressColumn(
|
||||
const GlobalHeader &global_header,
|
||||
const Real *data, MatrixIndexT stride,
|
||||
|
@ -383,7 +397,7 @@ void CompressedMatrix::Read(std::istream &is, bool binary) {
|
|||
KALDI_ERR << "Failed to read data.";
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat) const {
|
||||
if (data_ == NULL) {
|
||||
KALDI_ASSERT(mat->NumRows() == 0);
|
||||
|
|
|
@ -46,20 +46,24 @@ class CompressedMatrix {
|
|||
|
||||
~CompressedMatrix() { Destroy(); }
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
CompressedMatrix(const MatrixBase<Real> &mat): data_(NULL) { CopyFromMat(mat); }
|
||||
|
||||
|
||||
/// This will resize *this and copy the contents of mat to *this.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void CopyFromMat(const MatrixBase<Real> &mat);
|
||||
|
||||
CompressedMatrix(const CompressedMatrix &mat);
|
||||
|
||||
CompressedMatrix &operator = (const CompressedMatrix &mat); // assignment operator.
|
||||
|
||||
template<typename Real>
|
||||
CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
|
||||
|
||||
// Note: mat must have the correct size, CopyToMat no longer attempts
|
||||
// to resize the matrix
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void CopyToMat(MatrixBase<Real> *mat) const;
|
||||
|
||||
void Write(std::ostream &os, bool binary) const;
|
||||
|
@ -122,12 +126,12 @@ class CompressedMatrix {
|
|||
uint16 percentile_100;
|
||||
};
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
static void CompressColumn(const GlobalHeader &global_header,
|
||||
const Real *data, MatrixIndexT stride,
|
||||
int32 num_rows, PerColHeader *header,
|
||||
unsigned char *byte_data);
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
static void ComputeColHeader(const GlobalHeader &global_header,
|
||||
const Real *data, MatrixIndexT stride,
|
||||
int32 num_rows, PerColHeader *header);
|
||||
|
|
|
@ -36,7 +36,7 @@ namespace kaldi {
|
|||
// This class is not to be used externally. See the Eig function in the Matrix
|
||||
// class in kaldi-matrix.h. This is the external interface.
|
||||
|
||||
template<class Real> class EigenvalueDecomposition {
|
||||
template<typename Real> class EigenvalueDecomposition {
|
||||
// This class is based on the EigenvalueDecomposition class from the JAMA
|
||||
// library (version 1.0.2).
|
||||
public:
|
||||
|
@ -110,7 +110,7 @@ template<class Real> class EigenvalueDecomposition {
|
|||
template class EigenvalueDecomposition<float>; // force instantiation.
|
||||
template class EigenvalueDecomposition<double>; // force instantiation.
|
||||
|
||||
template<class Real> void EigenvalueDecomposition<Real>::Tred2() {
|
||||
template<typename Real> void EigenvalueDecomposition<Real>::Tred2() {
|
||||
// This is derived from the Algol procedures tred2 by
|
||||
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
|
||||
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
|
||||
|
@ -224,7 +224,7 @@ template<class Real> void EigenvalueDecomposition<Real>::Tred2() {
|
|||
e_[0] = 0.0;
|
||||
}
|
||||
|
||||
template<class Real> void EigenvalueDecomposition<Real>::Tql2() {
|
||||
template<typename Real> void EigenvalueDecomposition<Real>::Tql2() {
|
||||
// This is derived from the Algol procedures tql2, by
|
||||
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
|
||||
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
|
||||
|
@ -341,7 +341,7 @@ template<class Real> void EigenvalueDecomposition<Real>::Tql2() {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void EigenvalueDecomposition<Real>::Orthes() {
|
||||
|
||||
// This is derived from the Algol procedures orthes and ortran,
|
||||
|
@ -433,7 +433,7 @@ void EigenvalueDecomposition<Real>::Orthes() {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real> void EigenvalueDecomposition<Real>::Hqr2() {
|
||||
template<typename Real> void EigenvalueDecomposition<Real>::Hqr2() {
|
||||
// This is derived from the Algol procedure hqr2,
|
||||
// by Martin and Wilkinson, Handbook for Auto. Comp.,
|
||||
// Vol.ii-Linear Algebra, and the corresponding
|
||||
|
@ -872,7 +872,7 @@ template<class Real> void EigenvalueDecomposition<Real>::Hqr2() {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A) {
|
||||
KALDI_ASSERT(A.NumCols() == A.NumRows() && A.NumCols() >= 1);
|
||||
n_ = A.NumRows();
|
||||
|
@ -907,7 +907,7 @@ EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
EigenvalueDecomposition<Real>::~EigenvalueDecomposition() {
|
||||
delete [] d_;
|
||||
delete [] e_;
|
||||
|
|
|
@ -61,7 +61,7 @@ namespace kaldi {
|
|||
*/
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool MatrixBase<Real>::JamaSvd(VectorBase<Real> *s_in,
|
||||
MatrixBase<Real> *U_in,
|
||||
MatrixBase<Real> *V_in) { // Destructive!
|
||||
|
|
|
@ -27,12 +27,12 @@ namespace ut = kaldi::unittest;
|
|||
|
||||
namespace kaldi {
|
||||
|
||||
template<class Real> static void InitRand(VectorBase<Real> *v) {
|
||||
template<typename Real> static void InitRand(VectorBase<Real> *v) {
|
||||
for (MatrixIndexT i = 0;i < v->Dim();i++)
|
||||
(*v)(i) = RandGauss();
|
||||
}
|
||||
|
||||
template<class Real> static void InitRand(MatrixBase<Real> *M) {
|
||||
template<typename Real> static void InitRand(MatrixBase<Real> *M) {
|
||||
start:
|
||||
for (MatrixIndexT i = 0;i < M->NumRows();i++)
|
||||
for (MatrixIndexT j = 0;j < M->NumCols();j++)
|
||||
|
@ -44,7 +44,7 @@ template<class Real> static void InitRand(MatrixBase<Real> *M) {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real> static void InitRand(SpMatrix<Real> *M) {
|
||||
template<typename Real> static void InitRand(SpMatrix<Real> *M) {
|
||||
start_sp:
|
||||
for (MatrixIndexT i = 0;i < M->NumRows();i++)
|
||||
for (MatrixIndexT j = 0;j<=i;j++)
|
||||
|
@ -56,7 +56,7 @@ template<class Real> static void InitRand(SpMatrix<Real> *M) {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real> static void UnitTestGpsr() {
|
||||
template<typename Real> static void UnitTestGpsr() {
|
||||
for (int32 i = 0; i < 5; i++) {
|
||||
MatrixIndexT dim1 = (rand() % 10) + 10;
|
||||
MatrixIndexT dim2 = (rand() % 10) + 10;
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
// Yanmin Qian; Petr Schwarz; Jan Silovsky;
|
||||
// Haihua Xu
|
||||
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
@ -30,9 +29,14 @@
|
|||
namespace kaldi {
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::Invert(Real *LogDet, Real *DetSign,
|
||||
void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
|
||||
bool inverse_needed) {
|
||||
KALDI_ASSERT(num_rows_ == num_cols_);
|
||||
if (num_rows_ == 0) {
|
||||
if (det_sign) *det_sign = 1;
|
||||
if (log_det) *log_det = 0.0;
|
||||
return;
|
||||
}
|
||||
#ifndef HAVE_ATLAS
|
||||
KaldiBlasInt *pivot = new KaldiBlasInt[num_rows_];
|
||||
KaldiBlasInt M = num_rows_;
|
||||
|
@ -60,26 +64,26 @@ void MatrixBase<Real>::Invert(Real *LogDet, Real *DetSign,
|
|||
if (inverse_needed) {
|
||||
KALDI_ERR << "Cannot invert: matrix is singular";
|
||||
} else {
|
||||
if (LogDet) *LogDet = -std::numeric_limits<Real>::infinity();
|
||||
if (DetSign) *DetSign = 0;
|
||||
if (log_det) *log_det = -std::numeric_limits<Real>::infinity();
|
||||
if (det_sign) *det_sign = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (DetSign != NULL) {
|
||||
if (det_sign != NULL) {
|
||||
int sign = 1;
|
||||
for (MatrixIndexT i = 0; i < num_rows_; i++)
|
||||
if (pivot[i] != static_cast<int>(i) + pivot_offset) sign *= -1;
|
||||
*DetSign = sign;
|
||||
*det_sign = sign;
|
||||
}
|
||||
if (LogDet != NULL || DetSign != NULL) { // Compute log determinant.
|
||||
if (LogDet != NULL) *LogDet = 0.0;
|
||||
if (log_det != NULL || det_sign != NULL) { // Compute log determinant.
|
||||
if (log_det != NULL) *log_det = 0.0;
|
||||
Real prod = 1.0;
|
||||
for (MatrixIndexT i = 0; i < num_rows_; i++) {
|
||||
prod *= (*this)(i, i);
|
||||
if (i == num_rows_ - 1 || std::fabs(prod) < 1.0e-10 ||
|
||||
std::fabs(prod) > 1.0e+10) {
|
||||
if (LogDet != NULL) *LogDet += log(fabs(prod));
|
||||
if (DetSign != NULL) *DetSign *= (prod > 0 ? 1.0 : -1.0);
|
||||
if (log_det != NULL) *log_det += log(fabs(prod));
|
||||
if (det_sign != NULL) *det_sign *= (prod > 0 ? 1.0 : -1.0);
|
||||
prod = 1.0;
|
||||
}
|
||||
}
|
||||
|
@ -108,8 +112,8 @@ void MatrixBase<float>::AddVecVec(const float alpha,
|
|||
1, data_, stride_);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<class OtherReal>
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void MatrixBase<Real>::AddVecVec(const Real alpha,
|
||||
const VectorBase<OtherReal> &a,
|
||||
const VectorBase<OtherReal> &b) {
|
||||
|
@ -146,6 +150,7 @@ void MatrixBase<double>::AddVecVec(const double alpha,
|
|||
const VectorBase<double> &a,
|
||||
const VectorBase<double> &rb) {
|
||||
KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
|
||||
if (num_rows_ == 0) return;
|
||||
cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
|
||||
1, data_, stride_);
|
||||
}
|
||||
|
@ -162,11 +167,50 @@ void MatrixBase<Real>::AddMatMat(const Real alpha,
|
|||
|| (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
|
||||
|| (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
|
||||
KALDI_ASSERT(&A != this && &B != this);
|
||||
if (num_rows_ == 0) return;
|
||||
cblas_Xgemm(alpha, transA, A.data_, A.num_rows_, A.num_cols_, A.stride_,
|
||||
transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);
|
||||
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::CopyLowerToUpper() {
|
||||
KALDI_ASSERT(num_rows_ == num_cols_);
|
||||
Real *data = data_;
|
||||
MatrixIndexT num_rows = num_rows_, stride = stride_;
|
||||
for (int32 i = 0; i < num_rows; i++)
|
||||
for (int32 j = 0; j < i; j++)
|
||||
data[j * stride + i ] = data[i * stride + j];
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::CopyUpperToLower() {
|
||||
KALDI_ASSERT(num_rows_ == num_cols_);
|
||||
Real *data = data_;
|
||||
MatrixIndexT num_rows = num_rows_, stride = stride_;
|
||||
for (int32 i = 0; i < num_rows; i++)
|
||||
for (int32 j = 0; j < i; j++)
|
||||
data[i * stride + j] = data[j * stride + i];
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::SymAddMat2(const Real alpha,
|
||||
const MatrixBase<Real> &A,
|
||||
MatrixTransposeType transA,
|
||||
Real beta) {
|
||||
KALDI_ASSERT(num_rows_ == num_cols_ &&
|
||||
((transA == kNoTrans && A.num_rows_ == num_rows_) ||
|
||||
(transA == kTrans && A.num_cols_ == num_cols_)));
|
||||
KALDI_ASSERT(A.data_ != data_);
|
||||
if (num_rows_ == 0) return;
|
||||
MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
|
||||
|
||||
// This function call is hard-coded to update the lower triangle.
|
||||
cblas_Xsyrk(transA, num_rows_, A_other_dim, alpha, A.Data(),
|
||||
A.Stride(), beta, this->data_, this->stride_);
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::AddMatSmat(const Real alpha,
|
||||
|
@ -253,13 +297,14 @@ void MatrixBase<Real>::AddSpSp(const Real alpha, const SpMatrix<Real> &A_in,
|
|||
// CblasLower or CblasUpper would work below as symmetric matrix is copied
|
||||
// fully (to save work, we used the matrix constructor from SpMatrix).
|
||||
// CblasLeft means A is on the left: C <-- alpha A B + beta C
|
||||
if (sz == 0) return;
|
||||
cblas_Xsymm(alpha, sz, A.data_, A.stride_, B.data_, B.stride_, beta, data_, stride_);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
|
||||
MatrixTransposeType transA) {
|
||||
if (&A == this) { // Make it work in this case.
|
||||
if (&A == this) {
|
||||
if (transA == kNoTrans) {
|
||||
Scale(alpha + 1.0);
|
||||
} else {
|
||||
|
@ -293,20 +338,22 @@ void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
|
|||
Real *adata = A.data_, *data = data_;
|
||||
if (transA == kNoTrans) {
|
||||
KALDI_ASSERT(A.num_rows_ == num_rows_ && A.num_cols_ == num_cols_);
|
||||
if (num_rows_ == 0) return;
|
||||
for (MatrixIndexT row = 0; row < num_rows_; row++, adata += aStride,
|
||||
data += stride) {
|
||||
cblas_Xaxpy(num_cols_, alpha, adata, 1, data, 1);
|
||||
}
|
||||
} else {
|
||||
KALDI_ASSERT(A.num_cols_ == num_rows_ && A.num_rows_ == num_cols_);
|
||||
if (num_rows_ == 0) return;
|
||||
for (MatrixIndexT row = 0; row < num_rows_; row++, adata++, data += stride)
|
||||
cblas_Xaxpy(num_cols_, alpha, adata, aStride, data, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<class OtherReal>
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void MatrixBase<Real>::AddSp(const Real alpha, const SpMatrix<OtherReal> &S) {
|
||||
KALDI_ASSERT(S.NumRows() == NumRows() && S.NumRows() == NumCols());
|
||||
Real *data = data_; const OtherReal *sdata = S.Data();
|
||||
|
@ -331,6 +378,31 @@ template
|
|||
void MatrixBase<double>::AddSp(const double alpha, const SpMatrix<float> &S);
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::AddDiagVecMat(
|
||||
const Real alpha, VectorBase<Real> &v,
|
||||
const MatrixBase<Real> &M,
|
||||
MatrixTransposeType transM,
|
||||
Real beta) {
|
||||
if (beta != 1.0) this->Scale(beta);
|
||||
|
||||
if (transM == kNoTrans) {
|
||||
KALDI_ASSERT(SameDim(*this, M));
|
||||
} else {
|
||||
KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
|
||||
}
|
||||
KALDI_ASSERT(v.Dim() == this->NumRows());
|
||||
|
||||
MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1, stride = stride_,
|
||||
num_rows = num_rows_, num_cols = num_cols_;
|
||||
if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
|
||||
Real *data = data_;
|
||||
const Real *Mdata = M.Data(), *vdata = v.Data();
|
||||
if (num_rows_ == 0) return;
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++)
|
||||
cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
|
||||
}
|
||||
|
||||
#if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
|
||||
// ****************************************************************************
|
||||
// ****************************************************************************
|
||||
|
@ -869,6 +941,7 @@ template<typename Real> void MatrixBase<Real>::Max(const MatrixBase<Real> &A) {
|
|||
|
||||
template<typename Real> void MatrixBase<Real>::Scale(Real alpha) {
|
||||
if (alpha == 1.0) return;
|
||||
if (num_rows_ == 0) return;
|
||||
if (num_cols_ == stride_) {
|
||||
cblas_Xscal(static_cast<size_t>(num_rows_) * static_cast<size_t>(num_cols_),
|
||||
alpha, data_,1);
|
||||
|
@ -893,6 +966,58 @@ void MatrixBase<Real>::MulRowsVec(const VectorBase<Real> &scale) {
|
|||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
|
||||
KALDI_ASSERT(src.NumCols() > 0 && src.NumCols() <= this->NumCols());
|
||||
KALDI_ASSERT(this->NumCols() % src.NumCols() == 0 ||
|
||||
this->NumCols() % (src.NumCols() - 1) < this->NumCols() / (src.NumCols() - 1));
|
||||
int group_size = 0;
|
||||
if (this->NumCols() % src.NumCols() == 0) {
|
||||
group_size = this->NumCols() / src.NumCols();
|
||||
} else {
|
||||
group_size = this->NumCols() / src.NumCols() + 1;
|
||||
}
|
||||
MatrixIndexT M = num_rows_, N = num_cols_;
|
||||
|
||||
for (MatrixIndexT i = 0; i < M; i++)
|
||||
for (MatrixIndexT j = 0; j < N; j++)
|
||||
(*this)(i, j) *= src(i, j / group_size);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &src1,
|
||||
const MatrixBase<Real> &src2,
|
||||
Real power) {
|
||||
KALDI_ASSERT(src2.NumCols() > 0 && src2.NumCols() <= this->NumCols());
|
||||
KALDI_ASSERT(this->NumCols() % src2.NumCols() == 0 ||
|
||||
this->NumCols() % (src2.NumCols() - 1) < this->NumCols() / (src2.NumCols() - 1));
|
||||
int group_size = 0;
|
||||
if (this->NumCols() % src2.NumCols() == 0) {
|
||||
group_size = this->NumCols() / src2.NumCols();
|
||||
} else {
|
||||
group_size = this->NumCols() / src2.NumCols() + 1;
|
||||
}
|
||||
MatrixIndexT M = this->NumRows(), N = this->NumCols();
|
||||
|
||||
if (power == 1.0) {
|
||||
for (MatrixIndexT i = 0; i < M; i++)
|
||||
for (MatrixIndexT j = 0; j < N; j++)
|
||||
(*this)(i, j) = (src1(i, j) == 0 ? 0 : (src1(i, j) > 0 ? 1 : -1));
|
||||
} else {
|
||||
for (MatrixIndexT i = 0; i < M; i++) {
|
||||
for (MatrixIndexT j = 0; j < N; j++) {
|
||||
if (src2(i, j / group_size) == 0) {
|
||||
(*this)(i, j) = 0;
|
||||
} else {
|
||||
(*this)(i, j) = pow(std::abs(src1(i, j)), power - 1) *
|
||||
(src2(i, j / group_size) > 0 ? pow(src2(i, j / group_size), 1 - power) : 1) *
|
||||
(src1(i, j) >= 0 ? 1 : -1) ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real> // scales each column by scale[i].
|
||||
void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
|
||||
KALDI_ASSERT(scale.Dim() == num_cols_);
|
||||
|
@ -932,8 +1057,19 @@ void MatrixBase<Real>::SetUnit() {
|
|||
template<typename Real>
|
||||
void MatrixBase<Real>::SetRandn() {
|
||||
for (MatrixIndexT row = 0; row < num_rows_; row++) {
|
||||
for (MatrixIndexT col = 0; col < num_cols_; col++) {
|
||||
(*this)(row, col) = static_cast<Real>(kaldi::RandGauss());
|
||||
Real *row_data = this->RowData(row);
|
||||
for (MatrixIndexT col = 0; col < num_cols_; col++, row_data++) {
|
||||
*row_data = static_cast<Real>(kaldi::RandGauss());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::SetRandUniform() {
|
||||
for (MatrixIndexT row = 0; row < num_rows_; row++) {
|
||||
Real *row_data = this->RowData(row);
|
||||
for (MatrixIndexT col = 0; col < num_cols_; col++, row_data++) {
|
||||
*row_data = static_cast<Real>(kaldi::RandUniform());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1218,7 +1354,7 @@ SubMatrix<Real>::SubMatrix(Real *data,
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::Add(const Real alpha) {
|
||||
Real *data = data_;
|
||||
MatrixIndexT stride = stride_;
|
||||
|
@ -1227,8 +1363,17 @@ void MatrixBase<Real>::Add(const Real alpha) {
|
|||
data[c + stride*r] += alpha;
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::AddToDiag(const Real alpha) {
|
||||
Real *data = data_;
|
||||
MatrixIndexT this_stride = stride_ + 1,
|
||||
num_to_add = std::min(num_rows_, num_cols_);
|
||||
for (MatrixIndexT r = 0; r < num_to_add; r++)
|
||||
data[r * this_stride] += alpha;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
|
||||
template<typename Real>
|
||||
Real MatrixBase<Real>::Cond() const {
|
||||
KALDI_ASSERT(num_rows_ > 0&&num_cols_ > 0);
|
||||
Vector<Real> singular_values(std::min(num_rows_, num_cols_));
|
||||
|
@ -1241,7 +1386,7 @@ Real MatrixBase<Real>::Cond() const {
|
|||
else return 1.0e+100;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real MatrixBase<Real>::Trace(bool check_square) const {
|
||||
KALDI_ASSERT(!check_square || num_rows_ == num_cols_);
|
||||
Real ans = 0.0;
|
||||
|
@ -1249,7 +1394,7 @@ Real MatrixBase<Real>::Trace(bool check_square) const {
|
|||
return ans;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real MatrixBase<Real>::Max() const {
|
||||
KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
|
||||
Real ans= *data_;
|
||||
|
@ -1260,7 +1405,7 @@ Real MatrixBase<Real>::Max() const {
|
|||
return ans;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real MatrixBase<Real>::Min() const {
|
||||
KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
|
||||
Real ans= *data_;
|
||||
|
@ -1273,7 +1418,7 @@ Real MatrixBase<Real>::Min() const {
|
|||
|
||||
|
||||
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
void MatrixBase<Real>::AddMatMatMat(Real alpha,
|
||||
const MatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const MatrixBase<Real> &B, MatrixTransposeType transB,
|
||||
|
@ -1313,7 +1458,7 @@ void MatrixBase<Real>::AddMatMatMat(Real alpha,
|
|||
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) {
|
||||
// Svd, *this = U*diag(s)*Vt.
|
||||
// With (*this).num_rows_ == m, (*this).num_cols_ == n,
|
||||
|
@ -1357,7 +1502,7 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
|
|||
if (prescale != 1.0) s->Scale(1.0/prescale);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) const {
|
||||
try {
|
||||
if (num_rows_ >= num_cols_) {
|
||||
|
@ -1380,7 +1525,7 @@ void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool MatrixBase<Real>::IsSymmetric(Real cutoff) const {
|
||||
MatrixIndexT R = num_rows_, C = num_cols_;
|
||||
if (R != C) return false;
|
||||
|
@ -1396,7 +1541,7 @@ bool MatrixBase<Real>::IsSymmetric(Real cutoff) const {
|
|||
return true;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool MatrixBase<Real>::IsDiagonal(Real cutoff) const{
|
||||
MatrixIndexT R = num_rows_, C = num_cols_;
|
||||
Real bad_sum = 0.0, good_sum = 0.0;
|
||||
|
@ -1422,7 +1567,7 @@ void MatrixBase<Real>::TestUninitialized() const {
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool MatrixBase<Real>::IsUnit(Real cutoff) const {
|
||||
MatrixIndexT R = num_rows_, C = num_cols_;
|
||||
// if (R != C) return false;
|
||||
|
@ -1433,7 +1578,7 @@ bool MatrixBase<Real>::IsUnit(Real cutoff) const {
|
|||
return (bad_max <= cutoff);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool MatrixBase<Real>::IsZero(Real cutoff)const {
|
||||
MatrixIndexT R = num_rows_, C = num_cols_;
|
||||
Real bad_max = 0.0;
|
||||
|
@ -1443,16 +1588,9 @@ bool MatrixBase<Real>::IsZero(Real cutoff)const {
|
|||
return (bad_max <= cutoff);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real MatrixBase<Real>::FrobeniusNorm() const{
|
||||
MatrixIndexT R = num_rows_, C = num_cols_;
|
||||
Real sum = 0.0;
|
||||
for (MatrixIndexT i = 0;i < R;i++)
|
||||
for (MatrixIndexT j = 0;j < C;j++) {
|
||||
Real tmp = (*this)(i, j);
|
||||
sum += tmp*tmp;
|
||||
}
|
||||
return sqrt(sum);
|
||||
return sqrt(TraceMatMat(*this, *this, kTrans));
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
|
@ -1477,7 +1615,7 @@ bool MatrixBase<Real>::Equal(const MatrixBase<Real> &other) const {
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real MatrixBase<Real>::LargestAbsElem() const{
|
||||
MatrixIndexT R = num_rows_, C = num_cols_;
|
||||
Real largest = 0.0;
|
||||
|
@ -1488,7 +1626,7 @@ Real MatrixBase<Real>::LargestAbsElem() const{
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::OrthogonalizeRows() {
|
||||
KALDI_ASSERT(NumRows() <= NumCols());
|
||||
MatrixIndexT num_rows = num_rows_;
|
||||
|
@ -1529,7 +1667,7 @@ void MatrixBase<Real>::OrthogonalizeRows() {
|
|||
// Throws exception if this failed to within supplied precision (typically because *this was not
|
||||
// symmetric positive definite).
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *rU, Real check_thresh) // e.g. check_thresh = 0.001
|
||||
{
|
||||
const MatrixIndexT D = num_rows_;
|
||||
|
@ -1571,7 +1709,7 @@ void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real MatrixBase<Real>::LogDet(Real *det_sign) const {
|
||||
Real log_det;
|
||||
Matrix<Real> tmp(*this);
|
||||
|
@ -1579,15 +1717,15 @@ Real MatrixBase<Real>::LogDet(Real *det_sign) const {
|
|||
return log_det;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
void MatrixBase<Real>::InvertDouble(Real *LogDet, Real *DetSign,
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::InvertDouble(Real *log_det, Real *det_sign,
|
||||
bool inverse_needed) {
|
||||
double LogDet_tmp, DetSign_tmp;
|
||||
double log_det_tmp, det_sign_tmp;
|
||||
Matrix<double> dmat(*this);
|
||||
dmat.Invert(&LogDet_tmp, &DetSign_tmp, inverse_needed);
|
||||
dmat.Invert(&log_det_tmp, &det_sign_tmp, inverse_needed);
|
||||
if (inverse_needed) (*this).CopyFromMat(dmat);
|
||||
if (LogDet) *LogDet = LogDet_tmp;
|
||||
if (DetSign) *DetSign = DetSign_tmp;
|
||||
if (log_det) *log_det = log_det_tmp;
|
||||
if (det_sign) *det_sign = det_sign_tmp;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
|
@ -1610,7 +1748,7 @@ void MatrixBase<Real>::InvertElements() {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::Transpose() {
|
||||
KALDI_ASSERT(num_rows_ == num_cols_);
|
||||
MatrixIndexT M = num_rows_;
|
||||
|
@ -1622,7 +1760,7 @@ void MatrixBase<Real>::Transpose() {
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void Matrix<Real>::Transpose() {
|
||||
if (this->num_rows_ != this->num_cols_) {
|
||||
Matrix<Real> tmp(*this, kTrans);
|
||||
|
@ -1633,7 +1771,7 @@ void Matrix<Real>::Transpose() {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::ApplyFloor(Real floor_val) {
|
||||
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++) {
|
||||
|
@ -1643,7 +1781,7 @@ void MatrixBase<Real>::ApplyFloor(Real floor_val) {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
|
||||
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++) {
|
||||
|
@ -1653,28 +1791,28 @@ void MatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::ApplyLog() {
|
||||
for (MatrixIndexT i = 0; i < num_rows_; i++) {
|
||||
Row(i).ApplyLog();
|
||||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::ApplyExp() {
|
||||
for (MatrixIndexT i = 0; i < num_rows_; i++) {
|
||||
Row(i).ApplyExp();
|
||||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::ApplyPow(Real power) {
|
||||
for (MatrixIndexT i = 0; i < num_rows_; i++) {
|
||||
Row(i).ApplyPow(power);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::ApplyHeaviside() {
|
||||
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
|
||||
for (MatrixIndexT i = 0; i < num_rows; i++) {
|
||||
|
@ -1685,7 +1823,7 @@ void MatrixBase<Real>::ApplyHeaviside() {
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool MatrixBase<Real>::Power(Real power) {
|
||||
KALDI_ASSERT(num_rows_ > 0 && num_rows_ == num_cols_);
|
||||
MatrixIndexT n = num_rows_;
|
||||
|
@ -1708,7 +1846,7 @@ bool MatrixBase<Real>::Power(Real power) {
|
|||
return true;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void Matrix<Real>::Swap(Matrix<Real> *other) {
|
||||
std::swap(this->data_, other->data_);
|
||||
std::swap(this->num_cols_, other->num_cols_);
|
||||
|
@ -1733,7 +1871,7 @@ void Matrix<Real>::Swap(Matrix<Real> *other) {
|
|||
// By making the pointer arguments non-NULL or NULL, the user can choose to take
|
||||
// not to take the eigenvalues directly, and/or the matrix D which is block-diagonal
|
||||
// with 2x2 blocks.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
|
||||
VectorBase<Real> *r,
|
||||
VectorBase<Real> *i) const {
|
||||
|
@ -1756,7 +1894,7 @@ void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
|
|||
// INT_32 mSampSize;
|
||||
// };
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
|
||||
{
|
||||
// check instantiated with double or float.
|
||||
|
@ -1856,7 +1994,7 @@ bool ReadHtk(std::istream &is, Matrix<float> *M, HtkHeader *header_ptr);
|
|||
template
|
||||
bool ReadHtk(std::istream &is, Matrix<double> *M, HtkHeader *header_ptr);
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr) // header may be derived from a previous call to ReadHtk. Must be in binary mode.
|
||||
{
|
||||
KALDI_ASSERT(M.NumRows() == static_cast<MatrixIndexT>(htk_hdr.mNSamples));
|
||||
|
@ -1910,7 +2048,7 @@ template
|
|||
bool WriteHtk(std::ostream &os, const MatrixBase<double> &M, HtkHeader htk_hdr);
|
||||
|
||||
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const MatrixBase<Real> &B, MatrixTransposeType transB,
|
||||
const MatrixBase<Real> &C, MatrixTransposeType transC) {
|
||||
|
@ -1946,7 +2084,7 @@ double TraceMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA,
|
|||
const MatrixBase<double> &C, MatrixTransposeType transC);
|
||||
|
||||
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const MatrixBase<Real> &B, MatrixTransposeType transB,
|
||||
const MatrixBase<Real> &C, MatrixTransposeType transC,
|
||||
|
@ -1989,7 +2127,7 @@ double TraceMatMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA
|
|||
const MatrixBase<double> &C, MatrixTransposeType transC,
|
||||
const MatrixBase<double> &D, MatrixTransposeType transD);
|
||||
|
||||
template<class Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
|
||||
template<typename Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
|
||||
MatrixBase<Real> *Vt, bool sort_on_absolute_value) {
|
||||
/// Makes sure the Svd is sorted (from greatest to least absolute value).
|
||||
MatrixIndexT num_singval = s->Dim();
|
||||
|
@ -2031,7 +2169,7 @@ template
|
|||
void SortSvd(VectorBase<double> *s, MatrixBase<double> *U,
|
||||
MatrixBase<double> *Vt, bool);
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void CreateEigenvalueMatrix(const VectorBase<Real> &re, const VectorBase<Real> &im,
|
||||
MatrixBase<Real> *D) {
|
||||
MatrixIndexT n = re.Dim();
|
||||
|
@ -2067,7 +2205,7 @@ void CreateEigenvalueMatrix(const VectorBase<double> &re, const VectorBase<doubl
|
|||
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool AttemptComplexPower(Real *x_re, Real *x_im, Real power) {
|
||||
// Used in Matrix<Real>::Power().
|
||||
// Attempts to take the complex value x to the power "power",
|
||||
|
@ -2100,7 +2238,7 @@ bool AttemptComplexPower(double *x_re, double *x_im, double power);
|
|||
|
||||
|
||||
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
Real TraceMatMat(const MatrixBase<Real> &A,
|
||||
const MatrixBase<Real> &B,
|
||||
MatrixTransposeType trans) { // tr(A B), equivalent to sum of each element of A times same element in B'
|
||||
|
@ -2186,6 +2324,75 @@ void MatrixBase<Real>::Tanh(const MatrixBase<Real> &src) {
|
|||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
|
||||
KALDI_ASSERT(SameDim(*this, src));
|
||||
int32 num_rows = num_rows_, num_cols = num_cols_;
|
||||
for (MatrixIndexT r = 0; r < num_rows; r++) {
|
||||
Real *row_data = this->RowData(r);
|
||||
const Real *src_row_data = src.RowData(r);
|
||||
for (MatrixIndexT c = 0; c < num_cols; c++) {
|
||||
Real x = src_row_data[c], y;
|
||||
if (x > 10.0) y = x; // avoid exponentiating large numbers; function
|
||||
// approaches y=x.
|
||||
else y = log1p(exp(x));
|
||||
row_data[c] = y;
|
||||
}
|
||||
}
|
||||
}
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) {
|
||||
int group_size = src.NumCols() / this->NumCols();
|
||||
KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size);
|
||||
for (MatrixIndexT i = 0; i < src.NumRows(); i++)
|
||||
for (MatrixIndexT j = 0; j < this->NumCols(); j++)
|
||||
(*this)(i, j) = src.Row(i).Range(j * group_size, group_size).Norm(power);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
|
||||
const std::vector<MatrixIndexT> &indices) {
|
||||
KALDI_ASSERT(NumRows() == src.NumRows());
|
||||
KALDI_ASSERT(NumCols() == static_cast<MatrixIndexT>(indices.size()));
|
||||
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
|
||||
this_stride = stride_, src_stride = src.stride_;
|
||||
Real *this_data = this->data_;
|
||||
const Real *src_data = src.data_;
|
||||
#ifdef KALDI_PARANOID
|
||||
MatrixIndexT src_cols = src.NumCols();
|
||||
for (std::vector<MatrixIndexT>::const_iterator iter = indices.begin();
|
||||
iter != indices.end(); ++iter)
|
||||
KALDI_ASSERT(*iter >= -1 && *iter < src_cols);
|
||||
#endif
|
||||
|
||||
// For the sake of memory locality we do this row by row, rather
|
||||
// than doing it column-wise using cublas_Xcopy
|
||||
for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
|
||||
const MatrixIndexT *index_ptr = &(indices[0]);
|
||||
for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
|
||||
if (*index_ptr < 0) this_data[c] = 0;
|
||||
else this_data[c] = src_data[*index_ptr];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
|
||||
const std::vector<MatrixIndexT> &indices) {
|
||||
KALDI_ASSERT(NumCols() == src.NumCols());
|
||||
KALDI_ASSERT(NumRows() == static_cast<MatrixIndexT>(indices.size()));
|
||||
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
|
||||
this_stride = stride_;
|
||||
Real *this_data = this->data_;
|
||||
|
||||
for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
|
||||
MatrixIndexT index = indices[r];
|
||||
if (index < 0) memset(this_data, 0, sizeof(Real) * num_cols_);
|
||||
else cblas_Xcopy(num_cols, src.RowData(index), 1, this_data, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void MatrixBase<Real>::Sigmoid(const MatrixBase<Real> &src) {
|
||||
KALDI_ASSERT(SameDim(*this, src));
|
||||
|
@ -2237,8 +2444,8 @@ void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<class OtherReal>
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v) {
|
||||
const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
|
||||
stride = stride_;
|
||||
|
@ -2262,8 +2469,8 @@ template void MatrixBase<double>::AddVecToRows(const double alpha,
|
|||
const VectorBase<double> &v);
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<class OtherReal>
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void MatrixBase<Real>::AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v) {
|
||||
const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
|
||||
stride = stride_;
|
||||
|
|
|
@ -41,7 +41,7 @@ Real TraceMatMat(const MatrixBase<Real> &A, const MatrixBase<Real> &B,
|
|||
/// Base class which provides matrix operations not involving resizing
|
||||
/// or allocation. Classes Matrix and SubMatrix inherit from it and take care
|
||||
/// of allocation and resizing.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class MatrixBase {
|
||||
public:
|
||||
// so this child can access protected members of other instances.
|
||||
|
@ -50,6 +50,9 @@ class MatrixBase {
|
|||
friend class CuMatrixBase<Real>;
|
||||
friend class CuMatrix<Real>;
|
||||
friend class CuSubMatrix<Real>;
|
||||
friend class CuPackedMatrix<Real>;
|
||||
|
||||
friend class PackedMatrix<Real>;
|
||||
|
||||
/// Returns number of rows (or zero for emtpy matrix).
|
||||
inline MatrixIndexT NumRows() const { return num_rows_; }
|
||||
|
@ -121,13 +124,16 @@ class MatrixBase {
|
|||
void SetUnit();
|
||||
/// Sets to random values of a normal distribution
|
||||
void SetRandn();
|
||||
/// Sets to numbers uniformly distributed on (0, 1)
|
||||
void SetRandUniform();
|
||||
|
||||
/* Copying functions. These do not resize the matrix! */
|
||||
|
||||
|
||||
/// Copy given matrix. (no resize is done).
|
||||
template<typename OtherReal>
|
||||
void CopyFromMat(const MatrixBase<OtherReal> & M,
|
||||
MatrixTransposeType Trans = kNoTrans);
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
/// Copy from compressed matrix.
|
||||
void CopyFromMat(const CompressedMatrix &M);
|
||||
|
@ -139,12 +145,21 @@ class MatrixBase {
|
|||
/// Copy given tpmatrix. (no resize is done).
|
||||
template<typename OtherReal>
|
||||
void CopyFromTp(const TpMatrix<OtherReal> &M,
|
||||
MatrixTransposeType Trans = kNoTrans);
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
/// Copy from CUDA matrix. Implemented in ../cudamatrix/cu-matrix.h
|
||||
template<typename OtherReal>
|
||||
void CopyFromMat(const CuMatrixBase<OtherReal> &M,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
/// Inverse of vec() operator. Copies vector into matrix, row-by-row.
|
||||
/// Note that rv.Dim() must either equal NumRows()*NumCols() or
|
||||
/// NumCols()-- this has two modes of operation.
|
||||
void CopyRowsFromVec(const VectorBase<Real> &v);
|
||||
|
||||
/// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
|
||||
void CopyRowsFromVec(const CuVectorBase<Real> &v);
|
||||
|
||||
template<typename OtherReal>
|
||||
void CopyRowsFromVec(const VectorBase<OtherReal> &v);
|
||||
|
||||
|
@ -225,6 +240,10 @@ class MatrixBase {
|
|||
/// each row by a scalar taken from that dimension of the vector.
|
||||
void MulRowsVec(const VectorBase<Real> &scale);
|
||||
|
||||
/// divide each row into src.NumCols() groups,
|
||||
/// and then scale i'th row's jth group of elements by src[i, j].
|
||||
void MulRowsGroupMat(const MatrixBase<Real> &src);
|
||||
|
||||
/// Returns logdet of matrix.
|
||||
Real LogDet(Real *det_sign = NULL) const;
|
||||
|
||||
|
@ -248,6 +267,22 @@ class MatrixBase {
|
|||
/// Matrix child class works also for non-square.
|
||||
void Transpose();
|
||||
|
||||
/// Copies column r from column indices[r] of src.
|
||||
/// As a special case, if indexes[i] == -1, sets column i to zero
|
||||
/// indices.size() must equal this->NumCols(),
|
||||
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
|
||||
/// and src.NumRows() must equal this.NumRows()
|
||||
void CopyCols(const MatrixBase<Real> &src,
|
||||
const std::vector<MatrixIndexT> &indices);
|
||||
|
||||
/// Copies row r from row indices[r] of src.
|
||||
/// As a special case, if indexes[i] == -1, sets row i to zero
|
||||
/// "reorder".size() must equal this->NumRows(),
|
||||
/// all elements of "reorder" must be in [-1, src.NumRows()-1],
|
||||
/// and src.NumCols() must equal this.NumCols()
|
||||
void CopyRows(const MatrixBase<Real> &src,
|
||||
const std::vector<MatrixIndexT> &indices);
|
||||
|
||||
/// Applies floor to all matrix elements
|
||||
void ApplyFloor(Real floor_val);
|
||||
|
||||
|
@ -374,6 +409,24 @@ class MatrixBase {
|
|||
/// Set each element to the sigmoid of the corresponding element of "src".
|
||||
void Sigmoid(const MatrixBase<Real> &src);
|
||||
|
||||
/// Set each element to y = log(1 + exp(x))
|
||||
void SoftHinge(const MatrixBase<Real> &src);
|
||||
|
||||
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
|
||||
/// where G = x.NumCols() / y.NumCols() must be an integer.
|
||||
void GroupPnorm(const MatrixBase<Real> &src, Real power);
|
||||
|
||||
|
||||
/// Calculate derivatives for the GroupPnorm function above...
|
||||
/// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
|
||||
/// and "output" is the result of the computation (i.e. the "this" of that function
|
||||
/// call), and *this has the same dimension as "input", then it sets each element
|
||||
/// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
|
||||
/// "output-elem" is whichever element of output depends on that input element.
|
||||
void GroupPnormDeriv(const MatrixBase<Real> &input, const MatrixBase<Real> &output,
|
||||
Real power);
|
||||
|
||||
|
||||
/// Set each element to the tanh of the corresponding element of "src".
|
||||
void Tanh(const MatrixBase<Real> &src);
|
||||
|
||||
|
@ -406,25 +459,40 @@ class MatrixBase {
|
|||
/// Add a scalar to each element
|
||||
void Add(const Real alpha);
|
||||
|
||||
/// Add a scalar to each diagonal element.
|
||||
void AddToDiag(const Real alpha);
|
||||
|
||||
/// *this += alpha * a * b^T
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void AddVecVec(const Real alpha, const VectorBase<OtherReal> &a,
|
||||
const VectorBase<OtherReal> &b);
|
||||
|
||||
/// [each row of *this] += alpha * v
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v);
|
||||
|
||||
/// [each col of *this] += alpha * v
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v);
|
||||
|
||||
/// *this += alpha * M [or M^T]
|
||||
void AddMat(const Real alpha, const MatrixBase<Real> &M,
|
||||
MatrixTransposeType transA = kNoTrans);
|
||||
|
||||
/// *this = beta * *this + alpha * M M^T, for symmetric matrices. It only
|
||||
/// updates the lower triangle of *this. It will leave the matrix asymmetric;
|
||||
/// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
|
||||
void SymAddMat2(const Real alpha, const MatrixBase<Real> &M,
|
||||
MatrixTransposeType transA, Real beta);
|
||||
|
||||
/// *this = beta * *this + alpha * diag(v) * M [or M^T].
|
||||
/// The same as adding M but scaling each row M_i by v(i).
|
||||
void AddDiagVecMat(const Real alpha, VectorBase<Real> &v,
|
||||
const MatrixBase<Real> &M, MatrixTransposeType transM,
|
||||
Real beta = 1.0);
|
||||
|
||||
/// *this += alpha * S
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void AddSp(const Real alpha, const SpMatrix<OtherReal> &S);
|
||||
|
||||
void AddMatMat(const Real alpha,
|
||||
|
@ -512,6 +580,12 @@ class MatrixBase {
|
|||
const SpMatrix<Real>& A, const SpMatrix<Real>& B,
|
||||
const Real beta);
|
||||
|
||||
/// Copy lower triangle to upper triangle (symmetrize)
|
||||
void CopyLowerToUpper();
|
||||
|
||||
/// Copy upper triangle to lower triangle (symmetrize)
|
||||
void CopyUpperToLower();
|
||||
|
||||
/// This function orthogonalizes the rows of a matrix using the Gram-Schmidt
|
||||
/// process. It is only applicable if NumRows() <= NumCols(). It will use
|
||||
/// random number generation to fill in rows with something nonzero, in cases
|
||||
|
@ -580,7 +654,7 @@ class MatrixBase {
|
|||
};
|
||||
|
||||
/// A class for storing matrices.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class Matrix : public MatrixBase<Real> {
|
||||
public:
|
||||
|
||||
|
@ -589,12 +663,23 @@ class Matrix : public MatrixBase<Real> {
|
|||
|
||||
/// Basic constructor. Sets to zero by default.
|
||||
/// if set_zero == false, memory contents are undefined.
|
||||
Matrix(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type = kSetZero):
|
||||
Matrix(const MatrixIndexT r, const MatrixIndexT c,
|
||||
MatrixResizeType resize_type = kSetZero):
|
||||
MatrixBase<Real>() { Resize(r, c, resize_type); }
|
||||
|
||||
/// Copy constructor from CUDA matrix
|
||||
/// This is defined in ../cudamatrix/cu-matrix.h
|
||||
template<typename OtherReal>
|
||||
explicit Matrix(const CuMatrixBase<OtherReal> &cu,
|
||||
MatrixTransposeType trans = kNoTrans);
|
||||
|
||||
|
||||
/// Swaps the contents of *this and *other. Shallow swap.
|
||||
void Swap(Matrix<Real> *other);
|
||||
|
||||
/// Defined in ../cudamatrix/cu-matrix.cc
|
||||
void Swap(CuMatrix<Real> *mat);
|
||||
|
||||
/// Constructor from any MatrixBase. Can also copy with transpose.
|
||||
/// Allocates new memory.
|
||||
explicit Matrix(const MatrixBase<Real> & M,
|
||||
|
@ -707,11 +792,11 @@ struct HtkHeader {
|
|||
};
|
||||
|
||||
// Read HTK formatted features from file into matrix.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool ReadHtk(std::istream &is, Matrix<Real> *M, HtkHeader *header_ptr);
|
||||
|
||||
// Write (HTK format) features to file from matrix.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr);
|
||||
|
||||
|
||||
|
@ -764,19 +849,32 @@ class SubMatrix : public MatrixBase<Real> {
|
|||
|
||||
// Some declarations. These are traces of products.
|
||||
|
||||
|
||||
template<typename Real>
|
||||
bool ApproxEqual(const MatrixBase<Real> &A,
|
||||
const MatrixBase<Real> &B, Real tol = 0.01) {
|
||||
return A.ApproxEqual(B, tol);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
inline void AssertEqual(MatrixBase<Real> &A, MatrixBase<Real> &B,
|
||||
float tol = 0.01) {
|
||||
KALDI_ASSERT(A.ApproxEqual(B, tol));
|
||||
}
|
||||
|
||||
/// Returns trace of matrix.
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
double TraceMat(const MatrixBase<Real> &A) { return A.Trace(); }
|
||||
|
||||
|
||||
/// Returns tr(A B C)
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const MatrixBase<Real> &B, MatrixTransposeType transB,
|
||||
const MatrixBase<Real> &C, MatrixTransposeType transC);
|
||||
|
||||
/// Returns tr(A B C D)
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
|
||||
const MatrixBase<Real> &B, MatrixTransposeType transB,
|
||||
const MatrixBase<Real> &C, MatrixTransposeType transC,
|
||||
|
@ -796,7 +894,7 @@ Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
|
|||
/// otherwise, moving the columns of U, if it exists, and the rows of Vt, if it
|
||||
/// exists around in the same way. Note: the "absolute value" part won't matter
|
||||
/// if this is an actual SVD, since singular values are non-negative.
|
||||
template<class Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
|
||||
template<typename Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
|
||||
MatrixBase<Real>* Vt = NULL,
|
||||
bool sort_on_absolute_value = true);
|
||||
|
||||
|
@ -806,7 +904,7 @@ template<class Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
|
|||
/// 2x2 block [lambda, mu; -mu, lambda].
|
||||
/// This function will throw if any complex eigenvalues are not in complex conjugate
|
||||
/// pairs (or the members of such pairs are not consecutively numbered).
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real> &imag,
|
||||
MatrixBase<Real> *D);
|
||||
|
||||
|
@ -814,7 +912,7 @@ void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real>
|
|||
/// declare it here mainly for the testing code to see. It takes a complex value to
|
||||
/// a power using a method that will work for noninteger powers (but will fail if the
|
||||
/// complex value is real and negative).
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
|
||||
|
||||
|
||||
|
@ -834,7 +932,7 @@ template<typename Real>
|
|||
std::istream & operator >> (std::istream & In, Matrix<Real> & M);
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
|
||||
return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ template
|
|||
double VecVec<>(const VectorBase<double> &a,
|
||||
const VectorBase<double> &b);
|
||||
|
||||
template<class Real, class OtherReal>
|
||||
template<typename Real, typename OtherReal>
|
||||
Real VecVec(const VectorBase<Real> &ra,
|
||||
const VectorBase<OtherReal> &rb) {
|
||||
MatrixIndexT adim = ra.Dim();
|
||||
|
@ -470,20 +470,25 @@ Real VectorBase<Real>::Norm(Real p) const {
|
|||
return sqrt(sum);
|
||||
} else {
|
||||
Real tmp;
|
||||
bool ok = true;
|
||||
for (MatrixIndexT i = 0; i < dim_; i++) {
|
||||
tmp = pow(std::abs(data_[i]), p);
|
||||
if (tmp == HUGE_VAL) { // HUGE_VAL is what pow returns on error.
|
||||
KALDI_ERR << "Could not raise element " << i << "to power " << p
|
||||
<< ": returned value = " << tmp;
|
||||
}
|
||||
if (tmp == HUGE_VAL) // HUGE_VAL is what pow returns on error.
|
||||
ok = false;
|
||||
sum += tmp;
|
||||
}
|
||||
tmp = pow(sum, static_cast<Real>(1.0/p));
|
||||
if (tmp == HUGE_VAL) { // HUGE_VAL is what errno returns on error.
|
||||
KALDI_ERR << "Could not take the " << p << "-th root of " << sum
|
||||
<< "; returned value = " << tmp;
|
||||
}
|
||||
return tmp;
|
||||
KALDI_ASSERT(tmp != HUGE_VAL); // should not happen here.
|
||||
if (ok) {
|
||||
return tmp;
|
||||
} else {
|
||||
Real maximum = this->Max(), minimum = this->Min(),
|
||||
max_abs = std::max(maximum, -minimum);
|
||||
KALDI_ASSERT(max_abs > 0); // Or should not have reached here.
|
||||
Vector<Real> tmp(*this);
|
||||
tmp.Scale(1.0 / max_abs);
|
||||
return tmp.Norm(p) * max_abs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -612,9 +617,7 @@ void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixInd
|
|||
template<typename Real>
|
||||
void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
|
||||
KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
|
||||
for (MatrixIndexT i = 0; i < dim_; i++)
|
||||
data_[i] = M(i, i);
|
||||
// could make this more efficient.
|
||||
cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
|
@ -774,12 +777,13 @@ MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
|
|||
|
||||
template<typename Real>
|
||||
Real VectorBase<Real>::ApplySoftMax() {
|
||||
Real max = this->Max(), sum = 0.0;
|
||||
Real max = this->Max(), sum = 0.0;
|
||||
for (MatrixIndexT i = 0; i < dim_; i++) {
|
||||
sum += (data_[i] = exp(data_[i] - max));
|
||||
}
|
||||
this->Scale(1.0 / sum);
|
||||
return max + log(sum);
|
||||
|
||||
}
|
||||
|
||||
#ifdef HAVE_MKL
|
||||
|
@ -868,7 +872,12 @@ void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Real> // Set each element to y = (x == orig ? changed : x).
|
||||
void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
|
||||
Real *data = data_;
|
||||
for (MatrixIndexT i = 0; i < dim_; i++)
|
||||
if (data[i] == orig) data[i] = changed;
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
|
@ -1136,7 +1145,7 @@ void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
|
||||
KALDI_ASSERT(dim_ == v.dim_);
|
||||
for (MatrixIndexT i = 0; i < dim_; i++)
|
||||
|
@ -1144,7 +1153,7 @@ void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
|
|||
}
|
||||
|
||||
// this <-- beta*this + alpha*M*v.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
|
||||
const MatrixTransposeType trans,
|
||||
const VectorBase<Real> &v,
|
||||
|
@ -1162,7 +1171,7 @@ void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
|
||||
const VectorBase<Real> &v2) {
|
||||
KALDI_ASSERT(v1.Dim() == M.NumRows() && v2.Dim() == M.NumCols());
|
||||
|
@ -1178,7 +1187,7 @@ template
|
|||
double VecMatVec(const VectorBase<double> &v1, const MatrixBase<double> &M,
|
||||
const VectorBase<double> &v2);
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void Vector<Real>::Swap(Vector<Real> *other) {
|
||||
std::swap(this->data_, other->data_);
|
||||
std::swap(this->dim_, other->dim_);
|
||||
|
@ -1209,6 +1218,29 @@ void VectorBase<Real>::AddDiagMat2(
|
|||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void VectorBase<Real>::AddDiagMatMat(
|
||||
Real alpha,
|
||||
const MatrixBase<Real> &M, MatrixTransposeType transM,
|
||||
const MatrixBase<Real> &N, MatrixTransposeType transN,
|
||||
Real beta) {
|
||||
MatrixIndexT dim = this->dim_,
|
||||
M_col_dim = (transM == kTrans ? M.NumRows() : M.NumCols()),
|
||||
N_row_dim = (transN == kTrans ? N.NumCols() : N.NumRows());
|
||||
KALDI_ASSERT(M_col_dim == N_row_dim); // this is the dimension we sum over
|
||||
MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
|
||||
if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
|
||||
MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
|
||||
if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
|
||||
|
||||
Real *data = this->data_;
|
||||
const Real *Mdata = M.Data(), *Ndata = N.Data();
|
||||
for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) {
|
||||
*data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template class Vector<float>;
|
||||
template class Vector<double>;
|
||||
template class VectorBase<float>;
|
||||
|
@ -1216,5 +1248,3 @@ template class VectorBase<double>;
|
|||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -109,6 +109,11 @@ class VectorBase {
|
|||
template<typename OtherReal>
|
||||
void CopyFromVec(const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Copy from CuVector. This is defined in ../cudamatrix/cu-vector.h
|
||||
template<typename OtherReal>
|
||||
void CopyFromVec(const CuVectorBase<OtherReal> &v);
|
||||
|
||||
|
||||
/// Apply natural log to all elements. Throw if any element of
|
||||
/// the vector is negative (but doesn't complain about zero; the
|
||||
/// log will be -infinity
|
||||
|
@ -157,7 +162,7 @@ class VectorBase {
|
|||
|
||||
/// Add vector : *this = *this + alpha * rv (with casting between floats and
|
||||
/// doubles)
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring].
|
||||
|
@ -165,7 +170,7 @@ class VectorBase {
|
|||
|
||||
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring],
|
||||
/// with casting between floats and doubles.
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
|
||||
|
||||
/// Add matrix times vector : this <-- beta*this + alpha*M*v.
|
||||
|
@ -192,6 +197,9 @@ class VectorBase {
|
|||
const MatrixTransposeType trans, const VectorBase<Real> &v,
|
||||
const Real beta); // **beta previously defaulted to 0.0**
|
||||
|
||||
/// Set each element to y = (x == orig ? changed : x).
|
||||
void ReplaceValue(Real orig, Real changed);
|
||||
|
||||
/// Multipy element-by-element by another vector.
|
||||
void MulElements(const VectorBase<Real> &v);
|
||||
/// Multipy element-by-element by another vector of different type.
|
||||
|
@ -228,6 +236,8 @@ class VectorBase {
|
|||
template<typename OtherReal>
|
||||
void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
|
||||
|
||||
/// The following is implemented in ../cudamatrix/cu-matrix.cc
|
||||
void CopyRowsFromMat(const CuMatrixBase<Real> &M);
|
||||
|
||||
/// Performs a column stack of the matrix M
|
||||
void CopyColsFromMat(const MatrixBase<Real> &M);
|
||||
|
@ -292,6 +302,13 @@ class VectorBase {
|
|||
void AddDiagMat2(Real alpha, const MatrixBase<Real> &M,
|
||||
MatrixTransposeType trans = kNoTrans, Real beta = 1.0);
|
||||
|
||||
/// Add the diagonal of a matrix product: *this = diag(M N), assuming the
|
||||
/// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
|
||||
/// as you would expect.
|
||||
void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
|
||||
const MatrixBase<Real> &N, MatrixTransposeType transN,
|
||||
Real beta = 1.0);
|
||||
|
||||
/// Returns log(sum(exp())) without exp overflow
|
||||
/// If prune > 0.0, ignores terms less than the max - prune.
|
||||
/// [Note: in future, if prune = 0.0, it will take the max.
|
||||
|
@ -354,6 +371,11 @@ class Vector: public VectorBase<Real> {
|
|||
MatrixResizeType resize_type = kSetZero)
|
||||
: VectorBase<Real>() { Resize(s, resize_type); }
|
||||
|
||||
/// Copy constructor from CUDA vector
|
||||
/// This is defined in ../cudamatrix/cu-vector.h
|
||||
template<typename OtherReal>
|
||||
explicit Vector(const CuVectorBase<OtherReal> &cu);
|
||||
|
||||
/// Copy constructor. The need for this is controversial.
|
||||
Vector(const Vector<Real> &v) : VectorBase<Real>() { // (cannot be explicit)
|
||||
Resize(v.Dim(), kUndefined);
|
||||
|
@ -432,7 +454,7 @@ class Vector: public VectorBase<Real> {
|
|||
|
||||
/// Represents a non-allocating general vector which can be defined
|
||||
/// as a sub-vector of higher-level vector [or as the row of a matrix].
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class SubVector : public VectorBase<Real> {
|
||||
public:
|
||||
/// Constructor from a Vector or SubVector.
|
||||
|
@ -506,6 +528,20 @@ std::istream & operator >> (std::istream & in, Vector<Real> & v);
|
|||
/// \addtogroup matrix_funcs_scalar
|
||||
/// @{
|
||||
|
||||
|
||||
template<typename Real>
|
||||
bool ApproxEqual(const VectorBase<Real> &a,
|
||||
const VectorBase<Real> &b, Real tol = 0.01) {
|
||||
return a.ApproxEqual(b, tol);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
|
||||
float tol = 0.01) {
|
||||
KALDI_ASSERT(a.ApproxEqual(b, tol));
|
||||
}
|
||||
|
||||
|
||||
/// Returns dot product between v1 and v2.
|
||||
template<typename Real>
|
||||
Real VecVec(const VectorBase<Real> &v1, const VectorBase<Real> &v2);
|
||||
|
@ -516,7 +552,7 @@ Real VecVec(const VectorBase<Real> &v1, const VectorBase<OtherReal> &v2);
|
|||
|
||||
/// Returns \f$ v_1^T M v_2 \f$ .
|
||||
/// Not as efficient as it could be where v1 == v2.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
|
||||
const VectorBase<Real> &v2);
|
||||
|
||||
|
|
|
@ -38,6 +38,12 @@ typedef enum {
|
|||
kCopyData
|
||||
} MatrixResizeType;
|
||||
|
||||
typedef enum {
|
||||
kTakeLower,
|
||||
kTakeUpper,
|
||||
kTakeMean,
|
||||
kTakeMeanAndCheck
|
||||
} SpCopyType;
|
||||
|
||||
template<typename Real> class VectorBase;
|
||||
template<typename Real> class Vector;
|
||||
|
@ -57,6 +63,9 @@ template<typename Real> class CuMatrix;
|
|||
template<typename Real> class CuVectorBase;
|
||||
template<typename Real> class CuSubVector;
|
||||
template<typename Real> class CuVector;
|
||||
template<typename Real> class CuPackedMatrix;
|
||||
template<typename Real> class CuSpMatrix;
|
||||
template<typename Real> class CuTpMatrix;
|
||||
|
||||
class CompressedMatrix;
|
||||
|
||||
|
|
|
@ -28,14 +28,14 @@
|
|||
namespace kaldi {
|
||||
|
||||
//! ComplexMul implements, inline, the complex multiplication b *= a.
|
||||
template<class Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
|
||||
template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
|
||||
Real *b_re, Real *b_im) {
|
||||
Real tmp_re = (*b_re * a_re) - (*b_im * a_im);
|
||||
*b_im = *b_re * a_im + *b_im * a_re;
|
||||
*b_re = tmp_re;
|
||||
}
|
||||
|
||||
template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
|
||||
template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
|
||||
const Real &b_re, const Real &b_im,
|
||||
Real *c_re, Real *c_im) {
|
||||
*c_re += b_re*a_re - b_im*a_im;
|
||||
|
@ -43,7 +43,7 @@ template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real
|
|||
}
|
||||
|
||||
|
||||
template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
|
||||
template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
|
||||
*a_re = std::cos(x);
|
||||
*a_im = std::sin(x);
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
|
||||
namespace kaldi {
|
||||
|
||||
template<class Real> void ComplexFt (const VectorBase<Real> &in,
|
||||
template<typename Real> void ComplexFt (const VectorBase<Real> &in,
|
||||
VectorBase<Real> *out, bool forward) {
|
||||
int exp_sign = (forward ? -1 : 1);
|
||||
KALDI_ASSERT(out != NULL);
|
||||
|
@ -93,7 +93,7 @@ void ComplexFt (const VectorBase<double> &in,
|
|||
//! of the recursion.
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void ComplexFftRecursive (Real *data, int nffts, int N,
|
||||
const int *factor_begin,
|
||||
const int *factor_end, bool forward,
|
||||
|
@ -331,7 +331,7 @@ void ComplexFftRecursive (Real *data, int nffts, int N,
|
|||
|
||||
// This is the outer-layer calling code for ComplexFftRecursive.
|
||||
// It factorizes the dimension and then calls the FFT routine.
|
||||
template<class Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
|
||||
template<typename Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
|
||||
KALDI_ASSERT(v != NULL);
|
||||
|
||||
if (v->Dim()<=1) return;
|
||||
|
@ -347,7 +347,7 @@ template<class Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<R
|
|||
}
|
||||
|
||||
//! Inefficient version of Fourier transform, for testing purposes.
|
||||
template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
|
||||
template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
|
||||
KALDI_ASSERT(v != NULL);
|
||||
MatrixIndexT N = v->Dim();
|
||||
KALDI_ASSERT(N%2 == 0);
|
||||
|
@ -388,7 +388,7 @@ void ComplexFft(VectorBase<double> *v, bool forward, Vector<double> *tmp_in);
|
|||
|
||||
|
||||
// See the long comment below for the math behind this.
|
||||
template<class Real> void RealFft (VectorBase<Real> *v, bool forward) {
|
||||
template<typename Real> void RealFft (VectorBase<Real> *v, bool forward) {
|
||||
KALDI_ASSERT(v != NULL);
|
||||
MatrixIndexT N = v->Dim(), N2 = N/2;
|
||||
KALDI_ASSERT(N%2 == 0);
|
||||
|
@ -589,7 +589,7 @@ so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})) (z2)
|
|||
|
||||
*/
|
||||
|
||||
template<class Real> void ComputeDctMatrix(Matrix<Real> *M) {
|
||||
template<typename Real> void ComputeDctMatrix(Matrix<Real> *M) {
|
||||
//KALDI_ASSERT(M->NumRows() == M->NumCols());
|
||||
MatrixIndexT K = M->NumRows();
|
||||
MatrixIndexT N = M->NumCols();
|
||||
|
@ -612,7 +612,7 @@ template void ComputeDctMatrix(Matrix<float> *M);
|
|||
template void ComputeDctMatrix(Matrix<double> *M);
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixExponential<Real>::Clear() {
|
||||
N_ = 0;
|
||||
P_.Resize(0, 0);
|
||||
|
@ -620,7 +620,7 @@ void MatrixExponential<Real>::Clear() {
|
|||
powers_.clear();
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixExponential<Real>::Compute(const MatrixBase<Real> &M,
|
||||
MatrixBase<Real> *X) {
|
||||
// does *X = exp(M)
|
||||
|
@ -650,7 +650,7 @@ void MatrixExponential<Real>::Compute(const MatrixBase<Real> &M,
|
|||
(*X)(i, i) += 1.0;
|
||||
};
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixExponential<Real>::Compute(const SpMatrix<Real> &M,
|
||||
SpMatrix<Real> *X) {
|
||||
Matrix<Real> Mfull(M), Xfull(M.NumRows(), M.NumCols());
|
||||
|
@ -659,7 +659,7 @@ void MatrixExponential<Real>::Compute(const SpMatrix<Real> &M,
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
MatrixIndexT MatrixExponential<Real>::ComputeN(const MatrixBase<Real> &M) {
|
||||
// Computes the power of two we want to use. Aim to get
|
||||
// AScaled.FrobeniusNorm() < 1/10.
|
||||
|
@ -674,7 +674,7 @@ MatrixIndexT MatrixExponential<Real>::ComputeN(const MatrixBase<Real> &M) {
|
|||
return N;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixExponential<Real>::ComputeTaylor(const MatrixBase<Real> &P, MatrixBase<Real> *B0) {
|
||||
KALDI_ASSERT(P.FrobeniusNorm() < 1.001); // should actually be << 1
|
||||
// for this to work fast enough.
|
||||
|
@ -710,7 +710,7 @@ void MatrixExponential<Real>::ComputeTaylor(const MatrixBase<Real> &P, MatrixBas
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixExponential<Real>::Backprop(const MatrixBase<Real> &hX,
|
||||
MatrixBase<Real> *hM) const {
|
||||
MatrixIndexT dim = P_.NumRows();
|
||||
|
@ -747,7 +747,7 @@ void MatrixExponential<Real>::Backprop(const MatrixBase<Real> &hX,
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixExponential<Real>::Backprop(const SpMatrix<Real> &hX,
|
||||
SpMatrix<Real> *hM) const {
|
||||
Matrix<Real> hXfull(hX), hMfull(hX.NumRows(), hX.NumCols());
|
||||
|
@ -756,7 +756,7 @@ void MatrixExponential<Real>::Backprop(const SpMatrix<Real> &hX,
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void MatrixExponential<Real>::BackpropTaylor(const MatrixBase<Real> &hB0,
|
||||
MatrixBase<Real> *hP) const {
|
||||
// Backprop through the Taylor-series computation.
|
||||
|
@ -819,7 +819,7 @@ template class MatrixExponential<float>;
|
|||
template class MatrixExponential<double>;
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void ComputePca(const MatrixBase<Real> &X,
|
||||
MatrixBase<Real> *U,
|
||||
MatrixBase<Real> *A,
|
||||
|
@ -861,7 +861,7 @@ void ComputePca(const MatrixBase<Real> &X,
|
|||
A->AddMatMat(1.0, X, kNoTrans, *U, kTrans, 0.0);
|
||||
} else { // Do inner-product PCA.
|
||||
SpMatrix<Real> Nsp(N); // Matrix of inner products.
|
||||
Nsp.AddMat2(1.0, X, kNoTrans); // M <-- X X^T
|
||||
Nsp.AddMat2(1.0, X, kNoTrans, 0.0); // M <-- X X^T
|
||||
|
||||
Matrix<Real> Vtmp;
|
||||
Vector<Real> l;
|
||||
|
@ -929,7 +929,7 @@ void ComputePca(const MatrixBase<double> &X,
|
|||
// Added by Dan, Feb. 13 2012.
|
||||
// This function does: *plus += max(0, a b^T),
|
||||
// *minus += max(0, -(a b^T)).
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void AddOuterProductPlusMinus(Real alpha,
|
||||
const VectorBase<Real> &a,
|
||||
const VectorBase<Real> &b,
|
||||
|
|
|
@ -59,12 +59,12 @@ namespace kaldi {
|
|||
in some contexts, the transform is made symmetric by multiplying
|
||||
by sqrt(N) in both passes. The user can do this by themselves.
|
||||
*/
|
||||
template<class Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
|
||||
template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
|
||||
|
||||
/// ComplexFt is the same as ComplexFft but it implements the Fourier
|
||||
/// transform in an inefficient way. It is mainly included for testing purposes.
|
||||
/// See comment for ComplexFft to describe the input and outputs and what it does.
|
||||
template<class Real> void ComplexFt (const VectorBase<Real> &in,
|
||||
template<typename Real> void ComplexFt (const VectorBase<Real> &in,
|
||||
VectorBase<Real> *out, bool forward);
|
||||
|
||||
/// RealFft is a fourier transform of real inputs. Internally it uses
|
||||
|
@ -76,12 +76,12 @@ template<class Real> void ComplexFt (const VectorBase<Real> &in,
|
|||
/// The interpretation of the complex-FFT data is as follows: the array
|
||||
/// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
|
||||
/// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
|
||||
template<class Real> void RealFft (VectorBase<Real> *v, bool forward);
|
||||
template<typename Real> void RealFft (VectorBase<Real> *v, bool forward);
|
||||
|
||||
|
||||
/// RealFt has the same input and output format as RealFft above, but it is
|
||||
/// an inefficient implementation included for testing purposes.
|
||||
template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
|
||||
template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
|
||||
|
||||
/// ComputeDctMatrix computes a matrix corresponding to the DCT, such that
|
||||
/// M * v equals the DCT of vector v. M must be square at input.
|
||||
|
@ -97,21 +97,21 @@ template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward)
|
|||
/// because it was this way from the start and changing it would affect the
|
||||
/// feature generation.
|
||||
|
||||
template<class Real> void ComputeDctMatrix(Matrix<Real> *M);
|
||||
template<typename Real> void ComputeDctMatrix(Matrix<Real> *M);
|
||||
|
||||
|
||||
/// ComplexMul implements, inline, the complex multiplication b *= a.
|
||||
template<class Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
|
||||
template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
|
||||
Real *b_re, Real *b_im);
|
||||
|
||||
/// ComplexMul implements, inline, the complex operation c += (a * b).
|
||||
template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
|
||||
template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
|
||||
const Real &b_re, const Real &b_im,
|
||||
Real *c_re, Real *c_im);
|
||||
|
||||
|
||||
/// ComplexImExp implements a <-- exp(i x), inline.
|
||||
template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
|
||||
template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
|
||||
|
||||
|
||||
// This class allows you to compute the matrix exponential function
|
||||
|
@ -122,7 +122,7 @@ template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
|
|||
// It also provides a function that allows you do back-propagate the
|
||||
// derivative of a scalar function through this calculation.
|
||||
// The
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class MatrixExponential {
|
||||
public:
|
||||
MatrixExponential() { }
|
||||
|
@ -194,7 +194,7 @@ class MatrixExponential {
|
|||
method.
|
||||
*/
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void ComputePca(const MatrixBase<Real> &X,
|
||||
MatrixBase<Real> *U,
|
||||
MatrixBase<Real> *A,
|
||||
|
@ -205,14 +205,14 @@ void ComputePca(const MatrixBase<Real> &X,
|
|||
|
||||
// This function does: *plus += max(0, a b^T),
|
||||
// *minus += max(0, -(a b^T)).
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void AddOuterProductPlusMinus(Real alpha,
|
||||
const VectorBase<Real> &a,
|
||||
const VectorBase<Real> &b,
|
||||
MatrixBase<Real> *plus,
|
||||
MatrixBase<Real> *minus);
|
||||
|
||||
template<class Real1, class Real2>
|
||||
template<typename Real1, typename Real2>
|
||||
inline void AssertSameDim(const MatrixBase<Real1> &mat1, const MatrixBase<Real2> &mat2) {
|
||||
KALDI_ASSERT(mat1.NumRows() == mat2.NumRows()
|
||||
&& mat1.NumCols() == mat2.NumCols());
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -28,7 +28,7 @@ namespace kaldi {
|
|||
|
||||
// Below, N&W refers to Nocedal and Wright, "Numerical Optimization", 2nd Ed.
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
|
||||
const LbfgsOptions &opts):
|
||||
opts_(opts), k_(0), computation_state_(kBeforeStep), H_was_set_(false) {
|
||||
|
@ -48,7 +48,7 @@ OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real OptimizeLbfgs<Real>::RecentStepLength() const {
|
||||
size_t n = step_lengths_.size();
|
||||
if (n == 0) return std::numeric_limits<Real>::infinity();
|
||||
|
@ -63,7 +63,7 @@ Real OptimizeLbfgs<Real>::RecentStepLength() const {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
|
||||
if (k_ == 0) {
|
||||
if (H_.Dim() == 0) {
|
||||
|
@ -107,7 +107,7 @@ void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
|
|||
// This represents the first 2 lines of Algorithm 7.5 (N&W), which
|
||||
// in fact is mostly a call to Algorithm 7.4.
|
||||
// Note: this is valid whether we are minimizing or maximizing.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
|
||||
const VectorBase<Real> &gradient) {
|
||||
KALDI_ASSERT(computation_state_ == kBeforeStep);
|
||||
|
@ -166,7 +166,7 @@ void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
|
||||
const VectorBase<Real> &gradient) {
|
||||
// Save s_k = x_{k+1} - x_{k}, and y_k = \nabla f_{k+1} - \nabla f_k.
|
||||
|
@ -200,7 +200,7 @@ bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
|
|||
return true; // We successfully accepted the step.
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
|
||||
step_lengths_.push_back(s);
|
||||
if (step_lengths_.size() > static_cast<size_t>(opts_.avg_step_length))
|
||||
|
@ -208,7 +208,7 @@ void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
|
||||
Real f,
|
||||
const VectorBase<Real> &gradient) {
|
||||
|
@ -231,7 +231,7 @@ void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
|
|||
ComputeNewDirection(f, gradient);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
|
||||
const VectorBase<Real> &gradient) {
|
||||
KALDI_VLOG(3) << "In step size iteration, function value changed "
|
||||
|
@ -376,7 +376,7 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void OptimizeLbfgs<Real>::DoStep(Real function_value,
|
||||
const VectorBase<Real> &gradient) {
|
||||
if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
|
||||
|
@ -389,7 +389,7 @@ void OptimizeLbfgs<Real>::DoStep(Real function_value,
|
|||
StepSizeIteration(function_value, gradient);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void OptimizeLbfgs<Real>::DoStep(Real function_value,
|
||||
const VectorBase<Real> &gradient,
|
||||
const VectorBase<Real> &diag_approx_2nd_deriv) {
|
||||
|
@ -408,7 +408,7 @@ void OptimizeLbfgs<Real>::DoStep(Real function_value,
|
|||
DoStep(function_value, gradient);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
const VectorBase<Real>&
|
||||
OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
|
||||
if (objf_value != NULL) *objf_value = best_f_;
|
||||
|
|
|
@ -83,7 +83,7 @@ struct LbfgsOptions {
|
|||
avg_step_length(4) { }
|
||||
};
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class OptimizeLbfgs {
|
||||
public:
|
||||
/// Initializer takes the starting value of x.
|
||||
|
|
|
@ -36,12 +36,6 @@ void PackedMatrix<Real>::Scale(Real alpha) {
|
|||
cblas_Xscal(sz, alpha, data_, 1);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::AddVec2(const Real alpha, const Vector<Real> &rv) {
|
||||
KALDI_ASSERT(rv.Dim() == num_rows_);
|
||||
cblas_Xspr(rv.Dim(), alpha, rv.Data(), 1, data_);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::AddPacked(const Real alpha, const PackedMatrix<Real> &rMa) {
|
||||
KALDI_ASSERT(num_rows_ == rMa.NumRows());
|
||||
|
@ -50,7 +44,7 @@ void PackedMatrix<Real>::AddPacked(const Real alpha, const PackedMatrix<Real> &r
|
|||
cblas_Xaxpy(sz, alpha, rMa.Data(), 1, data_, 1);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::SetRandn() {
|
||||
Real *data = data_;
|
||||
size_t dim = num_rows_, size = ((dim*(dim+1))/2);
|
||||
|
@ -89,6 +83,12 @@ void PackedMatrix<Real>::Swap(PackedMatrix<Real> *other) {
|
|||
std::swap(num_rows_, other->num_rows_);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::Swap(Matrix<Real> *other) {
|
||||
std::swap(data_, other->data_);
|
||||
std::swap(num_rows_, other->num_rows_);
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::Resize(MatrixIndexT r, MatrixResizeType resize_type) {
|
||||
|
@ -119,6 +119,15 @@ void PackedMatrix<Real>::Resize(MatrixIndexT r, MatrixResizeType resize_type) {
|
|||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::AddToDiag(Real r) {
|
||||
Real *ptr = data_;
|
||||
for (MatrixIndexT i = 2; i <= num_rows_+1; i++) {
|
||||
*ptr += r;
|
||||
ptr += i;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::ScaleDiag(Real alpha) {
|
||||
Real *ptr = data_;
|
||||
|
@ -138,6 +147,7 @@ void PackedMatrix<Real>::SetDiag(Real alpha) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void PackedMatrix<Real>::CopyFromPacked(const PackedMatrix<OtherReal> &orig) {
|
||||
|
@ -221,35 +231,45 @@ void PackedMatrix<Real>::Destroy() {
|
|||
num_rows_ = 0;
|
||||
}
|
||||
|
||||
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
|
||||
if (!os.good()) {
|
||||
KALDI_ERR << "Failed to write vector to stream: stream not good";
|
||||
}
|
||||
std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
|
||||
|
||||
WriteToken(os, binary, my_token);
|
||||
|
||||
int32 size = this->NumRows(); // make the size 32-bit on disk.
|
||||
KALDI_ASSERT(this->NumRows() == (MatrixIndexT) size);
|
||||
WriteBasicType(os, binary, size);
|
||||
MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;
|
||||
|
||||
if(binary) {
|
||||
std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
|
||||
WriteToken(os, binary, my_token);
|
||||
WriteBasicType(os, binary, size);
|
||||
// We don't use the built-in Kaldi write routines for the floats, as they are
|
||||
// not efficient enough.
|
||||
MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;
|
||||
if (!binary) {
|
||||
for (MatrixIndexT i = 0; i < num_elems; i++)
|
||||
WriteBasicType(os, binary, data_[i]);
|
||||
os << '\n';
|
||||
} else {
|
||||
os.write((const char*) data_, sizeof(Real) * num_elems);
|
||||
}
|
||||
else {
|
||||
if(size == 0)
|
||||
os<<"[ ]\n";
|
||||
else {
|
||||
os<<"[\n";
|
||||
MatrixIndexT i = 0;
|
||||
for (int32 j = 0; j < size; j++) {
|
||||
for (int32 k = 0; k < j + 1; k++) {
|
||||
WriteBasicType(os, binary, data_[i++]);
|
||||
}
|
||||
os << ( (j==size-1)? "]\n" : "\n");
|
||||
}
|
||||
KALDI_ASSERT(i == num_elems);
|
||||
}
|
||||
}
|
||||
if (os.fail()) {
|
||||
KALDI_ERR << "Failed to write packed matrix to stream";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// template<typename Real>
|
||||
// void Save (std::ostream & os, const PackedMatrix<Real>& rM)
|
||||
// {
|
||||
|
@ -275,7 +295,7 @@ void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
|
|||
|
||||
|
||||
template<typename Real>
|
||||
void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
|
||||
void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
|
||||
if (add) {
|
||||
PackedMatrix<Real> tmp;
|
||||
tmp.Read(is, binary, false); // read without adding.
|
||||
|
@ -295,6 +315,8 @@ void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
|
|||
MatrixIndexT pos_at_start = is.tellg();
|
||||
int peekval = Peek(is, binary);
|
||||
const char *my_token = (sizeof(Real) == 4 ? "FP" : "DP");
|
||||
const char *new_format_token = "[";
|
||||
bool is_new_format = false;//added by hxu
|
||||
char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
|
||||
int32 size;
|
||||
MatrixIndexT num_elems;
|
||||
|
@ -310,25 +332,93 @@ void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
|
|||
std::string token;
|
||||
ReadToken(is, binary, &token);
|
||||
if (token != my_token) {
|
||||
specific_error << ": Expected token " << my_token << ", got " << token;
|
||||
goto bad;
|
||||
}
|
||||
ReadBasicType(is, binary, &size); // throws on error.
|
||||
if ((MatrixIndexT)size != this->NumRows()) {
|
||||
KALDI_ASSERT(size>=0);
|
||||
this->Resize(size);
|
||||
}
|
||||
num_elems = ((size+1)*(MatrixIndexT)size)/2;
|
||||
if (!binary) {
|
||||
for (MatrixIndexT i = 0; i < num_elems; i++) {
|
||||
ReadBasicType(is, false, data_+i); // will throw on error.
|
||||
if(token != new_format_token) {
|
||||
specific_error << ": Expected token " << my_token << ", got " << token;
|
||||
goto bad;
|
||||
}
|
||||
} else {
|
||||
if (num_elems)
|
||||
is.read(reinterpret_cast<char*>(data_), sizeof(Real)*num_elems);
|
||||
//new format it is
|
||||
is_new_format = true;
|
||||
}
|
||||
if(!is_new_format) {
|
||||
ReadBasicType(is, binary, &size); // throws on error.
|
||||
if ((MatrixIndexT)size != this->NumRows()) {
|
||||
KALDI_ASSERT(size>=0);
|
||||
this->Resize(size);
|
||||
}
|
||||
num_elems = ((size+1)*(MatrixIndexT)size)/2;
|
||||
if (!binary) {
|
||||
for (MatrixIndexT i = 0; i < num_elems; i++) {
|
||||
ReadBasicType(is, false, data_+i); // will throw on error.
|
||||
}
|
||||
} else {
|
||||
if (num_elems)
|
||||
is.read(reinterpret_cast<char*>(data_), sizeof(Real)*num_elems);
|
||||
}
|
||||
if (is.fail()) goto bad;
|
||||
return;
|
||||
}
|
||||
else {
|
||||
std::vector<Real> data;
|
||||
while(1) {
|
||||
int32 num_lines = 0;
|
||||
int i = is.peek();
|
||||
if (i == -1) { specific_error << "Got EOF while reading matrix data"; goto bad; }
|
||||
else if (static_cast<char>(i) == ']') { // Finished reading matrix.
|
||||
is.get(); // eat the "]".
|
||||
i = is.peek();
|
||||
if (static_cast<char>(i) == '\r') {
|
||||
is.get();
|
||||
is.get(); // get \r\n (must eat what we wrote)
|
||||
}// I don't actually understand what it's doing here
|
||||
else if (static_cast<char>(i) == '\n') { is.get(); } // get \n (must eat what we wrote)
|
||||
|
||||
if (is.fail()) {
|
||||
KALDI_WARN << "After end of matrix data, read error.";
|
||||
// we got the data we needed, so just warn for this error.
|
||||
}
|
||||
//now process the data:
|
||||
num_lines = int32(sqrt(data.size()*2));
|
||||
|
||||
KALDI_ASSERT(data.size() == num_lines*(num_lines+1)/2);
|
||||
|
||||
this->Resize(num_lines);
|
||||
|
||||
//std::cout<<data.size()<<' '<<num_lines<<'\n';
|
||||
|
||||
for(int32 i = 0; i < data.size(); i++) {
|
||||
data_[i] = data[i];
|
||||
}
|
||||
return;
|
||||
//std::cout<<"here!!!!!hxu!!!!!"<<std::endl;
|
||||
}
|
||||
else if ( (i >= '0' && i <= '9') || i == '-' ) { // A number...
|
||||
Real r;
|
||||
is >> r;
|
||||
if (is.fail()) {
|
||||
specific_error << "Stream failure/EOF while reading matrix data.";
|
||||
goto bad;
|
||||
}
|
||||
data.push_back(r);
|
||||
}
|
||||
else if (isspace(i)) {
|
||||
is.get(); // eat the space and do nothing.
|
||||
} else { // NaN or inf or error.
|
||||
std::string str;
|
||||
is >> str;
|
||||
if (!KALDI_STRCASECMP(str.c_str(), "inf") ||
|
||||
!KALDI_STRCASECMP(str.c_str(), "infinity")) {
|
||||
data.push_back(std::numeric_limits<Real>::infinity());
|
||||
KALDI_WARN << "Reading infinite value into matrix.";
|
||||
} else if (!KALDI_STRCASECMP(str.c_str(), "nan")) {
|
||||
data.push_back(std::numeric_limits<Real>::quiet_NaN());
|
||||
KALDI_WARN << "Reading NaN value into matrix.";
|
||||
} else {
|
||||
specific_error << "Expecting numeric matrix data, got " << str;
|
||||
goto bad;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (is.fail()) goto bad;
|
||||
return;
|
||||
bad:
|
||||
KALDI_ERR << "Failed to read packed matrix from stream. " << specific_error
|
||||
<< " File position at start is "
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
// matrix/packed-matrix.h
|
||||
|
||||
// Copyright 2009-2012 Ondrej Glembek; Lukas Burget; Microsoft Corporation;
|
||||
// Saarland University; Yanmin Qian; Johns Hopkins University (Author: Daniel Povey)
|
||||
// Copyright 2009-2013 Ondrej Glembek; Lukas Burget; Microsoft Corporation;
|
||||
// Saarland University; Yanmin Qian;
|
||||
// Johns Hopkins University (Author: Daniel Povey)
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
//
|
||||
|
@ -37,28 +38,29 @@ std::ostream & operator <<(std::ostream & out, const PackedMatrix<Real>& M);
|
|||
|
||||
/// @brief Packed matrix: base class for triangular and symmetric matrices.
|
||||
template<typename Real> class PackedMatrix {
|
||||
friend class CuPackedMatrix<Real>;
|
||||
public:
|
||||
//friend class CuPackedMatrix<Real>;
|
||||
|
||||
PackedMatrix() : data_(NULL), num_rows_(0) {}
|
||||
|
||||
explicit PackedMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero):
|
||||
data_(NULL) { Resize(r, resize_type); }
|
||||
|
||||
explicit PackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL) {
|
||||
Resize(orig.num_rows_);
|
||||
Resize(orig.num_rows_, kUndefined);
|
||||
CopyFromPacked(orig);
|
||||
}
|
||||
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
explicit PackedMatrix(const PackedMatrix<OtherReal> &orig) : data_(NULL) {
|
||||
Resize(orig.NumRows());
|
||||
Resize(orig.NumRows(), kUndefined);
|
||||
CopyFromPacked(orig);
|
||||
}
|
||||
|
||||
void SetZero();
|
||||
void SetZero(); /// < Set to zero
|
||||
void SetUnit(); /// < Set to unit matrix.
|
||||
|
||||
/// Sets to random values of a normal distribution
|
||||
void SetRandn();
|
||||
void SetRandn(); /// < Set to random values of a normal distribution
|
||||
|
||||
Real Trace() const;
|
||||
|
||||
|
@ -82,17 +84,19 @@ template<typename Real> class PackedMatrix {
|
|||
/// This function takes time proportional to the number of data elements.
|
||||
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);
|
||||
|
||||
void AddToDiag(const Real r); // Adds r to diaginal
|
||||
|
||||
void ScaleDiag(const Real alpha); // Scales diagonal by alpha.
|
||||
|
||||
void SetDiag(const Real alpha); // Sets diagonal to this value.
|
||||
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void CopyFromPacked(const PackedMatrix<OtherReal> &orig);
|
||||
|
||||
|
||||
/// CopyFromVec just interprets the vector as having the same layout
|
||||
/// as the packed matrix. Must have the same dimension, i.e.
|
||||
/// orig.Dim() == (NumRows()*(NumRows()+1)) / 2;
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void CopyFromVec(const SubVector<OtherReal> &orig);
|
||||
|
||||
Real* Data() { return data_; }
|
||||
|
@ -104,6 +108,8 @@ template<typename Real> class PackedMatrix {
|
|||
return ((nr * (nr+1)) / 2) * sizeof(Real);
|
||||
}
|
||||
|
||||
//MatrixIndexT Stride() const { return stride_; }
|
||||
|
||||
// This code is duplicated in child classes to avoid extra levels of calls.
|
||||
Real operator() (MatrixIndexT r, MatrixIndexT c) const {
|
||||
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
|
||||
|
@ -134,10 +140,6 @@ template<typename Real> class PackedMatrix {
|
|||
return * (std::min_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) ));
|
||||
}
|
||||
|
||||
|
||||
// *this <-- *this + alpha* rV * rV^T.
|
||||
// The "2" in the name is because the argument is repeated.
|
||||
void AddVec2(const Real alpha, const Vector<Real> &rv);
|
||||
void Scale(Real c);
|
||||
|
||||
friend std::ostream & operator << <> (std::ostream & out,
|
||||
|
@ -147,18 +149,20 @@ template<typename Real> class PackedMatrix {
|
|||
void Read(std::istream &in, bool binary, bool add = false);
|
||||
|
||||
void Write(std::ostream &out, bool binary) const;
|
||||
// binary = true is not yet supported.
|
||||
|
||||
|
||||
void Destroy();
|
||||
|
||||
/// Swaps the contents of *this and *other. Shallow swap.
|
||||
void Swap(PackedMatrix<Real> *other);
|
||||
void Swap(Matrix<Real> *other);
|
||||
|
||||
|
||||
protected:
|
||||
// Will only be called from this class or derived classes.
|
||||
void AddPacked(const Real alpha, const PackedMatrix<Real>& M);
|
||||
Real *data_;
|
||||
MatrixIndexT num_rows_;
|
||||
//MatrixIndexT stride_;
|
||||
private:
|
||||
/// Init assumes the current contents of the class are is invalid (i.e. junk or
|
||||
/// has already been freed), and it sets the matrixd to newly allocated memory
|
||||
|
@ -189,9 +193,5 @@ std::istream & operator >> (std::istream &is, PackedMatrix<Real> &M) {
|
|||
|
||||
} // namespace kaldi
|
||||
|
||||
|
||||
// Including the implementation
|
||||
#include "matrix/packed-matrix-inl.h"
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ namespace kaldi {
|
|||
x is the input of dimensino dim, v is the output of dimension
|
||||
dim, and beta is a scalar. Note: we use zero-based
|
||||
not one-based indexing. */
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
|
||||
KALDI_ASSERT(dim > 0);
|
||||
// To avoid overflow, we first compute the max of x_ (or
|
||||
|
@ -84,7 +84,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
|
|||
// the vector that is "special". This is convenient in
|
||||
// the Tridiagonalize routine that uses reversed indexes for
|
||||
// compatibility with the packed lower triangular format.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
|
||||
KALDI_ASSERT(dim > 0);
|
||||
// To avoid overflow, we first compute the max of x_ (or
|
||||
|
@ -138,7 +138,7 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
|
|||
Caution: Q is transposed vs. Golub and Van Loan.
|
||||
If Q != NULL it outputs Q.
|
||||
*/
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
|
||||
MatrixIndexT n = this->NumRows();
|
||||
KALDI_ASSERT(Q == NULL || (Q->NumRows() == n &&
|
||||
|
@ -194,7 +194,7 @@ template
|
|||
void SpMatrix<double>::Tridiagonalize(MatrixBase<double> *Q);
|
||||
|
||||
/// Create Givens rotations, as in Golub and Van Loan 3rd ed., page 216.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
inline void Givens(Real a, Real b, Real *c, Real *s) {
|
||||
if (b == 0) {
|
||||
*c = 1;
|
||||
|
@ -218,7 +218,7 @@ inline void Givens(Real a, Real b, Real *c, Real *s) {
|
|||
// with Wilkinson shift." A couple of differences: this code is
|
||||
// in zero based arithmetic, and we represent Q transposed from
|
||||
// their Q for memory locality with row-major-indexed matrices.
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
void QrStep(MatrixIndexT n,
|
||||
Real *diag,
|
||||
Real *off_diag,
|
||||
|
@ -294,7 +294,7 @@ void QrStep(MatrixIndexT n,
|
|||
// Internal code for the QR algorithm, where the diagonal
|
||||
// and off-diagonal of the symmetric matrix are represented as
|
||||
// vectors of length n and n-1.
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
void QrInternal(MatrixIndexT n,
|
||||
Real *diag,
|
||||
Real *off_diag,
|
||||
|
@ -372,7 +372,7 @@ void QrInternal(MatrixIndexT n,
|
|||
This is the symmetric QR algorithm, from Golub and Van Loan 3rd ed., Algorithm
|
||||
8.3.3. Q is transposed w.r.t. there, though.
|
||||
*/
|
||||
template <class Real>
|
||||
template <typename Real>
|
||||
void SpMatrix<Real>::Qr(MatrixBase<Real> *Q) {
|
||||
KALDI_ASSERT(this->IsTridiagonal());
|
||||
// We envisage that Q would be square but we don't check for this,
|
||||
|
@ -396,7 +396,7 @@ void SpMatrix<Real>::Qr(MatrixBase<Real> *Q) {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::Eig(VectorBase<Real> *s, MatrixBase<Real> *P) const {
|
||||
MatrixIndexT dim = this->NumRows();
|
||||
KALDI_ASSERT(s->Dim() == dim);
|
||||
|
@ -417,7 +417,7 @@ void SpMatrix<Real>::Eig(VectorBase<Real> *s, MatrixBase<Real> *P) const {
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
|
||||
MatrixIndexT lanczos_dim) const {
|
||||
const SpMatrix<Real> &S(*this); // call this "S" for easy notation.
|
||||
|
|
|
@ -193,7 +193,7 @@ void SpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real SpMatrix<Real>::Trace() const {
|
||||
const Real *data = this->data_;
|
||||
MatrixIndexT num_rows = this->num_rows_;
|
||||
|
@ -204,8 +204,8 @@ Real SpMatrix<Real>::Trace() const {
|
|||
}
|
||||
|
||||
// diagonal update, this <-- this + diag(v)
|
||||
template<class Real>
|
||||
template<class OtherReal>
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void SpMatrix<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v) {
|
||||
int32 num_rows = this->num_rows_;
|
||||
KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0);
|
||||
|
@ -316,7 +316,7 @@ void SpMatrix<Real>::Invert(Real *logdet, Real *det_sign, bool need_inverse) {
|
|||
}
|
||||
#else
|
||||
// in the ATLAS case, these are not implemented using a library and we back off to something else.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::Invert(Real *logdet, Real *det_sign, bool need_inverse) {
|
||||
Matrix<Real> M(this->NumRows(), this->NumCols());
|
||||
M.CopyFromSp(*this);
|
||||
|
@ -481,7 +481,7 @@ double TraceMatSpMatSp(const MatrixBase<double> &A, MatrixTransposeType transA,
|
|||
MatrixTransposeType transC, const SpMatrix<double> &D);
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool SpMatrix<Real>::IsDiagonal(Real cutoff) const {
|
||||
MatrixIndexT R = this->NumRows();
|
||||
Real bad_sum = 0.0, good_sum = 0.0;
|
||||
|
@ -496,7 +496,7 @@ bool SpMatrix<Real>::IsDiagonal(Real cutoff) const {
|
|||
return (!(bad_sum > good_sum * cutoff));
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool SpMatrix<Real>::IsUnit(Real cutoff) const {
|
||||
MatrixIndexT R = this->NumRows();
|
||||
Real max = 0.0; // max error
|
||||
|
@ -507,7 +507,7 @@ bool SpMatrix<Real>::IsUnit(Real cutoff) const {
|
|||
return (max <= cutoff);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool SpMatrix<Real>::IsTridiagonal(Real cutoff) const {
|
||||
MatrixIndexT R = this->NumRows();
|
||||
Real max_abs_2diag = 0.0, max_abs_offdiag = 0.0;
|
||||
|
@ -523,13 +523,13 @@ bool SpMatrix<Real>::IsTridiagonal(Real cutoff) const {
|
|||
return (max_abs_offdiag <= cutoff * max_abs_2diag);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool SpMatrix<Real>::IsZero(Real cutoff) const {
|
||||
if (this->num_rows_ == 0) return true;
|
||||
return (this->Max() <= cutoff && this->Min() >= -cutoff);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real SpMatrix<Real>::FrobeniusNorm() const {
|
||||
Real sum = 0.0;
|
||||
MatrixIndexT R = this->NumRows();
|
||||
|
@ -541,14 +541,14 @@ Real SpMatrix<Real>::FrobeniusNorm() const {
|
|||
return sqrt(sum);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
bool SpMatrix<Real>::ApproxEqual(const SpMatrix<Real> &other, float tol) const {
|
||||
if (this->NumRows() != other.NumRows())
|
||||
KALDI_ERR << "SpMatrix::AproxEqual, size mismatch, "
|
||||
<< this->NumRows() << " vs. " << other.NumRows();
|
||||
SpMatrix<Real> tmp(*this);
|
||||
tmp.AddSp(-1.0, other);
|
||||
return (tmp.FrobeniusNorm() <= tol * this->FrobeniusNorm());
|
||||
return (tmp.FrobeniusNorm() <= tol * std::max(this->FrobeniusNorm(), other.FrobeniusNorm()));
|
||||
}
|
||||
|
||||
// function Floor: A = Floor(B, alpha * C) ... see tutorial document.
|
||||
|
@ -600,7 +600,7 @@ int SpMatrix<Real>::ApplyFloor(const SpMatrix<Real> &C, Real alpha,
|
|||
return nfloored;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real SpMatrix<Real>::LogDet(Real *det_sign) const {
|
||||
Real log_det;
|
||||
SpMatrix<Real> tmp(*this);
|
||||
|
@ -648,7 +648,7 @@ MatrixIndexT SpMatrix<Real>::LimitCond(Real maxCond, bool invert) { // e.g. max
|
|||
s(i) = sqrt(std::max(s(i), floor));
|
||||
}
|
||||
P.MulColsVec(s);
|
||||
(*this).AddMat2(1.0, P, kNoTrans); // (*this) = P*P^T. ... (*this) = P * floor(s) * P^T ... if P was original P.
|
||||
(*this).AddMat2(1.0, P, kNoTrans, 0.0); // (*this) = P*P^T. ... (*this) = P * floor(s) * P^T ... if P was original P.
|
||||
return nfloored;
|
||||
}
|
||||
|
||||
|
@ -965,8 +965,8 @@ void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<double> &v)
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<class OtherReal>
|
||||
template<typename Real>
|
||||
template<typename OtherReal>
|
||||
void SpMatrix<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
|
||||
KALDI_ASSERT(v.Dim() == this->NumRows());
|
||||
Real *data = this->data_;
|
||||
|
@ -984,7 +984,7 @@ template
|
|||
void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<float> &v);
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
|
||||
const VectorBase<Real> &v2) {
|
||||
MatrixIndexT D = M.NumRows();
|
||||
|
@ -1002,7 +1002,7 @@ double VecSpVec(const VectorBase<double> &v1, const SpMatrix<double> &M,
|
|||
const VectorBase<double> &v2);
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::AddMat2Sp(
|
||||
const Real alpha, const MatrixBase<Real> &M,
|
||||
MatrixTransposeType transM, const SpMatrix<Real> &A, const Real beta) {
|
||||
|
@ -1046,7 +1046,7 @@ void SpMatrix<Real>::AddMat2Sp(
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::AddSmat2Sp(
|
||||
const Real alpha, const MatrixBase<Real> &M,
|
||||
MatrixTransposeType transM, const SpMatrix<Real> &A,
|
||||
|
@ -1101,7 +1101,7 @@ void SpMatrix<Real>::AddSmat2Sp(
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::AddMat2Vec(const Real alpha,
|
||||
const MatrixBase<Real> &M,
|
||||
MatrixTransposeType transM,
|
||||
|
@ -1130,7 +1130,7 @@ void SpMatrix<Real>::AddMat2Vec(const Real alpha,
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
|
||||
MatrixTransposeType transM, const Real beta) {
|
||||
KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
|
||||
|
@ -1159,7 +1159,7 @@ void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
|
|||
this->CopyFromMat(temp_mat, kTakeLower);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
|
||||
MatrixTransposeType transM, const SpMatrix<Real> &A,
|
||||
const Real beta) {
|
||||
|
@ -1167,7 +1167,7 @@ void SpMatrix<Real>::AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
|
|||
AddMat2Sp(alpha, Tmat, transM, A, beta);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::AddVecVec(const Real alpha, const VectorBase<Real> &v,
|
||||
const VectorBase<Real> &w) {
|
||||
int32 dim = this->NumRows();
|
||||
|
@ -1176,7 +1176,7 @@ void SpMatrix<Real>::AddVecVec(const Real alpha, const VectorBase<Real> &v,
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SpMatrix<Real>::AddTp2(const Real alpha, const TpMatrix<Real> &T,
|
||||
MatrixTransposeType transM, const Real beta) {
|
||||
Matrix<Real> Tmat(T);
|
||||
|
@ -1191,7 +1191,7 @@ template class SpMatrix<float>;
|
|||
template class SpMatrix<double>;
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real TraceSpSpLower(const SpMatrix<Real> &A, const SpMatrix<Real> &B) {
|
||||
MatrixIndexT adim = A.NumRows();
|
||||
KALDI_ASSERT(adim == B.NumRows());
|
||||
|
|
|
@ -28,14 +28,6 @@
|
|||
|
||||
namespace kaldi {
|
||||
|
||||
/// \weakgroup matrix_funcs_misc
|
||||
typedef enum {
|
||||
kTakeLower,
|
||||
kTakeUpper,
|
||||
kTakeMean,
|
||||
kTakeMeanAndCheck
|
||||
} SpCopyType;
|
||||
|
||||
|
||||
/// \addtogroup matrix_group
|
||||
/// @{
|
||||
|
@ -47,19 +39,25 @@ template<typename Real> class SpMatrix;
|
|||
*/
|
||||
template<typename Real>
|
||||
class SpMatrix : public PackedMatrix<Real> {
|
||||
friend class CuSpMatrix<Real>;
|
||||
public:
|
||||
// so it can use our assignment operator.
|
||||
friend class std::vector<Matrix<Real> >;
|
||||
|
||||
SpMatrix(): PackedMatrix<Real>() {}
|
||||
|
||||
/// Copy constructor from CUDA version of SpMatrix
|
||||
/// This is defined in ../cudamatrix/cu-sp-matrix.h
|
||||
|
||||
explicit SpMatrix(const CuSpMatrix<Real> &cu);
|
||||
|
||||
explicit SpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
|
||||
: PackedMatrix<Real>(r, resize_type) {}
|
||||
|
||||
SpMatrix(const SpMatrix<Real> &orig)
|
||||
: PackedMatrix<Real>(orig) {}
|
||||
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
explicit SpMatrix(const SpMatrix<OtherReal> &orig)
|
||||
: PackedMatrix<Real>(orig) {}
|
||||
|
||||
|
@ -77,8 +75,6 @@ class SpMatrix : public PackedMatrix<Real> {
|
|||
}
|
||||
#endif
|
||||
|
||||
~SpMatrix() {}
|
||||
|
||||
/// Shallow swap.
|
||||
void Swap(SpMatrix *other);
|
||||
|
||||
|
@ -90,7 +86,7 @@ class SpMatrix : public PackedMatrix<Real> {
|
|||
PackedMatrix<Real>::CopyFromPacked(other);
|
||||
}
|
||||
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void CopyFromSp(const SpMatrix<OtherReal> &other) {
|
||||
PackedMatrix<Real>::CopyFromPacked(other);
|
||||
}
|
||||
|
@ -231,7 +227,7 @@ class SpMatrix : public PackedMatrix<Real> {
|
|||
Real LogDet(Real *det_sign = NULL) const;
|
||||
|
||||
/// rank-one update, this <-- this + alpha v v'
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
|
||||
|
||||
/// rank-two update, this <-- this + alpha (v w' + w v').
|
||||
|
@ -243,7 +239,7 @@ class SpMatrix : public PackedMatrix<Real> {
|
|||
const SpMatrix<Real> &S, const Real beta);
|
||||
|
||||
/// diagonal update, this <-- this + diag(v)
|
||||
template<class OtherReal>
|
||||
template<typename OtherReal>
|
||||
void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
|
||||
|
||||
/// rank-N update:
|
||||
|
@ -251,8 +247,9 @@ class SpMatrix : public PackedMatrix<Real> {
|
|||
/// (*this) = beta*(*this) + alpha * M * M^T,
|
||||
/// or (if transM == kTrans)
|
||||
/// (*this) = beta*(*this) + alpha * M^T * M
|
||||
/// Note: beta used to default to 0.0.
|
||||
void AddMat2(const Real alpha, const MatrixBase<Real> &M,
|
||||
MatrixTransposeType transM, const Real beta = 0.0);
|
||||
MatrixTransposeType transM, const Real beta);
|
||||
|
||||
/// Extension of rank-N update:
|
||||
/// this <-- beta*this + alpha * M * A * M^T.
|
||||
|
@ -286,8 +283,7 @@ class SpMatrix : public PackedMatrix<Real> {
|
|||
/// can implement it more efficiently.
|
||||
void AddTp2(const Real alpha, const TpMatrix<Real> &T,
|
||||
MatrixTransposeType transM, const Real beta = 0.0);
|
||||
|
||||
|
||||
|
||||
/// Extension of rank-N update:
|
||||
/// this <-- beta*this + alpha * M * diag(v) * M^T.
|
||||
/// if transM == kTrans, then
|
||||
|
@ -381,6 +377,20 @@ float TraceSpSp(const SpMatrix<float> &A, const SpMatrix<float> &B);
|
|||
double TraceSpSp(const SpMatrix<double> &A, const SpMatrix<double> &B);
|
||||
|
||||
|
||||
template<typename Real>
|
||||
inline bool ApproxEqual(const SpMatrix<Real> &A,
|
||||
const SpMatrix<Real> &B, Real tol = 0.01) {
|
||||
return A.ApproxEqual(B, tol);
|
||||
}
|
||||
|
||||
template<typename Real>
|
||||
inline void AssertEqual(const SpMatrix<Real> &A,
|
||||
const SpMatrix<Real> &B, Real tol = 0.01) {
|
||||
KALDI_ASSERT(ApproxEqual(A, B, tol));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns tr(A B).
|
||||
template<typename Real, typename OtherReal>
|
||||
Real TraceSpSp(const SpMatrix<Real> &A, const SpMatrix<OtherReal> &B);
|
||||
|
@ -419,7 +429,7 @@ Real TraceMatSpMatSp(const MatrixBase<Real> &A, MatrixTransposeType transA,
|
|||
|
||||
/// Returns \f$ v_1^T M v_2 \f$
|
||||
/// Not as efficient as it could be where v1 == v2.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
|
||||
const VectorBase<Real> &v2);
|
||||
|
||||
|
@ -461,7 +471,7 @@ struct SolverOptions {
|
|||
/// Assumes H positive semidefinite.
|
||||
/// Returns the objective-function change.
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real SolveQuadraticProblem(const SpMatrix<Real> &H,
|
||||
const VectorBase<Real> &g,
|
||||
const SolverOptions &opts,
|
||||
|
@ -479,7 +489,7 @@ Real SolveQuadraticProblem(const SpMatrix<Real> &H,
|
|||
/// diagonal_precondition option is newly added, to handle problems
|
||||
/// where different dimensions have very different scaling (we recommend to use
|
||||
/// the option but it's set false for back compatibility).
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
|
||||
const MatrixBase<Real> &Y,
|
||||
const SpMatrix<Real> &P,
|
||||
|
@ -490,7 +500,7 @@ Real SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
|
|||
/// \f[ Q(M) = tr(M^T G) -0.5 tr(P_1 M Q_1 M^T) -0.5 tr(P_2 M Q_2 M^T). \f]
|
||||
/// Encountered in matrix update with a prior. We also apply a limit on the
|
||||
/// condition but it should be less frequently necessary, and can be set larger.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
Real SolveDoubleQuadraticMatrixProblem(const MatrixBase<Real> &G,
|
||||
const SpMatrix<Real> &P1,
|
||||
const SpMatrix<Real> &P2,
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
namespace kaldi {
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
|
||||
if ( (N & (N-1)) != 0 || N <= 1)
|
||||
KALDI_ERR << "SplitRadixComplexFft called with invalid number of points "
|
||||
|
@ -46,7 +46,7 @@ SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
|
|||
temp_buffer = NULL;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SplitRadixComplexFft<Real>::ComputeTables() {
|
||||
MatrixIndexT imax, lg2, i, j;
|
||||
MatrixIndexT m, m2, m4, m8, nel, n;
|
||||
|
@ -97,7 +97,7 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
|
||||
delete [] brseed;
|
||||
if (tab != NULL) {
|
||||
|
@ -109,7 +109,7 @@ SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
|
|||
delete [] temp_buffer;
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const {
|
||||
if (!forward) { // reverse real and imaginary parts for complex FFT.
|
||||
Real *tmp = xr;
|
||||
|
@ -123,7 +123,7 @@ void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const
|
|||
}
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
|
||||
if (temp_buffer == NULL)
|
||||
temp_buffer = new Real[N_];
|
||||
|
@ -150,7 +150,7 @@ void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
|
|||
x[1] = temp_buffer[0]; // special case of i = 0.
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) const {
|
||||
MatrixIndexT i, j, lg2, n;
|
||||
MatrixIndexT off, fj, gno, *brp;
|
||||
|
@ -176,7 +176,7 @@ void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) c
|
|||
}
|
||||
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixIndexT logm) const {
|
||||
|
||||
MatrixIndexT m, m2, m4, m8, nel, n;
|
||||
|
@ -321,7 +321,7 @@ void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixInde
|
|||
|
||||
// This code is mostly the same as the RealFft function. It would be
|
||||
// possible to replace it with more efficient code from Rico's book.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void SplitRadixRealFft<Real>::Compute(Real *data, bool forward) {
|
||||
MatrixIndexT N = N_, N2 = N/2;
|
||||
KALDI_ASSERT(N%2 == 0);
|
||||
|
|
|
@ -41,7 +41,7 @@ namespace kaldi {
|
|||
// Microsoft Corporation
|
||||
// This is a more efficient way of doing the complex FFT than ComplexFft
|
||||
// above, but it only works for powers of 2.
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class SplitRadixComplexFft {
|
||||
public:
|
||||
typedef MatrixIndexT Integer;
|
||||
|
@ -83,7 +83,7 @@ class SplitRadixComplexFft {
|
|||
// data.
|
||||
};
|
||||
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
class SplitRadixRealFft: private SplitRadixComplexFft<Real> {
|
||||
public:
|
||||
SplitRadixRealFft(MatrixIndexT N): // will fail unless N>=4 and N is a power of 2.
|
||||
|
|
|
@ -69,7 +69,7 @@ void TpMatrix<Real>::Invert() {
|
|||
}
|
||||
|
||||
/*
|
||||
template<class Real>
|
||||
template<typename Real>
|
||||
void TpMatrix<Real>::Invert() {
|
||||
Matrix<Real> tmp(*this);
|
||||
tmp.Invert();
|
||||
|
@ -127,7 +127,7 @@ void TpMatrix<Real>::Cholesky(const SpMatrix<Real> &orig) {
|
|||
}
|
||||
|
||||
template<typename Real>
|
||||
void TpMatrix<Real>::CopyFromMat(MatrixBase<Real> &M,
|
||||
void TpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
|
||||
MatrixTransposeType Trans) {
|
||||
if (Trans == kNoTrans) {
|
||||
KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols());
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
// Copyright 2009-2011 Ondrej Glembek; Lukas Burget; Microsoft Corporation;
|
||||
// Saarland University; Yanmin Qian; Haihua Xu
|
||||
// 2013 Johns Hopkins Universith (author: Daniel Povey)
|
||||
|
||||
|
||||
// See ../../COPYING for clarification regarding multiple authors
|
||||
|
@ -33,15 +34,22 @@ template<typename Real> class TpMatrix;
|
|||
/// @brief Packed symetric matrix class
|
||||
template<typename Real>
|
||||
class TpMatrix : public PackedMatrix<Real> {
|
||||
friend class CuTpMatrix<float>;
|
||||
friend class CuTpMatrix<double>;
|
||||
public:
|
||||
TpMatrix() : PackedMatrix<Real>() {}
|
||||
explicit TpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
|
||||
: PackedMatrix<Real>(r, resize_type) {}
|
||||
TpMatrix(const TpMatrix<Real>& Orig) : PackedMatrix<Real>(Orig) {}
|
||||
template<class OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& Orig)
|
||||
: PackedMatrix<Real>(Orig) {}
|
||||
~TpMatrix() {}
|
||||
|
||||
/// Copy constructor from CUDA TpMatrix
|
||||
/// This is defined in ../cudamatrix/cu-tp-matrix.cc
|
||||
explicit TpMatrix(const CuTpMatrix<Real> &cu);
|
||||
|
||||
|
||||
template<typename OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& Orig)
|
||||
: PackedMatrix<Real>(Orig) {}
|
||||
|
||||
Real operator() (MatrixIndexT r, MatrixIndexT c) const {
|
||||
if (static_cast<UnsignedMatrixIndexT>(c) >
|
||||
static_cast<UnsignedMatrixIndexT>(r)) {
|
||||
|
@ -85,15 +93,18 @@ class TpMatrix : public PackedMatrix<Real> {
|
|||
|
||||
/// CopyFromMat copies the lower triangle of M into *this
|
||||
/// (or the upper triangle, if Trans == kTrans).
|
||||
void CopyFromMat(MatrixBase<Real> &M,
|
||||
void CopyFromMat(const MatrixBase<Real> &M,
|
||||
MatrixTransposeType Trans = kNoTrans);
|
||||
|
||||
/// CopyFromTp copies andother triangular matrix into this one.
|
||||
/// This is implemented in ../cudamatrix/cu-tp-matrix.cc
|
||||
void CopyFromMat(const CuTpMatrix<Real> &other);
|
||||
|
||||
/// CopyFromTp copies another triangular matrix into this one.
|
||||
void CopyFromTp(const TpMatrix<Real> &other) {
|
||||
PackedMatrix<Real>::CopyFromPacked(other);
|
||||
}
|
||||
|
||||
template<class OtherReal> void CopyFromTp(const TpMatrix<OtherReal> &other) {
|
||||
template<typename OtherReal> void CopyFromTp(const TpMatrix<OtherReal> &other) {
|
||||
PackedMatrix<Real>::CopyFromPacked(other);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ all:
|
|||
include ../kaldi.mk
|
||||
|
||||
LDFLAGS += $(CUDA_LDFLAGS)
|
||||
LDLIBS += $(CUDA_LDLIBS)
|
||||
|
||||
TESTFILES = nnet-test nnet-randomizer-test
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ class Softmax : public Component {
|
|||
|
||||
void PropagateFnc(const CuMatrix<BaseFloat> &in, CuMatrix<BaseFloat> *out) {
|
||||
// y = e^x_j/sum_j(e^x_j)
|
||||
out->Softmax(in);
|
||||
out->ApplySoftMaxPerRow(in);
|
||||
}
|
||||
|
||||
void BackpropagateFnc(const CuMatrix<BaseFloat> &in, const CuMatrix<BaseFloat> &out,
|
||||
|
|
|
@ -96,7 +96,7 @@ class CacheConf {
|
|||
Vector<BaseFloat> confidence_leftover_;
|
||||
|
||||
std::vector<int32> randmask_;
|
||||
CuStlVector<int32> randmask_device_;
|
||||
CuArray<int32> randmask_device_;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -94,7 +94,7 @@ class CacheTgtMat {
|
|||
CuMatrix<BaseFloat> targets_leftover_; ///< Desired vector cache
|
||||
|
||||
std::vector<int32> randmask_;
|
||||
CuStlVector<int32> randmask_device_;
|
||||
CuArray<int32> randmask_device_;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -94,7 +94,7 @@ class Cache {
|
|||
std::vector<int32> targets_leftover_; ///< Desired vector cache
|
||||
|
||||
std::vector<int32> randmask_;
|
||||
CuStlVector<int32> randmask_device_;
|
||||
CuArray<int32> randmask_device_;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
#include "base/kaldi-common.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-stlvector.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
|
||||
namespace kaldi {
|
||||
namespace nnet1 {
|
||||
|
@ -67,10 +67,10 @@ class XentPrior {
|
|||
double frames_scaled_nosil_;
|
||||
double correct_scaled_nosil_;
|
||||
|
||||
CuStlVector<int32> max_id_;
|
||||
CuArray<int32> max_id_;
|
||||
std::vector<int32> max_id_host_;
|
||||
|
||||
CuStlVector<int32> target_device_;
|
||||
CuArray<int32> target_device_;
|
||||
CuVector<BaseFloat> log_post_tgt_;
|
||||
Vector<BaseFloat> log_post_tgt_host_;
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include "util/kaldi-holder.h"
|
||||
#include "cudamatrix/cu-matrix.h"
|
||||
#include "cudamatrix/cu-vector.h"
|
||||
#include "cudamatrix/cu-stlvector.h"
|
||||
#include "cudamatrix/cu-array.h"
|
||||
|
||||
namespace kaldi {
|
||||
namespace nnet1 {
|
||||
|
@ -61,16 +61,17 @@ class Xent {
|
|||
std::vector<float> loss_vec_;
|
||||
|
||||
// loss computation buffers
|
||||
CuStlVector<int32> target_device_;
|
||||
CuArray<int32> target_device_;
|
||||
|
||||
CuVector<BaseFloat> log_post_tgt_;
|
||||
Vector<BaseFloat> log_post_tgt_host_;
|
||||
CuMatrix<BaseFloat> tgt_mat_device_;
|
||||
CuMatrix<BaseFloat> xentropy_aux_;
|
||||
|
||||
// frame classification buffers
|
||||
CuStlVector<int32> max_id_out_;
|
||||
CuArray<int32> max_id_out_;
|
||||
std::vector<int32> max_id_out_host_;
|
||||
CuStlVector<int32> max_id_tgt_;
|
||||
CuArray<int32> max_id_tgt_;
|
||||
std::vector<int32> max_id_tgt_host_;
|
||||
|
||||
};
|
||||
|
|
|
@ -76,7 +76,7 @@ void MatrixRandomizer::Randomize(const std::vector<int32>& mask) {
|
|||
// Use auxiliary buffer for unshuffled data
|
||||
CuMatrix<BaseFloat> data_aux(data_);
|
||||
// Put the mask to GPU
|
||||
CuStlVector<int32> mask_in_gpu(mask.size());
|
||||
CuArray<int32> mask_in_gpu(mask.size());
|
||||
mask_in_gpu.CopyFromVec(mask);
|
||||
// randomize the data, mask is used to index rows in source matrix
|
||||
cu::Randomize(data_aux, mask_in_gpu, &data_);
|
||||
|
|
|
@ -155,7 +155,7 @@ class Splice : public Component {
|
|||
}
|
||||
|
||||
protected:
|
||||
CuStlVector<int32> frame_offsets_;
|
||||
CuArray<int32> frame_offsets_;
|
||||
};
|
||||
|
||||
|
||||
|
@ -218,7 +218,7 @@ class CopyComponent: public Component {
|
|||
}
|
||||
|
||||
protected:
|
||||
CuStlVector<int32> copy_from_indices_;
|
||||
CuArray<int32> copy_from_indices_;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ EXTRA_CXXFLAGS = -Wno-sign-compare
|
|||
include ../kaldi.mk
|
||||
|
||||
LDFLAGS += $(CUDA_LDFLAGS)
|
||||
LDLIBS += $(CUDA_LDLIBS)
|
||||
|
||||
BINFILES = nnet-train-frmshuff \
|
||||
nnet-train-xent-hardlab-perutt \
|
||||
|
|
|
@ -51,13 +51,8 @@ int main(int argc, char *argv[]) {
|
|||
bool apply_log = false;
|
||||
po.Register("apply-log", &apply_log, "Transform MLP output to logscale");
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
int32 use_gpu_id=-2;
|
||||
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
|
||||
#else
|
||||
int32 use_gpu_id=0;
|
||||
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
|
||||
#endif
|
||||
std::string use_gpu="no";
|
||||
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
|
@ -76,7 +71,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
//Select the GPU
|
||||
#if HAVE_CUDA==1
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu);
|
||||
#endif
|
||||
|
||||
Nnet nnet_transf;
|
||||
|
|
|
@ -60,13 +60,8 @@ int main(int argc, char *argv[]) {
|
|||
std::string frame_weights;
|
||||
po.Register("frame-weights", &frame_weights, "Per-frame weights to scale gradients (frame selection/weighting).");
|
||||
|
||||
#if HAVE_CUDA==1
|
||||
int32 use_gpu_id=-2;
|
||||
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
|
||||
#else
|
||||
int32 use_gpu_id=0;
|
||||
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
|
||||
#endif
|
||||
std::string use_gpu="yes";
|
||||
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
|
@ -90,7 +85,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
//Select the GPU
|
||||
#if HAVE_CUDA==1
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu);
|
||||
#endif
|
||||
|
||||
Nnet nnet_transf;
|
||||
|
|
|
@ -128,16 +128,9 @@ int main(int argc, char *argv[]) {
|
|||
po.Register("drop-frames", &drop_frames,
|
||||
"Drop frames, where is zero den-posterior under numerator path "
|
||||
"(ie. path not in lattice)");
|
||||
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
kaldi::int32 use_gpu_id=-2;
|
||||
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
|
||||
"(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
|
||||
#else
|
||||
int32 use_gpu_id=0;
|
||||
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
|
||||
#endif
|
||||
std::string use_gpu="yes";
|
||||
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
|
@ -162,7 +155,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// Select the GPU
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu);
|
||||
#endif
|
||||
|
||||
Nnet nnet_transf;
|
||||
|
@ -257,7 +250,7 @@ int main(int argc, char *argv[]) {
|
|||
if (old_acoustic_scale != 1.0) {
|
||||
fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale), &den_lat);
|
||||
}
|
||||
// optionaly sort it topologically
|
||||
// optional sort it topologically
|
||||
kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
|
||||
if (!(props & fst::kTopSorted)) {
|
||||
if (fst::TopSort(&den_lat) == false)
|
||||
|
|
|
@ -129,15 +129,9 @@ int main(int argc, char *argv[]) {
|
|||
po.Register("do-smbr", &do_smbr, "Use state-level accuracies instead of "
|
||||
"phone accuracies.");
|
||||
|
||||
#if HAVE_CUDA == 1
|
||||
kaldi::int32 use_gpu_id=-2;
|
||||
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
|
||||
"(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
|
||||
#else
|
||||
int32 use_gpu_id=0;
|
||||
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
|
||||
#endif
|
||||
|
||||
std::string use_gpu="yes";
|
||||
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
|
||||
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() != 6) {
|
||||
|
@ -164,7 +158,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// Select the GPU
|
||||
#if HAVE_CUDA == 1
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
|
||||
CuDevice::Instantiate().SelectGpuId(use_gpu);
|
||||
#endif
|
||||
|
||||
Nnet nnet_transf;
|
||||
|
@ -248,7 +242,7 @@ int main(int argc, char *argv[]) {
|
|||
fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
|
||||
&den_lat);
|
||||
}
|
||||
// optionaly sort it topologically
|
||||
// optional sort it topologically
|
||||
kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
|
||||
if (!(props & fst::kTopSorted)) {
|
||||
if (fst::TopSort(&den_lat) == false)
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче