syncing 'sandbox/dan2->trunk' of src/matrix,src/cudamatrix

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@3194 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
This commit is contained in:
Karel Vesely 2013-11-21 21:24:54 +00:00
Родитель c5bba859e4
Коммит c3a5fa2187
104 изменённых файлов: 15879 добавлений и 2935 удалений

Просмотреть файл

@ -18,7 +18,7 @@ align_to_lats=false # optionally produce alignment in lattice format
lats_decode_opts="--acoustic-scale=0.1 --beam=20 --latbeam=10"
lats_graph_scales="--transition-scale=1.0 --self-loop-scale=0.1"
use_gpu_id=-1 # disable gpu
use_gpu="no" # yes|no|optionaly
# End configuration options.
[ $# -gt 0 ] && echo "$0 $@" # Print the command line for logging
@ -76,7 +76,7 @@ if [ -f $srcdir/delta_order ]; then
feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
fi
# Finally add feature_transform and the MLP
feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet ark:- ark:- |"
feats="$feats nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"
echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"

Просмотреть файл

@ -25,7 +25,7 @@ scoring_opts="--min-lmwt 4 --max-lmwt 15"
num_threads=1 # if >1, will use latgen-faster-parallel
parallel_opts="-pe smp $((num_threads+1))" # use 2 CPUs (1 DNN-forward, 1 decoder)
use_gpu_id=-1 # -1 disable gpu
use_gpu="no" # yes|no|optionaly
# End configuration section.
echo "$0 $@" # Print the command line for logging
@ -104,7 +104,7 @@ fi
# Run the decoding in the queue
if [ $stage -le 0 ]; then
$cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu-id=$use_gpu_id $nnet "$feats" ark:- \| \
nnet-forward --feature-transform=$feature_transform --no-softmax=true --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
latgen-faster-mapped$thread_string --max-active=$max_active --max-mem=$max_mem --beam=$beam \
--lattice-beam=$latbeam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
$model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;

Просмотреть файл

@ -50,8 +50,6 @@ splice_step=1 # Stepsize of the splicing (1 is consecutive splice,
# value 2 would do [ -10 -8 -6 -4 -2 0 2 4 6 8 10 ] splicing)
# misc.
verbose=1 # enable per-cache reports
# gpu config
use_gpu_id= # manually select GPU id to run on, (-1 disables GPU)
# End configuration.
echo "$0 $@" # Print the command line for logging
@ -172,7 +170,7 @@ else
feature_transform_old=$feature_transform
feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
echo "Renormalizing MLP input features into $feature_transform"
nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
nnet-forward --use-gpu=yes \
$feature_transform_old "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
ark:- 2>$dir/log/cmvn_glob_fwd.log |\
compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
@ -186,7 +184,7 @@ fi
###### GET THE DIMENSIONS ######
num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu-id=-1 $feature_transform ark:- ark:- |" - 2>/dev/null)
num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null)
num_hid=$hid_dim
@ -208,14 +206,14 @@ for depth in $(seq 1 $nn_depth); do
rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate_low --l2-penalty=$rbm_l2penalty \
--num-iters=$((2*$rbm_iter)) --drop-data=$rbm_drop_data --verbose=$verbose \
--feature-transform=$feature_transform \
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} $rbm_extra_opts \
$rbm_extra_opts \
$RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
else
#This is Bernoulli-Bernoulli RBM
#cmvn stats for init
echo "Computing cmvn stats '$dir/$depth.cmvn' for RBM initialization"
if [ ! -f $dir/$depth.cmvn ]; then
nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
nnet-forward --use-gpu=yes \
"nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
"$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
ark:- 2>$dir/log/cmvn_fwd.$depth.log | \
@ -232,7 +230,7 @@ for depth in $(seq 1 $nn_depth); do
rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate --l2-penalty=$rbm_l2penalty \
--num-iters=$rbm_iter --drop-data=$rbm_drop_data --verbose=$verbose \
--feature-transform="nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} $rbm_extra_opts \
$rbm_extra_opts \
$RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
fi

Просмотреть файл

@ -46,7 +46,6 @@ train_opts= # options, passed to the training script
train_tool= # optionally change the training tool
# OTHER
use_gpu_id= # manually select GPU id to run on, (-1 disables GPU)
analyze_alignments=true # run the alignment analysis script
seed=777 # seed value used for training data shuffling and initialization
# End configuration.
@ -258,7 +257,7 @@ else
feature_transform_old=$feature_transform
feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
echo "Renormalizing MLP input features into $feature_transform"
nnet-forward ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
nnet-forward --use-gpu=yes \
$feature_transform_old "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" \
ark:- 2>$dir/log/nnet-forward-cmvn.log |\
compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
@ -315,7 +314,6 @@ steps/train_nnet_scheduler.sh \
${train_opts} \
${train_tool:+ --train-tool "$train_tool"} \
${config:+ --config $config} \
${use_gpu_id:+ --use-gpu-id $use_gpu_id} \
$mlp_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir || exit 1

Просмотреть файл

@ -21,7 +21,6 @@ learn_rate=0.00001
halving_factor=1.0 #ie. disable halving
drop_frames=true
verbose=1
use_gpu_id=
seed=777 # seed value used for training data shuffling
# End configuration section
@ -168,7 +167,6 @@ while [ $x -le $num_iters ]; do
--learn-rate=$learn_rate \
--drop-frames=$drop_frames \
--verbose=$verbose \
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
$cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
fi
cur_mdl=$dir/$x.nnet

Просмотреть файл

@ -21,7 +21,6 @@ halving_factor=1.0 #ie. disable halving
do_smbr=true
use_silphones=false #setting this to something will enable giving siphones to nnet-mpe
verbose=1
use_gpu_id=
seed=777 # seed value used for training data shuffling
# End configuration section
@ -151,7 +150,6 @@ while [ $x -le $num_iters ]; do
--do-smbr=$do_smbr \
--verbose=$verbose \
$mpe_silphones_arg \
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
$cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
fi
cur_mdl=$dir/$x.nnet

Просмотреть файл

@ -25,8 +25,6 @@ end_halving_inc=0.1
halving_factor=0.5
# misc.
verbose=1
# gpu
use_gpu_id=
# tool
train_tool="nnet-train-xent-hardlab-frmshuff"
@ -73,7 +71,6 @@ mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}
$train_tool --cross-validate=true \
--bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
${feature_transform:+ --feature-transform=$feature_transform} \
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
$mlp_best "$feats_cv" "$labels_cv" \
2> $dir/log/prerun.log || exit 1;
@ -97,7 +94,6 @@ for iter in $(seq -w $max_iters); do
--learn-rate=$learn_rate --momentum=$momentum --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
--bunchsize=$bunch_size --cachesize=$cache_size --randomize=true --verbose=$verbose \
${feature_transform:+ --feature-transform=$feature_transform} \
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
${seed:+ --seed=$seed} \
$mlp_best "$feats_tr" "$labels_tr" $mlp_next \
2> $dir/log/iter$iter.log || exit 1;
@ -110,7 +106,6 @@ for iter in $(seq -w $max_iters); do
$train_tool --cross-validate=true \
--bunchsize=$bunch_size --cachesize=$cache_size --verbose=$verbose \
${feature_transform:+ --feature-transform=$feature_transform} \
${use_gpu_id:+ --use-gpu-id=$use_gpu_id} \
$mlp_next "$feats_cv" "$labels_cv" \
2>>$dir/log/iter$iter.log || exit 1;

Просмотреть файл

@ -9,12 +9,16 @@ OPENFST_LDLIBS =
include ../kaldi.mk
LDFLAGS += $(CUDA_LDFLAGS)
LDLIBS += $(CUDA_LDLIBS)
TESTFILES = cuda-matrix-test
TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \
cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test
OBJFILES = cu-device.o cu-math.o cu-matrix.o
OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o
ifeq ($(CUDA), true)
OBJFILES += cu-kernels.o cu-randkernels.o
OBJFILES += cu-kernels.o cu-randkernels.o cu-choleskykernels.o
endif
LIBNAME = kaldi-cudamatrix

Просмотреть файл

@ -0,0 +1,208 @@
// cudamatrix/cu-array-inl.h
// Copyright 2009-2012 Karel Vesely
// 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
#define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
#if HAVE_CUDA == 1
#include <cuda_runtime_api.h>
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-kernels.h"
#endif
#include "util/timer.h"
namespace kaldi {
template<typename T>
void CuArray<T>::Resize(MatrixIndexT dim, MatrixResizeType resize_type) {
KALDI_ASSERT((resize_type == kSetZero || resize_type == kUndefined) && dim >= 0);
if (dim_ == dim) {
if (resize_type == kSetZero)
SetZero();
return;
}
Destroy();
if (dim == 0) return;
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
CU_SAFE_CALL(cudaMalloc((void**)&data_, dim*sizeof(T)));
} else
#endif
{
data_ = static_cast<T*>(malloc(dim * sizeof(T)));
// We allocate with malloc because we don't want constructors being called.
// We basically ignore memory alignment issues here-- we assume the malloc
// implementation is forgiving enough that it will automatically align on
// sensible boundaries.
if (data_ == 0)
KALDI_ERR << "Memory allocation failed when initializing CuVector "
<< "with dimension " << dim << " object size in bytes: "
<< sizeof(T);
}
dim_ = dim;
if (resize_type == kSetZero)
SetZero();
}
template<typename T>
void CuArray<T>::Destroy() {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (data_ != NULL) {
CU_SAFE_CALL(cudaFree(data_));
}
} else
#endif
{
if (data_ != NULL)
free(data_);
}
dim_ = 0;
data_ = NULL;
}
template<typename T>
void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
Resize(src.size(), kUndefined);
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
CU_SAFE_CALL(cudaMemcpy(data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice));
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
memcpy(data_, &src.front(), src.size()*sizeof(T));
}
}
template<typename T>
void CuArray<T>::CopyToVec(std::vector<T> *dst) const {
if (static_cast<MatrixIndexT>(dst->size()) != dim_) {
dst->resize(dim_);
}
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(T), cudaMemcpyDeviceToHost));
CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim.Elapsed());
} else
#endif
{
memcpy(&dst->front(), data_, dim_*sizeof(T));
}
}
template<typename T>
void CuArray<T>::SetZero() {
if (dim_ == 0) return;
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
CU_SAFE_CALL(cudaMemset(data_, 0, dim_ * sizeof(T)));
CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim.Elapsed());
} else
#endif
{
memset(static_cast<void*>(data_), 0, dim_ * sizeof(T));
}
}
/**
* Print the vector to stream
*/
template<typename T>
std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
std::vector<T> tmp;
vec.CopyToVec(&tmp);
out << "[";
for(int32 i=0; i<tmp.size(); i++) {
out << " " << tmp[i];
}
out << " ]\n";
return out;
}
template<class T>
inline void CuArray<T>::Set(const T &value) {
// This is not implemented yet, we'll do so if it's needed.
KALDI_ERR << "CuArray<T>::Set not implemented yet for this type.";
}
template<>
inline void CuArray<int32>::Set(const int32 &value) {
if (dim_ == 0) return;
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CU2DBLOCK);
dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK));
::MatrixDim d = { 1, Dim(), Dim() };
cudaI32_set_const(dimGrid, dimBlock, data_, value, d);
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
for (int32 i = 0; i < dim_; i++)
data_[i] = value;
}
}
template<typename T>
void CuArray<T>::CopyFromArray(const CuArray<T> &src) {
this->Resize(src.Dim(), kUndefined);
if (dim_ == 0) return;
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T),
cudaMemcpyDeviceToDevice));
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
memcpy(this->data_, src.data_, dim_ * sizeof(T));
}
}
} // namespace kaldi
#endif

Просмотреть файл

@ -0,0 +1,124 @@
// cudamatrix/cu-array-test.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "cudamatrix/cu-array.h"
using namespace kaldi;
namespace kaldi {
template<class T>
static void UnitTestCuArray() {
for (int32 i = 0; i < 30; i++) {
int32 size = rand() % 5;
size = size * size * size; // Have a good distribution of sizes, including >256.
int32 size2 = rand() % 4;
std::vector<T> vec(size);
std::vector<T> garbage_vec(size2); // We just use garbage_vec to make sure
// we sometimes resize from empty,
// sometimes not.
int32 byte_size = size * sizeof(T);
std::vector<char> rand_c(byte_size);
for (size_t i = 0; i < byte_size; i++)
rand_c[i] = rand() % 256;
if (!vec.empty()) {
std::memcpy((void*)&(vec[0]), (void*)&(rand_c[0]),
byte_size);
}
{ // test constructor from vector and CopyToVec.
CuArray<T> cu_vec(vec);
std::vector<T> vec2;
cu_vec.CopyToVec(&vec2);
KALDI_ASSERT(vec2 == vec);
}
{ // test assignment operator from CuArray.
CuArray<T> cu_vec(vec);
CuArray<T> cu_vec2(garbage_vec);
cu_vec2 = cu_vec;
std::vector<T> vec2;
cu_vec2.CopyToVec(&vec2);
KALDI_ASSERT(vec2 == vec);
KALDI_ASSERT(cu_vec2.Dim() == int32(vec2.size())); // test Dim()
}
{ // test resize with resize_type = kSetZero.
CuArray<T> cu_vec(vec);
cu_vec.Resize(size, kSetZero);
std::vector<T> vec2(vec);
if (!vec2.empty())
std::memset(&(vec2[0]), 0, vec2.size() * sizeof(T));
std::vector<T> vec3;
cu_vec.CopyToVec(&vec3);
KALDI_ASSERT(vec2 == vec3); // testing equality of zero arrays.
}
if (sizeof(T) == sizeof(int32) && size > 0) { // test Set for type int32, or same size.
CuArray<T> cu_vec(vec);
cu_vec.Set(vec[0]);
for (size_t i = 1; i < vec.size(); i++) vec[i] = vec[0];
std::vector<T> vec2;
cu_vec.CopyToVec(&vec2);
KALDI_ASSERT(vec2 == vec);
}
}
}
} // namespace kaldi
int main() {
for (int32 loop = 0; loop < 2; loop++) {
#if HAVE_CUDA == 1
if (loop == 0)
CuDevice::Instantiate().SelectGpuId("no");
else
CuDevice::Instantiate().SelectGpuId("yes");
#endif
//kaldi::UnitTestCuArray<float>();
kaldi::UnitTestCuArray<double>();
kaldi::UnitTestCuArray<int32>();
kaldi::UnitTestCuArray<std::pair<int32, int32> >();
if (loop == 0)
KALDI_LOG << "Tests without GPU use succeeded.\n";
else
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
}
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
}

123
src/cudamatrix/cu-array.h Normal file
Просмотреть файл

@ -0,0 +1,123 @@
// cudamatrix/cu-array.h
// Copyright 2009-2012 Karel Vesely
// 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_ARRAY_H_
#define KALDI_CUDAMATRIX_CU_ARRAY_H_
#include "matrix/kaldi-vector.h"
namespace kaldi {
/**
* std::vector equivalent for CUDA computing. This class is mostly intended as
* a CUDA-based mirror of a std::vector object that lives on the CPU. We don't
* call constructors, initializers, etc., on the GPU.
*/
template<typename T>
class CuArray {
typedef CuArray<T> ThisType;
public:
/// Default Constructor
CuArray<T>() : dim_(0), data_(NULL) { }
/// Constructor with memory initialisation. resize_type may be kSetZero or
/// kUndefined.
explicit CuArray<T>(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero):
dim_(0), data_(NULL) { Resize(dim, resize_type); }
/// Constructor from CPU-based int vector
explicit CuArray<T>(const std::vector<T> &src):
dim_(0), data_(NULL) { CopyFromVec(src); }
explicit CuArray<T>(const CuArray<T> &src):
dim_(0), data_(NULL) { CopyFromArray(src); }
/// Destructor
~CuArray() { Destroy(); }
/// Return the vector dimension
MatrixIndexT Dim() const { return dim_; }
/// Get raw pointer
const T* Data() const { return data_; }
T* Data() { return data_; }
/// Allocate the memory. resize_type may be kSetZero or kUndefined.
/// kCopyData not yet supported (can be implemented if needed).
void Resize(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero);
/// Deallocate the memory and set dim_ and data_ to zero. Does not call any
/// destructors of the objects stored.
void Destroy();
/// This function resizes if needed. Note: copying to GPU is done via memcpy,
/// and any constructors or assignment operators are not called.
void CopyFromVec(const std::vector<T> &src);
/// This function resizes if needed.
void CopyFromArray(const CuArray<T> &src);
/// This function resizes *dst if needed. On resize of "dst", the STL vector
/// may call copy-constructors, initializers, and assignment operators for
/// existing objects (which will be overwritten), but the copy from GPU to CPU
/// is done via memcpy. So be very careful calling this function if your
/// objects are more than plain structs.
void CopyToVec(std::vector<T> *dst) const;
/// Sets the memory for the object to zero, via memset. You should verify
/// that this makes sense for type T.
void SetZero();
/// Set to a constant value. Note: any copying is done as if using memcpy, and
/// assignment operators or destructors are not called. This is NOT IMPLEMENTED
/// YET except for T == int32 (the current implementation will just crash).
void Set(const T &value);
CuArray<T> &operator= (const CuArray<T> &in) {
this->CopyFromArray(in); return *this;
}
CuArray<T> &operator= (const std::vector<T> &in) {
this->CopyFromVec(in); return *this;
}
private:
MatrixIndexT dim_; ///< dimension of the vector
T *data_; ///< GPU data pointer (if GPU not available,
///< will point to CPU memory).
};
/// I/O
template<typename T>
std::ostream &operator << (std::ostream &out, const CuArray<T> &vec);
} // namespace
#include "cudamatrix/cu-array-inl.h"
#endif

Просмотреть файл

@ -0,0 +1,239 @@
// cudamatrix/cu-block-matrix-test.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "cudamatrix/cu-matrix-lib.h"
using namespace kaldi;
namespace kaldi {
/*
* ASSERTS
*/
template<typename Real>
static void AssertEqual(const MatrixBase<Real> &A,
const MatrixBase<Real> &B,
float tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows()&&A.NumCols() == B.NumCols());
for (MatrixIndexT i = 0;i < A.NumRows();i++) {
for (MatrixIndexT j = 0;j < A.NumCols();j++) {
KALDI_ASSERT(std::abs(A(i, j)-B(i, j)) <= tol*std::max(1.0, (double) (std::abs(A(i, j))+std::abs(B(i, j)))));
}
}
}
template<typename Real>
static void AssertEqual(const CuMatrixBase<Real> &A,
const CuMatrixBase<Real> &B,
float tol = 0.001) {
Real Anorm = A.FrobeniusNorm(), Bnorm = B.FrobeniusNorm();
CuMatrix<Real> diff(A);
diff.AddMat(-1.0, B);
Real diff_norm = diff.FrobeniusNorm();
if (diff_norm > tol * 0.5 * (Anorm + Bnorm)) {
KALDI_LOG << "A = " << A;
KALDI_LOG << "B = " << B;
KALDI_ERR << "Matrices differ, " << diff_norm << " > " << tol << " * 0.5 * ( "
<< Anorm << " + " << Bnorm << " ). ";
}
}
template<typename Real>
static void AssertEqual(const CuBlockMatrix<Real> &A,
const CuBlockMatrix<Real> &B,
float tol = 0.001) {
CuMatrix<Real> Acopy(A), Bcopy(B);
AssertEqual(Acopy, Bcopy, tol);
}
template<class Real>
static void UnitTestCuBlockMatrixIO() {
for (int32 i = 0; i < 10; i++) {
int32 num_blocks = rand() % 5;
std::vector<CuMatrix<Real> > data(num_blocks);
for (int32 b = 0; b < num_blocks; b++) {
int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
if (b % 2 == 0) std::swap(dimM, dimN);
data[b].Resize(dimM, dimN);
data[b].SetRandn();
}
CuBlockMatrix<Real> B(data);
std::ostringstream os;
bool binary = (i % 4 < 2);
B.Write(os, binary);
CuBlockMatrix<Real> B2;
std::istringstream is(os.str());
B2.Read(is, binary);
CuMatrix<Real> mat(B), mat2(B2);
AssertEqual(mat, mat2);
if (!data.empty())
KALDI_ASSERT(mat.Sum() != 0.0);
}
}
template<class Real>
static void UnitTestCuBlockMatrixAddMatBlock() {
for (int32 i = 0; i < 20; i++) {
int32 num_blocks = rand() % 5;
std::vector<CuMatrix<Real> > data(num_blocks);
for (int32 b = 0; b < num_blocks; b++) {
int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
// early failures will have small dim for easier eyeballing.
if (b % 2 == 0) std::swap(dimM, dimN);
data[b].Resize(dimM, dimN);
data[b].SetRandn();
}
CuBlockMatrix<Real> B(data);
int32 B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
// will do X += A B
MatrixTransposeType transB = (i % 2 == 1 ? kTrans : kNoTrans),
transA = (i % 3 == 1 ? kTrans : kNoTrans);
if (transB == kTrans) std::swap(B_num_rows, B_num_cols);
int32 X_num_rows = 100 + rand() % 255, X_num_cols = B_num_cols,
A_num_rows = X_num_rows, A_num_cols = B_num_rows;
if (data.size() == 0) { X_num_rows = 0; A_num_rows = 0; }
if (transA == kTrans) std::swap(A_num_rows, A_num_cols);
Real alpha = 2.0, beta = -1.0;
CuMatrix<Real> X(X_num_rows, X_num_cols);
X.SetRandn();
CuMatrix<Real> A(A_num_rows, A_num_cols);
A.SetRandn();
CuMatrix<Real> Xcopy(X), Bcopy(B), Xorig(X), Aorig(A);
Xcopy.AddMatMat(alpha, A, transA, Bcopy, transB, beta);
X.AddMatBlock(alpha, A, transA, B, transB, beta);
AssertEqual(X, Xcopy);
}
}
template<class Real>
static void UnitTestCuBlockMatrixAddMatMat() {
for (int32 i = 0; i < 20; i++) {
int32 num_blocks = rand() % 5;
std::vector<CuMatrix<Real> > data(num_blocks);
for (int32 b = 0; b < num_blocks; b++) {
int32 dimM = 100 + rand() % 255, dimN = 10 + rand() % 20;
if (i == 0) { dimM = 1; dimN = 1; }
// early failures will have small dim for easier eyeballing.
if (b % 2 == 0) std::swap(dimM, dimN);
data[b].Resize(dimM, dimN);
data[b].SetRandn();
}
CuBlockMatrix<Real> B(data);
int32 B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
// will do B += C D
int32 C_num_rows = B_num_rows, C_num_cols = 100 + rand() % 255;
if (C_num_rows == 0) C_num_cols = 0;
int32 D_num_rows = C_num_cols, D_num_cols = B_num_cols;
MatrixTransposeType transC = (i % 2 == 1 ? kTrans : kNoTrans),
transD = (i % 3 == 1 ? kTrans : kNoTrans);
if (transC == kTrans) std::swap(C_num_rows, C_num_cols);
if (transD == kTrans) std::swap(D_num_rows, D_num_cols);
CuMatrix<Real> C(C_num_rows, C_num_cols), D(D_num_rows, D_num_cols);
C.SetRandn();
D.SetRandn();
CuMatrix<Real> Bmat(B);
Real alpha = 2.0, beta = -1.0;
CuBlockMatrix<Real> Bcopy(B);
B.AddMatMat(alpha, C, transC, D, transD, beta);
Bmat.AddMatMat(alpha, C, transC, D, transD, beta);
// Now check that the block-structured part of Bmat is the
// same as B.
Bcopy.CopyFromMat(Bmat); // copy block-structured part from Bmat to Bcopy.
AssertEqual(B, Bcopy);
KALDI_ASSERT(Bmat.Sum() != 0 || B_num_rows == 0);
}
}
template<typename Real> void CuBlockMatrixUnitTest() {
UnitTestCuBlockMatrixIO<Real>();
UnitTestCuBlockMatrixAddMatBlock<Real>();
UnitTestCuBlockMatrixAddMatMat<Real>();
}
} // namespace kaldi
int main() {
for (int32 loop = 0; loop < 2; loop++) {
#if HAVE_CUDA == 1
if (loop == 0)
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
else
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
#endif
kaldi::CuBlockMatrixUnitTest<float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CuBlockMatrixUnitTest<double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CuBlockMatrixUnitTest<double>();
#endif
if (loop == 0)
KALDI_LOG << "Tests without GPU use succeeded.\n";
else
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
}
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
}

Просмотреть файл

@ -0,0 +1,330 @@
// cudamatrix/cu-block-matrix.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#if HAVE_CUDA == 1
#include <cuda_runtime_api.h>
#include <cublas.h>
#endif
#include "util/timer.h"
#include "cudamatrix/cu-block-matrix.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-device.h"
namespace kaldi {
template<class Real>
CuBlockMatrix<Real>::CuBlockMatrix() {
#if HAVE_CUDA == 1
cu_data_ = NULL;
#endif
}
template<class Real>
CuBlockMatrix<Real>::CuBlockMatrix(const std::vector<CuMatrix<Real> >&data) {
#if HAVE_CUDA == 1
cu_data_ = NULL;
#endif
block_data_.resize(data.size());
MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
for (size_t b = 0; b < data.size(); b++) {
MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
KALDI_ASSERT(num_rows > 0 && num_cols > 0);
BlockMatrixData block_data;
block_data.num_rows = num_rows;
block_data.num_cols = num_cols;
block_data.row_offset = row_offset;
block_data.col_offset = col_offset;
row_offset += num_rows;
col_offset += num_cols;
max_num_rows = std::max(max_num_rows, num_rows);
block_data_[b] = block_data;
}
num_rows_ = row_offset;
data_.Resize(max_num_rows, col_offset);
for (int32 b = 0; b < NumBlocks(); b++)
Block(b).CopyFromMat(data[b]);
SetCudaData();
}
template<class Real>
const CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) const {
KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
const BlockMatrixData &block_data = block_data_[b];
return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
block_data.col_offset, block_data.num_cols);
}
template<class Real>
CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) {
KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
BlockMatrixData &block_data = block_data_[b];
return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
block_data.col_offset, block_data.num_cols);
}
template<class Real>
CuBlockMatrix<Real>::CuBlockMatrix(const CuBlockMatrix<Real> &other):
data_(other.data_), block_data_(other.block_data_), num_rows_(other.num_rows_) {
#if HAVE_CUDA == 1
cu_data_ = NULL;
#endif
SetCudaData();
}
template<class Real>
CuBlockMatrix<Real> &CuBlockMatrix<Real>::operator =(const CuBlockMatrix<Real> &other) {
FreeCudaData();
data_ = other.data_;
block_data_ = other.block_data_;
num_rows_ = other.num_rows_;
SetCudaData();
return *this;
}
template<class Real>
void CuBlockMatrix<Real>::FreeCudaData() {
#if HAVE_CUDA == 1
if (cu_data_ != NULL) {
if (CuDevice::Instantiate().Enabled()) {
CuDevice::Instantiate().Free(cu_data_);
cu_data_ = NULL;
} else {
KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but "
<< "no GPU is enabled: likely code error.";
}
}
#endif
}
template<class Real>
void CuBlockMatrix<Real>::SetCudaData() {
#if HAVE_CUDA == 1
KALDI_ASSERT(cu_data_ == NULL);
if (block_data_.size() == 0) return; // Nothing to do.
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
int32 row_offset = 0, col_offset = 0;
for (size_t b = 0; b < NumBlocks(); b++) {
CuSubMatrix<Real> this_mat = Block(b);
CuBlockMatrixData &this_cu_data = tmp_cu_data[b];
this_cu_data.row_offset = row_offset;
this_cu_data.col_offset = col_offset;
this_cu_data.matrix_dim = this_mat.Dim();
this_cu_data.matrix_data = static_cast<void*>(this_mat.Data());
row_offset += this_mat.NumRows();
col_offset += this_mat.NumCols();
}
size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
cu_data_ = static_cast<CuBlockMatrixData*>(
CuDevice::Instantiate().Malloc(size));
CU_SAFE_CALL(cudaMemcpy(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice));
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
}
#endif
}
template<class Real>
void CuBlockMatrix<Real>::Swap(CuBlockMatrix<Real> *other) {
data_.Swap(&other->data_);
block_data_.swap(other->block_data_);
std::swap(num_rows_, other->num_rows_);
#if HAVE_CUDA == 1
std::swap(cu_data_, other->cu_data_);
#endif
}
template<class Real>
void CuBlockMatrix<Real>::Write(std::ostream &os, bool binary) const {
WriteToken(os, binary, "<CuBlockMatrix>");
int32 num_blocks = NumBlocks();
WriteBasicType(os, binary, num_blocks);
for (int32 b = 0; b < num_blocks; b++)
this->Block(b).Write(os, binary);
WriteToken(os, binary, "</CuBlockMatrix>");
}
template<class Real>
void CuBlockMatrix<Real>::Read(std::istream &is, bool binary) {
Destroy();
int i = Peek(is, binary);
std::vector<CuMatrix<Real> > data;
if (i != static_cast<int>('<')) {
// back-compatibility code so we can read the older format of
// MixtureProbComponent. This code should be deleted eventually.
int32 size;
ReadBasicType(is, binary, &size);
KALDI_ASSERT(size >= 0);
data.resize(size);
for (int32 i = 0; i < size; i++)
data[i].Read(is, binary);
} else {
ExpectToken(is, binary, "<CuBlockMatrix>");
int32 size;
ReadBasicType(is, binary, &size);
KALDI_ASSERT(size >= 0);
data.resize(size);
for (int32 i = 0; i < size; i++)
data[i].Read(is, binary);
ExpectToken(is, binary, "</CuBlockMatrix>");
}
CuBlockMatrix<Real> block_mat(data); // initializer from std::vector<CuMatrix<Real> > does
// the main job of initialization.
this->Swap(&block_mat);
}
template<class Real>
void CuBlockMatrix<Real>::Destroy() {
data_.Resize(0, 0);
block_data_.clear();
num_rows_ = 0;
FreeCudaData();
}
// Does *this = alpha A B + beta * *this, discarding elements outside
// the block structure of the *this matrix.
template<class Real>
void CuBlockMatrix<Real>::AddMatMat(
BaseFloat alpha,
const CuMatrix<Real> &A, MatrixTransposeType transA,
const CuMatrix<Real> &B, MatrixTransposeType transB,
BaseFloat beta) {
MatrixIndexT A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
A_row_stride = A.Stride(), A_col_stride = 1,
B_num_rows = B.NumRows(), B_num_cols = B.NumCols(),
B_row_stride = B.Stride(), B_col_stride = 1;
if (transA == kTrans) {
std::swap(A_num_rows, A_num_cols);
std::swap(A_row_stride, A_col_stride);
}
if (transB == kTrans) {
std::swap(B_num_rows, B_num_cols);
std::swap(B_row_stride, B_col_stride);
}
KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
&& A_num_cols == B_num_rows);
if (NumBlocks() == 0) return; // empty matrix.
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
// (x,y,z) dimensions are (block-id, row-of-block, col-of-block)
// First some logic to choose block dims...
// we assume (which we can, safely) that CU1DBLOCK is <= the max threads per block.
int32 x_blocksize = std::min(CU1DBLOCK, NumBlocks()); // x dim corresponds to block-idx.
int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
int32 y_blocksize = max_block_rows;
while (y_blocksize * x_blocksize > CU1DBLOCK || y_blocksize > CU2DBLOCK)
y_blocksize--;
int32 z_blocksize = max_block_cols;
while (z_blocksize * x_blocksize * y_blocksize > CU1DBLOCK || z_blocksize > CU2DBLOCK)
z_blocksize--;
dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
n_blocks(max_block_rows, y_blocksize),
n_blocks(max_block_cols, z_blocksize));
cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
A.Data(), A_num_cols, A_row_stride, A_col_stride,
B.Data(), B_row_stride, B_col_stride, alpha, beta);
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
int32 row_offset = 0, col_offset = 0;
for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
CuSubMatrix<Real> this_block = Block(b);
MatrixIndexT this_num_rows = this_block.NumRows(),
this_num_cols = this_block.NumCols();
CuSubMatrix<Real> A_part = (transA == kNoTrans ?
A.Range(row_offset, this_num_rows,
0, A.NumCols()) :
A.Range(0, A.NumRows(),
row_offset, this_num_rows)),
B_part = (transB == kNoTrans ?
B.Range(0, B.NumRows(),
col_offset, this_num_cols) :
B.Range(col_offset, this_num_cols,
0, B.NumCols()));
this_block.AddMatMat(alpha, A_part, transA, B_part, transB, beta);
row_offset += this_num_rows;
col_offset += this_num_cols;
}
KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
}
}
template<class Real>
MatrixIndexT CuBlockMatrix<Real>::MaxBlockCols() const {
MatrixIndexT max_cols = 0;
for (size_t i = 0; i < block_data_.size(); i++)
max_cols = std::max(max_cols, block_data_[i].num_cols);
return max_cols;
}
template<class Real>
MatrixIndexT CuBlockMatrix<Real>::MaxBlockRows() const {
return data_.NumRows();
}
template<class Real>
void CuBlockMatrix<Real>::CopyFromMat(const CuMatrix<Real> &M) {
KALDI_ASSERT(NumRows() == M.NumRows() && NumCols() == M.NumCols());
MatrixIndexT row_offset = 0, col_offset = 0;
for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
CuSubMatrix<Real> this_block = Block(b);
MatrixIndexT this_num_rows = this_block.NumRows(),
this_num_cols = this_block.NumCols();
const CuSubMatrix<Real> src(M, row_offset, this_num_rows,
col_offset, this_num_cols);
this_block.CopyFromMat(src);
row_offset += this_num_rows;
col_offset += this_num_cols;
}
KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
}
/**
* Print the matrix to stream
*/
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
bool binary = false;
mat.Write(out, binary);
return out;
}
// instantiate the template
template
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
template
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);
// Instantiate the class for float and double.
template class CuBlockMatrix<float>;
template class CuBlockMatrix<double>;
} // namespace kaldi

Просмотреть файл

@ -0,0 +1,150 @@
// cudamatrix/cu-block-matrix.h
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
#define KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
#include <sstream>
#include <vector>
#include "cudamatrix/cu-common.h"
namespace kaldi {
/**
The class CuBlockMatrix holds a vector of objects of type CuMatrix,
say, M_1, M_2, .. M_N
and it represents the matrix diag(M_1, M_2, ... M_N). Note:
the individual matrices do not have to be square. The reason the
class is needed is mostly so that we can efficiently multiply by this
block-diagonal structure in a parallel way.
If we have a GPU available, CuBlockMatrix will store a copy of the
individual CuMatrix quantities M_1 .. M_N on the GPU, but their
'primary' home remains on the CPU.. what we mean by this is that
while the data remains on the GPU, the "primary" version of the
Matrix object that holds the pointers will remain on the CPU.
We just copy it over to the GPU whenever it is changed.
*/
template<typename Real>
class CuBlockMatrix {
public:
friend class CuMatrixBase<Real>;
CuBlockMatrix();
CuBlockMatrix(const std::vector<CuMatrix<Real> > &data);
~CuBlockMatrix() { Destroy(); }
/// Copy constructor
CuBlockMatrix(const CuBlockMatrix &other);
/// Assignment operator
CuBlockMatrix &operator= (const CuBlockMatrix &other);
void Write(std::ostream &os, bool binary) const;
void Read(std::istream &is, bool binary);
MatrixIndexT NumRows() const { return num_rows_; }
MatrixIndexT NumCols() const { return data_.num_cols_; }
MatrixIndexT NumBlocks() const { return block_data_.size(); }
// Returns max num-columns of any block
MatrixIndexT MaxBlockCols() const ;
// Returns max num-rows of any block
MatrixIndexT MaxBlockRows() const;
const CuSubMatrix<Real> Block(MatrixIndexT b) const;
CuSubMatrix<Real> Block(MatrixIndexT b); // return CuMatrixBase to disallow resizes.
/// Does *this = alpha A B + beta * *this, discarding elements of the product outside
/// the block structure of the *this matrix. The transA and transB parameters
/// can be used to substitute A^T for A and B^T for B, respectively.
void AddMatMat(BaseFloat alpha,
const CuMatrix<Real> &A, MatrixTransposeType transA,
const CuMatrix<Real> &B, MatrixTransposeType transB,
BaseFloat beta);
/// Copies elements within the block structure from matrix M, discarding others.
/// Note: this has not been implemented in a very efficient way, it's used only
/// for testing.
void CopyFromMat(const CuMatrix<Real> &M);
/// Normalizes the columns of *this so that each one sums to one.
/// On error (e.g. inf's), will set the column to a constant value that
/// sums to one.
void NormalizeColumns();
void Swap(CuBlockMatrix *other);
protected:
CuMatrix<Real> data_; // This is a single matrix into which
// we pack all the blocks (possibly with spaces left over)
struct BlockMatrixData{
MatrixIndexT num_rows;
MatrixIndexT num_cols;
MatrixIndexT row_offset;
MatrixIndexT col_offset;
};
#if HAVE_CUDA == 1
const CuBlockMatrixData* CuData() const { return cu_data_; }
#endif
private:
/// If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL
void FreeCudaData();
/// If using GPU, allocate and set cu_data_ on the GPU to reflect "data_".
void SetCudaData();
/// Frees and deinitializes everything.
void Destroy();
std::vector<BlockMatrixData> block_data_;
MatrixIndexT num_rows_; // sum of num_rows of elements of block_data_.
#if HAVE_CUDA == 1
CuBlockMatrixData *cu_data_; // We store the pointers and some additional info
// on the GPU card in a form more suited to
// use by CUDA kernels.
#endif
}; // class CuBlockMatrix
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat);
} // namespace Kaldi
#endif

Просмотреть файл

@ -0,0 +1,53 @@
// cudamatrix/cu-choleskykernel-ansi.h
// Copyright 2010-2013Dr. Stephan Kramer
// Institut für Numerische und Angewandte Mathematik
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_ANSI_H_
#define KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_ANSI_H_
#include <stdlib.h>
#include <stdio.h>
#include "cudamatrix/cu-matrixdim.h"
#if HAVE_CUDA == 1
extern "C" {
/*********************************************************
* float CUDA kernel calls
*/
void cudaF_factorize_diagonal_block(float* A, int block_offset, MatrixDim d);
void cudaF_strip_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d);
void cudaF_diag_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d);
void cudaF_lo_update(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d);
/*********************************************************
* double CUDA kernel calls
*/
void cudaD_factorize_diagonal_block(double* A, int block_offset, MatrixDim d);
void cudaD_strip_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d);
void cudaD_diag_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d);
void cudaD_lo_update(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d);
}
#endif // HAVE_CUDA
#endif

Просмотреть файл

@ -0,0 +1,359 @@
// cudamatrix/cu-choleskykernel.cu
// Copyright 2010-2013 Dr. Stephan Kramer
// Institut fur Numerische und Angewandte Mathematik
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "cudamatrix/cu-choleskykernels-ansi.h"
#include <stdio.h>
#define TILE_SIZE 16
/***********************************************************************
* CUDA kernels
* some functions are templated to have the float/double operations
*/
__device__ int lex_index_2D (int r, int c, int row_length) {
return c + r*row_length;
}
__device__ int global_pos(int t_pos, int block_offset) {
return t_pos + TILE_SIZE*block_offset;
}
__device__ float inv_sqrt(float x) {
return rsqrtf(x);
}
__device__ double inv_sqrt(double x) {
return rsqrt(x);
}
template<typename T>
__global__
void __factorize_diagonal_block(T* A, int block_offset, MatrixDim d) {
int global_row_length = d.stride;
int col = threadIdx.x;
int row = threadIdx.y;
int global_row = global_pos(row,block_offset);
int global_col = global_pos(col,block_offset);
if ((global_row >= d.cols) || (global_col >= d.cols))
return;
int k_max = TILE_SIZE;
if (d.cols - global_pos(0,block_offset) < TILE_SIZE)
k_max = d.cols % TILE_SIZE;
int idx = lex_index_2D(global_row, global_col, global_row_length);
__shared__ T L[TILE_SIZE][TILE_SIZE+1];
L[row][col] = 0;
L[row][col] = A[idx];
__syncthreads();
if ((row >= k_max) || (col >= k_max))
return;
T fac;
for (int k = 0; k < k_max; k++) {
__syncthreads();
fac = inv_sqrt(L[k][k]);
__syncthreads();
if ((row==k)&&(col>=k))
L[col][row] = (L[col][row])*fac;
__syncthreads();
if ((row>=col)&&(col>k))
L[row][col] = L[row][col] - L[col][k]*L[row][k];
}
__syncthreads();
if (row >= col) {
A[idx] = L[row][col];
if (A[idx] > 100000)
A[idx] = 1;
}
}
template<typename T>
__global__
void __strip_update(T* A, int block_offset, MatrixDim d) {
int global_row_length = d.stride;
int boffy = block_offset;
int boffx = blockIdx.x + boffy + 1;
int col = threadIdx.x;
int row = threadIdx.y;
__shared__ T topleft[TILE_SIZE][TILE_SIZE+1];
__shared__ T workingmat[TILE_SIZE][TILE_SIZE+1];
int global_row = global_pos(row,block_offset);
int global_col = global_pos(col,block_offset);
if ((global_row >= d.cols) || (global_col >= d.cols))
return;
int idx = lex_index_2D(global_row, global_col, global_row_length);
topleft[row][col] = 0;
topleft[row][col] = A[idx];
//__syncthreads();
global_row = global_pos(row,boffx);
if (global_row >= d.cols)
return;
int idx_w = lex_index_2D(global_row, global_col, global_row_length);
//int row2 = row + block_offset * TILE_SIZE;
//int idx_w = row2 + col*global_row_length;
workingmat[col][row]=0;
workingmat[col][row]=A[idx_w];
__syncthreads();
if (row==0) {
for (int k = 0; k < TILE_SIZE; k++) {
T sum=0.0;
for (int m = 0; m < k; m++)
sum = sum + topleft[k][m]*workingmat[m][col];
workingmat[k][col] = (workingmat[k][col] - sum) / topleft[k][k];
}
}
__syncthreads();
A[idx_w] = workingmat[col][row];
if (A[idx_w] > 100000)
A[idx_w] = 1;
//A[idx_w] = 1;
}
template<typename T>
__global__
void __diag_update(T* A, int block_offset, MatrixDim d) {
int global_row_length = d.stride;
int boffx = blockIdx.x + block_offset + 1;
int col = threadIdx.x;
int row = threadIdx.y;
int global_row = global_pos(row,boffx);
int global_col = global_pos(col,block_offset);
if ((global_row >= d.cols) || (global_col >= d.cols))
return;
int idx = lex_index_2D(global_row, global_col, global_row_length);
__shared__ T left[TILE_SIZE][TILE_SIZE+1];
left[row][col] = 0;
left[row][col] = A[idx];
__syncthreads();
T sum = 0.0;
if (row >= col) {
for (int kk = 0; kk < TILE_SIZE; kk++)
sum = sum + left[row][kk]*left[col][kk];
//__syncthreads();
global_col = global_pos(col, boffx);
if (global_col >= d.cols)
return;
idx = lex_index_2D(global_row, global_col, global_row_length);
A[idx] = A[idx] - sum;
}
}
template<typename T>
__global__
void __lo_update(T* A, int block_offset, int n_blocks, MatrixDim d) {
int global_row_length = d.stride;
int col = threadIdx.x;
int row = threadIdx.y;
int boffy = blockIdx.y + block_offset + 1;
//int boffx = boffy + 1;
int boffx = boffy + 1;
__shared__ T left[TILE_SIZE][TILE_SIZE];
__shared__ T upt[TILE_SIZE][TILE_SIZE + 1];
int global_row = global_pos(row,boffy);
int global_col_src = global_pos(col,block_offset);
if ((global_row >= d.cols) || (global_col_src >= d.cols))
return;
int idx = lex_index_2D(global_row, global_col_src, global_row_length);
upt[row][col] = 0;
upt[row][col] = A[idx];
__syncthreads();
for (; boffx < n_blocks; boffx++) {
global_row = global_pos(row,boffx);
if (global_row >= d.cols)
return;
idx = lex_index_2D(global_row, global_col_src, global_row_length);
left[row][col] = 0;
left[row][col] = A[idx];
__syncthreads();
if (global_row >= d.cols)
return;
T matrixprod = 0.0;
for (int kk = 0; kk < TILE_SIZE; kk++)
matrixprod += left[row][kk]*upt[col][kk];
__syncthreads();
int global_col = global_pos(col,boffy);
if (global_col >= d.cols)
return;
idx = lex_index_2D(global_row, global_col, global_row_length);
A[idx] = A[idx] - matrixprod;
}
}
/***********************************************************************
* ANSI-C wrappers of CUDA kernels
*/
/*
* float
*/
void cudaF_factorize_diagonal_block(float* A, int block_offset, MatrixDim d) {
dim3 threads(TILE_SIZE,TILE_SIZE);
__factorize_diagonal_block<<<1,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
}
void cudaF_strip_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
dim3 threads(TILE_SIZE,TILE_SIZE);
if (n_remaining_blocks >= 2) {
dim3 stripgrid(n_remaining_blocks-1);
__strip_update<<<stripgrid,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
} else {
int stripgrid = 1;
__strip_update<<<stripgrid,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
}
}
void cudaF_diag_update(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
dim3 threads(TILE_SIZE,TILE_SIZE);
if (n_remaining_blocks >= 2) {
dim3 diaggrid(n_remaining_blocks-1);
__diag_update<<<diaggrid,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
} else {
int diaggrid = 1;
__diag_update<<<diaggrid,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
}
}
void cudaF_lo_update(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) {
dim3 logrid;
logrid.x = 1;
logrid.y = n_remaining_blocks-2;
dim3 threads(TILE_SIZE,TILE_SIZE);
__lo_update<<<logrid,threads>>>(A,block_offset,n_blocks,d);
cudaThreadSynchronize();
}
/*
* double
*/
void cudaD_factorize_diagonal_block(double* A, int block_offset, MatrixDim d) {
dim3 threads(TILE_SIZE,TILE_SIZE);
__factorize_diagonal_block<<<1,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
}
void cudaD_strip_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
dim3 threads(TILE_SIZE,TILE_SIZE);
if (n_remaining_blocks >= 2) {
dim3 stripgrid(n_remaining_blocks-1);
__strip_update<<<stripgrid,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
} else {
int stripgrid = 1;
__strip_update<<<stripgrid,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
}
}
void cudaD_diag_update(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) {
dim3 threads(TILE_SIZE,TILE_SIZE);
if (n_remaining_blocks >= 2) {
dim3 diaggrid(n_remaining_blocks-1);
__diag_update<<<diaggrid,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
} else {
int diaggrid = 1;
__diag_update<<<diaggrid,threads>>>(A,block_offset,d);
cudaThreadSynchronize();
}
}
void cudaD_lo_update(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) {
dim3 logrid;
logrid.x = 1;
logrid.y = n_remaining_blocks-2;
dim3 threads(TILE_SIZE,TILE_SIZE);
__lo_update<<<logrid,threads>>>(A,block_offset,n_blocks,d);
cudaThreadSynchronize();
}

Просмотреть файл

@ -0,0 +1,62 @@
// cudamatrix/cu-choleskykernel.h
// Copyright 2010-2013 Dr. Stephan Kramer
// Institut für Numerische und Angewandte Mathematik
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_H_
#define KALDI_CUDAMATRIX_CU_CHOLESKYKERNELS_H_
#if HAVE_CUDA == 1
#include "base/kaldi-error.h"
#include "cudamatrix/cu-choleskykernels-ansi.h"
/*
* In this file are C++ templated wrappers
* of the ANSI-C CUDA kernels
*/
namespace kaldi {
/*********************************************************
* base templates
*/
template<typename Real> inline void cuda_factorize_diagonal_block(Real* A, int block_offset, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_strip_update(Real* A, int block_offset, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_diag_update(Real* A, int block_offset, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_lo_update(Real* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
/*********************************************************
* float specialization
*/
template<> inline void cuda_factorize_diagonal_block<float>(float* A, int block_offset, MatrixDim d) { cudaF_factorize_diagonal_block(A,block_offset,d); }
template<> inline void cuda_strip_update<float>(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaF_strip_update(A,block_offset,n_remaining_blocks,d); }
template<> inline void cuda_diag_update<float>(float* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaF_diag_update(A,block_offset,n_remaining_blocks,d); }
template<> inline void cuda_lo_update<float>(float* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { cudaF_lo_update(A,block_offset,n_blocks,n_remaining_blocks,d); }
/*********************************************************
* double specialization
*/
template<> inline void cuda_factorize_diagonal_block<double>(double* A, int block_offset, MatrixDim d) { cudaD_factorize_diagonal_block(A,block_offset,d); }
template<> inline void cuda_strip_update<double>(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaD_strip_update(A,block_offset,n_remaining_blocks,d); }
template<> inline void cuda_diag_update<double>(double* A, int block_offset, int n_remaining_blocks, MatrixDim d) { cudaD_diag_update(A,block_offset,n_remaining_blocks,d); }
template<> inline void cuda_lo_update<double>(double* A, int block_offset, int n_blocks, int n_remaining_blocks, MatrixDim d) { cudaD_lo_update(A,block_offset,n_blocks,n_remaining_blocks,d); }
} // namespace
#endif // HAVE_CUDA
#endif

Просмотреть файл

@ -0,0 +1,32 @@
#ifndef KALDI_CUDAMATRIX_COMMON_H_
#define KALDI_CUDAMATRIX_COMMON_H_
// This file contains some #includes, forward declarations
// and typedefs that are needed by all the main header
// files in this directory.
#include "base/kaldi-common.h"
#include "matrix/kaldi-blas.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-common.h"
namespace kaldi {
#if HAVE_CUDA == 1
cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans) {
cublasOperation_t cublas_trans;
if (kaldi_trans == kNoTrans)
cublas_trans = CUBLAS_OP_N;
else if (kaldi_trans == kTrans)
cublas_trans = CUBLAS_OP_T;
else
cublas_trans = CUBLAS_OP_C;
return cublas_trans;
}
#endif
} // namespace
#endif // KALDI_CUDAMATRIX_COMMON_H_

Просмотреть файл

@ -22,20 +22,20 @@
#ifndef KALDI_CUDAMATRIX_CU_COMMON_H_
#define KALDI_CUDAMATRIX_CU_COMMON_H_
#if HAVE_CUDA==1
#include "cudamatrix/cu-matrixdim.h" // for CU1DBLOCK and CU2DBLOCK
#include <iostream>
#include <sstream>
#include "base/kaldi-error.h"
#include "matrix/matrix-common.h"
#if HAVE_CUDA == 1
#include <cublas.h>
#include <cuda_runtime_api.h>
#include "base/kaldi-error.h"
#define cuSafeCall(fun) \
#define CU_SAFE_CALL(fun) \
{ \
int32 ret; \
if ((ret = (fun)) != 0) { \
@ -47,19 +47,19 @@
namespace kaldi {
/** The size of edge of CUDA square block **/
static const int32 CUBLOCK = 16;
/** Number of blocks in which the task of size 'size' is splitted **/
inline int32 n_blocks(int32 size, int32 block_size) {
return size / block_size + ((size % block_size == 0)? 0 : 1);
}
/** Number of blocks in which the task of size 'size' is splitted **/
inline int32 n_blocks(int32 size, int32 block_size) {
return size / block_size + ((size % block_size == 0)? 0 : 1);
}
cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans);
}
#endif // HAVE_CUDA
namespace kaldi {
// Some forward declarations, frequently needed
// Some forward declarations, needed for friend declarations.
template<typename Real> class CuVectorBase;
template<typename Real> class CuVector;
template<typename Real> class CuSubVector;
@ -67,7 +67,13 @@ template<typename Real> class CuRand;
template<typename Real> class CuMatrixBase;
template<typename Real> class CuMatrix;
template<typename Real> class CuSubMatrix;
template<typename Real> class CuRand;
template<typename Real> class CuPackedMatrix;
template<typename Real> class CuSpMatrix;
template<typename Real> class CuTpMatrix;
template<typename Real> class CuBlockMatrix; // this has no non-CU counterpart.
}

Просмотреть файл

@ -1,6 +1,8 @@
// cudamatrix/cu-device.cc
// Copyright 2009-2012 Karel Vesely
// 2013 Lucas Ondel
// 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -19,140 +21,137 @@
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
#include <cublas.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <string>
#include <vector>
#include <algorithm>
#include <dlfcn.h>
#include <unistd.h> // for sleep
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-device.h"
#include "base/kaldi-error.h"
#include "util/common-utils.h"
namespace kaldi {
CuDevice::CuDevice()
: active_gpu_id_(-3), verbose_(true)
{ }
CuDevice::~CuDevice() {
if (Enabled()) {
cuSafeCall(cublasShutdown());
} else if (active_gpu_id_ == -2) {
KALDI_WARN << "CUDA was NOT used! No CUDA GPU detected!";
}
}
/**
* SelectGpuId(gpu_id)
* SelectGpuId(use_gpu)
*
* The argument 'gpu_id' meaning: 0..N selects a GPU,
* -1 disables CUDA, -2 performs GPU auto-detection.
* There are 3 'use_gpu' modes for GPU selection:
* "yes" -- Select GPU automatically (or get one by exclusive mode)
* and die if this fails.
* "optional" -- Do as above, but if it fails, back off to CPU.
* "no" -- Run on CPU.
*
* If there is no GPU in the system, and we have GPU auto-detection,
* or GPU is manually disabled the computation will run on CPU.
* In other cases it is an error (manual selection).
* In case of Compute exclusive mode, the GPU is selected by OS.
*
* In case of Compute exclusive mode, the GPU is selected by OS,
* this has priority over manual/auto selection of GPU.
* Otherwise GPU selection is based on largest proportion of free memory.
* This can eventually lead to multiple processes computing on single GPU,
* which is slow. More practical is to use "compute exclusive mode".
*
* Since the autoselection of GPU is not perfect, it may still
* happen that two processes compute on single GPU, which is slow.
* The users are advised to use manual selection or exclusive mode.
*
* This method must be called at the very beginning of the program
* (before the cudamatrix objects allocate memory for the data),
* or not at all (when we intentionally want to run on the CPU).
* This method is to be called at the very beginning of the program
* (before first allocation in cudamatrix), or not at all (default to CPU).
*
*/
void CuDevice::SelectGpuId(int32 gpu_id) {
void CuDevice::SelectGpuId(std::string use_gpu) {
// Possible modes
if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional") {
KALDI_ERR << "Please choose : --use-gpu=yes|no|optional, passed '" << use_gpu << "'";
}
// Make sure this function is not called twice!
if(Enabled()) {
if (Enabled()) {
KALDI_ERR << "There is already an active GPU " << active_gpu_id_
<< ", cannot change it on the fly!";
}
// Allow the GPU to stay disabled
if(!Enabled() && gpu_id == -1) {
KALDI_LOG << "Selected device: " << gpu_id
<< ", we don't even try to get a GPU. We run on CPU.";
active_gpu_id_ = -1;
if(!Enabled() && use_gpu == "no") {
KALDI_LOG << "Manually selected to compute on CPU.";
return;
}
// Check that we have a gpu available
int32 n_gpu = 0;
cudaGetDeviceCount(&n_gpu);
if(n_gpu == 0 && gpu_id == -2) {
// If we do automatic selection and no GPU is found, we run on a CPU
KALDI_WARN << "CUDA will NOT be used!!! No CUDA capable GPU detected...";
active_gpu_id_ = -2;
return;
}
// In other cases it is an error, no GPU is an error
if(n_gpu == 0) {
KALDI_ERR << "No CUDA capable GPU detected, while explicitly asked for gpu-id '"
<< gpu_id << "'.";
}
//Now we know that there is a GPU in the system,
//and we don't want to have it disabled.
//
//For the GPU selection there are 3 possibilities,
//with priorities according to the order:
//
//1.) We have compute exclusive mode on (GPU is selected by OS)
//2.) User did not specify the GPU-id (default value -2),
// we will do automatic selection.
//3.) User specified the GPU to run on, so we select it.
if(IsComputeExclusive()) {
//we have the GPU context now...
;
} else if(gpu_id == -2) {
SelectGpuIdAuto();
} else {
//try to select the desired GPU
int32 ret = cudaSetDevice(gpu_id);
//handle the possible errors (no recovery!!!)
switch(ret) {
case cudaSuccess : {
//create the GPU context
cudaError_t e;
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
if(e != cudaSuccess) {
KALDI_ERR << "Failed to create CUDA context on a GPU.";
}
//this was okay, so we are done!
KALDI_LOG << "Selected device: " << gpu_id << " (manually)";
break;
}
case cudaErrorInvalidDevice : {
int32 n_gpu = 0;
cudaGetDeviceCount(&n_gpu);
KALDI_ERR << "cudaSetDevice(" << gpu_id << "):"
<< " '" << gpu_id << "' is not a VALID CUDA device! "
<< " (system has " << n_gpu << " GPUs,"
<< " valid IDs 0.." << n_gpu-1 << ")";
break;
}
default :
KALDI_ERR << "cudaSetDevice(" << gpu_id << "): "
<< "returned " << ret << ", "
<< cudaGetErrorString((cudaError_t)ret);
if (use_gpu == "yes") {
KALDI_ERR << "No CUDA GPU detected!";
}
if (use_gpu == "optional") {
KALDI_WARN << "Running on CPU!!! No CUDA GPU detected...";
return;
}
}
// Now the we should have active GPU,
// so we can query its name and memory stats
// and notify user which GPU is finally used.
//
// Create a CUDA context : in case of compute-exclusive mode OS selects gpu_id,
// or default gpu_id=0. In the case with no free GPUs a context cannot be created
// (compute-exclusive mode).
//
cudaError_t e;
e = cudaThreadSynchronize(); //<< CUDA context gets created here.
if (e != cudaSuccess) {
// So far no we don't have context, sleep a bit and retry.
int32 sec_sleep = 2;
KALDI_WARN << "Will try again to get a GPU after " << sec_sleep
<< " seconds.";
sleep(sec_sleep);
//
e = cudaThreadSynchronize(); //<< 2nd trial to get CUDA context.
if (e != cudaSuccess) {
if (use_gpu == "yes") {
KALDI_ERR << "Failed to create CUDA context, no more unused GPUs?";
}
if (use_gpu == "optional") {
KALDI_WARN << "Running on CPU!!! No more unused CUDA GPUs?";
return;
}
}
}
// Re-assure we have the context
KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
// Check if the machine use compute exclusive mode
if (IsComputeExclusive()) {
FinalizeActiveGpu();
return;
} else {
// Or suggest to use compute exclusive mode
if(n_gpu > 1) {
KALDI_WARN << "Hint: It is practical to set the GPUs into ``compute exclusive mode''."
<< " Selection of free GPUs would be done by OS automatically.";
}
// And select the GPU according to proportion of free memory
if(SelectGpuIdAuto()) {
FinalizeActiveGpu();
return;
} else {
// Could not get GPU, after prevously having the CUDA context?
// Strange but not impossible...
if (use_gpu == "yes") {
KALDI_ERR << "Error acquiring GPU.";
}
if (use_gpu == "optional") {
KALDI_WARN << "Running on CPU!!! Error acquiring GPU.";
return;
}
}
}
}
void CuDevice::FinalizeActiveGpu() {
// The device at this point should have active GPU, so we can query its name
// and memory stats and notify user which GPU is finally used.
// Get the device-id of active device:
{
int32 act_gpu_id;
@ -164,44 +163,38 @@ void CuDevice::SelectGpuId(int32 gpu_id) {
// Remember the id of active GPU
active_gpu_id_ = act_gpu_id; //CuDevice::Enabled() is true from now on
// Initialize the CUBLAS
cuSafeCall(cublasInit());
CU_SAFE_CALL(cublasInit());
// Notify user which GPU is finally used
char name[128];
DeviceGetName(name,128,act_gpu_id);
KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: "
<< name << "\t" << GetFreeMemory(NULL, NULL);
}
CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, act_gpu_id));
KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t"
<< GetFreeMemory(&free_memory_at_startup_, NULL) << " version "
<< properties_.major << "." << properties_.minor;
if (verbose_) PrintMemoryUsage();
}
return;
}
bool CuDevice::DoublePrecisionSupported() {
if (!Enabled()) return true;
return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
// Double precision is supported from version 1.3
}
bool CuDevice::IsComputeExclusive() {
// check that we have a gpu
int32 n_gpu = 0;
cudaGetDeviceCount(&n_gpu);
if(n_gpu == 0) {
KALDI_LOG << "No CUDA devices found";
return false;
}
// Create a GPU context
// This will be kept if we detect compute exclusive mode
// or released in the other case.
//
// It does not harm if the function gets called twice,
// and the context is already created.
cudaError_t e;
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
if(e != cudaSuccess) {
KALDI_ERR << "Failed to create CUDA context on a GPU. No more unused GPUs in compute exclusive mode?";
}
// assume we already have an CUDA context created
KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
// get the device-id and its device-properties
int32 gpu_id = -1;
e = cudaGetDevice(&gpu_id);
cudaError_t e = cudaGetDevice(&gpu_id);
if(e != cudaSuccess) {
KALDI_ERR << "Failed to get current device";
}
@ -216,12 +209,12 @@ bool CuDevice::IsComputeExclusive() {
KALDI_LOG << "CUDA setup operating under Compute Exclusive Mode.";
return true;
break;
#if (CUDA_VERSION >= 4000)
#if (CUDA_VERSION >= 4000)
case cudaComputeModeExclusiveProcess :
KALDI_LOG << "CUDA setup operating under Compute Exclusive Process Mode.";
return true;
break;
#endif
#endif
default :
// The computation mode is not compute-exclusive,
// in this case we release the GPU context...
@ -234,21 +227,20 @@ bool CuDevice::IsComputeExclusive() {
}
void CuDevice::SelectGpuIdAuto() {
// check that we have at least one gpu
bool CuDevice::SelectGpuIdAuto() {
// Check that we have at least one gpu
int32 n_gpu = 0;
cudaGetDeviceCount(&n_gpu);
if(n_gpu == 0) {
KALDI_ERR << "No CUDA devices found";
return;
KALDI_WARN << "No CUDA devices found";
return false;
}
// The GPU is selected according to maximal free memory ratio
std::vector<float> free_mem_ratio(n_gpu+1, 0.0);
//get ratios of memory use, if possible
// Get ratios of memory use, if possible
KALDI_LOG << "Selecting from " << n_gpu << " GPUs";
for(int32 n=0; n<n_gpu; n++) {
for(int32 n = 0; n < n_gpu; n++) {
int32 ret = cudaSetDevice(n);
switch(ret) {
case cudaSuccess : {
@ -292,23 +284,22 @@ void CuDevice::SelectGpuIdAuto() {
if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
}
//the free_mem_ratio should be bigger than zero
if(!free_mem_ratio[max_id] > 0.0) {
KALDI_ERR << "No device could be selected (this should never happen)";
}
KALDI_ASSERT(free_mem_ratio[max_id] > 0.0);
//finally select the GPU
KALDI_LOG << "Selected device: " << max_id << " (automatically)";
cuSafeCall(cudaSetDevice(max_id));
CU_SAFE_CALL(cudaSetDevice(max_id));
//create the context
cudaError_t e;
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
if(e != cudaSuccess) {
KALDI_ERR << "Failed to create CUDA context on a GPU.";
KALDI_WARN << "Failed to create CUDA context on a GPU.";
return false;
}
return true;
}
void CuDevice::AccuProfile(const std::string &key, double time) {
if (profile_map_.find(key) == profile_map_.end()) {
profile_map_[key] = 0.0;
@ -316,23 +307,35 @@ void CuDevice::AccuProfile(const std::string &key, double time) {
profile_map_[key] += time;
}
void CuDevice::PrintMemoryUsage() const {
if (Enabled()) {
int64 free_memory_now;
GetFreeMemory(&free_memory_now, NULL);
KALDI_LOG << "Memory used: " << (free_memory_at_startup_ - free_memory_now) << " bytes.";
}
}
void CuDevice::PrintProfile() {
if (verbose_ && Enabled()) {
std::ostringstream os;
os << "-----\n[cudevice profile]\n";
std::map<std::string, double>::iterator it;
for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
os << it->first << "\t" << it->second << "s\n";
}
std::vector<std::pair<double, std::string> > pairs;
for(it = profile_map_.begin(); it != profile_map_.end(); ++it)
pairs.push_back(std::make_pair(it->second, it->first));
std::sort(pairs.begin(), pairs.end());
size_t max_print = 15, start_pos = (pairs.size() <= max_print ?
0 : pairs.size() - max_print);
for (size_t i = start_pos; i < pairs.size(); i++)
os << pairs[i].second << "\t" << pairs[i].first << "s\n";
os << "-----";
KALDI_LOG << os.str();
PrintMemoryUsage();
}
}
std::string CuDevice::GetFreeMemory(int64* free, int64* total) {
std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
// WARNING! the CUDA API is inconsistent accross versions!
#if (CUDA_VERSION >= 3020)
//define the function signature type
@ -406,14 +409,354 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
}
////////////////////////////////////////////////
// The instance of the static singleton
//
CuDevice CuDevice::msDevice;
//
////////////////////////////////////////////////
struct CuAllocatorOptions {
int32 count; // Number of times we free and delete a particular size before we
// start to cache it.
int32 cleanup_interval_bytes;
CuAllocatorOptions(): count(1), cleanup_interval_bytes(1000000) { }
};
/// We define class CuAllocator inside the .cc file, because we don't want to
/// expose it in the header. Its purpose is to hang on to memory that we have
/// freed, so that we don't waste time in cudaMalloc and cudaMallocPitch().
/// For some reason, they are sometimes very slow.
class CuAllocator {
public:
CuAllocator(const CuAllocatorOptions &opts, CuDevice *device):
device_(device), opts_(opts),
cleanup_countdown_bytes_(opts.cleanup_interval_bytes) { }
inline void *Malloc(size_t size);
inline void *MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
inline void Free(void *ptr);
~CuAllocator();
private:
inline void *MallocInternal(size_t row_bytes, size_t num_rows, size_t *pitch);
// struct MemInfoForSize stores information associated with a particular size
// of allocated memory. The row_bytes and num_rows refer to the arguments of
// a cudaMallocPitch call; for regular, non-pitch allocations with cudaMalloc,
// we make "row_bytes" zero and the size in bytes is "num_rows"... there is a
// reason why we do it this way round (make num_rows contain the size in
// bytes); it relates to the ordering of the map, and the behavior when
// we didn't find the exact size and want to find larger match.
struct MemInfoForSize {
size_t row_bytes; // or zero, if a regular CudaMalloc, not
// CudaMallocPitch.
size_t num_rows; // or the number of rows, if it's a regular CudaMalloc
// call, not CudaMallocPitch.
size_t pitch; // If CudaMallocPitch, the pitch returned by CudaMallocPitch;
// this code assumes (and checks) that it's a deterministic
// function of row_bytes and num_rows.
size_t countdown; // number that have been freed and not cached.
size_t currently_used; // number that are "in the wild".. kept for
// diagnostics and error detection.
std::vector<void*> freed; // freed and cached...
MemInfoForSize(size_t row_bytes,
size_t num_rows,
int32 count):
row_bytes(row_bytes),
num_rows(num_rows),
pitch(0),
countdown(count),
currently_used(0) { }
};
// FindMemInfo returns the MemInfoForSize object for this (row_bytes,
// num_rows) combination if it exists; otherwise...
// if there is a MemInfoForSize object with the same row_bytes and larger (but
// not more than twice larger) num_rows that has freed memory waiting, it
// returns that; otherwise, it returns a new MemInfoForSize object for the
// requested size).
inline MemInfoForSize *FindMemInfo(size_t row_bytes,
size_t num_rows) {
if (row_bytes >= size_to_list_.size())
size_to_list_.resize(row_bytes + 1, NULL);
// note: we set row_bytes to 0 for regular, linear allocation.
KALDI_ASSERT(num_rows != 0);
if (size_to_list_[row_bytes] == NULL)
size_to_list_[row_bytes] = new std::map<size_t, MemInfoForSize*>;
std::map<size_t, MemInfoForSize*> &size_to_list = *(size_to_list_[row_bytes]);
typedef std::map<size_t, MemInfoForSize* >::iterator IterType;
// get an iterator to the requested object or the next-larger one.
// Here, upper_bound(num_rows - 1) returns an object strictly greater
// than num_rows - 1, which could be num_rows itself. We need to
// treat num_rows == 0 as a special case because of size_t being
// unsigned.
IterType iter = (num_rows == 0 ? size_to_list.begin() :
size_to_list.upper_bound(num_rows - 1));
if (iter != size_to_list.end() && iter->first == num_rows) {
// Found a MemInfoForSize object
// with the requested size -> return it.
KALDI_ASSERT(iter->second->row_bytes == row_bytes &&
iter->second->num_rows == num_rows);
return iter->second;
} else if (iter != size_to_list.end() &&
iter->second->num_rows <= 2 * num_rows &&
!iter->second->freed.empty()) {
// Return the non-matching one with freed memory, which is larger than
// this one but not more than twice larger.
KALDI_ASSERT(iter->second->row_bytes == row_bytes &&
iter->second->num_rows > num_rows); // confirm expectations.
return iter->second;
} else {
// There was no such object, and the next-larger object either did not
// exist, had more than twice the num-rows requested, or had no free
// memory -> create an object with the requested size.
return (size_to_list[num_rows] = new MemInfoForSize(row_bytes, num_rows,
opts_.count));
}
}
void PossiblyCleanup(size_t num_bytes);
// A periodic housekeeping task..
void Cleanup();
// Frees all memory in the "freed" vectors; memory that the
// user freed but we held on to. If destroy == true, also
// clean up all memory held in the size_to_list_ object (i.e.
// allocated maps and MemInfoForSize objects).
void ReleaseAllCachedMemory(bool destroy = false);
CuDevice *device_; // device this is attached to...
CuAllocatorOptions opts_;
unordered_map<void*, MemInfoForSize*> addr_to_list_;
// size_to_list_ is indexed first by row_bytes (which is zero for linear
// mallocs) and then by num_rows (which for linear mallocs, is the actual size
// in bytes).
std::vector<std::map<size_t, MemInfoForSize*>* > size_to_list_;
int32 cleanup_countdown_bytes_; // countdown in bytes, until the next time we check
// whether we should do cleanup
};
void* CuAllocator::Malloc(size_t size) {
KALDI_ASSERT(size > 0);
return MallocInternal(0, size, NULL);
}
void* CuAllocator::MallocPitch(size_t num_rows, size_t row_bytes,
size_t *pitch) {
KALDI_ASSERT(num_rows > 0 && row_bytes > 0 && pitch != NULL);
return MallocInternal(num_rows, row_bytes, pitch);
}
void* CuAllocator::MallocInternal(size_t row_bytes,
size_t num_rows,
size_t *pitch_out) {
// we share the code for standard cudaMalloc and cudaMallocPitch
// because most of it is the same. for cudaMalloc, we'll have
// row_bytes == 0, and num_rows is just the size to be allocated.
KALDI_ASSERT(num_rows != 0 && (row_bytes != 0) == (pitch_out != NULL));
MemInfoForSize *info = FindMemInfo(row_bytes, num_rows);
if (!info->freed.empty()) { // We can satisfy the request with cached,
// previously-allocated memory.
void *ans = info->freed.back();
info->freed.pop_back();
info->currently_used++;
addr_to_list_[ans] = info;
if (pitch_out) *pitch_out = info->pitch;
return ans;
} else {
PossiblyCleanup(row_bytes == 0 ? num_rows : row_bytes * num_rows);
void *ans;
if (row_bytes == 0) { // Simple malloc request, not "MallocPitch".
size_t size = num_rows;
int32 ret = cudaMalloc(&ans, size);
if (ret != 0) {
KALDI_WARN << "Allocation of memory block of " << size << " bytes "
<< "failed, releasing cached memory and retrying.";
ReleaseAllCachedMemory();
ret = cudaMalloc(&ans, size);
if (ret != 0)
KALDI_WARN << "Allocation failed for the second time. Printing "
<< "device memory usage and exiting";
device_->PrintMemoryUsage();
KALDI_ERR << "Memory allocation failure";
}
} else {
size_t pitch;
int32 ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
if (ret != 0) { // allocation failed...
KALDI_WARN << "Allocation of " << num_rows << " rows, each of size "
<< row_bytes << " bytes failed, releasing cached "
<< "memory and retrying.";
ReleaseAllCachedMemory();
ret = cudaMallocPitch(&ans, &pitch, row_bytes, num_rows);
if (ret != 0) {
KALDI_WARN << "Allocation failed for the second time. Printing "
<< "device memory usage and exiting";
device_->PrintMemoryUsage();
KALDI_ERR << "Memory allocation failure";
}
}
KALDI_ASSERT(pitch > 0);
if (info->pitch == 0) { // First allocation; have not set info->pitch yet.
info->pitch = pitch;
} else if (pitch != info->pitch) {
KALDI_ERR << "Pitch differs between multiple calls with the same "
<< "parameters: " << pitch << " vs. " << info->pitch;
}
*pitch_out = info->pitch;
}
addr_to_list_[ans] = info;
info->currently_used++;
return ans;
}
}
void CuAllocator::Free(void *addr) {
unordered_map<void*, MemInfoForSize*>::iterator iter
= addr_to_list_.find(addr);
if (iter == addr_to_list_.end()) {
KALDI_ERR << "Attempt to free address " << addr << " that was not allocated "
<< "by CuDevice::Malloc() (or was previously freed);";
}
MemInfoForSize *info = iter->second;
addr_to_list_.erase(addr); // Erase this element in the addr_to_list_ map.
info->currently_used--;
if (info->countdown == 0) { // We have freed [i.e. actually freed with
// CudaFree()] enough of these that we think
// we're wasting too much time this way and
// need to start caching them.
info->freed.push_back(addr);
} else { // Actually free the address, and decrease "countdown".
info->countdown--;
CU_SAFE_CALL(cudaFree(addr)); // This is how we free, even if allocated with
// cudaMallocPitch().
}
}
void CuAllocator::ReleaseAllCachedMemory(bool destroy) {
KALDI_VLOG(2) << "Releasing all cached memory.";
for (size_t i = 0; i < size_to_list_.size(); i++) {
if (size_to_list_[i] == NULL)
continue;
typedef std::map<size_t, MemInfoForSize*>::iterator IterType;
for (IterType iter = size_to_list_[i]->begin();
iter != size_to_list_[i]->end(); ++iter) {
MemInfoForSize *info = iter->second;
if (destroy && !info->freed.empty()) {
// When called from the destructor at program end, if verbose level is
// high, say the sizes we had.
if (info->row_bytes == 0) {
KALDI_VLOG(3) << "Releasing " << info->freed.size() << " blocks of "
<< info->num_rows << " bytes.";
} else {
KALDI_VLOG(3) << "Releasing " << info->freed.size()
<< " 2-d blocks of " << info->num_rows << " rows of "
<< info->row_bytes << " bytes each.";
}
}
if (!destroy) {
// We only do this freeing part when we're *not* called from the
// destuctor (destroy = false). This leads to a crash when called from
// the destructor, with cudaFree returning "unload of CUDA runtime
// failed". Presumably this has to do with the destruction order of
// C++, which we can't really control.
while (!info->freed.empty()) {
CU_SAFE_CALL(cudaFree(info->freed.back()));
info->freed.pop_back();
}
}
if (destroy)
delete info;
}
if (destroy) {
delete size_to_list_[i];
size_to_list_[i] = NULL;
}
}
}
void CuAllocator::Cleanup() {
// TODO: implement this or remove it (and also PossiblyCleanup).
// Actually we may never implement this, as just calling
// ReleaseAllCachedMemory whenever an allocation fails is probably
// sufficient.
}
void CuAllocator::PossiblyCleanup(size_t num_bytes) {
if (static_cast<size_t>(cleanup_countdown_bytes_) <= num_bytes) {
Cleanup();
cleanup_countdown_bytes_ = opts_.cleanup_interval_bytes;
} else {
cleanup_countdown_bytes_ -= static_cast<int32>(num_bytes);
}
}
CuAllocator::~CuAllocator() {
// Check that nothing was allocated by the user and not freed.
std::set<MemInfoForSize*> unfreed_set;
typedef unordered_map<void*, MemInfoForSize *>::iterator IterType;
for (IterType iter = addr_to_list_.begin(); iter != addr_to_list_.end();
++iter)
unfreed_set.insert(iter->second);
for (std::set<MemInfoForSize*>::iterator iter = unfreed_set.begin();
iter != unfreed_set.end(); ++iter) {
MemInfoForSize *info = *iter;
KALDI_ASSERT(info->currently_used > 0); // Or should not be in this set
// (code error or memory corruption)
if (info->num_rows == 0) {
KALDI_WARN << info->currently_used << " memory chunks of size "
<< info->row_bytes << " were allocated and not freed.";
} else {
KALDI_WARN << info->currently_used << " memory chunks of size "
<< info->row_bytes << " per row, and " << info->num_rows
<< " rows, were allocated and not freed.";
}
}
bool destroy = true;
ReleaseAllCachedMemory(destroy);
}
void CuDevice::Free(void *ptr) { allocator_->Free(ptr); }
void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
return allocator_->MallocPitch(row_bytes, num_rows, pitch);
}
void* CuDevice::Malloc(size_t size) {
return allocator_->Malloc(size);
}
CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true),
allocator_(new CuAllocator(CuAllocatorOptions(), this))
{ }
CuDevice::~CuDevice() {
if (allocator_ != NULL)
delete allocator_;
if (Enabled())
CU_SAFE_CALL(cublasShutdown());
}
// The instance of the static singleton
CuDevice CuDevice::global_device_;
}

Просмотреть файл

@ -22,75 +22,105 @@
#ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_
#define KALDI_CUDAMATRIX_CU_DEVICE_H_
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
#include <map>
#include <string>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
namespace kaldi {
class CuAllocator; // Forward declaration.
/**
* Singleton object which represents CUDA device
* responsible for CUBLAS initilalisation, collects profiling info
*/
class CuDevice {
// Singleton interface...
private:
CuDevice();
CuDevice(CuDevice&);
CuDevice &operator=(CuDevice&);
// Singleton object (there should only be one instantiated per program)
public:
~CuDevice();
static CuDevice& Instantiate() {
return msDevice;
}
static inline CuDevice& Instantiate() { return global_device_; }
private:
static CuDevice msDevice;
// We provide functions Malloc, MallocPitch and Free which replace cudaMalloc,
// cudaMallocPitch and cudaFree. Their function is to cache the results of
// previous allocations to avoid the very large overhead that CUDA's
// allocation seems to give for some setups.
void* Malloc(size_t size);
void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
void Free(void *ptr);
/// Select a GPU for computation, the 'use_gpu' modes are:
/// "yes" -- Select GPU automatically and die if this fails.
/// "optional" -- Do as above, but if it fails, back off to CPU.
/// "no" -- Run on CPU.
/// (more comments in cu-device.cc)
void SelectGpuId(std::string use_gpu);
/**********************************/
// Instance interface
public:
/// Check if the CUDA device is selected for use
bool Enabled() {
/// Check if the CUDA GPU is selected for use
bool Enabled() const {
return (active_gpu_id_ > -1);
}
/// Manually select GPU by id (more comments in cu-device.cc)
void SelectGpuId(int32 gpu_id);
/// Get the active GPU id
int32 ActiveGpuId() {
return active_gpu_id_;
}
void Verbose(bool verbose) {
verbose_ = verbose;
}
/// Returns true if either we have no GPU, or we have a GPU
/// and it supports double precision.
bool DoublePrecisionSupported();
void SetVerbose(bool verbose) { verbose_ = verbose; }
/// Sum the IO time
void AccuProfile(const std::string &key, double time);
void PrintProfile();
void PrintMemoryUsage() const;
void ResetProfile() {
profile_map_.clear();
}
/// Get the actual GPU memory use stats
std::string GetFreeMemory(int64* free = NULL, int64* total = NULL);
std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const;
/// Get the name of the GPU
void DeviceGetName(char* name, int32 len, int32 dev);
private:
/// Check if the GPU run in compute exclusive mode
bool IsComputeExclusive();
/// Automatically select GPU
void SelectGpuIdAuto();
CuDevice();
CuDevice(CuDevice&); // Disallow.
CuDevice &operator=(CuDevice&); // Disallow.
static CuDevice global_device_;
/// Check if the GPU run in compute exclusive mode Returns true if it is
/// running in compute exclusive mode and we have a GPU. Returns false
/// otherwise. Sets error to true if there was some error, such as that we
/// were running in compute exclusive modes but no GPUs available; otherwise
/// sets it to false.
bool IsComputeExclusive();
/// Automatically select GPU and get CUDA context. Returns true on success.
bool SelectGpuIdAuto();
/// Try to get CUDA context on manually selected GPU. Return true on success.
bool SelectGpuIdManual(int32 gpu_id);
void FinalizeActiveGpu();
/// Should only be called if Enabled() == true.
int32 MajorDeviceVersion();
/// Should only be called if Enabled() == true.
int32 MinorDeviceVersion();
private:
std::map<std::string, double> profile_map_;
/// active_gpu_id_ values:
@ -99,14 +129,20 @@ class CuDevice {
/// -1 SelectGpuId was called, but the GPU was manually disabled
/// 0..N Normal GPU IDs
int32 active_gpu_id_;
///
int64 free_memory_at_startup_;
cudaDeviceProp properties_;
bool verbose_;
CuAllocator *allocator_;
}; // class CuDevice
}// namespace
} // namespace
#endif // HAVE_CUDA

Просмотреть файл

@ -1,6 +1,10 @@
// cudamatrix/cu-kernels-ansi.h
// Copyright 2009-2012 Karel Vesely
// 2013 Johns Hopkins University (author: Daniel Povey)
// 2013 Hainan Xu
// 2013 Xiaohui Zhang
// 2013 Johns Hopkins University (author: Guoguo Chen)
// See ../../COPYING for clarification regarding multiple authors
//
@ -25,8 +29,7 @@
#include "cudamatrix/cu-matrixdim.h"
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
extern "C" {
/*********************************************************
@ -43,13 +46,39 @@ void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, Matr
/*
* CuMatrix
*/
void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim,
const float *vec, const float *mat2, int mat2_row_stride,
int mat2_col_stride, float beta);
void cudaF_copy_from_tp_trans(int Gr, int Bl, float* A, const float* B, MatrixDim dmat);
void cudaFD_copy_from_tp_trans(int Gr, int Bl, float* A, const double* B, MatrixDim dmat);
void cudaF_copy_from_tp(int Gr, int Bl, float* A, const float* B, MatrixDim dmat);
void cudaFD_copy_from_tp(int Gr, int Bl, float* A, const double* B, MatrixDim dmat);
void cudaF_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d);
void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
void cudaF_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec, const float alpha, int dim);
void cudaF_scale_diag(int Gr, int Bl, float* mat, float value, int dim);
void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim d);
void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride);
void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride);
void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size);
void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power);
void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d);
void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d);
void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d);
@ -58,29 +87,82 @@ void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, floa
/*
* CuVector
*/
void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig, float changed);
void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a, float param_1, float param_2, float param_3, int* flag, int dim);
void cudaF_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim);
void cudaF_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim);
void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
void cudaF_vec_min(const float* v, float* value, int dim);
void cudaF_vec_max(const float* v, float* value, int dim);
void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
int N_col_stride, int threads_per_element, float beta);
void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim);
void cudaF_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim);
void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
void cudaF_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size);
void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim);
void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim);
void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
void cudaF_add_row_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d);
void cudaF_add_col_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d);
void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
// Note: B_trans is nonzero if B is transposed.
void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const float *Adata,
int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
const CuBlockMatrixData *B_cu_data, int B_num_blocks,
float alpha, float beta, int B_trans);
void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
const float *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
const float *D_data, int D_row_stride, int D_col_stride,
float alpha, float beta);
/*
* cu::
*/
void cudaF_softmax(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d);
void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride);
void cudaF_softmax_part(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d);
void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d);
void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d);
void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d);
void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power);
void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int src_stride);
void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d);
void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d);
void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d);
void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in);
void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
void cudaF_one(int Gr, int Bl, float* x, int dim);
void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
void cudaF_copy_from_sp(int Gr, int Bl, const float* x, float* y, int d_in, MatrixDim d_out);
void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
void cudaF_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<float>* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t);
void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
float *S, MatrixDim sdim);
void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
const float *src_data, MatrixDim src_dim,
const Int32Pair *indices);
void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
const Int32Pair *indices, int indices_size,
float *output);
/*********************************************************
* double CUDA kernel calls
*/
@ -88,13 +170,39 @@ void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *co
/*
* CuMatrix
*/
void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
const double *vec, const double *mat2, int mat2_row_stride,
int mat2_col_stride, double beta);
void cudaD_copy_from_tp_trans(int Gr, int Bl, double* A, const double* B, MatrixDim dmat);
void cudaDF_copy_from_tp_trans(int Gr, int Bl, double* A, const float* B, MatrixDim dmat);
void cudaD_copy_from_tp(int Gr, int Bl, double* A, const double* B, MatrixDim dmat);
void cudaDF_copy_from_tp(int Gr, int Bl, double* A, const float* B, MatrixDim dmat);
void cudaD_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d);
void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec, const double alpha, int dim);
void cudaD_scale_diag(int Gr, int Bl, double* mat, double value, int dim);
void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim d);
void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride);
void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride);
void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size);
void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power);
void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d);
void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d);
void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d);
@ -103,31 +211,101 @@ void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, do
/*
* CuVector
*/
void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig, double changed);
void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a, double param_1, double param_2, double param_3, int* flag, int dim);
void cudaD_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim);
void cudaD_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim);
void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim);
void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
void cudaD_vec_min(const double* v, double* value, int dim);
void cudaD_vec_max(const double* v, double* value, int dim);
void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
int N_col_stride, int threads_per_element, double beta);
void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim);
void cudaD_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim);
void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
void cudaD_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size);
void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim);
void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim);
void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
void cudaD_add_row_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d);
void cudaD_add_col_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d);
void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
// note: B_trans is nonzero if B is tranposed.
void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const double *Adata,
int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
const CuBlockMatrixData *B_cu_data, int B_num_blocks,
double alpha, double beta, int B_trans);
void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
const double *D_data, int D_row_stride, int D_col_stride,
double alpha, double beta);
/*
* cu::
*/
void cudaD_softmax(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d);
void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride);
void cudaD_softmax_part(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d);
void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d);
void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d);
void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d);
void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power);
void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int src_stride);
void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d);
void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d);
void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d);
void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in);
void cudaD_randomize(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
void cudaD_splice(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
void cudaD_one(int Gr, int Bl, double* x, int dim);
void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
void cudaD_copy_from_sp(int Gr, int Bl, const double* x, double* y, int d_in, MatrixDim d_out);
void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in);
// some mostly mixed-type kernels.
void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in);
void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
void cudaD_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<double>* x, int s, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t);
void cudaD_transpose_matrix(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* T, MatrixDim tdim,
double *S, MatrixDim sdim);
void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
const double *src_data, MatrixDim src_dim,
const Int32Pair *indices);
void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
const Int32Pair *indices, int indices_size,
double *output);
} // extern "C"
#endif // HAVE_CUDA
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,6 +1,11 @@
// cudamatrix/cu-kernels.h
// Copyright 2009-2012 Karel Vesely
// 2013 Ehsan Variani
// 2014 Johns Hopkins University (author: Daniel Povey)
// 2013 Hainan Xu
// 2013 Xiaohui Zhang
// 2013 Johns Hopkins University (author: Guoguo Chen)
// See ../../COPYING for clarification regarding multiple authors
//
@ -22,7 +27,7 @@
#ifndef KALDI_CUDAMATRIX_CU_KERNELS_H_
#define KALDI_CUDAMATRIX_CU_KERNELS_H_
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
#include "base/kaldi-error.h"
#include "cudamatrix/cu-kernels-ansi.h"
@ -34,147 +39,366 @@
namespace kaldi {
/*********************************************************
* base templates
*/
/*
* CuMatrix
*/
template<typename Real> inline void cuda_set_const(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_add(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_scale(dim3 Gr, dim3 Bl, Real *mat, Real value, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_apply_log(dim3 Gr, dim3 Bl, Real *mat, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_mul_elements(dim3 Gr, dim3 Bl, Real *mat, const Real *A, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *scale, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *scale, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, Real *mat, const Real *vec_div, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_add_mat(dim3 Gr, dim3 Bl, Real alpha, const Real *A, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, Real alpha, const Real *col, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, Real alpha, const Real *row, Real beta, Real *dst, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
/*
* CuVector
*/
template<typename Real> inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_sum, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_sum, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_invert_elements(dim3 Gr, dim3 Bl, Real *data, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_sigmoid(dim3 Gr, dim3 Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, Real *eout, const Real *e, const Real *y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_tanh(dim3 Gr, dim3 Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, Real *eout, const Real *e, const Real *y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_softmax(size_t Gr, size_t Bl, Real *y, const Real *x, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const Real *X, const int32_cuda *vec_ids, Real* Y, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, Real *wei, Real *grad, Real l1, Real lr, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const Real *mat, Real *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, Real *mat_net_out, Real *vec_log_post, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_randomize(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_splice(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_copy(dim3 Gr, dim3 Bl, Real *y, const Real *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { KALDI_ERR << __func__ << " Not implemented!"; }
/*********************************************************
* float specializations
*/
/*
* CuMatrix
*/
template<> inline void cuda_set_const<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_set_const(Gr,Bl,mat,value,d); }
template<> inline void cuda_add<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_add(Gr,Bl,mat,value,d); }
template<> inline void cuda_scale<float>(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_scale(Gr,Bl,mat,value,d); }
template<> inline void cuda_apply_log<float>(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) { cudaF_apply_log(Gr,Bl,mat,d); }
template<> inline void cuda_mul_elements<float>(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim d) { cudaF_mul_elements(Gr,Bl,mat,A,d); }
template<> inline void cuda_mul_cols_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr,Bl,mat,scale,d); }
template<> inline void cuda_mul_rows_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
template<> inline void cuda_div_rows_vec<float>(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d) { cudaF_div_rows_vec(Gr,Bl,mat,vec_div,d); }
template<> inline void cuda_add_mat<float>(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d) { cudaF_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
template<> inline void cuda_add_vec_to_cols<float>(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
template<> inline void cuda_add_vec_to_rows<float>(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_upp_low(Gr, Bl, A, dimA); }
inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_low_upp(Gr, Bl, A, dimA); }
inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim,
const float *vec, const float *mat2, int mat2_row_stride,
int mat2_col_stride, float beta) {
cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
mat2_row_stride, mat2_col_stride, beta);
}
inline void cuda_copy_from_tp_trans(int Gr, int Bl, float* A, const float* B, MatrixDim dmat) { cudaF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
inline void cuda_copy_from_tp_trans(int Gr, int Bl, float* A, const double* B, MatrixDim dmat) { cudaFD_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
inline void cuda_copy_from_tp(int Gr, int Bl, float* A, const float* B, MatrixDim dmat) { cudaF_copy_from_tp(Gr,Bl,A,B,dmat); }
inline void cuda_copy_from_tp(int Gr, int Bl, float* A, const double* B, MatrixDim dmat) { cudaFD_copy_from_tp(Gr,Bl,A,B,dmat); }
inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
}
inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
}
inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
}
inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
}
inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
}
inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
}
inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
}
inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
}
inline void cuda_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { cudaF_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr,Bl,mat,d); }
inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); }
inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) { cudaF_apply_heaviside(Gr,Bl,mat,dim); }
inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim dim) { cudaF_apply_floor(Gr,Bl,mat,floor_val,dim); }
inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim dim) { cudaF_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) { cudaF_trace(Gr,Bl,mat,value,dim); }
inline void cuda_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d) { cudaF_set_diag(Gr,Bl,mat,value,d); }
inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { cudaF_set_diag_packed(Gr,Bl,mat,value,dim); }
inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { cudaF_add_diag_packed(Gr,Bl,mat,value,dim); }
inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_set_const(Gr,Bl,mat,value,d); }
inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_set_zero_above_diag(Gr,Bl,mat,d); }
inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_add(Gr,Bl,mat,value,d); }
inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec, const float alpha, int dim) { cudaF_add_vec2(Gr,Bl,mat,vec,alpha,dim); }
inline void cuda_scale_diag(int Gr, int Bl, float* mat, float value, int dim) { cudaF_scale_diag(Gr,Bl,mat,value,dim); }
inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) { cudaF_scale(Gr,Bl,mat,value,d); }
inline void cuda_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) { cudaF_apply_log(Gr,Bl,mat,d); }
inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride) {
cudaF_mul_elements(Gr,Bl,mat,A,dst_d,src_stride);
}
inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d, int src_stride) {
cudaF_max(Gr,Bl,mat,A,dst_d,src_stride);
}
inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_cols_vec(Gr,Bl,mat,scale,d); }
inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *A, float beta, float *dst, MatrixDim d) { cudaF_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_transpose_matrix(Gr, Bl, mat, d); }
inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
float *S, MatrixDim sdim) {
cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
}
/*
* CuVector
*/
template<> inline void cuda_add_row_sum_mat<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
template<> inline void cuda_add_col_sum_mat<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
template<> inline void cuda_invert_elements<float>(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig, float changed) {cudaF_replace_value(Gr, Bl, v, dim, orig, changed); }
inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d) { cudaF_div_rows_vec(Gr,Bl,mat,vec_div,d); }
inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a, float param_1, float param_2, float param_3, int* flag, int dim) { cudaF_set_bias_params(Gr,Bl,v,a,param_1,param_2,param_3,flag,dim); }
inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_mul_elements(Gr,Bl,v,a,dim); }
inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { cudaF_vec_soft_max(Gr,Bl,v,dim); }
inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(v,value,dim); }
inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
inline void cuda_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
int N_col_stride, int threads_per_element, float beta) {
cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
N_col_stride, threads_per_element, beta);
}
inline void cuda_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc) { cudaF_vec_sum(Gr,Bl,v,value,dim,inc); }
inline void cuda_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size) { cudaF_pvec_sum(Gr, Bl, vec, pvec_sum, dim, size); }
inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim) { cudaF_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim) { cudaF_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) { cudaF_vec_apply_exp(Gr,Bl,v,dim); }
inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) { cudaF_vec_apply_log(Gr,Bl,v,flag,dim); }
inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const float *mat, float *vec_sum, MatrixDim d) { cudaF_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
// B_trans nonzero if B transposed.
inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const float *Adata,
int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
const CuBlockMatrixData *B_cu_data, int B_num_blocks,
float alpha, float beta, int B_trans) {
cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride,
B_cu_data, B_num_blocks, alpha, beta, B_trans);
}
inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
const float *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
const float *D_data, int D_row_stride, int D_col_stride,
float alpha, float beta) {
cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
C_row_stride, C_col_stride, D_data, D_row_stride,
D_col_stride, alpha, beta);
}
/*
* cu::
*/
template<> inline void cuda_sigmoid<float>(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d) { cudaF_sigmoid(Gr,Bl,y,x,d); }
template<> inline void cuda_diff_sigmoid<float>(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d); }
template<> inline void cuda_tanh<float>(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d) { cudaF_tanh(Gr,Bl,y,x,d); }
template<> inline void cuda_diff_tanh<float>(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d); }
template<> inline void cuda_softmax<float>(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d) { cudaF_softmax(Gr,Bl,y,x,d); }
template<> inline void cuda_softmax_part<float>(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d) { cudaF_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_soft_hinge(Gr,Bl,y,x,d,src_stride); }
inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power) { cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);}
inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_sigmoid(Gr,Bl,y,x,d,src_stride); }
inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int src_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,src_stride); }
inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); }
inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d); }
inline void cuda_softmax(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d) { cudaF_softmax(Gr,Bl,y,x,d); }
/*
Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK threads reduce a row at the same time.
Gr: the number of rows
*/
inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const float *X, const int32_cuda *vec_ids, float* Y, MatrixDim d) { cudaF_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
template<> inline void cuda_regularize_l1<float>(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
template<> inline void cuda_find_row_max_id<float>(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
template<> inline void cuda_diff_xent<float>(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
template<> inline void cuda_randomize<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
template<> inline void cuda_splice<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaF_splice(Gr,Bl,y,x,off,d_out,d_in); }
template<> inline void cuda_copy<float>(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in) {
cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
}
/*********************************************************
* double specializations
*/
inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaF_splice(Gr,Bl,y,x,off,d_out,d_in); }
inline void cuda_one(int Gr,int Bl,float* x,int dim) { cudaF_one(Gr,Bl,x,dim); }
inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaF_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
inline void cuda_copy_from_sp(int Gr, int Bl, const float* x, float* y, int d_in, MatrixDim d_out) { cudaF_copy_from_sp(Gr,Bl,x,y,d_in,d_out); }
inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_lower(Gr,Bl,x,y,d_in); }
inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_upper(Gr,Bl,x,y,d_in); }
inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_mean(Gr,Bl,x,y,d_in); }
inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int32 size, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) {cudaF_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
const float *src_data, MatrixDim src_dim,
const Int32Pair *indices) {
cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
}
inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
MatrixDim dim, const Int32Pair *indices,
int indices_size, float *output) {
cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
}
// double versions
/*
* CuMatrix
*/
template<> inline void cuda_set_const<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_set_const(Gr,Bl,mat,value,d); }
template<> inline void cuda_add<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_add(Gr,Bl,mat,value,d); }
template<> inline void cuda_scale<double>(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_scale(Gr,Bl,mat,value,d); }
template<> inline void cuda_apply_log<double>(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_apply_log(Gr,Bl,mat,d); }
template<> inline void cuda_mul_elements<double>(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim d) { cudaD_mul_elements(Gr,Bl,mat,A,d); }
template<> inline void cuda_mul_cols_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_cols_vec(Gr,Bl,mat,scale,d); }
template<> inline void cuda_mul_rows_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
template<> inline void cuda_div_rows_vec<double>(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d) { cudaD_div_rows_vec(Gr,Bl,mat,vec_div,d); }
template<> inline void cuda_add_mat<double>(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d) { cudaD_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
template<> inline void cuda_add_vec_to_cols<double>(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
template<> inline void cuda_add_vec_to_rows<double>(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_upp_low(Gr, Bl, A, dimA); }
inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_low_upp(Gr, Bl, A, dimA); }
inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
const double *vec, const double *mat2, int mat2_row_stride,
int mat2_col_stride, double beta) {
cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
mat2_row_stride, mat2_col_stride, beta);
}
inline void cuda_copy_from_tp_trans(int Gr, int Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
inline void cuda_copy_from_tp_trans(int Gr, int Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
inline void cuda_copy_from_tp(int Gr, int Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp(Gr,Bl,A,B,dmat); }
inline void cuda_copy_from_tp(int Gr, int Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp(Gr,Bl,A,B,dmat); }
inline void cuda_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d) { cudaD_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_apply_exp(Gr,Bl,mat,d); }
inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim dim) { cudaD_apply_pow(Gr,Bl,mat,power,dim); }
inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) { cudaD_apply_heaviside(Gr,Bl,mat,dim); }
inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim dim) { cudaD_apply_floor(Gr,Bl,mat,floor_val,dim); }
inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim dim) { cudaD_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) { cudaD_trace(Gr,Bl,mat,value,dim); }
inline void cuda_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d) { cudaD_set_diag(Gr,Bl,mat,value,d); }
inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { cudaD_set_diag_packed(Gr,Bl,mat,value,dim); }
inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { cudaD_add_diag_packed(Gr,Bl,mat,value,dim); }
inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_set_const(Gr,Bl,mat,value,d); }
inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_set_zero_above_diag(Gr,Bl,mat,d); }
inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_add(Gr,Bl,mat,value,d); }
inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec, const double alpha, int dim) { cudaD_add_vec2(Gr,Bl,mat,vec,alpha,dim); }
inline void cuda_scale_diag(int Gr, int Bl, double* mat, double value, int dim) { cudaD_scale_diag(Gr,Bl,mat,value,dim); }
inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) { cudaD_scale(Gr,Bl,mat,value,d); }
inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_apply_log(Gr,Bl,mat,d); }
inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) {
cudaD_mul_elements(Gr,Bl,mat,A,dst_d,src_stride);
}
inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d, int src_stride) {
cudaD_max(Gr,Bl,mat,A,dst_d,src_stride);
}
inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_cols_vec(Gr,Bl,mat,scale,d); }
inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size) { cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *A, double beta, double *dst, MatrixDim d) { cudaD_add_mat(Gr,Bl,alpha,A,beta,dst,d); }
inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) { cudaD_transpose_matrix(Gr, Bl, mat, d); }
inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* T, MatrixDim tdim,
double *S, MatrixDim sdim) {
cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
}
/*
* CuVector
*/
template<> inline void cuda_add_row_sum_mat<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
template<> inline void cuda_add_col_sum_mat<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
template<> inline void cuda_invert_elements<double>(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
inline void cuda_replace_value(int Gr, int Bl, double *v, int dim, double orig, double changed) {cudaD_replace_value(Gr, Bl, v, dim, orig, changed); }
inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d) { cudaD_div_rows_vec(Gr,Bl,mat,vec_div,d); }
inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a, double param_1, double param_2, double param_3, int* flag, int dim) { cudaD_set_bias_params(Gr,Bl,v,a,param_1,param_2,param_3,flag,dim); }
inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim) { cudaD_vec_mul_elements(Gr,Bl,v,a,dim); }
inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr,Bl,v,dim); }
inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_min(v,value,dim); }
inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
inline void cuda_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
int N_col_stride, int threads_per_element, double beta) {
cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
N_col_stride, threads_per_element, beta);
}
inline void cuda_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc) { cudaD_vec_sum(Gr,Bl,v,value,dim,inc); }
inline void cuda_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size) { cudaD_pvec_sum(Gr,Bl,vec,pvec_sum,dim,size); }
inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim) { cudaD_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim) { cudaD_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) { cudaD_vec_apply_exp(Gr,Bl,v,dim); }
inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim) { cudaD_vec_apply_log(Gr,Bl,v,flag,dim); }
inline void cuda_add_row_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_row_sum_mat(Gr,Bl,mat,vec_sum,d); }
inline void cuda_add_col_sum_mat(dim3 Gr, dim3 Bl, const double *mat, double *vec_sum, MatrixDim d) { cudaD_add_col_sum_mat(Gr,Bl,mat,vec_sum,d); }
inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
// B_trans nonzero if B transposed.
inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const double *Adata,
int A_num_rows, int A_num_cols, int A_row_stride, int A_col_stride,
const CuBlockMatrixData *B_cu_data, int B_num_blocks,
double alpha, double beta, int B_trans) {
cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride,
B_cu_data, B_num_blocks, alpha, beta, B_trans);
}
inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
const double *D_data, int D_row_stride, int D_col_stride,
double alpha, double beta) {
cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
C_row_stride, C_col_stride, D_data, D_row_stride,
D_col_stride, alpha, beta);
}
/*
* cu::
*/
template<> inline void cuda_sigmoid<double>(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d) { cudaD_sigmoid(Gr,Bl,y,x,d); }
template<> inline void cuda_diff_sigmoid<double>(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d); }
template<> inline void cuda_tanh<double>(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d) { cudaD_tanh(Gr,Bl,y,x,d); }
template<> inline void cuda_diff_tanh<double>(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d); }
template<> inline void cuda_softmax<double>(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d) { cudaD_softmax(Gr,Bl,y,x,d); }
template<> inline void cuda_softmax_part<double>(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d) { cudaD_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_soft_hinge(Gr,Bl,y,x,d,src_stride); }
inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power) { cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power); }
inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_sigmoid(Gr,Bl,y,x,d,src_stride); }
inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int src_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,src_stride); }
inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); }
inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d); }
inline void cuda_softmax(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d) { cudaD_softmax(Gr,Bl,y,x,d); }
inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
inline void cuda_softmax_part(dim3 Gr, dim3 Bl, const double *X, const int32_cuda *vec_ids, double* Y, MatrixDim d) { cudaD_softmax_part(Gr,Bl,X,vec_ids,Y,d); }
template<> inline void cuda_regularize_l1<double>(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
template<> inline void cuda_find_row_max_id<double>(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
template<> inline void cuda_diff_xent<double>(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) { cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d); }
inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) {
cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d);
}
inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in) {
cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
}
template<> inline void cuda_randomize<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
template<> inline void cuda_splice<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaD_splice(Gr,Bl,y,x,off,d_out,d_in); }
template<> inline void cuda_copy<double>(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_randomize(Gr,Bl,y,x,copy_from,d_out,d_in); }
inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *off, MatrixDim d_out, MatrixDim d_in) { cudaD_splice(Gr,Bl,y,x,off,d_out,d_in); }
inline void cuda_one(int Gr,int Bl,double* x,int dim) { cudaD_one(Gr,Bl,x,dim); }
inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x, const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in) { cudaD_copy(Gr,Bl,y,x,copy_from,d_out,d_in); }
inline void cuda_copy_from_sp(int Gr, int Bl, const double* x, double* y, int d_in, MatrixDim d_out) { cudaD_copy_from_sp(Gr,Bl,x,y,d_in,d_out); }
inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_lower(Gr,Bl,x,y,d_in); }
inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_upper(Gr,Bl,x,y,d_in); }
inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_mean(Gr,Bl,x,y,d_in); }
inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int32 size, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t) {cudaD_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
const double *src_data, MatrixDim src_dim, const Int32Pair *indices) {
cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
}
inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
MatrixDim dim, const Int32Pair *indices,
int indices_size, double *output) {
cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
}
} // namespace
// Also include some template-friendly wrappers of cublas functions:
inline void cuda_axpy(int n, float alpha, const float *x, int incx, float *y, int incy) {
cublasSaxpy(n, alpha, x, incx, y, incy);
}
inline void cuda_axpy(int n, double alpha, const double *x, int incx, double *y, int incy) {
cublasDaxpy(n, alpha, x, incx, y, incy);
}
inline void cuda_scal(int n, float alpha, float *x, int incx) {
cublasSscal(n, alpha, x, incx);
}
inline void cuda_scal(int n, double alpha, double *x, int incx) {
cublasDscal(n, alpha, x, incx);
}
} // namespace kaldi
#endif // HAVE_CUDA
#endif

Просмотреть файл

@ -0,0 +1,181 @@
// cudamatrix/cuda-math-test.cc
// Copyright 2013 Johns Hopkins University (Author: David Snyder)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "cudamatrix/cu-matrix-lib.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-array.h"
using namespace kaldi;
namespace kaldi {
/*
* Unit tests
*/
template<typename Real>
static void UnitTestCuMathRandomize() {
int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
CuMatrix<Real> src(M, N);
CuMatrix<Real> tgt(M, N);
CuArray<int32> copy_from_idx;
src.SetRandn();
int32 n_rows = src.NumRows();
int32 n_columns = src.NumCols();
std::vector<int32> copy_from_idx_vec;
for (int32 i = 0; i < n_rows; i++) {
copy_from_idx_vec.push_back(rand() % n_rows);
}
copy_from_idx.CopyFromVec(copy_from_idx_vec);
cu::Randomize(src, copy_from_idx, &tgt);
for (int32 i = 0; i < n_rows; i++) {
for (int32 j = 0; j < n_columns; j++) {
Real src_val = src(copy_from_idx_vec.at(i), j);
Real tgt_val = tgt(i, j);
AssertEqual(src_val, tgt_val);
}
}
}
template<typename Real>
static void UnitTestCuMathCopy() {
int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
CuMatrix<Real> src(M, N);
CuMatrix<Real> tgt(M, N);
CuArray<int32> copy_from_idx;
src.SetRandn();
int32 n_rows = src.NumRows();
int32 n_columns = src.NumCols();
std::vector<int32> copy_from_idx_vec;
for (int32 i = 0; i < n_columns; i++) {
copy_from_idx_vec.push_back(rand() % n_columns);
}
copy_from_idx.CopyFromVec(copy_from_idx_vec);
cu::Copy(src, copy_from_idx, &tgt);
for (int32 i = 0; i < n_rows; i++) {
for (int32 j = 0; j < n_columns; j++) {
Real src_val = src(i, copy_from_idx_vec.at(j));
Real tgt_val = tgt(i, j);
AssertEqual(src_val, tgt_val);
}
}
}
template<typename Real>
static void UnitTestCuMathSplice() {
int32 M = 100 + rand() % 200, N = 100 + rand() % 200;
CuMatrix<Real> src(M, N);
CuArray<int32> frame_offsets;
src.SetRandn();
int32 n_rows = src.NumRows();
int32 n_columns = src.NumCols();
std::vector<int32> frame_offsets_vec;
// The number of columns of tgt is rows(src)
// times n_frame_offsets, so we keep n_frame_offsets
// reasonably small (2 <= n <= 6).
int32 n_frame_offsets = rand() % 7 + 2;
for (int32 i = 0; i < n_frame_offsets; i++) {
frame_offsets_vec.push_back(rand() % 2 * n_columns - n_columns);
}
CuMatrix<Real> tgt(M, N * n_frame_offsets);
frame_offsets.CopyFromVec(frame_offsets_vec);
cu::Splice(src, frame_offsets, &tgt);
for (int32 i = 0; i < n_rows; i++) {
for (int32 k = 0; k < n_frame_offsets; k++) {
for (int32 j = 0; j < n_columns; j++) {
Real src_val;
if (i + frame_offsets_vec.at(k) >= n_rows) {
src_val = src(n_rows-1, j);
} else if (i + frame_offsets_vec.at(k) <= 0) {
src_val = src(0, j);
} else {
src_val = src(i + frame_offsets_vec.at(k), j);
}
Real tgt_val = tgt(i, k * n_columns + j);
AssertEqual(src_val, tgt_val);
}
}
}
}
template<typename Real> void CudaMathUnitTest() {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported())
#endif
UnitTestCuMathRandomize<Real>();
UnitTestCuMathSplice<Real>();
UnitTestCuMathCopy<Real>();
}
} // namespace kaldi
int main() {
for (int32 loop = 0; loop < 2; loop++) {
#if HAVE_CUDA == 1
if (loop == 0)
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
else
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
#endif
srand(time(NULL));
kaldi::CudaMathUnitTest<float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CudaMathUnitTest<double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CudaMathUnitTest<float>();
#endif
if (loop == 0)
KALDI_LOG << "Tests without GPU use succeeded.\n";
else
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
}
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
}

Просмотреть файл

@ -36,15 +36,15 @@ namespace cu {
template<typename Real>
void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
KALDI_ASSERT(SameDim(*weight, *grad));
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK, CUBLOCK);
dim3 dimGrid(n_blocks(weight->NumCols(), CUBLOCK), n_blocks(weight->NumRows(), CUBLOCK));
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));
cuda_regularize_l1(dimGrid, dimBlock, weight->data_, grad->data_, l1, lr, weight->Dim());
cuSafeCall(cudaGetLastError());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
@ -77,21 +77,21 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
template<typename Real>
void Randomize(const CuMatrixBase<Real> &src,
const CuStlVector<int32> &copy_from_idx,
const CuArray<int32> &copy_from_idx,
CuMatrixBase<Real> *tgt) {
KALDI_ASSERT(src.NumCols() == tgt->NumCols());
KALDI_ASSERT(src.NumRows() == tgt->NumRows());
KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
/*
Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
dim3 dimBlock(CUBLOCK, CUBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(copy_from_idx.Dim(), CUBLOCK));
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
*/
/*
@ -108,7 +108,7 @@ void Randomize(const CuMatrixBase<Real> &src,
MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();
cuda_randomize(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_idx.Data(), dimtgt, dimsrc);
cuSafeCall(cudaGetLastError());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
@ -116,7 +116,7 @@ void Randomize(const CuMatrixBase<Real> &src,
{
// randomize in CPU
const MatrixBase<Real> &srcmat = src.Mat();
const std::vector<int32> &copy_from_idxvec = copy_from_idx.Vec();
const int32 *copy_from_idxvec = copy_from_idx.Data();
MatrixBase<Real> &tgtmat = tgt->Mat();
for(int32 i=0; i<copy_from_idx.Dim(); i++) {
tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
@ -127,20 +127,20 @@ void Randomize(const CuMatrixBase<Real> &src,
template<typename Real>
void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<Real> *tgt) {
void Splice(const CuMatrix<Real> &src, const CuArray<int32> &frame_offsets, CuMatrix<Real> *tgt) {
KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
KALDI_ASSERT(src.NumRows() == tgt->NumRows());
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK, CUBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
cuda_splice(dimGrid, dimBlock, tgt->data_, src.data_, frame_offsets.Data(), tgt->Dim(), src.Dim());
cuSafeCall(cudaGetLastError());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
@ -148,11 +148,12 @@ void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets,
{
// expand in CPU
const MatrixBase<Real> &srcmat = src.Mat();
const std::vector<int32> &frame_offsetvec = frame_offsets.Vec();
const int32 *frame_offsetvec = frame_offsets.Data();
int32 dim = frame_offsets.Dim();
MatrixBase<Real> &tgtmat = tgt->Mat();
//
for(int32 r=0; r < tgtmat.NumRows(); r++) {
for(int32 off=0; off < static_cast<int32>(frame_offsetvec.size()); off++) {
for(int32 off=0; off < dim; off++) {
int32 r_off = r + frame_offsetvec[off];
if(r_off < 0) r_off = 0;
if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
@ -165,20 +166,20 @@ void Splice(const CuMatrix<Real> &src, const CuStlVector<int32> &frame_offsets,
template<typename Real>
void Copy(const CuMatrix<Real> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<Real> *tgt) {
void Copy(const CuMatrix<Real> &src, const CuArray<int32> &copy_from_indices, CuMatrix<Real> *tgt) {
KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
KALDI_ASSERT(src.NumRows() == tgt->NumRows());
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK, CUBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
cuda_copy(dimGrid, dimBlock, tgt->data_, src.data_, copy_from_indices.Data(), tgt->Dim(), src.Dim());
cuSafeCall(cudaGetLastError());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
@ -186,11 +187,12 @@ void Copy(const CuMatrix<Real> &src, const CuStlVector<int32> &copy_from_indices
{
// expand in CPU
const MatrixBase<Real> &srcmat = src.Mat();
const std::vector<int32> &copy_from_indicesvec = copy_from_indices.Vec();
const int32 *copy_from_indicesvec = copy_from_indices.Data();
int32 dim = copy_from_indices.Dim();
MatrixBase<Real> &tgtmat = tgt->Mat();
//
for(int32 r=0; r < tgtmat.NumRows(); r++) {
for(int32 c=0; c < static_cast<int32>(copy_from_indicesvec.size()); c++) {
for(int32 r = 0; r < tgtmat.NumRows(); r++) {
for(int32 c = 0; c < dim; c++) {
tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
}
}
@ -204,21 +206,21 @@ template
void RegularizeL1(CuMatrixBase<double> *weight, CuMatrixBase<double> *grad, double l1, double lr);
template
void Splice(const CuMatrix<float> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<float> *tgt);
void Splice(const CuMatrix<float> &src, const CuArray<int32> &frame_offsets, CuMatrix<float> *tgt);
template
void Splice(const CuMatrix<double> &src, const CuStlVector<int32> &frame_offsets, CuMatrix<double> *tgt);
void Splice(const CuMatrix<double> &src, const CuArray<int32> &frame_offsets, CuMatrix<double> *tgt);
template
void Copy(const CuMatrix<float> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<float> *tgt);
void Copy(const CuMatrix<float> &src, const CuArray<int32> &copy_from_indices, CuMatrix<float> *tgt);
template
void Copy(const CuMatrix<double> &src, const CuStlVector<int32> &copy_from_indices, CuMatrix<double> *tgt);
void Copy(const CuMatrix<double> &src, const CuArray<int32> &copy_from_indices, CuMatrix<double> *tgt);
template
void Randomize(const CuMatrixBase<float> &src,
const CuStlVector<int32> &copy_from_idx,
const CuArray<int32> &copy_from_idx,
CuMatrixBase<float> *tgt);
template
void Randomize(const CuMatrixBase<double> &src,
const CuStlVector<int32> &copy_from_idx,
const CuArray<int32> &copy_from_idx,
CuMatrixBase<double> *tgt);

Просмотреть файл

@ -1,6 +1,7 @@
// cudamatrix/cu-math.h
// Copyright 2009-2012 Karel Vesely
// 2013 Johns Hopkins University (Author: David Snyder)
// See ../../COPYING for clarification regarding multiple authors
//
@ -22,7 +23,7 @@
#ifndef KALDI_CUDAMATRIX_CU_MATH_H_
#define KALDI_CUDAMATRIX_CU_MATH_H_
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-stlvector.h"
#include "cudamatrix/cu-array.h"
#include "cudamatrix/cu-device.h"
#include "util/timer.h"
@ -38,21 +39,38 @@ template<typename Real>
void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *gradient,
Real l1_penalty, Real learning_rate);
/// ie. switch rows according to copy_from_idx
/// Copies a permutation of src into tgt. The row permutation is specified in
/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The
/// dimensions of copy_from_idx must be equivalent to the number of rows in
/// tgt and src and all elements in the vector must be in [0, src.numRows()-1].
template<typename Real>
void Randomize(const CuMatrixBase<Real> &src,
const CuStlVector<int32> &copy_from_idx,
const CuArray<int32> &copy_from_idx,
CuMatrixBase<Real> *tgt);
/// ie. concatenate the frames with offsets from frame_offsets
/// Splice concatenates frames of src as specified in frame_offsets into tgt.
/// The dimensions of tgt must be equivalent to the number of rows in src
/// and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim().
/// As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the
/// general case where i in [0..src.NumRows()-1],
/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1]
/// and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the
/// number of rows in src or less than 0 than the right side of the equation
/// is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid
/// an index out of bounds.
template<typename Real>
void Splice(const CuMatrix<Real> &src,
const CuStlVector<int32> &frame_offsets,
const CuArray<int32> &frame_offsets,
CuMatrix<Real> *tgt);
/// Copies elements from src into tgt as given by copy_from_indices.
/// The matrices src and tgt must have the same dimensions and
/// the dimension of copy_from_indices must equal the number of columns
/// in the src matrix. As a result, tgt(i, j) == src(i, copy_from_indices[j]).
/// Also see CuMatrix::CopyCols(), which is more general.
template<typename Real>
void Copy(const CuMatrix<Real> &src,
const CuStlVector<int32> &copy_from_indices,
const CuArray<int32> &copy_from_indices,
CuMatrix<Real> *tgt);

Просмотреть файл

@ -24,7 +24,7 @@
namespace kaldi {
template<class Real>
template<typename Real>
inline CuSubMatrix<Real>::CuSubMatrix(const CuMatrixBase<Real> &mat,
const MatrixIndexT row_offset,
const MatrixIndexT num_rows,

Просмотреть файл

@ -1,31 +1,32 @@
// matrix/packed-matrix-inl.h
// cudamatrix/cu-matrix-lib.h
// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Lukas Burget;
// Saarland University; Yanmin Qian; Jan Silovsky;
// Haihua Xu
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_PACKED_MATRIX_INL_H_
#define KALDI_MATRIX_PACKED_MATRIX_INL_H_
namespace kaldi {
} // namespace kaldi
#ifndef KALDI_CUDAMATRIX_CU_MATRIX_LIB_H_
#define KALDI_CUDAMATRIX_CU_MATRIX_LIB_H_
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-sp-matrix.h"
#include "cudamatrix/cu-tp-matrix.h"
#include "cudamatrix/cu-block-matrix.h"
#include "cudamatrix/cu-rand.h"
#endif

Просмотреть файл

@ -0,0 +1,196 @@
// cudamatrix/cu-matrix-speed-test.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-math.h"
using namespace kaldi;
namespace kaldi {
template<typename Real>
std::string NameOf() {
return (sizeof(Real) == 8 ? "<double>" : "<float>");
}
template<typename Real> void TestCuMatrixMatMat(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuMatrix<Real> M(dim, dim), N(dim, dim), O(dim, dim);
M.SetRandn();
N.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
O.AddMatMat(1.0, M, kNoTrans, N, kNoTrans, 0.0);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::AddMatMat" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixSigmoid(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuMatrix<Real> M(dim, dim), N(dim, dim);
M.SetRandn();
N.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
N.Sigmoid(M);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::Sigmoid" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuMatrix<Real> M(256, dim), N(256, dim);
M.SetRandn();
N.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
N.ApplySoftMaxPerRow(M);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::Softmax" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
for (int32 n = 0; n < 2; n++) {
MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
BaseFloat time_in_secs = 0.08;
CuMatrix<Real> M(dim, dim), N(dim, dim);
M.SetRandn();
N.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
TraceMatMat(M, N, trans);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf<Real>()
<< (trans == kTrans ? " [transposed]" : "") << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
}
template<typename Real> void TestCuMatrixCopyLowerToUpper(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuMatrix<Real> M(dim, dim);
M.SetRandn();
Timer tim;
int32 iter = 0;
for (; tim.Elapsed() < time_in_secs; iter++) {
M.CopyLowerToUpper();
}
CuMatrix<Real> M2(M, kTrans);
AssertEqual(M, M2);
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::CopyLowerToUpper" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixCopyUpperToLower(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuMatrix<Real> M(dim, dim);
M.SetRandn();
Timer tim;
int32 iter = 0;
for (; tim.Elapsed() < time_in_secs; iter++) {
M.CopyUpperToLower();
}
CuMatrix<Real> M2(M, kTrans);
AssertEqual(M, M2);
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::CopyUpperToLower" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void CudaMatrixSpeedTest() {
std::vector<int32> sizes;
sizes.push_back(16);
sizes.push_back(128);
sizes.push_back(256);
sizes.push_back(1024);
int32 ns = sizes.size();
for (int32 s = 0; s < ns; s++)
TestCuMatrixMatMat<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixSigmoid<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixSoftmax<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixTraceMatMat<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixCopyLowerToUpper<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixCopyUpperToLower<Real>(sizes[s]);
}
} // namespace kaldi
int main() {
//Select the GPU
#if HAVE_CUDA == 1
CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
#endif
kaldi::CudaMatrixSpeedTest<float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CudaMatrixSpeedTest<double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CudaMatrixSpeedTest<double>();
#endif
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
std::cout << "Tests succeeded.\n";
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,7 +1,10 @@
// cudamatrix/cu-matrix.h
// Copyright 2009-2012 Karel Vesely
// Johns Hopkins University (author: Daniel Povey)
// 2013 Johns Hopkins University (author: Daniel Povey)
// 2013 Hainan Xu
// 2013 Xiaohui Zhang
// 2013 Johns Hopkins University (author: Guoguo Chen)
// See ../../COPYING for clarification regarding multiple authors
//
@ -27,14 +30,18 @@
#include "cudamatrix/cu-matrixdim.h"
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-value.h"
#include "matrix/matrix-common.h"
#include "matrix/kaldi-matrix.h"
#include "cudamatrix/cu-stlvector.h"
#include "cudamatrix/cu-array.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-rand.h"
namespace kaldi {
template<typename Real>
Real TraceMatMat(const CuMatrixBase<Real> &A, const CuMatrixBase<Real> &B,
MatrixTransposeType trans = kNoTrans);
/**
* Matrix for CUDA computing.
* Does the computation on the CUDA card when CUDA is compiled in and
@ -42,23 +49,77 @@ namespace kaldi {
* otherwise, does it on the CPU.
*/
/*
template<typename Real>
struct MatrixElement {
int row;
int column;
Real weight;
};
// */
template<typename Real>
class CuMatrixBase {
public:
friend class CuMatrixBase<float>;
friend class CuMatrixBase<double>;
friend class CuVectorBase<float>;
friend class CuVectorBase<double>;
friend class VectorBase<Real>;
friend class CuSpMatrix<Real>;
friend class CuTpMatrix<float>;
friend class CuTpMatrix<double>;
friend class CuVectorBase<Real>;
friend class CuSubMatrix<Real>;
friend class CuRand<Real>;
friend class CuSubVector<Real>;
friend class CuBlockMatrix<Real>;
friend void cu::RegularizeL1<Real>(CuMatrixBase<Real> *weight,
CuMatrixBase<Real> *grad, Real l1, Real lr);
friend void cu::Splice<Real>(const CuMatrix<Real> &src,
const CuStlVector<int32> &frame_offsets,
const CuArray<int32> &frame_offsets,
CuMatrix<Real> *tgt);
friend void cu::Copy<Real>(const CuMatrix<Real> &src,
const CuStlVector<int32> &copy_from_indices,
const CuArray<int32> &copy_from_indices,
CuMatrix<Real> *tgt);
friend void cu::Randomize<Real>(const CuMatrixBase<Real> &src,
const CuStlVector<int32> &copy_from_idx,
const CuArray<int32> &copy_from_idx,
CuMatrixBase<Real> *tgt);
/// Copies column r from column indices[r] of src.
/// As a special case, if indexes[i] == -1, sets column i to zero
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void CopyCols(const CuMatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// Version of CopyCols that takes CuArray argument.
void CopyCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &indices);
/// Copies row r from row indices[r] of src.
/// As a special case, if indexes[i] <== -1, sets row i to zero
/// "reorder".size() must equal this->NumRows(),
/// all elements of "reorder" must be in [0, src.NumRows()-1],
/// and src.NumCols() must equal this.NumCols()
void CopyRows(const CuMatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// For each row r of this and for each column c, sets (*this)(r, c) to the
/// sum \sum_j src(r, j), where j ranges from indices[c].first through
/// indices[c].second - 1.
void SumColumnRanges(const CuMatrixBase<Real> &src,
const CuArray<Int32Pair> &indices);
friend Real TraceMatMat<Real>(const CuMatrixBase<Real> &A,
const CuMatrixBase<Real> &B,
MatrixTransposeType trans);
void AddToDiag(Real value);
/// Dimensions
MatrixIndexT NumRows() const { return num_rows_; }
@ -72,26 +133,66 @@ class CuMatrixBase {
return d;
}
Real FrobeniusNorm() const { return sqrt(TraceMatMat(*this, *this, kTrans)); }
bool IsUnit(Real tol = 0.001) const;
bool ApproxEqual(const CuMatrixBase<Real> &other, float tol = 0.01) const;
/// Get size of matrix in bytes
MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
/// Get size of matrix row in bytes
MatrixIndexT RowSizeInBytes() const { return num_cols_*sizeof(Real); }
/// Get size of matrix stride in bytes
MatrixIndexT StrideSizeInBytes() const { return stride_*sizeof(Real); }
// Copy functions. These do not resize.
template<typename OtherReal>
void CopyFromMat(const MatrixBase<OtherReal> &src,
MatrixTransposeType trans = kNoTrans);
void CopyFromMat(const MatrixBase<Real> &src,
MatrixTransposeType trans = kNoTrans);
/// Copy functions (reallocates when needed, but note from Dan: eventually
/// I'll change it to just die if the sizes don't match, like the Matrix class.)
void CopyFromMat(const CuMatrixBase<Real> &src);
void CopyFromMat(const MatrixBase<Real> &src);
void CopyToMat(MatrixBase<Real> *dst) const;
void CopyFromSp(const CuSpMatrix<Real> &M);
template<typename OtherReal>
void CopyFromTp(const CuTpMatrix<OtherReal> &M,
MatrixTransposeType trans = kNoTrans);
template<typename OtherReal>
void CopyFromMat(const CuMatrixBase<OtherReal> &M,
MatrixTransposeType trans = kNoTrans);
template<typename OtherReal>
void CopyToMat(MatrixBase<OtherReal> *dst,
MatrixTransposeType trans = kNoTrans) const;
void CopyRowsFromVec(const CuVectorBase<Real> &v);
void CopyRowsFromVec(const VectorBase<Real> &v);
/// Copy vector into specific column of matrix.
void CopyColFromVec(const CuVectorBase<Real> &v, const MatrixIndexT col);
/// Set each element to the sigmoid of the corresponding element of "src":
/// element by element, *this = 1 / (1 + exp(-src)).
/// element by element, x = 1 / (1 + exp(-x))
void Sigmoid(const CuMatrixBase<Real> &src);
/// Apply the function y = log(1 + exp(x)), to each element.
/// Note: the derivative of this function is the sigmoid function.
/// This is like a soft ReLU.
void SoftHinge(const CuMatrixBase<Real> &src);
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
/// where G = x.NumCols() / y.NumCols() must be an integer.
void GroupPnorm(const CuMatrixBase<Real> &src, Real pow);
/// Calculate derivatives for the GroupPnorm function above...
/// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
/// and "output" is the result of the computation (i.e. the "this" of that function
/// call), and *this has the same dimension as "input", then it sets each element
/// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
/// "output-elem" is whichever element of output depends on that input element.
void GroupPnormDeriv(const CuMatrixBase<Real> &input,
const CuMatrixBase<Real> &output, Real power);
/// Compute the hyperbolic tangent (tanh) function; element by element,
/// *this = tanh(src).
void Tanh(const CuMatrixBase<Real> &src);
@ -105,7 +206,7 @@ class CuMatrixBase {
/// tanh output. Does, element-by-element, *this = diff * (1 - value^2).
void DiffTanh(const CuMatrixBase<Real> &value,
const CuMatrixBase<Real> &diff);
/// Differentiate the block [softmax+cross-entropy] :
/// dE/da = posterior_mat - target_mat,
/// 'E' is error function, 'a' is activation on softmax input
@ -115,16 +216,30 @@ class CuMatrixBase {
/// net_out_or_diff ... before invocation net output, after diff dE/da
/// log_post_tgt ... per-frame statistics for cross-entropy computations :
/// log(sum_row(posterior_mat .* target_mat))
void DiffXent(const CuStlVector<int32> &tgt,
void DiffXent(const CuArray<int32> &tgt,
CuVector<Real> *log_post_tgt);
/// This method may be only called for symmetric matrices (it accesses the
/// upper as well as lower triangle). The result is put in the lower
/// triangle, and the upper triangle zeroed.
void Cholesky();
void SymInvertPosDef(); ///< Inversion for positive definite symmetric matrices.
///< Requires that the input is symmetric (we do not check this).
///< The output is symmetric.
void ApplyPow(Real power);
void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0)
void ApplyFloor(Real floor_val);
void ApplyCeiling(Real ceiling_val);
void ApplyExp();
/// Softmax nonlinearity
/// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik)
/// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row
/// for each row, the max value is first subtracted for good numerical stability
void Softmax(const CuMatrixBase<Real> &src);
void ApplySoftMaxPerRow(const CuMatrixBase<Real> &src);
/// Find the id of the maximal element for each row
void FindRowMaxId(CuStlVector<int32> *id) const;
void FindRowMaxId(CuArray<int32> *id) const;
/*
// Copy row interval from matrix
@ -139,27 +254,90 @@ class CuMatrixBase {
void SetZero();
void Set(Real value);
void Add(Real value);
void SetZeroUpperDiag();
void Scale(Real value);
void ApplyLog();
/// Multiply two matrices elementhwise: C = A .* C
void MulElements(const CuMatrixBase<Real>& A);
/// Multiply two matrices elementwise: C = A .* C
void MulElements(const CuMatrixBase<Real> &A);
/// Do, elementwise, *this = max(*this, A).
void Max(const CuMatrixBase<Real> &A);
/// scale i'th column by scale[i]
void MulColsVec(const CuVectorBase<Real> &scale);
/// scale i'th row by scale[i]
void MulRowsVec(const CuVectorBase<Real> &scale);
void MulRowsVec(const CuVectorBase<Real> &scale);
/// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j].
void MulRowsGroupMat(const CuMatrixBase<Real> &src);
/// divide i'th row by scale[i]
void DivRowsVec(const CuVectorBase<Real> &div);
/// B = aplha * A + beta * B
void AddMat(Real alpha, const CuMatrixBase<Real>& A, Real beta=1.0);
void AddMat(Real alpha, const CuMatrixBase<Real> &A, Real beta=1.0);
/// B = aplha * row + beta * B
void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta=1.0);
void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
/// B = aplha * row + beta * B
void AddVecToRows(Real alpha, const CuVectorBase<Real> &row, Real beta=1.0);
void AddVecToRows(Real alpha, const CuVectorBase<Real> &row, Real beta = 1.0);
/// C = alpha * A(^T)*B(^T) + beta * C
void AddMatMat(Real alpha, const CuMatrixBase<Real>& A, MatrixTransposeType transA,
const CuMatrixBase<Real>& B, MatrixTransposeType transB, Real beta);
void AddMatMat(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
const CuMatrixBase<Real> &B, MatrixTransposeType transB, Real beta);
/// *this = beta * *this + alpha * M M^T, for symmetric matrices. It only
/// updates the lower triangle of *this. It will leave the matrix asymmetric;
/// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
void SymAddMat2(const Real alpha, const CuMatrixBase<Real> &M,
MatrixTransposeType transA, Real beta);
/// This function is like AddMatMat but for where the second argument is of
/// type CuBlockMatrix (a block-diagonal matrix of blocks).
void AddMatBlock(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
const CuBlockMatrix<Real> &B, MatrixTransposeType transB, Real beta);
/// *this = beta * *this + alpha * diag(v) * M [or M^T].
/// The same as adding M but scaling each row M_i by v(i).
void AddDiagVecMat(const Real alpha, CuVectorBase<Real> &v,
const CuMatrixBase<Real> &M, MatrixTransposeType transM,
Real beta = 1.0);
/// this <-- beta*this + alpha*A*B
void AddMatSp(const Real alpha,
const CuMatrixBase<Real> &A, MatrixTransposeType transA,
const CuSpMatrix<Real> &B,
const Real beta) {
CuMatrix<Real> M(B);
return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
}
/// this <-- beta*this + alpha*SpA*B
void AddSpMat(const Real alpha,
const CuSpMatrix<Real> &A,
const CuMatrixBase<Real> &B, MatrixTransposeType transB,
const Real beta) {
CuMatrix<Real> M(A);
return AddMatMat(alpha, M, kNoTrans, B, transB, beta);
}
/// this <-- beta*this + alpha*A*B.
void AddTpMat(const Real alpha,
const CuTpMatrix<Real> &A, MatrixTransposeType transA,
const CuMatrixBase<Real> &B, MatrixTransposeType transB,
const Real beta) {
CuMatrix<Real> M(A);
return AddMatMat(alpha, M, transA, B, transB, beta);
}
/// this <-- beta*this + alpha*A*B.
void AddMatTp(const Real alpha,
const CuMatrixBase<Real> &A, MatrixTransposeType transA,
const CuTpMatrix<Real> &B, MatrixTransposeType transB,
const Real beta) {
CuMatrix<Real> M(B);
return AddMatMat(alpha, A, transA, M, transB, beta);
}
void CopyFromBlock(const CuBlockMatrix<Real> &B,
MatrixTransposeType trans = kNoTrans);
void CopyLowerToUpper();
void CopyUpperToLower();
inline CuSubMatrix<Real> Range(const MatrixIndexT row_offset,
const MatrixIndexT num_rows,
const MatrixIndexT col_offset,
@ -177,11 +355,67 @@ class CuMatrixBase {
return CuSubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
}
inline const CuSubVector<Real> Row(MatrixIndexT i) const {
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
static_cast<UnsignedMatrixIndexT>(num_rows_));
return CuSubVector<Real>(data_ + (i * stride_), NumCols());
}
inline CuSubVector<Real> Row(MatrixIndexT i) {
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
static_cast<UnsignedMatrixIndexT>(num_rows_));
return CuSubVector<Real>(data_ + (i * stride_), NumCols());
}
inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
static_cast<UnsignedMatrixIndexT>(num_rows_) &&
static_cast<UnsignedMatrixIndexT>(c) <
static_cast<UnsignedMatrixIndexT>(num_cols_));
return CuValue<Real>(data_ + r * stride_ + c);
}
inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
static_cast<UnsignedMatrixIndexT>(num_rows_) &&
static_cast<UnsignedMatrixIndexT>(c) <
static_cast<UnsignedMatrixIndexT>(num_cols_));
return CuValue<Real>(data_ + r * stride_ + c); // will be casted to Real.
}
Real Sum() const;
/// Return the trace. If check_square = true, will crash if matrix is not square.
Real Trace(bool check_square = true) const;
void SetRandn();
void SetRandUniform();
void Write(std::ostream &os, bool binary) const;
// This function resizes the output to indices.size(), and for each element of
// "indices" it interprets it as a (row, column) index into *this, and puts
// (*this)(row, column) into the corresponding element of "output".
void Lookup(const std::vector<Int32Pair> &indices,
std::vector<Real> *output) const;
protected:
// The following two functions should only be called if we did not compile with CUDA
// or could not get a CUDA card; in that case the contents are interpreted the
// same as a regular matrix.
inline const MatrixBase<Real> &Mat() const {
return *(reinterpret_cast<const MatrixBase<Real>* >(this));
}
inline MatrixBase<Real> &Mat() {
return *(reinterpret_cast<MatrixBase<Real>* >(this));
}
/// Get raw row pointer
inline const Real* RowData(MatrixIndexT r) const { return data_ + r * stride_; }
inline Real* RowData(MatrixIndexT r) { return data_ + r * stride_; }
inline const Real *Data() const { return data_; }
inline Real *Data() { return data_; }
// The constructors are protected to prevent the user creating an instance of
@ -198,19 +432,9 @@ class CuMatrixBase {
MatrixIndexT stride):
data_(data), num_cols_(num_cols), num_rows_(num_rows), stride_(stride) { }
// The following two functions should only be called if we did not compile with CUDA
// or could not get a CUDA card; in that case the contents are interpreted the
// same as a regular matrix.
inline const MatrixBase<Real> &Mat() const {
return *(reinterpret_cast<const MatrixBase<Real>* >(this));
}
inline MatrixBase<Real> &Mat() {
return *(reinterpret_cast<MatrixBase<Real>* >(this));
}
Real *data_; ///< GPU data pointer (or regular matrix data pointer,
///< if either CUDA was not compiled in or we could not
///< acquire the device).
///< if either CUDA was not compiled in or we could not
///< acquire the device).
// Note: it might seem a bit backwards that we have the number of columns
// first here; it's necessary because we need the data to be laid out the same
// as for MatrixBase so the Mat() function call will work. We don't want to
@ -239,15 +463,34 @@ class CuMatrix: public CuMatrixBase<Real> {
// Note: we had to remove the "explicit" keyword due
// to problems with STL vectors of CuMatrixBase.
CuMatrix(const CuMatrix<Real> &other) {
this->Resize(other.NumRows(), other.NumCols(), kUndefined);
this->CopyFromMat(other);
CuMatrix(const CuMatrix<Real> &other,
MatrixTransposeType trans = kNoTrans);
explicit CuMatrix(const CuBlockMatrix<Real> &other,
MatrixTransposeType trans = kNoTrans);
explicit CuMatrix(const CuMatrixBase<Real> &other,
MatrixTransposeType trans = kNoTrans);
template<typename OtherReal>
explicit CuMatrix(const MatrixBase<OtherReal> &other,
MatrixTransposeType trans = kNoTrans);
/// Copy constructor taking SpMatrix...
explicit CuMatrix(const CuSpMatrix<Real> &M) : CuMatrixBase<Real>() {
Resize(M.NumRows(), M.NumRows(), kUndefined);
this->CopyFromSp(M);
}
explicit CuMatrix(const MatrixBase<Real> &other) {
this->Resize(other.NumRows(), other.NumCols(), kUndefined);
this->CopyFromMat(other);
}
/// Copy constructor taking TpMatrix...
template <typename OtherReal>
explicit CuMatrix(const CuTpMatrix<OtherReal> & M,
MatrixTransposeType trans = kNoTrans);
/// Copy constructor: as above, but from another type.
template<typename OtherReal>
explicit CuMatrix(const CuMatrixBase<OtherReal> &M,
MatrixTransposeType trans = kNoTrans);
CuMatrix<Real> &operator = (const CuMatrixBase<Real> &other) {
this->Resize(other.NumRows(), other.NumCols(), kUndefined);
@ -265,21 +508,45 @@ class CuMatrix: public CuMatrixBase<Real> {
this->Resize(other.NumRows(), other.NumCols(), kUndefined);
this->CopyFromMat(other);
return *this;
}
}
void Transpose();
/// Allocate the memory
void Resize(MatrixIndexT rows, MatrixIndexT cols,
MatrixResizeType resize_type = kSetZero);
void Swap(Matrix<Real> *mat);
void Swap(CuMatrix<Real> *mat);
template<typename OtherReal>
void Swap(CuMatrix<OtherReal> *mat);
/// I/O functions
void Read(std::istream &is, bool binary);
void Write(std::ostream &os, bool binary) const;
/// Destructor
~CuMatrix() { Destroy(); }
inline const Matrix<Real> &Mat() const {
return *(reinterpret_cast<const Matrix<Real>* >(this));
}
inline Matrix<Real> &Mat() {
return *(reinterpret_cast<Matrix<Real>* >(this));
}
/// This function does: for each element { row, column, weight } indexed i in
/// the vector "elements", let x(i) = A(row(i), column(i)); then it does
/// (*this)(row(i), column(i)) += weight(i) / x(i), and
/// *tot_objf = \sum_i weight(i) * log(x(i)), and
/// *tot_weight = \sum_i weight(i)
/// Preconditions: A must be strictly positive, and no (row, column) pair
/// may be repeated within "elements"
void CompObjfAndDeriv(const std::vector<MatrixElement<Real> > &elements,
const CuMatrix<Real> &A,
Real *tot_objf,
Real* tot_weight);
private:
void Destroy();
};
@ -305,27 +572,55 @@ class CuSubMatrix: public CuMatrixBase<Real> {
CuSubMatrix<Real> &operator = (const CuSubMatrix<Real> &other);
};
template<class Real>
template<typename Real>
bool ApproxEqual(const CuMatrixBase<Real> &A,
const CuMatrixBase<Real> &B, Real tol = 0.01) {
return A.ApproxEqual(B, tol);
}
template<typename Real>
inline void AssertEqual(CuMatrixBase<Real> &A, CuMatrixBase<Real> &B,
float tol = 0.01) {
KALDI_ASSERT(A.ApproxEqual(B, tol));
}
template<typename Real>
bool SameDim(const CuMatrixBase<Real> &M, const CuMatrixBase<Real> &N) {
return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
}
template<class Real>
template<typename Real>
bool SameDimAndStride(const CuMatrixBase<Real> &M, const CuMatrixBase<Real> &N) {
return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols()
&& M.Stride() == N.Stride());
}
/// I/O
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuMatrixBase<Real> &mat);
} // namespace
template<typename Real>
template<typename OtherReal>
Matrix<Real>::Matrix(const CuMatrixBase<OtherReal> &M,
MatrixTransposeType trans) {
if (trans == kNoTrans) Init(M.NumRows(), M.NumCols());
else Init(M.NumCols(), M.NumRows());
M.CopyToMat(this, trans);
}
template<typename Real>
template<typename OtherReal>
void MatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &cu,
MatrixTransposeType trans) {
cu.CopyToMat(this, trans);
}
#include "cu-matrix-inl.h"
} // namespace
#include "cudamatrix/cu-matrix-inl.h"
#endif

Просмотреть файл

@ -1,6 +1,7 @@
// cudamatrix/cu-matrixdim.h
// Copyright 2009-2012 Karel Vesely
// 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -28,12 +29,20 @@
#ifdef _MSC_VER
typedef unsigned __int32 uint32_cuda;
typedef __int32 int32_cuda;
typedef __int32 MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
#else
#include <stdint.h>
typedef uint32_t uint32_cuda;
typedef int32_t int32_cuda;
typedef int32_t MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
#endif
template<typename Real>
struct MatrixElement {
int32_cuda row;
int32_cuda column;
Real weight;
};
extern "C" {
/**
@ -45,8 +54,37 @@ extern "C" {
int32_cuda cols;
int32_cuda stride;
} MatrixDim;
// we define the following constants here because this file is included
// both by the C++ code and also CUDA code.
// The size of a CUDA 1-d block, e.g. for vector operations..
#define CU1DBLOCK 256
// The size of edge of CUDA square block, e.g. for matrix operations.
// Must be defined the same in cu-kernels-ansi.h
#define CU2DBLOCK 16
/** This structure is used in cu-block-matrix.h to store information
about a block-diagonal matrix. We declare it here so that it
will be accessible
*/
typedef struct CuBlockMatrixData_ {
int32_cuda row_offset; // sum of #rows of previous M_i
int32_cuda col_offset; // sum of #cols of previous M_i
MatrixDim matrix_dim; // dimension of this M_i
void *matrix_data; // data for M_i. This is a pointer to either float* or
// double*. Because C doesn't support templates and to
// avoid extra coding to support the two cases, we
// decided to make this a void* pointer.
} CuBlockMatrixData;
typedef struct Int32Pair {
int32_cuda first;
int32_cuda second;
} Int32Pair;
}
#endif

Просмотреть файл

@ -0,0 +1,265 @@
// cudamatrix/cu-sp-matrix-test.cc
//
// Copyright 2013 Ehsan Variani
// Lucas Ondel
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
// UnitTests for testing cu-sp-matrix.h methods.
//
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-sp-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-math.h"
using namespace kaldi;
namespace kaldi {
/*
* INITIALIZERS
*/
/*
* ASSERTS
*/
template<typename Real>
static void AssertEqual(const CuPackedMatrix<Real> &A,
const CuPackedMatrix<Real> &B,
float tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
for (MatrixIndexT j = 0; j <= i; j++)
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
}
template<typename Real>
static void AssertEqual(const PackedMatrix<Real> &A,
const PackedMatrix<Real> &B,
float tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
for (MatrixIndexT j = 0; j <= i; j++)
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
}
template<typename Real>
static void AssertDiagEqual(const PackedMatrix<Real> &A,
const CuPackedMatrix<Real> &B,
float value,
float tol = 0.001) {
for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
KALDI_ASSERT(std::abs((A(i, i)+value) - B(i, i))
< tol * std::max(1.0, (double) (std::abs(A(i, i)) + std::abs(B(i, i) + value))));
}
}
template<typename Real>
static void AssertDiagEqual(const PackedMatrix<Real> &A,
const PackedMatrix<Real> &B,
float value,
float tol = 0.001) {
for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
KALDI_ASSERT(std::abs((A(i, i)+value) - B(i, i))
< tol * std::max(1.0, (double) (std::abs(A(i, i)) + std::abs(B(i, i) + value))));
}
}
template<typename Real>
static void AssertEqual(const PackedMatrix<Real> &A,
const CuPackedMatrix<Real> &B,
float tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
for (MatrixIndexT j = 0; j <= i; j++)
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
}
template<typename Real>
static bool ApproxEqual(const PackedMatrix<Real> &A,
const PackedMatrix<Real> &B, Real tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
PackedMatrix<Real> diff(A);
diff.AddPacked(1.0, B);
Real a = std::max(A.Max(), -A.Min()), b = std::max(B.Max(), -B.Min()),
d = std::max(diff.Max(), -diff.Min());
return (d <= tol * std::max(a, b));
}
/*
* Unit Tests
*/
template<typename Real>
static void UnitTestCuPackedMatrixConstructor() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 10 * i;
PackedMatrix<Real> A(dim);
A.SetRandn();
CuPackedMatrix<Real> B(A);
CuPackedMatrix<Real> C(B);
AssertEqual(B, C);
}
}
template<typename Real>
static void UnitTestCuPackedMatrixCopy() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 10 * i;
PackedMatrix<Real> A(dim);
A.SetRandn();
CuPackedMatrix<Real> B(A);
CuPackedMatrix<Real> C(dim);
C.CopyFromPacked(A);
CuPackedMatrix<Real> D(dim);
D.CopyFromPacked(B);
AssertEqual(C, D);
PackedMatrix<Real> E(dim);
D.CopyToPacked(&E);
AssertEqual(A, E);
}
}
template<typename Real>
static void UnitTestCuPackedMatrixTrace() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 5 * i + rand() % 10;
PackedMatrix<Real> A(dim);
A.SetRandn();
CuPackedMatrix<Real> B(A);
AssertEqual(A.Trace(), B.Trace());
}
}
template<typename Real>
static void UnitTestCuPackedMatrixScale() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 5 * i + rand() % 10;
PackedMatrix<Real> A(dim);
A.SetRandn();
CuPackedMatrix<Real> B(A);
Real scale_factor = 23.5896223;
A.Scale(scale_factor);
B.Scale(scale_factor);
AssertEqual(A, B);
}
}
template<typename Real>
static void UnitTestCuPackedMatrixScaleDiag() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 5 * i + rand() % 10;
PackedMatrix<Real> A(dim);
A.SetRandn();
CuPackedMatrix<Real> B(A);
Real scale_factor = 23.5896223;
A.ScaleDiag(scale_factor);
B.ScaleDiag(scale_factor);
AssertEqual(A, B);
}
}
template<typename Real>
static void UnitTestCuPackedMatrixAddToDiag() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 5 * i + rand() % 10;
PackedMatrix<Real> A(dim);
A.SetRandn();
CuPackedMatrix<Real> B(A);
Real value = rand() % 50;
B.AddToDiag(value);
AssertDiagEqual(A, B, value);
}
}
template<typename Real>
static void UnitTestCuPackedMatrixSetUnit() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 5 * i + rand() % 10;
CuPackedMatrix<Real> A(dim);
A.SetUnit();
for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
for (MatrixIndexT j = 0; j < A.NumRows(); j++) {
if (i != j) {
KALDI_ASSERT(A(i, j) == 0);
} else {
KALDI_ASSERT(A(i, j) == 1.0);
}
}
}
}
}
template<typename Real> void CudaPackedMatrixUnitTest() {
UnitTestCuPackedMatrixConstructor<Real>();
//UnitTestCuPackedMatrixCopy<Real>();
UnitTestCuPackedMatrixTrace<Real>();
UnitTestCuPackedMatrixScale<Real>();
UnitTestCuPackedMatrixAddToDiag<Real>();
UnitTestCuPackedMatrixSetUnit<Real>();
}
} // namespace kaldi
int main() {
using namespace kaldi;
#if HAVE_CUDA == 1
// Select the GPU
CuDevice::Instantiate().SelectGpuId("yes");
#endif
kaldi::CudaPackedMatrixUnitTest<float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CudaPackedMatrixUnitTest<double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CudaPackedMatrixUnitTest<double>();
#endif
KALDI_LOG << "Tests succeeded";
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
}

Просмотреть файл

@ -0,0 +1,400 @@
// cudamatrix/cu-packed-matrix.cc
// Copyright 2009-2013 Johns Hopkins University (author: Daniel Povey)
// Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#if HAVE_CUDA == 1
#include <cuda_runtime_api.h>
#include <cublas.h>
#endif
#include "util/timer.h"
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-kernels.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-packed-matrix.h"
#include "cudamatrix/cublas-wrappers.h"
namespace kaldi {
template<typename Real>
void CuPackedMatrix<Real>::Resize(MatrixIndexT rows,
MatrixResizeType resize_type) {
// This code does not currently support the other resize_type options.
KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
if (this->num_rows_ == rows) {
if (resize_type == kSetZero) this->SetZero();
return;
}
if (this->num_rows_ != 0)
this->Destroy();
if (rows == 0) return;
#if HAVE_CUDA == 1
CuDevice &device = CuDevice::Instantiate();
if (device.Enabled()) {
Timer tim;
this->num_rows_ = rows;
size_t nr = static_cast<size_t>(num_rows_),
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
this->data_ = static_cast<Real*>(device.Malloc(num_bytes));
if (resize_type == kSetZero) this->SetZero();
device.AccuProfile("CuPackedMatrix::Resize", tim.Elapsed());
} else
#endif
{ // Let the initializer of SpMatrix<Real> handle the allocation,
// and then just do Swap which will switch the pointers.
// This wastes a few instructions but is simple to code.
SpMatrix<Real> mat(rows, resize_type);
this->Swap(&mat);
}
}
template<typename Real>
void CuPackedMatrix<Real>::SetRandn() {
if (num_rows_ != 0) {
MatrixIndexT size = num_rows_ * (num_rows_ + 1) / 2;
CuSubVector<Real> tmp(data_, size);
CuRand<Real> rand;
rand.RandGaussian(&tmp);
}
}
template<typename Real>
void CuPackedMatrix<Real>::Destroy() {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (this->data_ != NULL) {
CuDevice::Instantiate().Free(this->data_);
}
} else
#endif
{
if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
}
this->data_ = NULL;
this->num_rows_ = 0;
}
template<typename Real>
void CuPackedMatrix<Real>::Swap(PackedMatrix<Real> *mat) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (this->num_rows_ == 0) {
if (mat->num_rows_ != 0) {
// *this is empty, but mat is nonempty.
Resize(mat->num_rows_, kUndefined);
CopyFromPacked(*mat);
mat->Resize(0);
}
// else both are empty.
} else { // *this is nonempty.
if (mat->num_rows_ != 0) {
// Both *this and *mat are nonempty. Recurse to simpler cases.
// this could be done more efficiently in the case where
// the size does not change.
PackedMatrix<Real> temp;
this->Swap(&temp); // now temp is full, *this is empty.
mat->Swap(&temp); // now mat has data from *this, temp has
// data from mat.
this->Swap(mat); // copy data in mat to *this, which is now empty.
} else { // *this is full but *mat is empty.
mat->Resize(this->num_rows_, kUndefined);
this->CopyToPacked(mat);
this->Destroy();
}
}
} else
#endif
{
std::swap(mat->data_, this->data_);
std::swap(mat->num_rows_, this->num_rows_);
}
}
template<typename Real>
void CuPackedMatrix<Real>::CopyFromPacked(const CuPackedMatrix<Real> &src) {
KALDI_ASSERT(src.NumRows() == num_rows_);
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (num_rows_ == 0) return; // Nothing to do.
Timer tim;
size_t nr = static_cast<size_t>(num_rows_),
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
CU_SAFE_CALL(cudaMemcpy(data_, src.data_, num_bytes,
cudaMemcpyDeviceToDevice));
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked1",tim.Elapsed());
} else
#endif
{
Mat().CopyFromPacked(src.Mat());
}
}
template<typename Real>
void CuPackedMatrix<Real>::CopyFromPacked(const PackedMatrix<Real> &src) {
KALDI_ASSERT(src.NumRows() == num_rows_);
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (num_rows_ == 0) return; // Nothing to do.
Timer tim;
CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.SizeInBytes(),
cudaMemcpyHostToDevice));
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked2",tim.Elapsed());
} else
#endif
{
Mat().CopyFromPacked(src);
//memcpy(data_, src.Data(), SizeInBytes());
}
}
template<typename Real>
void CuPackedMatrix<Real>::CopyToPacked(PackedMatrix<Real> *dst) const {
KALDI_ASSERT(dst->NumRows() == NumRows());
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (num_rows_ == 0) return; // Nothing to do.
Timer tim;
size_t nr = static_cast<size_t>(num_rows_),
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
CU_SAFE_CALL(cudaMemcpy(dst->data_, data_, num_bytes,
cudaMemcpyDeviceToHost));
CuDevice::Instantiate().AccuProfile("CuPackedMatrixMatrix::CopyToPackedD2H",tim.Elapsed());
} else
#endif
{
//memcpy(data_, dst->Data(), SizeInBytes());
dst->CopyFromPacked(Mat());
}
}
/*
template<typename Real>
void CuPackedMatrix<Real>::CopyRowsFromPacked(int32 r, const CuPackedMatrix<Real> &src, int32 src_ro, int32 dst_ro) {
KALDI_ASSERT(r+src_ro <= src.NumRows());
KALDI_ASSERT(r+dst_ro <= NumRows());
KALDI_ASSERT(NumCols() == src.NumCols());
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
MatrixIndexT dst_pitch = stride_*sizeof(Real);
MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
MatrixIndexT width = src.NumCols()*sizeof(Real);
const Real *p_src = src.Data() + src_ro*src.Stride();
Real *p_dst = data_ + dst_ro*stride_;
CU_SAFE_CALL(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, r, cudaMemcpyDeviceToDevice));
CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Elapsed());
} else
#endif
{
memcpy(Data()+dst_ro*stride_, src.Data()+src_ro*src.Stride(), r*stride_*sizeof(Real));
}
} */
template<typename Real>
void CuPackedMatrix<Real>::Read(std::istream &is, bool binary) {
PackedMatrix<Real> temp;
temp.Read(is, binary);
Destroy();
Swap(&temp);
}
template<typename Real>
void CuPackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
PackedMatrix<Real> temp(this->num_rows_, kUndefined);
this->CopyToPacked(&temp);
temp.Write(os, binary);
}
template<typename Real>
void CuPackedMatrix<Real>::SetZero() {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
size_t nr = static_cast<size_t>(num_rows_),
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
CU_SAFE_CALL(cudaMemset(reinterpret_cast<void*>(this->data_), 0, num_bytes));
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim.Elapsed());
} else
#endif
{
Mat().SetZero();
}
}
template<typename Real>
Real CuPackedMatrix<Real>::Trace() const {
Real result = 0.0;
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (num_rows_ == 0) return 0.0;
CuVector<Real> tmp(num_rows_, kUndefined);
tmp.CopyDiagFromPacked(*this);
return tmp.Sum();
} else
#endif
{
result = Mat().Trace();
}
return result;
}
template<typename Real>
void CuPackedMatrix<Real>::SetDiag(Real alpha) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (num_rows_ == 0) return;
Timer tim;
int dimBlock(CU1DBLOCK);
int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
cuda_set_diag_packed(dimGrid,dimBlock,data_,alpha,num_rows_);
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetDiag", tim.Elapsed());
} else
#endif
{
Mat().SetDiag(alpha);
}
}
template<typename Real>
void CuPackedMatrix<Real>::Scale(Real alpha) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
size_t nr = static_cast<size_t>(num_rows_),
num_elements = ((nr * (nr+1)) / 2);
cublas_scal(num_elements, alpha, data_, 1);
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::Scale", tim.Elapsed());
} else
#endif
{
Mat().Scale(alpha);
}
}
template<typename Real>
void CuPackedMatrix<Real>::ScaleDiag(Real alpha) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
int dimBlock(CU1DBLOCK);
int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
CU_SAFE_CALL(cudaGetLastError()); // TEMP
cuda_scale_diag(dimGrid,dimBlock,data_,alpha,num_rows_);
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::ScaleDiag", tim.Elapsed());
} else
#endif
{
Mat().ScaleDiag(alpha);
}
}
template<typename Real>
void CuPackedMatrix<Real>::AddPacked(const Real alpha, const CuPackedMatrix<Real> &M) {
KALDI_ASSERT(num_rows_ == M.NumRows());
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (num_rows_ == 0) return;
Timer tim;
size_t nr = num_rows_,
sz = (nr * (nr + 1)) / 2;
cublas_axpy(sz, alpha, M.Data(), 1, data_, 1);
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddPacked", tim.Elapsed());
} else
#endif
{
Mat().AddPacked(alpha, M.Mat());
}
}
template<typename Real>
void CuPackedMatrix<Real>::AddToDiag(Real r) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
if (num_rows_ == 0) return;
Timer tim;
int dimBlock(CU1DBLOCK);
int dimGrid(n_blocks(NumRows(),CU1DBLOCK));
cuda_add_diag_packed(dimGrid,dimBlock,data_,r,num_rows_);
CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddToDiag", tim.Elapsed());
} else
#endif
{
// TODO
Mat().AddToDiag(r);
}
}
template<typename Real>
void CuPackedMatrix<Real>::SetUnit() {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
this->SetZero();
this->SetDiag(1.0);
} else
#endif
{
Mat().SetUnit();
}
}
/**
* Print the matrix to stream
*/
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat) {
PackedMatrix<Real> temp(mat.NumRows());
mat.CopyToPacked(&temp);
out << temp;
return out;
}
// instantiate the template
template
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<float> &mat);
template
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<double> &mat);
// Instantiate class CuPackedMatrix for float and double.
template class CuPackedMatrix<float>;
template class CuPackedMatrix<double>;
} // namespace kaldi

Просмотреть файл

@ -0,0 +1,176 @@
// cudamatrix/cu-packed-matrix.h
// Copyright 2009-2013 Johns Hopkins University (author: Daniel Povey)
// Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_PACKED_MATRIX_H_
#define KALDI_CUDAMATRIX_CU_PACKED_MATRIX_H_
#include <sstream>
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-value.h"
#include "matrix/matrix-common.h"
#include "matrix/kaldi-matrix.h"
#include "matrix/packed-matrix.h"
#include "matrix/sp-matrix.h"
#include "cudamatrix/cu-array.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-matrix.h"
namespace kaldi {
/**
* Matrix for CUDA computing. This is a base class for packed
* triangular and symmetric matrices.
* Does the computation on the CUDA card when CUDA is compiled in and
* we have a suitable GPU (CuDevice::Instantiate().Enabled() == true);
* otherwise, does it on the CPU.
*/
/// @brief Packed CUDA matrix: base class for triangular and symmetric matrices on
/// a GPU card.
template<typename Real>
class CuPackedMatrix {
public:
friend class CuMatrixBase<Real>;
friend class CuVectorBase<Real>;
friend class CuSubMatrix<Real>;
friend class CuRand<Real>;
CuPackedMatrix() : data_(NULL), num_rows_(0) {}
explicit CuPackedMatrix(MatrixIndexT r,
MatrixResizeType resize_type = kSetZero):
data_(NULL), num_rows_(0) { Resize(r, resize_type); }
explicit CuPackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL), num_rows_(0) {
Resize(orig.num_rows_, kUndefined);
CopyFromPacked(orig);
}
explicit CuPackedMatrix(const CuPackedMatrix<Real> &orig) : data_(NULL), num_rows_(0) {
Resize(orig.NumRows(), kUndefined);
CopyFromPacked(orig);
}
void SetZero(); /// < Set to zero
void SetUnit(); /// < Set to unit matrix.
void SetRandn(); /// < Set to random values of a normal distribution
void SetDiag(Real alpha); /// < Set the diagonal value to alpha
void AddToDiag(Real r); ///< Add this quantity to the diagonal of the matrix.
void Scale(Real alpha);
void ScaleDiag(Real alpha);
Real Trace() const;
~CuPackedMatrix() { Destroy(); }
/// Set packed matrix to a specified size (can be zero).
/// The value of the new data depends on resize_type:
/// -if kSetZero, the new data will be zero
/// -if kUndefined, the new data will be undefined
/// -if kCopyData, the new data will be the same as the old data in any
/// shared positions, and zero elsewhere.
/// This function takes time proportional to the number of data elements.
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);
// Copy functions (do not resize).
void CopyFromPacked(const CuPackedMatrix<Real> &src);
void CopyFromPacked(const PackedMatrix<Real> &src);
void CopyToPacked(PackedMatrix<Real> *dst) const;
void Read(std::istream &in, bool binary);
void Write(std::ostream &out, bool binary) const;
void Destroy();
/// Swaps the contents of *this and *other. Shallow swap.
void Swap(CuPackedMatrix<Real> *other);
/// Swaps the contents of *this and *other.
void Swap(PackedMatrix<Real> *other);
Real* Data() { return data_; }
const Real* Data() const { return data_; }
inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
if (static_cast<UnsignedMatrixIndexT>(c) >
static_cast<UnsignedMatrixIndexT>(r))
std::swap(c, r);
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
static_cast<UnsignedMatrixIndexT>(this->num_rows_));
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Real value;
CU_SAFE_CALL(cudaMemcpy(&value, this->data_ + (r * (r+1)) / 2 + c,
sizeof(Real), cudaMemcpyDeviceToHost));
return value;
} else
#endif
return this->data_[(r * (r+1)) / 2 + c];
}
inline MatrixIndexT NumRows() const { return num_rows_; }
inline MatrixIndexT NumCols() const { return num_rows_; }
/// Returns size in bytes of the data held by the matrix.
size_t SizeInBytes() const {
size_t nr = static_cast<size_t>(num_rows_),
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
return num_bytes;
}
protected:
// The following two functions should only be called if we did not compile with CUDA
// or could not get a CUDA card; in that case the contents are interpreted the
// same as a regular matrix.
inline const PackedMatrix<Real> &Mat() const {
return *(reinterpret_cast<const PackedMatrix<Real>* >(this));
}
inline PackedMatrix<Real> &Mat() {
return *(reinterpret_cast<PackedMatrix<Real>* >(this));
}
// Will only be called from this class or derived classes.
Real *data_;
MatrixIndexT num_rows_;
void AddPacked(const Real alpha, const CuPackedMatrix<Real> &M);
private:
// Disallow assignment.
PackedMatrix<Real> & operator=(const PackedMatrix<Real> &other);
}; // class CuPackedMatrix
/// I/O
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat);
} // namespace
#endif

Просмотреть файл

@ -1,6 +1,7 @@
// cudamatrix/cu-rand-inl.h
// cudamatrix/cu-rand.cc
// Copyright 2012 Karel Vesely
// 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -18,14 +19,8 @@
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_RAND_INL_H_
#define KALDI_CUDAMATRIX_CU_RAND_INL_H_
#include "base/kaldi-math.h"
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-rand.h"
#include "cudamatrix/cu-matrix-lib.h"
#include "cudamatrix/cu-randkernels.h"
@ -34,112 +29,120 @@ namespace kaldi {
template<typename Real>
void CuRand<Real>::SeedGpu(MatrixIndexT state_size) {
if(NULL != host_) delete[] host_;
host_ = new uint32[state_size];
host_size_ = state_size;
SeedBuffer(&z1_, state_size);
SeedBuffer(&z2_, state_size);
SeedBuffer(&z3_, state_size);
SeedBuffer(&z4_, state_size);
KALDI_ASSERT(state_size >= 0);
state_size_ = state_size;
delete[] host_;
host_ = NULL;
host_size_ = 0;
SeedBuffer(state_size, &z1_);
SeedBuffer(state_size, &z2_);
SeedBuffer(state_size, &z3_);
SeedBuffer(state_size, &z4_);
}
template<typename Real>
void CuRand<Real>::SeedBuffer(uint32* *tgt, MatrixIndexT state_size) {
// generate random state
for(MatrixIndexT i=0; i<host_size_; i++) {
host_[i] = RandInt(128, RAND_MAX);
}
#if HAVE_CUDA==1
// push it to the GPU
if (CuDevice::Instantiate().Enabled()) {
int32 state_size_in_bytes = state_size*sizeof(uint32);
// resize the GPU buffer
if (state_size_ != state_size) {
cudaFree(*tgt);
cudaMalloc((void**)tgt, state_size_in_bytes);
void CuRand<Real>::SeedBuffer(MatrixIndexT state_size, uint32 **tgt) {
#if HAVE_CUDA == 1
CuDevice &device = CuDevice::Instantiate();
if (device.Enabled()) {
if (*tgt != NULL) {
device.Free(*tgt);
*tgt = NULL;
}
// copy the values
cudaMemcpy(*tgt, host_, state_size_in_bytes, cudaMemcpyHostToDevice);
} else
#endif
{ // use back-off host buffer
if (state_size_ != state_size) {
delete[] (*tgt);
*tgt = new uint32[state_size];
}
int32 state_size_in_bytes = state_size*sizeof(uint32);
memcpy(*tgt, host_, state_size_in_bytes);
if (state_size == 0) return; // Nothing to do.
std::vector<uint32> temp_rand_data(state_size);
for(MatrixIndexT i = 0; i < state_size; i++)
temp_rand_data[i] = RandInt(128, RAND_MAX);
int32 state_size_in_bytes = state_size * sizeof(uint32);
*tgt = static_cast<uint32*>(device.Malloc(state_size_in_bytes));
CU_SAFE_CALL(cudaMemcpy(*tgt, &(temp_rand_data[0]),
state_size_in_bytes, cudaMemcpyHostToDevice));
}
#endif
}
template<class Real>
CuRand<Real>::~CuRand() {
SeedBuffer(0, &z1_);
SeedBuffer(0, &z2_);
SeedBuffer(0, &z3_);
SeedBuffer(0, &z4_);
}
template<typename Real> void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
#if HAVE_CUDA==1
template<typename Real> void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
int32 tgt_size = tgt->NumRows()*tgt->Stride();
int32 tgt_size = tgt->NumRows() * tgt->Stride();
if (tgt_size != state_size_) SeedGpu(tgt_size);
dim3 dimBlock(CUBLOCK, CUBLOCK);
dim3 dimGrid(n_blocks(tgt->num_cols_, CUBLOCK), n_blocks(tgt->num_rows_, CUBLOCK));
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(tgt->num_cols_, CU2DBLOCK), n_blocks(tgt->num_rows_, CU2DBLOCK));
cuda_rand(dimGrid, dimBlock, tgt->data_, z1_, z2_, z3_, z4_, tgt->Dim());
cuSafeCall(cudaGetLastError());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
#endif
{
for(int32 r=0; r<tgt->NumRows(); r++) {
for(int32 c=0; c<tgt->num_cols_; c++) {
tgt->Mat()(r, c) = kaldi::RandUniform();
}
}
tgt->SetRandUniform();
}
}
template<typename Real> void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
#if HAVE_CUDA==1
template<typename Real> void CuRand<Real>::RandGaussian(CuMatrixBase<Real> *tgt) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
int32 tgt_size = tgt->NumRows()*tgt->Stride();
if (tgt_size != state_size_) SeedGpu(tgt_size);
dim3 dimBlock(CUBLOCK, CUBLOCK);
dim3 dimGrid(n_blocks(tgt->num_cols_, CUBLOCK), n_blocks(tgt->NumRows(), CUBLOCK));
int32 tgt_size = tgt->NumRows() * tgt->Stride();
if (tgt_size == 0)
return;
if (tgt_size > state_size_) SeedGpu(tgt_size);
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(tgt->num_cols_, CU2DBLOCK), n_blocks(tgt->num_rows_, CU2DBLOCK));
cuda_gauss_rand(dimGrid, dimBlock, tgt->data_, z1_, z2_, z3_, z4_, tgt->Dim());
cuSafeCall(cudaGetLastError());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
#endif
{
for(int32 r=0; r<tgt->NumRows(); r++) {
for(int32 c=0; c<tgt->num_cols_; c++) {
tgt->Mat()(r, c) = RandGauss();
}
}
tgt->SetRandn();
}
}
template<typename Real> void CuRand<Real>::RandGaussian(CuVectorBase<Real> *tgt) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
int32 tgt_size = tgt->Dim();
if (tgt_size != state_size_) SeedGpu(tgt_size);
int dimBlock(CU1DBLOCK);
int dimGrid(n_blocks(tgt->Dim(), CU1DBLOCK));
cuda_vec_gauss_rand(dimGrid, dimBlock, tgt->Data(), z1_, z2_, z3_, z4_, tgt->Dim());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
tgt->Vec().SetRandn();
}
}
template<typename Real> void CuRand<Real>::BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states) {
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
@ -156,15 +159,15 @@ template<typename Real> void CuRand<Real>::BinarizeProbs(const CuMatrix<Real> &p
RandUniform(&tmp_);
// use the uniform random numbers to compute discrete 0/1 states
dim3 dimBlock(CUBLOCK, CUBLOCK);
dim3 dimGrid(n_blocks(states->num_cols_, CUBLOCK), n_blocks(states->num_rows_, CUBLOCK));
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(states->num_cols_, CU2DBLOCK), n_blocks(states->num_rows_, CU2DBLOCK));
cuda_binarize_probs(dimGrid, dimBlock, states->data_, probs.data_, tmp_.data_, states->Dim());
cuSafeCall(cudaGetLastError());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
#endif
{
for(int32 r=0; r<states->num_rows_; r++) {
for(int32 c=0; c<states->num_cols_; c++) {
@ -182,10 +185,12 @@ template<typename Real> void CuRand<Real>::AddGaussNoise(CuMatrix<Real> *tgt, Re
tgt->AddMat(gscale, tmp_, 1.0);
}
// Instantiate the class for float and double.
template class CuRand<float>;
template class CuRand<double>;
} // namespace
#endif

Просмотреть файл

@ -24,7 +24,7 @@
#include "cudamatrix/cu-matrix.h"
#include "base/kaldi-math.h"
namespace kaldi {
@ -33,25 +33,18 @@ template<typename Real>
class CuRand {
public:
CuRand()
: z1_(NULL), z2_(NULL), z3_(NULL), z4_(NULL), state_size_(0),
host_(NULL), host_size_(0)
{ }
~CuRand() {
#if HAVE_CUDA == 1
cudaFree(z1_); cudaFree(z2_); cudaFree(z3_); cudaFree(z4_);
#endif
delete[] host_;
}
CuRand(): z1_(NULL), z2_(NULL), z3_(NULL), z4_(NULL), state_size_(0) { }
~CuRand();
/// on demand seeding of all the buffers
void SeedGpu(MatrixIndexT state_size);
/// fill with uniform random numbers (0.0-1.0)
void RandUniform(CuMatrix<Real> *tgt);
/// fill with numbers drawn from uniform distribution on [0, 1]
void RandUniform(CuMatrixBase<Real> *tgt);
/// fill with normal random numbers
void RandGaussian(CuMatrix<Real> *tgt);
void RandGaussian(CuMatrixBase<Real> *tgt);
void RandGaussian(CuVectorBase<Real> *tgt);
/// align probabilities to discrete 0/1 states (use uniform samplig)
void BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states);
@ -59,8 +52,9 @@ class CuRand {
void AddGaussNoise(CuMatrix<Real> *tgt, Real gscale = 1.0);
private:
/// seed one buffer
void SeedBuffer(uint32* *tgt, MatrixIndexT state_size);
/// seed one buffer on the GPU. If state_size == 0, just frees any
/// existing buffers.
void SeedBuffer(MatrixIndexT state_size, uint32 **tgt);
private:
@ -75,19 +69,13 @@ class CuRand {
/// Inner state of the ``grid-like'' random number generator
uint32 *z1_, *z2_, *z3_, *z4_;
int32 state_size_; ///< size of the buffers
uint32 *host_; ///< host bufer, used for initializing
int32 host_size_; ///< size of the host buffer
CuMatrix<Real> tmp_; ///< auxiliary matrix
};
} // namsepace
#include "cudamatrix/cu-rand-inl.h"
#endif

Просмотреть файл

@ -25,7 +25,7 @@
#include "cudamatrix/cu-matrixdim.h"
#include "cudamatrix/cu-kernels-ansi.h"
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
extern "C" {
@ -34,6 +34,7 @@ extern "C" {
*/
void cudaF_rand(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
void cudaF_vec_gauss_rand(int Gr, int Bl, float *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim);
void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float *states, const float *probs, float *rand, MatrixDim d);
/*********************************************************
@ -41,6 +42,7 @@ void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float *states, const float *probs, f
*/
void cudaD_rand(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d);
void cudaD_vec_gauss_rand(int Gr, int Bl, double *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim);
void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double *states, const double *probs, double *rand, MatrixDim d);
}

Просмотреть файл

@ -1,6 +1,7 @@
// cudamatrix/cu-randkernels.cu
// Copyright 2012 Karel Vesely
// 2013 Johns Hopkins University (author: Daniel Povey)
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -108,6 +109,20 @@ static void _gauss_rand(Real* mat, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda
template<typename Real>
__global__
static void _vec_gauss_rand(Real* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
if (blockIdx.y > 0)
return;
if ( i < dim ) {
v[i] = BoxMuller<Real>(z1[i],z2[i],z3[i],z4[i]);
}
}
template<typename Real>
__global__
static void _binarize_probs(Real* states, const Real* probs, const Real* rand, MatrixDim d) {
@ -136,6 +151,10 @@ void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, uint32_cuda* z1, uint32_cuda
_gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d);
}
void cudaF_vec_gauss_rand(int Gr, int Bl, float* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
_vec_gauss_rand<<<Gr,Bl>>>(v,z1,z2,z3,z4,dim);
}
void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) {
_binarize_probs<<<Gr,Bl>>>(states,probs,rand,d);
}
@ -153,6 +172,10 @@ void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, uint32_cuda* z1, uint32_cud
_gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d);
}
void cudaD_vec_gauss_rand(int Gr, int Bl, double* v, uint32_cuda* z1, uint32_cuda* z2, uint32_cuda* z3, uint32_cuda* z4, int dim) {
_vec_gauss_rand<<<Gr,Bl>>>(v,z1,z2,z3,z4,dim);
}
void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) {
_binarize_probs<<<Gr,Bl>>>(states,probs,rand,d);
}

Просмотреть файл

@ -22,7 +22,7 @@
#ifndef KALDI_CUDAMATRIX_CU_RANDKERNELS_H_
#define KALDI_CUDAMATRIX_CU_RANDKERNELS_H_
#if HAVE_CUDA==1
#if HAVE_CUDA == 1
#include "base/kaldi-error.h"
#include "cudamatrix/cu-randkernels-ansi.h"
@ -38,6 +38,7 @@ namespace kaldi {
*/
template<typename Real> inline void cuda_rand(dim3 Gr, dim3 Bl, Real *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_gauss_rand(dim3 Gr, dim3 Bl, Real *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_vec_gauss_rand(int Gr, int Bl, Real *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { KALDI_ERR << __func__ << " Not implemented!"; }
template<typename Real> inline void cuda_binarize_probs(dim3 Gr, dim3 Bl, Real *states, const Real *probs, Real *rand, MatrixDim d) { KALDI_ERR << __func__ << " Not implemented!"; }
/*********************************************************
@ -45,6 +46,7 @@ template<typename Real> inline void cuda_binarize_probs(dim3 Gr, dim3 Bl, Real *
*/
template<> inline void cuda_rand<float>(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaF_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
template<> inline void cuda_gauss_rand<float>(dim3 Gr, dim3 Bl, float *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaF_gauss_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
template<> inline void cuda_vec_gauss_rand<float>(int Gr, int Bl, float *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { cudaF_vec_gauss_rand(Gr,Bl,v,z1,z2,z3,z4,dim); }
template<> inline void cuda_binarize_probs<float>(dim3 Gr, dim3 Bl, float *states, const float *probs, float *rand, MatrixDim d) { cudaF_binarize_probs(Gr,Bl,states,probs,rand,d); }
/*********************************************************
@ -52,6 +54,7 @@ template<> inline void cuda_binarize_probs<float>(dim3 Gr, dim3 Bl, float *state
*/
template<> inline void cuda_rand<double>(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaD_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
template<> inline void cuda_gauss_rand<double>(dim3 Gr, dim3 Bl, double *mat, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, MatrixDim d) { cudaD_gauss_rand(Gr,Bl,mat,z1,z2,z3,z4,d); }
template<> inline void cuda_vec_gauss_rand<double>(int Gr, int Bl, double *v, uint32_cuda *z1, uint32_cuda *z2, uint32_cuda *z3, uint32_cuda *z4, int dim) { cudaD_vec_gauss_rand(Gr,Bl,v,z1,z2,z3,z4,dim); }
template<> inline void cuda_binarize_probs<double>(dim3 Gr, dim3 Bl, double *states, const double *probs, double *rand, MatrixDim d) { cudaD_binarize_probs(Gr,Bl,states,probs,rand,d); }
} // namespace

Просмотреть файл

@ -0,0 +1,187 @@
// cudamatrix/cu-matrix-speed-test.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-sp-matrix.h"
using namespace kaldi;
namespace kaldi {
template<typename Real>
std::string NameOf() {
return (sizeof(Real) == 8 ? "<double>" : "<float>");
}
template<typename Real>
static void UnitTestCuSpMatrixInvert(int32 dim) {
BaseFloat time_in_secs = 0.5;
int32 iter = 0;
Timer tim;
CuSpMatrix<Real> A(dim);
A.SetRandn();
for (;tim.Elapsed() < time_in_secs; iter++) {
KALDI_ASSERT(A.Trace() != 0.0); // true with probability 1...
CuSpMatrix<Real> B(A);
if (iter > 0) {
B.Invert();
} else { // do some more testing...
CuMatrix<Real> D(A);
A.AddMat2(1.0, D, kTrans, 1.0);
A.AddToDiag(0.1 * dim);
CuMatrix<Real> C(B);
B.AddMat2(1.0, C, kTrans, 1.0);
B.AddToDiag(0.1 * dim);
A.Invert();
B.Invert();
SpMatrix<Real> E(dim);
B.CopyToSp(&E);
SpMatrix<Real> A2(A);
AssertEqual(A2, E);
}
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuSpMatrix::Invert" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real>
static void UnitTestCuSpMatrixCopyFromMat(int32 dim, SpCopyType copy_type) {
BaseFloat time_in_secs = 0.1;
int32 iter = 0;
Timer tim;
CuMatrix<Real> A(dim, dim);
CuSpMatrix<Real> S(dim);
for (;tim.Elapsed() < time_in_secs; iter++) {
S.CopyFromMat(A, copy_type);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuSpMatrix::CopyFromMat" << NameOf<Real>()
<< ", with copy-type "
<<(copy_type == kTakeLower ? "kTakeLower" :
(copy_type == kTakeUpper ? "kTakeUpper" :
"kTakeMeanAndCheck")) << " and dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real>
static void UnitTestCuMatrixApproxInvert(int32 dim) {
BaseFloat time_in_secs = 0.5;
int32 iter = 0;
// Get random orthogonal matrix.
Matrix<Real> Q_cpu(dim, dim);
Q_cpu.SetRandn();
for (int32 r = 0; r < dim; r++) {
for (int32 s = 0; s < r; s++)
Q_cpu.Row(r).AddVec(-1.0 * VecVec(Q_cpu.Row(r), Q_cpu.Row(s)), Q_cpu.Row(s));
Q_cpu.Row(r).Scale(1.0 / Q_cpu.Row(r).Norm(2.0));
}
CuMatrix<Real> Q(Q_cpu);
CuVector<Real> s(dim);
Real eig_range = 50.0; // factor of 50 on eigenvalues.. this affects the speed.
Real first_eig = 0.001 + RandUniform() * 5.0;
for (int32 r = 0; r < dim; r++)
s(r) = first_eig * exp(r * log(eig_range) / dim);
s.ApplyPow(0.5);
Q.MulColsVec(s);
CuSpMatrix<Real> A(dim);
A.AddMat2(1.0, Q, kNoTrans, 0.0);
Timer tim;
for (;tim.Elapsed() < time_in_secs; iter++) {
CuSpMatrix<Real> Atmp(A);
Atmp.InvertPosDefApprox(0.1);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuSpMatrix::InvertPosDefApprox" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void CuSpMatrixSpeedTest() {
std::vector<int32> sizes;
sizes.push_back(16);
sizes.push_back(128);
sizes.push_back(256);
sizes.push_back(1024);
int32 ns = sizes.size();
for (int32 s = 0; s < ns; s++) {
UnitTestCuSpMatrixInvert<Real>(sizes[s]);
UnitTestCuMatrixApproxInvert<Real>(sizes[s]);
UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeLower);
UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeUpper);
UnitTestCuSpMatrixCopyFromMat<Real>(sizes[s], kTakeMean);
}
}
} // namespace kaldi
int main() {
//Select the GPU
#if HAVE_CUDA == 1
CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
#endif
kaldi::CuSpMatrixSpeedTest<float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CuSpMatrixSpeedTest<double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CuSpMatrixSpeedTest<double>();
#endif
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
std::cout << "Tests succeeded.\n";
}

Просмотреть файл

@ -0,0 +1,437 @@
// cudamatrix/cu-sp-matrix-test.cc
//
// Copyright 2013 Ehsan Variani
// Lucas Ondel
// Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
//
//
// UnitTests for testing cu-sp-matrix.h methods.
//
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-sp-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-math.h"
using namespace kaldi;
namespace kaldi {
/*
* Unit Tests
*/
template<typename Real>
static void UnitTestCuSpMatrixConstructor() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 10 * i;
Matrix<Real> A(dim, dim);
A.SetRandn();
SpMatrix<Real> B(A, kTakeLower);
CuMatrix<Real> C(A);
CuSpMatrix<Real> D(C, kTakeLower);
SpMatrix<Real> E(dim);
D.CopyToSp(&E);
SpMatrix<Real> F(D);
AssertEqual(F, B);
//added by hxu, to test copy from SpMatrix to CuSpMatrix
AssertEqual(B, E);
KALDI_ASSERT(!B.IsUnit());
B.SetZero();
B.SetDiag(1.0);
KALDI_ASSERT(B.IsUnit());
}
}
template<typename Real>
static void UnitTestCuSpMatrixApproxEqual() {
for (int32 i = 0; i < 10; i++) {
int32 dim = 1 + rand() % 10;
SpMatrix<Real> A(dim), B(dim);
A.SetRandn();
B.SetRandn();
BaseFloat threshold = 0.01;
for (int32 j = 0; j < 20; j++, threshold *= 1.3) {
bool b1 = A.ApproxEqual(B, threshold);
SpMatrix<Real> diff(A);
diff.AddSp(-1.0, B);
bool b2 = (diff.FrobeniusNorm() < threshold * std::max(A.FrobeniusNorm(),
B.FrobeniusNorm()));
KALDI_ASSERT(b1 == b2);
}
}
}
template<typename Real>
static void UnitTestCuSpMatrixOperator() {
SpMatrix<Real> A(100);
A.SetRandn();
CuSpMatrix<Real> B(100);
B.CopyFromSp(A);
for (MatrixIndexT i = 0; i < A.NumRows(); i++) {
for (MatrixIndexT j = 0; j <= i; j++)
KALDI_ASSERT(std::abs(A(i, j) - B(i, j)) < 0.0001);
}
}
template<typename Real>
static void UnitTestCuSpMatrixAddToDiag() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 10*i;
SpMatrix<Real> A(dim);
A.SetRandn();
CuSpMatrix<Real> B(A);
Matrix<Real> D(A);
A.AddToDiag(i);
CuMatrix<Real> C(B);
B.AddToDiag(i);
SpMatrix<Real> E(dim);
B.CopyToSp(&E);
AssertEqual(A, E);
}
}
template<typename Real>
static void UnitTestCuSpMatrixCopyFromMat() {
for (MatrixIndexT i = 1; i < 10; i++) {
SpCopyType copy_type = (i % 3 == 0 ? kTakeMean :
(i % 3 == 1 ? kTakeLower : kTakeUpper));
MatrixIndexT dim = 10 * i + rand() % 5;
CuMatrix<Real> A(dim, dim);
A.SetRandn();
Matrix<Real> A2(A);
CuSpMatrix<Real> B(A, copy_type);
SpMatrix<Real> B2(A2, copy_type);
SpMatrix<Real> B3(B);
if (!ApproxEqual(B2, B3) ) {
KALDI_ERR << "Matrices differ, A = " << A << ", B2 = " << B2 << ", B3(CUDA) = " << B3;
}
KALDI_ASSERT(B3.Trace() != 0);
}
}
template<typename Real>
static void UnitTestCuSpMatrixApproxInvert(int32 dim) {
// Get random orthogonal matrix.
CuMatrix<Real> Q(dim, dim);
Q.SetRandn();
for (int32 r = 0; r < dim; r++) {
for (int32 s = 0; s < r; s++)
Q.Row(r).AddVec(-1.0 * VecVec(Q.Row(r), Q.Row(s)), Q.Row(s));
Q.Row(r).Scale(1.0 / Q.Row(r).Norm(2.0));
}
CuVector<Real> s(dim); // factor of 10 on eigenvalues, evenly spaced in log.
Real eig_range = 50.0;
Real first_eig = 0.001 + RandUniform() * 5.0;
for (int32 r = 0; r < dim; r++)
s(r) = first_eig * exp(r * log(eig_range) / dim);
s.ApplyPow(0.5);
Q.MulColsVec(s);
CuSpMatrix<Real> A(dim);
A.AddMat2(1.0, Q, kNoTrans, 0.0);
CuMatrix<Real> A_orig(A);
BaseFloat max_error = 0.1;
A.InvertPosDefApprox(max_error);
CuMatrix<Real> prod(dim, dim);
prod.AddSpMat(1.0, A, A_orig, kNoTrans, 0.0);
KALDI_ASSERT(prod.IsUnit(max_error));
}
template<typename Real>
static void UnitTestCuSpMatrixInvert() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 10*i + rand() % 5;
CuSpMatrix<Real> A(dim);
A.SetRandn();
KALDI_ASSERT(A.Trace() != 0.0); // true with probability 1...
SpMatrix<Real> B(A);
CuMatrix<Real> D(A);
A.AddMat2(1.0, D, kTrans, 1.0);
A.AddToDiag(i);
Matrix<Real> C(B);
B.AddMat2(1.0, C, kTrans, 1.0);
B.AddToDiag(i);
CuSpMatrix<Real> Acopy(A);
A.Invert();
B.Invert();
SpMatrix<Real> A2(A);
AssertEqual(A2, B);
CuMatrix<Real> I(dim, dim);
I.AddMatMat(1.0, CuMatrix<Real>(Acopy), kNoTrans, CuMatrix<Real>(A), kNoTrans, 0.0);
KALDI_ASSERT(I.IsUnit(0.01));
}
}
// TODO (variani) : fails for dim = 0
template<typename Real>
static void UnitTestCuSpMatrixAddVec2() {
for (int32 i = 0; i < 50; i++) {
MatrixIndexT dim = 1 + rand() % 200;
SpMatrix<Real> A(dim);
A.SetRandn();
CuSpMatrix<Real> B(A);
Vector<Real> C(dim);
C.SetRandn();
CuVector<Real> D(C);
Real alpha = RandGauss();
A.AddVec2(alpha, C);
B.AddVec2(alpha, D);
SpMatrix<Real> E(dim);
B.CopyToSp(&E);
AssertEqual(A, E);
}
}
template<typename Real>
static void UnitTestCuSpMatrixAddMat2() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim_row = 15 * i + rand() % 10;
MatrixIndexT dim_col = 7 *i + rand() % 10;
Matrix<Real> A(dim_row, dim_col);
A.SetRandn();
CuMatrix<Real> B(A);
SpMatrix<Real> C(dim_col);
C.SetRandn();
CuSpMatrix<Real> D(C);
const Real alpha = 2.0;
const Real beta = 3.0;
C.AddMat2(alpha, A, kTrans, beta);
D.AddMat2(alpha, B, kTrans, beta);
SpMatrix<Real> E(dim_col);
D.CopyToSp(&E);
AssertEqual(C, E);
}
}
template<typename Real>
static void UnitTestCuSpMatrixAddSp() {
for (MatrixIndexT i = 1; i < 50; i++) {
MatrixIndexT dim = 7 * i + rand() % 10;
SpMatrix<Real> A(dim);
A.SetRandn();
CuSpMatrix<Real> B(A);
SpMatrix<Real> C(dim);
C.SetRandn();
const CuSpMatrix<Real> D(C);
const Real alpha = 2.0;
A.AddSp(alpha, C);
B.AddSp(alpha, D);
SpMatrix<Real> E(dim);
B.CopyToSp(&E);
AssertEqual(A, E);
}
}
template<typename Real, typename OtherReal>
static void UnitTestCuSpMatrixTraceSpSp() {
for (MatrixIndexT i = 1; i < 2; i++) {
MatrixIndexT dim = 100 + rand() % 255;
SpMatrix<Real> A(dim);
A.SetRandn();
const CuSpMatrix<Real> B(A);
SpMatrix<OtherReal> C(dim);
C.SetRandn();
const CuSpMatrix<OtherReal> D(C);
Real t1 = TraceSpSp(A, C), t2 = TraceSpSp(B, D);
KALDI_ASSERT(ApproxEqual(t1, t2));
}
}
template<typename Real>
void UnitTestCuSpMatrixSetUnit() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 100 * i + rand() % 255;
if (i % 5 == 0) dim = 0;
CuSpMatrix<Real> S1(dim), S2(dim), S4(dim);
S1.SetRandn();
S2.SetRandn();
S4.SetRandn();
SpMatrix<Real> S3(dim);
S3.SetUnit();
S1.SetUnit();
S2.SetZero();
S2.SetDiag(1.0);
S4.SetZero();
S4.AddToDiag(0.4);
S4.AddToDiag(0.6);
CuSpMatrix<Real> cu_S3(S3);
KALDI_LOG << "S1 norm is " << S1.FrobeniusNorm();
KALDI_LOG << "S2 norm is " << S2.FrobeniusNorm();
KALDI_LOG << "S3 norm is " << S3.FrobeniusNorm();
AssertEqual(S1, cu_S3);
AssertEqual(S2, cu_S3);
AssertEqual(S4, cu_S3);
}
}
template<class Real>
static void UnitTestCuSpMatrixIO() {
for (int32 i = 0; i < 10; i++) {
int32 dimM = rand() % 255;
if (i % 5 == 0) { dimM = 0; }
CuSpMatrix<Real> mat(dimM);
mat.SetRandn();
std::ostringstream os;
bool binary = (i % 4 < 2);
mat.Write(os, binary);
CuSpMatrix<Real> mat2;
std::istringstream is(os.str());
mat2.Read(is, binary);
AssertEqual(mat, mat2);
}
}
template<typename Real, typename OtherReal>
static void UnitTestCuSpMatrixAddSp() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 100 * i + rand() % 255;
SpMatrix<Real> A(dim);
A.SetRandn();
const CuSpMatrix<Real> B(A);
SpMatrix<OtherReal> C(dim);
C.SetRandn();
const CuSpMatrix<OtherReal> D(C);
A.AddSp(1.0, C);
B.AddSp(1.0, D);
AssertEqual(A, B);
}
}
template<typename Real> void CudaSpMatrixUnitTest() {
UnitTestCuSpMatrixIO<Real>();
UnitTestCuSpMatrixConstructor<Real>();
UnitTestCuSpMatrixOperator<Real>();
UnitTestCuSpMatrixApproxEqual<Real>();
UnitTestCuSpMatrixInvert<Real>();
UnitTestCuSpMatrixApproxInvert<Real>(300);
UnitTestCuSpMatrixApproxInvert<Real>(100);
UnitTestCuSpMatrixApproxInvert<Real>(10);
UnitTestCuSpMatrixCopyFromMat<Real>();
UnitTestCuSpMatrixAddVec2<Real>();
UnitTestCuSpMatrixAddMat2<Real>();
UnitTestCuSpMatrixAddSp<Real>();
UnitTestCuSpMatrixAddToDiag<Real>();
UnitTestCuSpMatrixSetUnit<Real>();
}
template<typename Real, typename OtherReal> void CudaSpMatrixUnitTest() {
UnitTestCuSpMatrixTraceSpSp<Real, OtherReal>();
}
} // namespace kaldi
int main() {
using namespace kaldi;
for (int32 loop = 0; loop < 2; loop++) {
#if HAVE_CUDA == 1
if (loop == 0)
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
else
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
#endif
kaldi::CudaSpMatrixUnitTest<float>();
kaldi::CudaSpMatrixUnitTest<float, float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CudaSpMatrixUnitTest<double>();
kaldi::CudaSpMatrixUnitTest<float, double>();
kaldi::CudaSpMatrixUnitTest<double, float>();
kaldi::CudaSpMatrixUnitTest<double, double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CudaSpMatrixUnitTest<float, double>();
kaldi::CudaSpMatrixUnitTest<double, float>();
kaldi::CudaSpMatrixUnitTest<double, double>();
#endif
if (loop == 0)
KALDI_LOG << "Tests without GPU use succeeded.\n";
else
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
}
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
}

Просмотреть файл

@ -0,0 +1,361 @@
#if HAVE_CUDA == 1
#include <cuda_runtime_api.h>
#include <cublas.h>
#endif
#include "util/timer.h"
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-kernels.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-sp-matrix.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cublas-wrappers.h"
namespace kaldi {
template<typename Real>
void CuSpMatrix<Real>::CopyFromMat(const CuMatrixBase<Real> &M,
SpCopyType copy_type) {
KALDI_ASSERT(this->num_rows_ == M.NumRows() &&
this->num_rows_ == M.NumCols());
if (this->num_rows_ == 0)
return;
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
MatrixIndexT D = this->NumRows();
if (D == 0)
return;
switch (copy_type) {
case kTakeMeanAndCheck:
KALDI_ERR << "kTakeMeanAndCheck not supported!";
// The grid/block dimensions have been very roughly tuned for the
// individual cases.
case kTakeMean:
{
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
cuda_take_mean(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
CU_SAFE_CALL(cudaGetLastError());
}
break;
case kTakeLower:
{
dim3 dimBlock(1, CU1DBLOCK);
dim3 dimGrid(D, n_blocks(D, CU1DBLOCK));
cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
CU_SAFE_CALL(cudaGetLastError());
cudaThreadSynchronize();
}
break;
case kTakeUpper:
{
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
CU_SAFE_CALL(cudaGetLastError());
}
break;
default:
KALDI_ASSERT("Invalid argument to CuSpMatrix::CopyFromMat");
}
CuDevice::Instantiate().AccuProfile("CuSpMatrix::CopyFromMat(from CuMatrixBase)", tim.Elapsed());
} else
#endif
{
Mat().CopyFromMat(M.Mat(), copy_type);
}
}
template<typename Real>
void CuSpMatrix<Real>::Invert() {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
CuMatrix<Real> mat(this->num_rows_, this->num_rows_);
mat.CopyFromSp(*this);
mat.SymInvertPosDef();
this->CopyFromMat(mat);
} else
#endif
{ // Use inversion of CPU-based SpMatrix.
Mat().Invert();
}
}
template<typename Real>
void CuSpMatrix<Real>::InvertPosDefApprox(BaseFloat max_error) {
if (this->num_rows_ == 0) return;
MatrixIndexT dim = this->num_rows_;
CuMatrix<Real> temp(dim * 5, dim);
CuSubMatrix<Real> A(temp, 0, dim, 0, dim),
AA(temp, dim, dim, 0, dim),
AAA(temp, 2 * dim, dim, 0, dim),
AAAA(temp, 3 * dim, dim, 0, dim);
Real prescale = dim / this->Trace();
this->Scale(prescale); // We'll compute the inverse of the prescaled A, and then
// put that factor back later. This is useful since we
// deal with high powers of A that could get large or small.
A.CopyFromSp(*this);
// use *this as a temporary SpMatrix; we've stored its contents in "A".
this->AddMat2(1.0, A, kNoTrans, 0.0);
AA.CopyFromSp(*this);
{ // now create AAA and AAAA using a single multiplication.
CuSubMatrix<Real> A_and_AA(temp, 0, dim * 2, 0, dim),
AAA_and_AAAA(temp, dim * 2, dim * 2, 0, dim);
// Note: below, the transpose-ness of AA is arbitrary since it's symmetric;
// I guess that transposed may be faster.
AAA_and_AAAA.AddMatMat(1.0, A_and_AA, kNoTrans, AA, kTrans, 0.0);
}
// Note: below, trace_A equals dim because of the prescaling, we
// ensured that.
Vector<double> trace(8); // trace(i) is trace(A^(i+1))
trace(0) = dim;
{
CuVector<Real> trace_vec(dim * 5);
CuSubVector<Real> trace_lower4(trace_vec, 0, dim * 4),
trace_lower3(trace_vec, 0, dim * 3),
trace1(trace_vec, 0, dim), trace2(trace_vec, dim, dim),
trace3(trace_vec, dim * 2, dim), trace4(trace_vec, dim * 3, dim),
ones(trace_vec, dim * 4, dim);
trace_lower4.AddDiagMat2(1.0, temp.Range(0, dim * 4, 0, dim),
kNoTrans, 0.0);
ones.Set(1.0);
// TODO: can make these vecvec's faster as fake matrix multiplies.
trace(1) = VecVec(trace1, ones);
trace(3) = VecVec(trace2, ones);
trace(5) = VecVec(trace3, ones);
trace(7) = VecVec(trace4, ones);
// Now we want to get odd-numbered trace quantities, so multiply the
// rows of A through AAA with the rows of AA through AAA.
CuSubMatrix<Real> lower_three(temp, 0, dim * 3, 0, dim),
upper_three(temp, dim, dim * 3, 0, dim);
trace_lower3.AddDiagMatMat(1.0, lower_three, kNoTrans, upper_three, kTrans, 0.0);
trace(2) = VecVec(trace1, ones);
trace(4) = VecVec(trace2, ones);
trace(6) = VecVec(trace3, ones);
}
{ // Check the trace values.
CuMatrix<Real> power(A);
for (int32 i = 0; i < 8; i++) {
double this_trace = power.Trace();
AssertEqual(this_trace, trace(i));
CuMatrix<Real> temp_power(power);
power.AddMatMat(1.0, temp_power, kNoTrans, A, kNoTrans, 0.0);
}
}
// We'll use a and B to get the coefficients. These operations are in very
// tiny dimensions -> faster and more convenient to use CPU.
SubVector<double> a(trace, 0, 4);
SpMatrix<double> B(4);
for (int32 r = 0; r < 4; r++)
for (int32 c = 0; c <= r; c++)
B(r, c) = trace(r + c + 1);
TpMatrix<double> C(4);
C.Cholesky(B);
C.Invert();
SpMatrix<double> Binv(4);
Binv.AddTp2(1.0, C, kTrans, 0.0);
Vector<double> v(4);
v.AddSpVec(1.0, Binv, a, 0.0);
Real av = VecVec(a, v), vBv = VecSpVec(v, B, v),
error = (vBv + dim) - 2.0 * av;
KALDI_ASSERT(error >= 0.0); // note: error is a squared Frobenius
// norm.
KALDI_VLOG(5) << "a is " << a << ", B is " << B;
KALDI_VLOG(5) << "Dim is " << dim << ", error norm is " << sqrt(error);
if (error <= max_error) {
// It's sufficient to return with the approximation up to A^3.
A.Scale(v(1));
A.AddToDiag(v(0));
A.AddMat(v(2), AA);
A.AddMat(v(3), AAA);
this->CopyFromMat(A, kTakeLower);
this->Scale(prescale);
return;
} else {
// Let X be the approximate inverse of A: X = v(0) I + v(1) A + v(2) A^2 + v(3) A^3.
// Let AX be A times X: AX = v(0) A + v(1) A^2 + v(2) A^3 + v(3) A^4.
// We can construct both X and AX out of quantities we've already computed.
CuSubMatrix<Real> X(temp, dim * 4, dim, 0, dim),
AX(temp, dim * 3, dim, 0, dim);
AX.Scale(v(3)); // AX re-uses memory of AAAA: scale that.
AX.AddMat(v(2), AAA);
AX.AddMat(v(1), AA);
AX.AddMat(v(0), A);
X.AddMat(v(3), AAA); // X was zero before; space never used.
X.AddMat(v(2), AA);
X.AddMat(v(1), A);
X.AddToDiag(v(0));
int32 num_iters = 10;
for (int32 i = 0; i < num_iters; i++) {
CuSubMatrix<Real> AX_and_X(temp, dim * 3, dim * 2, 0, dim),
AAXX_and_AXX(temp, dim, dim * 2, 0, dim); // Note: in our variable-naming
// conventions we put the A's first; since all quantities commute it doesn't
// matter which order we put them in. Note: the transpose of AX below is
// arbitrary (it's symmetric); I think it might be more efficient.`
AAXX_and_AXX.AddMatMat(1.0, AX_and_X, kNoTrans, AX, kTrans, 0.0);
// The iteration now is X' <--- X (2I - AX). This is the iteration of
// Schulz/Hoteling/whatever. To get the objf (and for the next iteration)
// we also want AX'. Use X' <-- 2X - AXX, and AX' <-- 2AX - AAXX.
// They go in the same place as before. For now on, forget about the dash
// on the X, we'll just call it X.
AX_and_X.Scale(2.0);
AX_and_X.AddMat(-1.0, AAXX_and_AXX);
// The squared error is ||AX - I||^2 = tr((AX - I)(AX - I)) = tr(AX^T AX) + dim - 2 tr(AX)
Real a = TraceMatMat(AX, AX, kTrans), b = AX.Trace();
error = a + dim - 2 * b;
KALDI_VLOG(5) << "Better-inverse error is "
<< sqrt(error);
if (error <= max_error) {
this->CopyFromMat(X, kTakeLower);
this->Scale(prescale);
return;
}
}
KALDI_ASSERT("Error: max iters reached."); // TODO
}
}
template<typename Real>
void CuSpMatrix<Real>::AddVec2(const Real alpha, const CuVectorBase<Real> &v) {
KALDI_ASSERT(v.Dim() == this->NumRows());
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
size_t nr = this->num_rows_;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK));
cublas_spr('U', this->num_rows_, alpha, v.Data(),
1, this->Data());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim.Elapsed());
} else
#endif
{
Mat().AddVec2(alpha, v.Vec());
}
}
template<typename Real>
void CuSpMatrix<Real>::AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
MatrixTransposeType transM, const Real beta) {
KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
|| (transM == kTrans && this->NumRows() == M.NumCols()));
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
MatrixIndexT this_dim = this->NumRows(),
m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows());
if (this_dim == 0) return;
if (alpha == 0.0) {
if (beta != 1.0) this->Scale(beta);
return;
}
char trans = (transM == kTrans ? 'N' : 'T');
CuMatrix<Real> tmp_mat(*this);
cublas_syrk('U', trans, this_dim, m_other_dim, alpha, M.Data(),
M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride());
this->CopyFromMat(tmp_mat, kTakeLower);
CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddMat2", tim.Elapsed());
} else
#endif
{
Mat().AddMat2(alpha, M.Mat(), transM, beta);
}
}
/**
* C++ templatd wrapper of ANSI-C CUBLAS function GEMM (matrix multiply)
*/
template<typename Real, typename OtherReal>
Real TraceSpSp(const CuSpMatrix<Real> &A, const CuSpMatrix<OtherReal> &B) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
MatrixIndexT nr = A.NumRows(), size = nr * (nr+1) / 2;
CuVector<Real> Adiag(nr, kUndefined);
CuVector<OtherReal> Bdiag(nr, kUndefined);
Adiag.CopyDiagFromPacked(A);
Bdiag.CopyDiagFromPacked(B);
CuSubVector<Real> Aall(A.Data(), size);
CuSubVector<OtherReal> Ball(B.Data(), size);
// Below, we subtrace VecVec(Adiag, Bdiag) to remove double-counting
// on the diagonal.
return 2.0 * VecVec(Aall, Ball) - VecVec(Adiag, Bdiag);
} else
#endif
{
return TraceSpSp(A.Mat(), B.Mat());
}
}
template
float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<float> &B);
template
float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<double> &B);
template
double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<float> &B);
template
double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<double> &B);
template<typename Real>
bool CuSpMatrix<Real>::ApproxEqual(const CuSpMatrix<Real> &B, Real tol) const {
KALDI_ASSERT(this->NumRows() == B.NumRows());
CuSpMatrix<Real> diff(*this);
diff.AddSp(-1.0, B);
Real a = this->FrobeniusNorm(), b = B.FrobeniusNorm(),
d = diff.FrobeniusNorm();
return (d <= tol * std::max(a, b));
}
template<typename Real>
bool CuSpMatrix<Real>::IsUnit(Real tol) const {
// want to return:
//FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.:
//sqrt (trace((*this - I)(*this-I)) <= tol * NumRows()
// trace((*this - I)(*this - I)) <= tol * NumRows()
// trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows()
// trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows()
// Note: we could do this more efficiently still, by slightly changing the
// definition of IsUnit and getting rid of the extra stuff inside TraceSpSp
// that corrects for the diagonal being counted twice.
return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <=
tol * this->NumRows());
}
template class CuSpMatrix<float>;
template class CuSpMatrix<double>;
} // namespace

Просмотреть файл

@ -0,0 +1,146 @@
#ifndef KALDI_CUDAMATRIX_CU_SP_MATRIX_H_
#define KALDI_CUDAMATRIX_CU_SP_MATRIX_H_
#include <sstream>
#include "cudamatrix/cu-common.h"
#include "matrix/matrix-common.h"
#include "matrix/sp-matrix.h"
#include "cudamatrix/cu-array.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-packed-matrix.h"
#include "cudamatrix/cu-matrix.h"
namespace kaldi {
/// TraceSpSp returns tr(A B)
template<typename Real, typename OtherReal>
Real TraceSpSp(const CuSpMatrix<Real> &A, const CuSpMatrix<OtherReal> &B);
template<typename Real>
class CuSpMatrix : public CuPackedMatrix<Real> {
friend class CuMatrixBase<Real>;
friend class CuVectorBase<Real>;
friend class CuTpMatrix<Real>;
friend class CuSubMatrix<Real>;
friend class CuRand<Real>;
template<class R, class S>
friend R TraceSpSp(const CuSpMatrix<R> &A, const CuSpMatrix<S> &B);
public:
CuSpMatrix(): CuPackedMatrix<Real>() {}
explicit CuSpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
: CuPackedMatrix<Real>(r, resize_type) {}
explicit CuSpMatrix(const SpMatrix<Real> &orig)
: CuPackedMatrix<Real>(orig) {}
explicit CuSpMatrix(const CuSpMatrix<Real> &orig)
: CuPackedMatrix<Real>(orig) {}
explicit CuSpMatrix(const CuMatrixBase<Real> &orig,
SpCopyType copy_type = kTakeLower)
: CuPackedMatrix<Real>(orig.NumRows(), kUndefined) {
CopyFromMat(orig, copy_type);
}
~CuSpMatrix() {}
inline void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) {
CuPackedMatrix<Real>::Resize(nRows, resize_type);
}
Real FrobeniusNorm() const { return sqrt(TraceSpSp(*this, *this)); }
bool IsUnit(Real tol = 0.001) const;
bool ApproxEqual(const CuSpMatrix<Real> &other, Real tol = 0.001) const;
void CopyFromSp(const CuSpMatrix<Real> &other) {
CuPackedMatrix<Real>::CopyFromPacked(other);
}
void CopyFromSp(const SpMatrix<Real> &other) {
CuPackedMatrix<Real>::CopyFromPacked(other);
}
void CopyFromMat(const CuMatrixBase<Real> &orig,
SpCopyType copy_type = kTakeLower);
void CopyToSp(SpMatrix<Real> *dst) const { //added const by hxu
CuPackedMatrix<Real>::CopyToPacked(dst);
}
inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
if (static_cast<UnsignedMatrixIndexT>(c) >
static_cast<UnsignedMatrixIndexT>(r))
std::swap(c, r);
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
static_cast<UnsignedMatrixIndexT>(this->num_rows_));
return CuValue<Real>(this->data_ + (r * (r+1)) / 2 + c);
}
inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
if (static_cast<UnsignedMatrixIndexT>(c) >
static_cast<UnsignedMatrixIndexT>(r))
std::swap(c, r);
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
static_cast<UnsignedMatrixIndexT>(this->num_rows_));
return CuValue<Real>(this->data_ + (r * (r+1)) / 2 + c); // will be
// casted to Real.
}
/// Approximate inversion of positive definite matrices, using repeated
/// multiplication. Limits the error by ensuring that
/// || I - A Ainv ||^2 <= max_error, using Frobenius norm (so guarantees
// that (I - A Ainv).IsUnit(max_error) == true).
void InvertPosDefApprox(BaseFloat max_error = 0.1);
/// Note: the CuMatrix version of the Invert() function will only work for
/// positive definite matrices; it is based on Cholesky.
void Invert();
void AddVec2(const Real alpha, const CuVectorBase<Real> &v);
void AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
MatrixTransposeType transM, const Real beta);
void AddSp(const Real alpha, const CuSpMatrix<Real> &Ma) {
this->AddPacked(alpha, Ma);
}
protected:
inline const SpMatrix<Real> &Mat() const {
return *(reinterpret_cast<const SpMatrix<Real>* >(this));
}
inline SpMatrix<Real> &Mat() {
return *(reinterpret_cast<SpMatrix<Real>* >(this));
}
};
template<typename Real>
inline bool ApproxEqual(const CuSpMatrix<Real> &A,
const CuSpMatrix<Real> &B, Real tol = 0.001) {
return A.ApproxEqual(B, tol);
}
template<typename Real>
inline void AssertEqual(const CuSpMatrix<Real> &A,
const CuSpMatrix<Real> &B, Real tol = 0.001) {
KALDI_ASSERT(ApproxEqual(A, B, tol));
}
template<typename Real>
SpMatrix<Real>::SpMatrix(const CuSpMatrix<Real> &cu) {
Resize(cu.NumRows());
cu.CopyToSp(this);
}
} // namespace
#endif

Просмотреть файл

@ -1,213 +0,0 @@
// cudamatrix/cu-stlvector-inl.h
// Copyright 2009-2012 Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_STLVECTOR_INL_H_
#define KALDI_CUDAMATRIX_CU_STLVECTOR_INL_H_
#if HAVE_CUDA==1
#include <cuda_runtime_api.h>
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-kernels.h"
#endif
#include "util/timer.h"
namespace kaldi {
template<typename IntType>
const IntType* CuStlVector<IntType>::Data() const {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
return data_;
} else
#endif
{
return &vec_.front();
}
}
template<typename IntType>
IntType* CuStlVector<IntType>::Data() {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
return data_;
} else
#endif
{
return &vec_.front();
}
}
template<typename IntType>
CuStlVector<IntType>& CuStlVector<IntType>::Resize(MatrixIndexT dim) {
if (dim_ == dim) {
// SetZero();
return *this;
}
Destroy();
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
cuSafeCall(cudaMalloc((void**)&data_, dim*sizeof(IntType)));
} else
#endif
{
vec_.resize(dim);
}
dim_ = dim;
SetZero();
return *this;
}
template<typename IntType>
void CuStlVector<IntType>::Destroy() {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
if (NULL != data_) {
cuSafeCall(cudaFree(data_));
data_ = NULL;
}
} else
#endif
{
vec_.resize(0);
}
dim_ = 0;
}
template<typename IntType>
CuStlVector<IntType>& CuStlVector<IntType>::CopyFromVec(const std::vector<IntType> &src) {
Resize(src.size());
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
cuSafeCall(cudaMemcpy(data_, &src.front(), src.size()*sizeof(IntType), cudaMemcpyHostToDevice));
CuDevice::Instantiate().AccuProfile("CuStlVector::CopyFromVecH2D",tim.Elapsed());
} else
#endif
{
memcpy(&vec_.front(), &src.front(), src.size()*sizeof(IntType));
}
return *this;
}
template<typename IntType>
void CuStlVector<IntType>::CopyToVec(std::vector<IntType> *dst) const {
if (static_cast<MatrixIndexT>(dst->size()) != dim_) {
dst->resize(dim_);
}
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
cuSafeCall(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(IntType), cudaMemcpyDeviceToHost));
CuDevice::Instantiate().AccuProfile("CuStlVector::CopyToVecD2H",tim.Elapsed());
} else
#endif
{
memcpy(&dst->front(), &vec_.front(), dim_*sizeof(IntType));
}
}
template<typename IntType>
void CuStlVector<IntType>::SetZero() {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
cuSafeCall(cudaMemset(data_, 0, dim_*sizeof(IntType)));
CuDevice::Instantiate().AccuProfile("CuStlVector::SetZero",tim.Elapsed());
} else
#endif
{
vec_.assign(dim_, 0);
}
}
/**
* Print the vector to stream
*/
template<typename IntType>
std::ostream &operator << (std::ostream &out, const CuStlVector<IntType> &vec) {
std::vector<IntType> tmp;
vec.CopyToVec(&tmp);
out << "[";
for(int32 i=0; i<tmp.size(); i++) {
out << " " << tmp[i];
}
out << " ]\n";
return out;
}
/*
* Methods wrapping the ANSI-C CUDA kernels
*/
template<>
inline void CuStlVector<int32>::Set(int32 value) {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK);
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
::MatrixDim d = { 1, Dim(), Dim() };
cudaI32_set_const(dimGrid, dimBlock, data_, value, d);
cuSafeCall(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
vec_.assign(vec_.size(), value);
}
}
} // namespace kaldi
#endif

Просмотреть файл

@ -1,109 +0,0 @@
// cudamatrix/cu-stlvector.h
// Copyright 2009-2012 Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_STLVECTOR_H_
#define KALDI_CUDAMATRIX_CU_STLVECTOR_H_
#include "matrix/kaldi-vector.h"
namespace kaldi {
template<typename IntType> class CuMatrix;
/**
* std::vector equivalent for CUDA computing
*/
template<typename IntType>
class CuStlVector {
typedef CuStlVector<IntType> ThisType;
public:
/// Default Constructor
CuStlVector<IntType>()
: dim_(0), data_(NULL) {
}
/// Constructor with memory initialisation
CuStlVector<IntType>(MatrixIndexT dim)
: dim_(0), data_(NULL) {
Resize(dim);
}
/// Destructor
~CuStlVector() {
Destroy();
}
/// Dimensions
MatrixIndexT Dim() const {
return dim_;
}
/// Get raw pointer
const IntType* Data() const;
IntType* Data();
/// Allocate the memory
ThisType& Resize(MatrixIndexT dim);
/// Deallocate the memory
void Destroy();
/// Copy functions (reallocates when needed)
ThisType& CopyFromVec(const std::vector<IntType> &src);
void CopyToVec(std::vector<IntType> *dst) const;
/// Math operations
void SetZero();
void Set(IntType value);
/// Accessor to non-GPU vector
const std::vector<IntType>& Vec() const {
return vec_;
}
std::vector<IntType>& Vec() {
return vec_;
}
private:
MatrixIndexT dim_; ///< dimension of the vector
IntType *data_; ///< GPU data pointer
std::vector<IntType> vec_; ///< non-GPU vector as back-up
};
/*
* Signatures of general/specialized methods
*/
template<typename Real> void CuStlVector<Real>::Set(Real value) { KALDI_ERR << __func__ << " Not implemented"; }
template<> inline void CuStlVector<int32>::Set(int32 value);
/// I/O
template<typename IntType>
std::ostream &operator << (std::ostream &out, const CuStlVector<IntType> &vec);
} // namespace
#include "cu-stlvector-inl.h"
#endif

582
src/cudamatrix/cu-test.cc Normal file
Просмотреть файл

@ -0,0 +1,582 @@
// cudamatrix/cuda-test.cc
//
//
#include <iostream>
#include <vector>
#include <cstdlib>
#include <ctime>
#include "base/kaldi-common.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-sp-matrix.h"
#include "cudamatrix/cu-tp-matrix.h"
#include "cudamatrix/cu-packed-matrix.h"
#include "cudamatrix/cu-vector.h"
#include <numeric>
#include <time.h>
namespace kaldi {
/*
* INITIALIZERS
*/
template<typename Real>
static void InitRand(SpMatrix<Real> *M) {
do {
for (MatrixIndexT i = 0; i < M->NumRows(); i++) {
for (MatrixIndexT j = 0; j <= i; j++ ) {
(*M)(i,j) = RandGauss();
}
}
} while (M->NumRows() != 0 && M->Cond() > 100);
}
template<typename Real>
static void InitRand(VectorBase<Real> *v) {
for (MatrixIndexT i = 0; i < v->Dim(); i++) {
(*v)(i) = RandGauss();
}
}
template<typename Real>
static void UnitTestSetZeroUpperDiag() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 10 * i;
Matrix<Real> A(dim,dim);
A.SetRandn();
CuMatrix<Real> B(A);
B.SetZeroUpperDiag();
Real sum = 0.0;
for (MatrixIndexT i = 0; i < dim; i++) {
for (MatrixIndexT j = i + 1; j < dim; j++)
sum += A(i,j);
}
KALDI_LOG << "the upper diaganoal sum for A is : " << sum;
B.CopyToMat(&A);
sum = 0.0;
for (MatrixIndexT i = 0; i < dim; i++) {
for (MatrixIndexT j = i + 1; j < dim; j++)
sum += A(i,j);
}
KALDI_LOG << "the upper diaganoal sum for B is : " << sum;
}
}
template<typename Real> static void UnitTestCholesky() {
for (MatrixIndexT iter = 0; iter < 3; iter++) {
MatrixIndexT dim = 300 + rand() % 200;
// set dimension
// computing the matrix for cholesky input
// CuMatrix is cuda matrix class while Matrix is cpu matrix class
CuMatrix<Real> A(dim,dim);
Matrix<Real> B(dim,dim);
Vector<Real> C(dim);
for (MatrixIndexT i = 0; i < dim; i++) {
B(i,i) = 1;
C(i) = i + 1;
}
B.AddVecVec(1.0, C, C);
// copy the matrix to cudamatrix object
A.CopyFromMat(B);
A.CopyToMat(&B);
//KALDI_LOG << B << '\n';
// doing cholesky
A.Cholesky();
Matrix<Real> D(dim,dim);
A.CopyToMat(&D);
//KALDI_LOG << "D is: " << D << '\n';
Matrix<Real> E(dim,dim);
E.AddMatMat(1.0, D, kNoTrans, D, kTrans, 0.0);
// check if the D'D is equal to B or not!
AssertEqual(B, E);
}
}
template<typename Real> static void UnitTestTrace() {
for (MatrixIndexT iter = 1; iter < 18; iter++) {
MatrixIndexT dim = iter;
KALDI_LOG << "dim is : " << iter;
SpMatrix<Real> A(dim);
A.SetRandn();
CuSpMatrix<Real> B(A);
KALDI_LOG << "cpu trace is : " << A.Trace();
KALDI_LOG << "gpu trace is : " << B.Trace();
}
/*
Vector<Real> tim(100);
Vector<Real> d(100);
for (MatrixIndexT iter = 0; iter < 100; iter++) {
MatrixIndexT dim = 10000 + rand() % 400;
Matrix<Real> A(dim,dim);
A.SetRandn();
CuMatrix<Real> B(A);
CuSpMatrix<Real> C(B,kTakeLower);
clock_t t1 = clock();
tim(iter) = C.Trace();
clock_t t2 = clock();
//tim(iter) = t2 - t1;
d(iter) = dim;
KALDI_LOG << tim(iter) << iter << '\n';
KALDI_LOG << d(iter) << iter << '\n';
}
KALDI_LOG << "tim is " << tim << '\n';
KALDI_LOG << "dim is " << d << '\n';
*/
}
template<typename Real> static void UnitInvert() {
//MatrixIndexT dim = 15 + rand() % 40;;
MatrixIndexT dim = 8;
CuMatrix<Real> A(dim,dim);
Matrix<Real> B(dim,dim);
Vector<Real> C(dim);
for (MatrixIndexT i = 0; i < dim; i++) {
B(i,i) = 1;
C(i) = i + 1;
}
B.AddVecVec(1.0,C,C);
CuMatrix<Real> tmp(dim,dim);
A.CopyFromMat(B);
//A.Cholesky();
A.CopyToMat(&B);
KALDI_LOG << "B is : " << '\n';
KALDI_LOG << B << '\n';
A.SymInvertPosDef();
Matrix<Real> D(dim,dim);
A.CopyToMat(&D);
KALDI_LOG << "D is : " << '\n';
KALDI_LOG << D << '\n';
Matrix<Real> X(dim,dim);
X.AddMatMat(1,B,kNoTrans,D,kNoTrans,0);
KALDI_LOG << X << '\n';
//for (MatrixIndexT i = 0; i < dim; i++) {
// for (MatrixIndexT j = i+1; j < dim; j++)
// D(i,j) = 0;
//}
//Matrix<Real> E(dim,dim);
//E.AddMatMat(1,D,kNoTrans,D,kTrans,0);
//AssertEqual(B,E);
}
template<typename Real> static void UnitTestInvert() {
for (MatrixIndexT iter = 0; iter < 3; iter++) {
MatrixIndexT dim = 500 + rand() % 400;
KALDI_LOG << "dim is : " << '\n';
KALDI_LOG << dim << '\n';
CuMatrix<Real> A(dim,dim);
Matrix<Real> B(dim,dim);
Vector<Real> C(dim);
for (MatrixIndexT i = 0; i < dim; i++) {
B(i,i) = 1;
C(i) = (i/(1.0*dim)) + 1;
}
Matrix<Real> Identity(B);
B.AddVecVec(1.0, C, C);
// Now we have a positive-definite B (inversion would
// fail if it were not positive definite).
A.CopyFromMat(B);
A.SymInvertPosDef();
Matrix<Real> D(dim,dim);
A.CopyToMat(&D);
Matrix<Real> X(dim,dim);
X.AddMatMat(1.0, B, kNoTrans, D, kNoTrans, 0.0);
// KALDI_LOG << "X is (should be identity): " << X << '\n';
AssertEqual(Identity, X, (sizeof(Real) == 4 ? 0.1 : 0.001));
}
}
template<typename Real> static void UnitTestConstructor() {
MatrixIndexT dim = 8;
CuMatrix<Real> A(dim,dim);
Matrix<Real> B(dim,dim);
for (MatrixIndexT i = 0; i < dim; i++) {
for (MatrixIndexT j = 0; j <=i; j++)
B(i,j) = i+j;
for (MatrixIndexT j = i+1; j < dim; j++)
B(i,j) = i+j+4;
}
KALDI_LOG << "A is : " << '\n';
KALDI_LOG << B << '\n';
A.CopyFromMat(B);
//CuSpMatrix<Real> C(dim);
//C.CopyFromMat(A,kTakeLower);
CuSpMatrix<Real> C(A, kTakeLower);
SpMatrix<Real> D(dim);
C.CopyToSp(&D);
KALDI_LOG << "C is : " << '\n';
for (MatrixIndexT i = 0; i < dim; i++) {
for (MatrixIndexT j = 0; j <= i; j++)
std::cout << D(i,j) << " ";
std::cout << '\n';
}
}
template<typename Real> static void UnitTestCopySp() {
// Checking that the various versions of copying
// matrix to SpMatrix work the same in the symmetric case.
for (MatrixIndexT iter = 0;iter < 5;iter++) {
int32 dim = 5 + rand() % 10;
SpMatrix<Real> A(dim), B(dim);
A.SetRandn();
Matrix<Real> C(A);
//CuMatrix<Real> D(C);
{
CuMatrix<Real> D2(dim,dim);
D2.CopyFromMat(C);
KALDI_LOG << "D2 is " << D2;
CuSpMatrix<Real> E(D2.NumRows(), kUndefined);
KALDI_LOG << "D2 is " << D2;
E.CopyFromMat(D2, kTakeLower);
KALDI_LOG << "D2 is " << D2;
}
CuMatrix<Real> D(dim,dim);
D.CopyFromMat(C);
KALDI_LOG << "D stride is : " << D.Stride() <<'\n';
CuSpMatrix<Real> E(D,kTakeLower);
///CuSpMatrix<Real> E(dim);
//E.CopyFromMat(D,kTakeLower);
/*
KALDI_LOG << D.NumRows() << '\n';
//E.CopyFromMat(D, kTakeMean);
//E(D, kTakeMean);
//KALDI_LOG << E.NumRows() << '\n';
E.CopyToMat(&B);
AssertEqual(A, B);
B.SetZero();
//E.CopyFromMat(D, kTakeLower);
CuSpMatrix<Real> F(D,kTakeLower);
//F(D, kTakeLower);
F.CopyToMat(&B);
AssertEqual(A, B);
B.SetZero();
//E.CopyFromMat(D, kTakeUpper);
//E(D, kTakeUpper);
CuSpMatrix<Real> G(D, kTakeUpper);
G.CopyToMat(&B);
AssertEqual(A, B);
*/
}
}
template<typename Real> static void UnitTestCopyFromMat() {
MatrixIndexT dim = 8;
CuMatrix<Real> A(dim,dim);
Matrix<Real> B(dim,dim);
for (MatrixIndexT i = 0; i < dim; i++) {
for (MatrixIndexT j = 0; j <=i; j++)
B(i,j) = i+j;
for (MatrixIndexT j = i+1; j < dim; j++)
B(i,j) = i+j+4;
}
KALDI_LOG << "A is : " << '\n';
KALDI_LOG << B << '\n';
A.CopyFromMat(B);
CuSpMatrix<Real> C(dim);
C.CopyFromMat(A,kTakeLower);
SpMatrix<Real> D(dim);
C.CopyToSp(&D);
KALDI_LOG << "C is : " << '\n';
for (MatrixIndexT i = 0; i < dim; i++) {
for (MatrixIndexT j = 0; j <= i; j++)
std::cout << D(i,j) << " ";
std::cout << '\n';
}
C.CopyFromMat(A,kTakeUpper);
C.CopyToSp(&D);
KALDI_LOG << "C is : " << '\n';
for (MatrixIndexT i = 0; i < dim; i++) {
for (MatrixIndexT j = 0; j <= i; j++)
std::cout << D(i,j) << " ";
std::cout << '\n';
}
C.CopyFromMat(A,kTakeMean);
C.CopyToSp(&D);
KALDI_LOG << "C is : " << '\n';
for (MatrixIndexT i = 0; i < dim; i++) {
for (MatrixIndexT j = 0; j <= i; j++)
std::cout << D(i,j) << " ";
std::cout << '\n';
}
//KALDI_LOG << D << '\n';
}
template<typename Real> static void UnitTestMatrix() {
//operator()
for (MatrixIndexT iter = 0; iter < 2; iter++) {
int32 dim1 = 6 + rand() % 10;
int32 dim2 = 8 + rand() % 10;
Matrix<Real> A(dim1,dim2);
A.SetRandn();
CuMatrix<Real> B(A);
KALDI_ASSERT(A(3, 4) == B(3, 4));
B(3, 4) = 2.0;
A(3, 4) = B(3, 4);
KALDI_ASSERT(A(3, 4) == B(3, 4));
SpMatrix<Real> As(dim1);
CuSpMatrix<Real> Bs(As);
KALDI_ASSERT(As(3, 4) == Bs(3, 4));
Bs(3, 4) = 2.0;
if (rand() % 2 == 0)
As(3, 4) = Bs(3, 4);
else
As(3, 4) = (const_cast<const CuSpMatrix<Real>&>(Bs))(3, 4);
KALDI_ASSERT(As(3, 4) == Bs(3, 4));
Vector<Real> v(dim1);
CuVector<Real> w(v);
KALDI_ASSERT(w(2) == v(2));
w(2) = 3.0;
v(2) = w(2);
KALDI_ASSERT(w(2) == v(2));
}
//SetRandn
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim1 = 15 + rand() % 10;
int32 dim2 = dim1;//10 + rand() % 14;
//KALDI_LOG << "dimension is " << dim1
// << " " << dim2 << '\n';
CuMatrix<Real> A(dim1,dim2);
A.SetRandn();
Matrix<Real> A1(dim1,dim2);
A.CopyToMat(&A1);
//KALDI_LOG << "gpu sum is: " << A.Sum() << '\n';
//KALDI_LOG << "cpu sum is: " << A1.Sum() << '\n';
}
}
template<typename Real> static void UnitTestMulTp() {
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim = 1 + rand() % 30;
Vector<Real> v(dim);
v.SetRandn();
TpMatrix<Real> M(dim);
M.SetRandn();
CuVector<Real> cv(v);
CuTpMatrix<Real> cM(M);
Vector<Real> v2(dim);
cv.CopyToVec(&v2);
AssertEqual(v, v2);
v.MulTp(M, iter % 2 == 0 ? kTrans:kNoTrans);
cv.MulTp(cM, iter % 2 == 0 ? kTrans:kNoTrans);
cv.CopyToVec(&v2);
// KALDI_LOG << "v is " << v << ", v2 is " << v2;
AssertEqual(v, v2);
}
}
template<typename Real> static void UnitTestVector() {
// Scale
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim = 24 + rand() % 10;
Vector<Real> A(dim);
A.SetRandn();
CuVector<Real> B(A);
Vector<Real> C(dim);
Real r = 1.43;
B.Scale(r);
B.CopyToVec(&C);
A.Scale(r);
//KALDI_LOG << A;
//KALDI_LOG << (A.Scale(r));
//KALDI_LOG << C;
AssertEqual(A, C);
}
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim = 15 + rand() % 10;
CuVector<Real> A(dim);
CuVector<Real> B(dim);
Vector<Real> A1(dim);
Vector<Real> B1(dim);
A.SetRandn();
B.SetRandn();
A.CopyToVec(&A1);
B.CopyToVec(&B1);
A.MulElements(B);
A1.MulElements(B1);
Vector<Real> A2(dim);
A.CopyToVec(&A2);
AssertEqual(A1,A2);
}
/*
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim = 72;
CuVector<Real> A(dim);
Vector<Real> A1(dim);
CuMatrix<Real> B(9,8);
Matrix<Real> B1(9,8);
B.SetRandn();
B.CopyToMat(&B1);
A.CopyRowsFromMat(B);
A1.CopyRowsFromMat(B1);
Vector<Real> A2(dim);
A.CopyToVec(&A2);
AssertEqual(A1,A2);
}
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim = 15 + rand() % 10;
CuVector<Real> A(dim);
A.SetRandn();
Vector<Real> A1(dim);
A.CopyToVec(&A1);
KALDI_LOG << "cpu min is : " << A1.Min() << '\n';
KALDI_LOG << "gpu min is : " << A.Min() << '\n';
}
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim = 15 + rand() % 10;
CuVector<Real> A(dim);
A.SetRandn();
Vector<Real> A1(dim);
A.CopyToVec(&A1);
CuVector<Real> B(dim);
B.SetRandn();
Vector<Real> B1(dim);
B.CopyToVec(&B1);
CuVector<Real> C(dim);
C.SetRandn();
Vector<Real> C1(dim);
C.CopyToVec(&C1);
Real alpha = 2;
Real beta = 3;
A.AddVecVec(alpha, B, C, beta);
A1.AddVecVec(alpha,B1,C1,beta);
Vector<Real> D(dim);
A.CopyToVec(&D);
AssertEqual(D,A1);
}
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim1 = 15 + rand() % 10;
int32 dim2 = 10 + rand() % 10;
Matrix<Real> A(dim1,dim2);
for (MatrixIndexT i = 0; i < dim1; i++) {
for (MatrixIndexT j = 0; j < dim2; j++)
A(i,j) = i + 2 * j + 1;
}
KALDI_LOG << A;
CuMatrix<Real> B(dim1,dim2);
B.CopyFromMat(A);
CuVector<Real> C(dim1);
C.SetZero();
Real alpha = 1;
Real beta = 1;
C.AddDiagMat2(alpha, B, kNoTrans, beta);
Vector<Real> D(dim1);
C.CopyToVec(&D);
KALDI_LOG << D << '\n';
Vector<Real> E(dim1);
E.AddDiagMat2(alpha, A, kNoTrans, beta);
KALDI_LOG << E;
AssertEqual(D,E);
}
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim1 = 15 + rand() % 10;
int32 dim2 = 10 + rand() % 10;
Matrix<Real> A(dim1,dim2);
for (MatrixIndexT i = 0; i < dim1; i++) {
for (MatrixIndexT j = 0; j < dim2; j++)
A(i,j) = i + 2 * j + 1;
}
KALDI_LOG << A;
CuMatrix<Real> B(dim1,dim2);
B.CopyFromMat(A);
CuSubVector<Real> C(B,1);
Vector<Real> D(dim2);
C.CopyToVec(&D);
KALDI_LOG << D;
}
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim = 15 + rand() % 10;
CuVector<Real> A(dim);
A.SetRandn();
Vector<Real> A1(dim);
A.CopyToVec(&A1);
CuVector<Real> B(dim);
B.SetRandn();
Vector<Real> B1(dim);
B.CopyToVec(&B1);
Real dot = VecVec(A,B);
KALDI_LOG << "dot product in gpu: " << dot << '\n';
dot = VecVec(A1,B1);
KALDI_LOG << "dot product in cpu: " << dot << '\n';
}
for (MatrixIndexT iter = 0; iter < 10; iter++) {
int32 dim = 15 + rand() % 10;
CuVector<Real> A(dim);
Vector<Real> A1(dim);
for (MatrixIndexT i = 0; i < dim; i++)
A1(i) = i;
A.CopyFromVec(A1);
KALDI_LOG << A(dim-2) << '\n';
KALDI_LOG << A1(dim-2) << '\n';
}
*/
}
template<typename Real>
static void CuMatrixUnitTest() {
UnitTestTrace<Real>();
UnitTestCholesky<Real>();
UnitTestInvert<Real>();
UnitInvert<Real>();
UnitTestCopyFromMat<Real>();
UnitTestCopySp<Real>();
UnitTestConstructor<Real>();
UnitTestVector<Real>();
UnitTestMulTp<Real>();
UnitTestMatrix<Real>();
UnitTestSetZeroUpperDiag<Real>();
}
} //namespace
int main() {
using namespace kaldi;
#if HAVE_CUDA == 1
kaldi::CuDevice::Instantiate().SelectGpuId("yes");
#endif
kaldi::CuMatrixUnitTest<float>();
#if HAVE_CUDA == 1
if (!kaldi::CuDevice::Instantiate().DoublePrecisionSupported()) {
KALDI_WARN << "Double precision not supported, not testing that code";
} else
#endif
{
kaldi::CuMatrixUnitTest<double>();
}
#if HAVE_CUDA == 1
kaldi::CuDevice::Instantiate().PrintProfile();
#endif
KALDI_LOG << "Tests succeeded.\n";
return 0;
}

Просмотреть файл

@ -0,0 +1,218 @@
// cudamatrix/cu-sp-matrix-test.cc
//
// Copyright 2013 Ehsan Variani
// Lucas Ondel
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
//
// UnitTests for testing cu-sp-matrix.h methods.
//
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-tp-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-sp-matrix.h"
using namespace kaldi;
namespace kaldi {
template<typename Real>
static void AssertEqual(const CuPackedMatrix<Real> &A,
const CuPackedMatrix<Real> &B,
float tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
for (MatrixIndexT j = 0; j <= i; j++)
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
}
template<typename Real>
static void AssertEqual(const PackedMatrix<Real> &A,
const PackedMatrix<Real> &B,
float tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
for (MatrixIndexT j = 0; j <= i; j++)
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
}
template<typename Real>
static void AssertEqual(const PackedMatrix<Real> &A,
const CuPackedMatrix<Real> &B,
float tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
for (MatrixIndexT i = 0; i < A.NumRows(); i++)
for (MatrixIndexT j = 0; j <= i; j++)
KALDI_ASSERT(std::abs(A(i, j) - B(i, j))
< tol * std::max(1.0, (double) (std::abs(A(i, j)) + std::abs(B(i, j)))));
}
/*
* Unit Tests
*/
template<typename Real>
static void UnitTestCuTpMatrixInvert() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 5 * i + rand() % 10;
TpMatrix<Real> A(dim);
A.SetRandn();
CuTpMatrix<Real> B(A);
AssertEqual<Real>(A, B, 0.005);
A.Invert();
B.Invert();
AssertEqual<Real>(A, B, 0.005);
}
}
template<typename Real>
static void UnitTestCuTpMatrixCopyFromTp() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 5 * i + rand() % 10;
TpMatrix<Real> A(dim);
A.SetRandn();
CuTpMatrix<Real> B(dim);
B.CopyFromTp(A);
CuTpMatrix<Real> C(dim);
C.CopyFromTp(B);
AssertEqual<Real>(A, B);
AssertEqual<Real>(B, C);
}
}
template<typename Real>
static void UnitTestCuTpMatrixCopyFromMat() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixTransposeType trans = (i % 2 == 0 ? kNoTrans : kTrans);
MatrixIndexT dim = 10*i + rand() % 5;
CuMatrix<Real> A(dim, dim);
A.SetRandn();
Matrix<Real> A2(A);
CuTpMatrix<Real> B(dim);
B.CopyFromMat(A, trans);
TpMatrix<Real> B2(dim);
B2.CopyFromMat(A2, trans);
TpMatrix<Real> B3(B);
AssertEqual(B2, B3);
KALDI_ASSERT(B3.Trace() != 0);
}
}
template<typename Real>
static void UnitTestCuTpMatrixCholesky() {
for (MatrixIndexT i = 1; i < 10; i++) {
MatrixIndexT dim = 1 + rand() % 10;
if (i > 4) {
dim += 32 * (rand() % 5);
}
Matrix<Real> M(dim, dim + 2);
M.SetRandn();
SpMatrix<Real> A(dim);
A.AddMat2(1.0, M, kNoTrans, 0.0); // sets A to random almost-surely +ve
// definite matrix.
CuSpMatrix<Real> B(A);
TpMatrix<Real> C(dim);
C.SetRandn();
CuTpMatrix<Real> D(C);
C.Cholesky(A);
D.Cholesky(B);
AssertEqual<Real>(C, D);
}
}
template<class Real>
static void UnitTestCuTpMatrixIO() {
for (int32 i = 0; i < 3; i++) {
int32 dimM = rand() % 255 + 10;
if (i % 5 == 0) { dimM = 0; }
CuTpMatrix<Real> mat(dimM);
mat.SetRandn();
std::ostringstream os;
bool binary = (i % 4 < 2);
mat.Write(os, binary);
CuTpMatrix<Real> mat2;
std::istringstream is(os.str());
mat2.Read(is, binary);
AssertEqual(mat, mat2);
}
}
template<typename Real> void CudaTpMatrixUnitTest() {
UnitTestCuTpMatrixIO<Real>();
UnitTestCuTpMatrixInvert<Real>();
UnitTestCuTpMatrixCopyFromTp<Real>();
UnitTestCuTpMatrixCholesky<Real>();
UnitTestCuTpMatrixCopyFromMat<Real>();
}
} // namespace kaldi
int main() {
using namespace kaldi;
for (int32 loop = 0; loop < 2; loop++) {
#if HAVE_CUDA == 1
if (loop == 0)
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
else
CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
#endif
kaldi::CudaTpMatrixUnitTest<float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CudaTpMatrixUnitTest<double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CudaTpMatrixUnitTest<double>();
#endif
if (loop == 0)
KALDI_LOG << "Tests without GPU use succeeded.\n";
else
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
}
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
}

Просмотреть файл

@ -0,0 +1,112 @@
#if HAVE_CUDA==1
#include <cuda_runtime_api.h>
#include <cublas.h>
#endif
#include "util/timer.h"
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-kernels.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-tp-matrix.h"
#include "cudamatrix/cu-sp-matrix.h"
#include "cudamatrix/cublas-wrappers.h"
namespace kaldi {
template<typename Real>
CuTpMatrix<Real>::CuTpMatrix(const CuMatrixBase<Real> &orig, MatrixTransposeType trans):
CuPackedMatrix<Real>(orig.NumRows(), kUndefined) {
KALDI_ASSERT(orig.NumRows() == orig.NumCols());
this->CopyFromMat(orig, trans);
}
template<typename Real>
void CuTpMatrix<Real>::Cholesky(const CuSpMatrix<Real> &orig) {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
CuMatrix<Real> tmp(orig);
tmp.Cholesky();
this->CopyFromMat(tmp, kNoTrans);
} else
#endif
{
this->Mat().Cholesky(orig.Mat());
}
}
template<typename Real>
void CuTpMatrix<Real>::Invert() {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
int dimBlock(CU2DBLOCK);
int dimGrid(n_blocks(this->NumRows(), CU2DBLOCK));
CuMatrix<Real> tmp(this->NumRows(), this->NumRows());
int dim = this->NumRows();
Real alpha = 1.0;
cuda_set_diag(dimGrid, dimBlock, tmp.Data(), alpha, tmp.Dim());
//Matrix<Real> A(dim,dim);
//tmp.CopyToMat(&A);
CuMatrix<Real> tmp2(dim, dim);
tmp2.CopyFromTp(*this);
cublas_trsm(dim, dim, alpha, tmp2.Data(), tmp2.Dim().stride,
tmp.Data(), tmp.Dim().stride);
this->CopyFromMat(tmp, kNoTrans);
} else
#endif
{
Mat().Invert();
}
}
template<typename Real>
void CuTpMatrix<Real>::CopyFromMat(const CuMatrixBase<Real> &M,
MatrixTransposeType Trans) {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
MatrixIndexT num_rows = this->num_rows_;
KALDI_ASSERT(num_rows == M.NumRows() && this->num_rows_ == M.NumCols());
if (num_rows == 0)
return;
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(num_rows, CU2DBLOCK), n_blocks(num_rows, CU2DBLOCK));
if (Trans == kNoTrans) {
cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
cudaThreadSynchronize();
} else {
cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
cudaThreadSynchronize();
}
} else
#endif
{
Mat().CopyFromMat(M.Mat(), Trans);
}
}
template<class Real>
TpMatrix<Real>::TpMatrix(const CuTpMatrix<Real> &cu) {
this->Resize(cu.NumRows());
this->CopyFromMat(cu);
}
template TpMatrix<float>::TpMatrix(const CuTpMatrix<float> &cu);
template TpMatrix<double>::TpMatrix(const CuTpMatrix<double> &cu);
template<class Real>
void TpMatrix<Real>::CopyFromMat(const CuTpMatrix<Real> &other) {
other.CopyToPacked(this);
}
// instantiate the template above.
template void TpMatrix<float>::CopyFromMat(const CuTpMatrix<float> &other);
template void TpMatrix<double>::CopyFromMat(const CuTpMatrix<double> &other);
template class CuTpMatrix<float>;
template class CuTpMatrix<double>;
} // namespace

Просмотреть файл

@ -0,0 +1,83 @@
// cudamatrix/cu-tp-matrix.h
// Copyright 2013 Ehsan Variani
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
//
#ifndef KALDI_CUDAMATRIX_CU_TP_MATRIX_H_
#define KALDI_CUDAMATRIX_CU_TP_MATRIX_H_
#include <sstream>
#include "cudamatrix/cu-common.h"
#include "matrix/matrix-common.h"
#include "matrix/tp-matrix.h"
#include "cudamatrix/cu-array.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-packed-matrix.h"
#include "cudamatrix/cu-matrix.h"
namespace kaldi {
template<typename Real> class CuTpMatrix;
template<typename Real>
class CuTpMatrix : public CuPackedMatrix<Real> {
friend class CuMatrixBase<float>;
friend class CuMatrixBase<double>;
friend class CuVectorBase<Real>;
friend class CuSubMatrix<Real>;
friend class CuRand<Real>;
friend class CuTpMatrix<float>;
friend class CuTpMatrix<double>;
public:
CuTpMatrix() : CuPackedMatrix<Real>() {}
explicit CuTpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
: CuPackedMatrix<Real>(r, resize_type) {}
explicit CuTpMatrix<Real>(const TpMatrix<Real> &orig)
: CuPackedMatrix<Real>(orig) {}
explicit CuTpMatrix<Real>(const CuTpMatrix<Real> &orig)
: CuPackedMatrix<Real>(orig) {}
explicit CuTpMatrix<Real>(const CuMatrixBase<Real> &orig,
MatrixTransposeType trans = kNoTrans);
~CuTpMatrix() {}
void CopyFromMat(const CuMatrixBase<Real> &M,
MatrixTransposeType Trans = kNoTrans);
void CopyFromTp(const CuTpMatrix<Real> &other) {
CuPackedMatrix<Real>::CopyFromPacked(other);
}
void CopyFromTp(const TpMatrix<Real> &other) {
CuPackedMatrix<Real>::CopyFromPacked(other);
}
void Cholesky(const CuSpMatrix<Real>& Orig);
void Invert();
protected:
inline const TpMatrix<Real> &Mat() const {
return *(reinterpret_cast<const TpMatrix<Real>* >(this));
}
inline TpMatrix<Real> &Mat() {
return *(reinterpret_cast<TpMatrix<Real>* >(this));
}
};
} // namespace
#endif

88
src/cudamatrix/cu-value.h Normal file
Просмотреть файл

@ -0,0 +1,88 @@
// cudamatrix/cu-value.h
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_VALUE_H_
#define KALDI_CUDAMATRIX_CU_VALUE_H_
#include <cudamatrix/cu-device.h>
namespace kaldi {
/// The following class is used to simulate non-const
/// references to Real, e.g. as returned by the non-const operator ().
/// This class is also used as a convenient way of
/// reading a single Real value from the device.
template<typename Real>
class CuValue {
public:
CuValue(Real *data): data_(data) { }
CuValue(const CuValue &other): data_(other.data_) { }
inline CuValue operator = (const CuValue<Real> &other) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
CU_SAFE_CALL(cudaMemcpy(data_, other.data_, sizeof(Real), cudaMemcpyDeviceToDevice));
return *this;
} else
#endif
{
*data_ = *other.data_;
return *this;
}
}
inline Real operator = (Real r) { // assignment from Real
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
CU_SAFE_CALL(cudaMemcpy(data_, &r, sizeof(Real), cudaMemcpyHostToDevice));
return r;
} else
#endif
{
*data_ = r;
return r;
}
}
inline Real operator += (Real r) { return (*this = r + Real(*this)); }
inline operator Real () const { // assignment to Real
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Real value;
CU_SAFE_CALL(cudaMemcpy(&value, data_,
sizeof(Real), cudaMemcpyDeviceToHost));
return value;
} else
#endif
return *data_;
}
private:
Real *data_;
}; // class CuValue<Real>
} // namespace
#endif // KALDI_CUDAMATRIX_CU_VALUE_H_

Просмотреть файл

@ -1,462 +0,0 @@
// cudamatrix/cu-vector-inl.h
// Copyright 2009-2012 Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_VECTOR_INL_H_
#define KALDI_CUDAMATRIX_CU_VECTOR_INL_H_
#if HAVE_CUDA==1
#include <cuda_runtime_api.h>
#endif
#include "util/timer.h"
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-kernels.h"
namespace kaldi {
template<typename Real>
CuVector<Real>::CuVector(const CuVector<Real> &v) {
this->Resize(v.dim_);
this->CopyFromVec(v);
}
template<typename Real>
CuVector<Real>::CuVector(const CuVectorBase<Real> &v) {
this->Resize(v.dim_);
this->CopyFromVec(v);
}
template<typename Real>
CuVector<Real>::CuVector(const VectorBase<Real> &v) {
this->Resize(v.dim_);
this->CopyFromVec(v);
}
template<typename Real>
void CuVector<Real>::Resize(MatrixIndexT dim, MatrixResizeType t) {
KALDI_ASSERT(t == kSetZero || t == kUndefined); // Others not implemented
// yet.
if (this->dim_ == dim) {
this->SetZero();
return;
}
if (this->dim_ != 0)
this->Destroy();
if (dim == 0) return;
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
cuSafeCall(cudaMalloc(reinterpret_cast<void**>(&this->data_), dim * sizeof(Real)));
this->dim_ = dim;
if (t == kSetZero) this->SetZero();
} else
#endif
{
Vector<Real> vec(dim);
this->Swap(&vec);
}
}
template<typename Real>
void CuVector<Real>::Swap(Vector<Real> *vec) {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
if (this->dim_ == 0) {
if (vec->dim_ != 0) {
// *this is empty, but vec is nonempty.
Resize(vec->dim_, kUndefined);
this->CopyFromVec(*vec);
vec->Resize(0);
}
// else both are empty.
} else { // *this is nonempty.
if (vec->dim_ != 0) {
// Both *this and *vec are nonempty. Recurse to simpler cases.
// this could be done more efficiently in the case where
// the size does not change.
Vector<Real> temp;
this->Swap(&temp); // now temp is full, *this is empty.
vec->Swap(&temp); // now vec has data from *this, temp has
// data from vec.
Swap(vec); // copy data in vec to *this, which is now empty.
} else { // *this is full but *vec is empty.
vec->Resize(this->dim_, kUndefined);
this->CopyToVec(vec);
this->Destroy();
}
}
} else
#endif
{
std::swap(vec->data_, this->data_);
std::swap(vec->dim_, this->dim_);
}
}
template<typename Real>
void CuVector<Real>::Destroy() {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
if (this->data_ != NULL) {
cuSafeCall(cudaFree(this->data_));
}
} else
#endif
{
if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_);
}
this->data_ = NULL;
this->dim_ = 0;
}
template<typename Real>
void CuVectorBase<Real>::CopyFromVec(const CuVectorBase<Real> &src) {
KALDI_ASSERT(src.Dim() == dim_);
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
cuSafeCall(cudaMemcpy(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice));
CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecD2D",tim.Elapsed());
} else
#endif
{
memcpy(static_cast<void*>(data_), static_cast<void*>(src.data_),
dim_ * sizeof(Real));
}
}
template<typename Real>
void CuVectorBase<Real>::CopyFromVec(const VectorBase<Real> &src) {
KALDI_ASSERT(src.Dim() == dim_);
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
cuSafeCall(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice));
CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D",tim.Elapsed());
} else
#endif
{
memcpy(static_cast<void*>(data_), static_cast<const void*>(src.Data()),
dim_ * sizeof(Real));
}
}
template<typename Real>
void CuVectorBase<Real>::CopyToVec(VectorBase<Real> *dst) const {
KALDI_ASSERT(dst->Dim() == dim_);
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
cuSafeCall(cudaMemcpy(dst->Data(), this->data_,
dim_*sizeof(Real), cudaMemcpyDeviceToHost));
CuDevice::Instantiate().AccuProfile("CuVector::CopyToVecD2H",tim.Elapsed());
} else
#endif
{
dst->CopyFromVec(Vec());
}
}
template<typename Real>
void CuVector<Real>::Read(std::istream &is, bool binary) {
Vector<BaseFloat> temp;
temp.Read(is, binary);
Destroy();
Swap(&temp);
}
template<typename Real>
void CuVector<Real>::Write(std::ostream &os, bool binary) const {
Vector<BaseFloat> temp(this->dim_);
this->CopyToVec(&temp);
temp.Write(os, binary);
}
template<typename Real>
void CuVectorBase<Real>::SetZero() {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
KALDI_ASSERT(dim_>0);
KALDI_ASSERT(data_!=NULL);
Timer tim;
cuSafeCall(cudaMemset(data_, 0, dim_*sizeof(Real)));
CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Elapsed());
} else
#endif
{
Vec().SetZero();
}
}
/**
* Print the vector to stream
*/
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec) {
Vector<Real> temp;
vec.CopyToVec(&temp);
out << temp;
return out;
}
/*
* Methods wrapping the ANSI-C CUDA kernels
*/
template<typename Real>
void CuVectorBase<Real>::Set(Real value) {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK);
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
::MatrixDim d = { 1, Dim(), Dim() };
cuda_set_const(dimGrid, dimBlock, data_, value, d);
cuSafeCall(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Vec().Set(value);
}
}
template<typename Real>
void CuVectorBase<Real>::Add(Real value) {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK);
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
::MatrixDim d = { 1, Dim(), Dim() };
cuda_add(dimGrid, dimBlock, data_, value, d);
cuSafeCall(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Vec().Add(value);
}
}
template<typename Real>
void CuVectorBase<Real>::Scale(Real value) {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK);
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
::MatrixDim d = { 1, Dim(), Dim() };
cuda_scale(dimGrid, dimBlock, data_, value, d);
cuSafeCall(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Vec().Scale(value);
}
}
template<class Real>
void CuVectorBase<Real>::AddVec(Real alpha, const CuVectorBase<Real> &vec,
Real beta) {
KALDI_ASSERT(vec.Dim() == Dim());
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK);
dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
::MatrixDim d = { 1, Dim(), Dim() };
cuda_add_mat(dimGrid, dimBlock, alpha, vec.data_, beta, data_, d);
cuSafeCall(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
if (beta != 1.0) Vec().Scale(beta);
Vec().AddVec(alpha, vec.Vec());
}
}
template<typename Real>
void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
Real beta) {
KALDI_ASSERT(mat.NumCols() == Dim());
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
CuVector<Real> temp(Dim()); // create a buffer
temp.SetZero();
MatrixDim d = mat.Dim(); // only stride will be used!
// process per 256 row blocks
for(int32 block=0; (block+1)*256 <= mat.NumRows(); block++) {
// 1st dim ... rows, 2nd dim ... cols
dim3 dimBlock(256, 1);
dim3 dimGrid(1, mat.NumCols());
int32 offset = block*256*d.stride;
cuda_add_row_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
}
// process the remainder
int32 div = mat.NumRows() / 256;
int32 mod = mat.NumRows() % 256;
if (mod != 0) {
// 1st dim ... rows, 2nd dim ... cols
dim3 dimBlock(mod, 1);
dim3 dimGrid(1, mat.NumCols());
int32 offset = div*256*d.stride;
cuda_add_row_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
}
// now we have the sum!
// add buffer rmp to this vector using alpha and beta
this->AddVec(alpha,temp,beta);
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Vec().AddRowSumMat(alpha, mat.Mat(), beta);
}
}
template<typename Real>
void CuVectorBase<Real>::AddColSumMat(Real alpha,
const CuMatrixBase<Real> &mat,
Real beta) {
KALDI_ASSERT(mat.NumRows() == Dim());
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
CuVector<Real> temp(Dim()); // create a buffer
MatrixDim d = mat.Dim(); // only stride will be used!
// process per 256 column blocks
for(int32 block=0; (block+1)*256 <= mat.NumCols(); block++) {
// 1st dim ... cols, 2nd dim ... rows
dim3 dimBlock(256, 1);
dim3 dimGrid(1, mat.NumRows());
int32 offset = block*256;
cuda_add_col_sum_mat(dimGrid, dimBlock, mat.data_ + offset, temp.data_, d);
}
// process the remainder
int32 div = mat.NumCols() / 256;
int32 mod = mat.NumCols() % 256;
if (mod != 0) {
// 1st dim ... cols, 2nd dim ... rows
dim3 dimBlock(mod, 1);
dim3 dimGrid(1, mat.NumRows());
int32 offset=div*256;
cuda_add_col_sum_mat(dimGrid, dimBlock, mat.data_ +offset, temp.data_, d);
}
// now we have the sum!
// add buffer rmp to this vector using alpha and beta
this->AddVec(alpha, temp, beta);
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Vec().AddColSumMat(alpha, mat.Mat(), beta);
}
}
template<typename Real>
void CuVectorBase<Real>::InvertElements() {
#if HAVE_CUDA==1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CUBLOCK*8, 1);
dim3 dimGrid(n_blocks(dim_, CUBLOCK*8));
MatrixDim d = {1, dim_, dim_};
cuda_invert_elements(dimGrid, dimBlock, data_, d);
cuSafeCall(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Vec().InvertElements();
}
}
} // namespace kaldi
#endif

Просмотреть файл

@ -0,0 +1,169 @@
// cudamatrix/cu-vector-speed-test.cc
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-math.h"
using namespace kaldi;
namespace kaldi {
template<typename Real>
std::string NameOf() {
return (sizeof(Real) == 8 ? "<double>" : "<float>");
}
template<typename Real> void TestCuVectorSoftmax(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuVector<Real> M(dim);
M.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
M.ApplySoftMax();
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuVector::Softmax" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuVectorSum(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuVector<Real> M(dim);
M.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
M.Sum();
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuVector::Sum" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuVector<Real> M(dim);
M.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
CuVector<Real> ones(dim);
ones.Set(1.0);
VecVec(M, ones);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuVector::VecVecOne" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
BaseFloat time_in_secs = 0.05;
CuVector<Real> v(dim);
v.SetRandn();
CuMatrix<Real> N(dim, dim), O(dim, dim);
N.SetRandn(); O.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
v.AddDiagMatMat(1.0, N, kNoTrans, O, kNoTrans, 1.0);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void CudaVectorSpeedTest() {
std::vector<int32> sizes;
sizes.push_back(16);
sizes.push_back(128);
sizes.push_back(256);
sizes.push_back(1024);
int32 ns = sizes.size();
for (int32 s = 0; s < ns; s++) {
TestCuVectorSoftmax<Real>(sizes[s]);
}
for (int32 s = 0; s < ns; s++) {
TestCuVectorSum<Real>(sizes[s]);
}
for (int32 s = 0; s < ns; s++) {
TestCuVectorVecVecOne<Real>(sizes[s]);
}
for (int32 s = 0; s < ns; s++) {
TestCuVectorAddDiagMatMat<Real>(sizes[s]);
}
}
} // namespace kaldi
int main() {
//Select the GPU
#if HAVE_CUDA == 1
CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
#endif
kaldi::CudaVectorSpeedTest<float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CudaVectorSpeedTest<double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CudaVectorSpeedTest<double>();
#endif
std::cout << "Tests succeeded.\n";
}

Просмотреть файл

@ -0,0 +1,751 @@
// cudamatrix/cuda-vector-test.cc
// Copyright 2013 Lucas Ondel
// 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-tp-matrix.h"
#include "cudamatrix/cu-sp-matrix.h"
#include "cudamatrix/cu-math.h"
namespace kaldi {
/*
* INITIALIZERS
*/
/*
* Unit tests
*/
template<class Real>
static void UnitTestCuVectorIO() {
for (int32 i = 0; i < 10; i++) {
int32 dimM = rand() % 255;
if (i % 5 == 0) { dimM = 0; }
CuVector<Real> vec(dimM);
vec.SetRandn();
std::ostringstream os;
bool binary = (i % 4 < 2);
vec.Write(os, binary);
CuVector<Real> vec2;
std::istringstream is(os.str());
vec2.Read(is, binary);
AssertEqual(vec, vec2);
}
}
template<typename Real, typename OtherReal>
static void UnitTestCuVectorCopyFromVec() {
for (int32 i = 1; i < 10; i++) {
MatrixIndexT dim = 10 * i;
Vector<Real> A(dim);
A.SetRandn();
CuVector<OtherReal> B(A);
Vector<Real> C(B);
CuVector<Real> D(dim);
D.CopyFromVec(C);
Vector<OtherReal> E(dim);
E.CopyFromVec(D);
CuVector<Real> F(E);
CuVector<Real> A2(A);
AssertEqual(F, A2);
}
}
template<typename Real>
static void UnitTestCuSubVector() {
for (int32 iter = 0 ; iter < 10; iter++) {
int32 M1 = 1 + rand () % 10, M2 = 1 + rand() % 1, M3 = 1 + rand() % 10, M = M1 + M2 + M3,
m = rand() % M2;
CuVector<Real> vec(M);
vec.SetRandn();
CuSubVector<Real> subvec1(vec, M1, M2),
subvec2 = vec.Range(M1, M2);
Real f1 = vec(M1 + m), f2 = subvec1(m), f3 = subvec2(m);
KALDI_ASSERT(f1 == f2);
KALDI_ASSERT(f2 == f3);
}
}
template<typename Real>
static void UnitTestCuVectorMulTp() {
for (int32 i = 1; i < 10; i++) {
MatrixIndexT dim = 10 * i;
Vector<Real> A(dim);
A.SetRandn();
TpMatrix<Real> B(dim);
B.SetRandn();
CuVector<Real> C(A);
CuTpMatrix<Real> D(B);
A.MulTp(B, kNoTrans);
C.MulTp(D, kNoTrans);
CuVector<Real> E(A);
AssertEqual(C, E);
}
}
template<typename Real>
static void UnitTestCuVectorAddTp() {
for (int32 i = 1; i < 10; i++) {
MatrixIndexT dim = 10 * i;
Vector<Real> A(dim);
A.SetRandn();
TpMatrix<Real> B(dim);
B.SetRandn();
Vector<Real> C(dim);
C.SetRandn();
CuVector<Real> D(A);
CuTpMatrix<Real> E(B);
CuVector<Real> F(C);
A.AddTpVec(1.0, B, kNoTrans, C, 1.0);
D.AddTpVec(1.0, E, kNoTrans, F, 1.0);
CuVector<Real> G(A);
AssertEqual(D, G);
}
}
template<typename Real> void CuVectorUnitTestVecVec() {
int32 M = 10 % rand() % 100;
CuVector<Real> vec1(M), vec2(M);
vec1.SetRandn();
vec2.SetRandn();
Real prod = 0.0;
for (int32 i = 0; i < M; i++)
prod += vec1(i) * vec2(i);
AssertEqual(prod, VecVec(vec1, vec2));
}
template<typename Real> void CuVectorUnitTestAddVec() {
int32 M = 10 % rand() % 100;
CuVector<Real> vec1(M);
CuVector<Real> vec2(M);
vec1.SetRandn();
vec2.SetRandn();
CuVector<Real> vec1_orig(vec1);
BaseFloat alpha = 0.43243;
vec1.AddVec(alpha, vec2);
for (int32 i = 0; i < M; i++)
AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
}
template<typename Real> void CuVectorUnitTestAddVecCross() {
for (int32 i = 0; i < 4; i++) {
int32 M = 10 % rand() % 100;
CuVector<float> vec1(M);
CuVector<Real> vec2(M);
vec1.SetRandn();
vec2.SetRandn();
if (i == 0) {
CuVector<Real> vec1_orig(vec1);
Real alpha = 0.43243;
vec1.AddVec(alpha, vec2);
for (int32 i = 0; i < M; i++)
AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
} else {
CuVector<Real> vec2_orig(vec2);
Real alpha = 0.43243;
vec2.AddVec(alpha, vec1);
for (int32 i = 0; i < M; i++)
AssertEqual(vec2_orig(i) + alpha * vec1(i), vec2(i));
}
}
}
template<typename Real> void CuVectorUnitTestAddVecExtra() {
int32 M = 10 % rand() % 100;
CuVector<Real> vec1(M), vec2(M);
vec1.SetRandn();
vec2.SetRandn();
CuVector<Real> vec1_orig(vec1);
BaseFloat alpha = 0.43243, beta = 1.4321;
vec1.AddVec(alpha, vec2, beta);
for (int32 i = 0; i < M; i++)
AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i));
}
template<typename Real> void CuVectorUnitTestAddRowSumMat() {
int32 M = 10 + rand() % 280, N = 10 + rand() % 20;
BaseFloat alpha = 10.0143432, beta = 43.4321;
CuMatrix<Real> mat(N, M);
mat.SetRandn();
CuVector<Real> vec(M);
mat.SetRandn();
Matrix<Real> mat2(mat);
Vector<Real> vec2(M);
vec.AddRowSumMat(alpha, mat, beta);
vec2.AddRowSumMat(alpha, mat2, beta);
Vector<Real> vec3(vec);
AssertEqual(vec2, vec3);
}
template<typename Real> void CuVectorUnitTestAddColSumMat() {
int32 M = 10 + rand() % 280, N = 10 + rand() % 20;
BaseFloat alpha = 10.0143432, beta = 43.4321;
CuMatrix<Real> mat(M, N);
mat.SetRandn();
CuVector<Real> vec(M);
mat.SetRandn();
Matrix<Real> mat2(mat);
Vector<Real> vec2(M);
vec.AddColSumMat(alpha, mat, beta);
vec2.AddColSumMat(alpha, mat2, beta);
Vector<Real> vec3(vec);
AssertEqual(vec2, vec3);
}
template<typename Real> void CuVectorUnitTestApproxEqual() {
int32 M = 10 + rand() % 100;
CuVector<Real> vec1(M), vec2(M);
vec1.SetRandn();
vec2.SetRandn();
Real tol = 0.5;
for (int32 i = 0; i < 10; i++) {
Real sumsq = 0.0, sumsq_orig = 0.0;
for (int32 j = 0; j < M; j++) {
sumsq += (vec1(j) - vec2(j)) * (vec1(j) - vec2(j));
sumsq_orig += vec1(j) * vec1(j);
}
Real rms = sqrt(sumsq), rms_orig = sqrt(sumsq_orig);
KALDI_ASSERT(vec1.ApproxEqual(vec2, tol) == (rms <= tol * rms_orig));
tol *= 2.0;
}
}
template<typename Real> static void UnitTestCuVectorReplaceValue() {
for (int32 i = 0; i < 5; i++) {
int32 dim = 100 + rand() % 200;
Real orig = 0.1 * (rand() % 100), changed = 0.1 * (rand() % 50);
Vector<Real> vec(dim);
vec.SetRandn();
vec(dim / 2) = orig;
CuVector<Real> vec1(vec);
vec.ReplaceValue(orig, changed);
vec1.ReplaceValue(orig, changed);
Vector<Real> vec2(vec1);
AssertEqual(vec, vec2);
}
}
template<typename Real> void CuVectorUnitTestInvertElements() {
// Also tests MulElements();
int32 M = 256 + rand() % 100;
CuVector<Real> vec1(M);
vec1.SetRandn();
CuVector<Real> vec2(vec1);
vec2.InvertElements();
CuVector<Real> vec3(vec1);
vec3.MulElements(vec2);
// vec3 should be all ones.
Real prod = VecVec(vec3, vec3);
AssertEqual(prod, static_cast<Real>(M));
}
template<typename Real> void CuVectorUnitTestSum() {
for (int32 i =1; i < 10; i++) {
MatrixIndexT dim = 2048 * i + 100 % rand();
CuVector<Real> A(dim), ones(dim);
A.SetRandn();
ones.Set(1.0);
AssertEqual(VecVec(A, ones), A.Sum());
}
}
template<typename Real> void CuVectorUnitTestScale() {
for (int32 i = 0; i < 4; i++) {
int32 dim = 100 + 400 % rand();
CuVector<Real> cu_vec(dim);
cu_vec.SetRandn();
Vector<Real> vec(cu_vec);
BaseFloat scale = 0.333;
cu_vec.Scale(scale);
vec.Scale(scale);
Vector<Real> vec2(cu_vec);
KALDI_ASSERT(ApproxEqual(vec, vec2));
}
}
template<typename Real> void CuVectorUnitTestCopyFromMat() {
int32 M = 100 + rand() % 255, N = 100 + rand() % 255;
CuMatrix<Real> cu_matrix(M, N);
cu_matrix.SetRandn();
for(int32 i = 0; i < N; i++) {
CuVector<Real> vector(M);
vector.CopyColFromMat(cu_matrix, i);
for(int32 j = 0; j < M; j++) {
KALDI_ASSERT(vector(j)==cu_matrix(j, i));
}
}
Matrix<Real> matrix(cu_matrix), matrix2(M, N);
CuMatrix<Real> matrix3(M, N);
CuVector<Real> vector(M * N), vector2(M * N);
vector.CopyRowsFromMat(cu_matrix);
vector2.CopyRowsFromMat(matrix);
matrix2.CopyRowsFromVec(vector2);
matrix3.CopyRowsFromVec(Vector<Real>(vector2));
Vector<Real> vector3(M * N);
vector3.CopyRowsFromMat(cu_matrix);
for(int32 j = 0; j < M*N; j++) {
if (rand() % 500 == 0) { // random small subset (it was slow)
KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N));
KALDI_ASSERT(vector2(j) == cu_matrix(j/N, j%N));
KALDI_ASSERT(vector2(j) == matrix2(j/N, j%N));
KALDI_ASSERT(vector3(j) == matrix2(j/N, j%N));
KALDI_ASSERT(vector3(j) == matrix3(j/N, j%N));
}
}
}
template<typename Real> void CuVectorUnitTestCopyDiagFromPacked() {
for (int32 i = 0; i < 5; i++) {
int32 N = 100 + rand() % 255;
CuSpMatrix<Real> S(N);
S.SetRandn();
CuVector<Real> V(N, kUndefined);
V.CopyDiagFromPacked(S);
SpMatrix<Real> cpu_S(S);
Vector<Real> cpu_V(N);
cpu_V.CopyDiagFromPacked(cpu_S);
Vector<Real> cpu_V2(V);
KALDI_ASSERT(cpu_V.ApproxEqual(cpu_V2));
}
}
template<typename Real> void CuVectorUnitTestCopyCross() {
for (int32 i = 0; i < 10; i++) {
int32 M = 100 + rand() % 255;
if (rand() % 3 == 0) M = 0;
CuVector<Real> v1(M);
v1.SetRandn();
CuVector<float> v2(M);
v2.CopyFromVec(v1);
CuVector<Real> v3(M);
v3.CopyFromVec(v2);
AssertEqual(v1, v3);
}
}
template<typename Real> void CuVectorUnitTestCopyCross2() {
for (int32 i = 0; i < 10; i++) {
int32 M = 100 + rand() % 255;
if (rand() % 3 == 0) M = 0;
CuVector<Real> v1(M);
v1.SetRandn();
Vector<float> v2(M);
v2.CopyFromVec(v1);
CuVector<Real> v3(M);
v3.CopyFromVec(v2);
AssertEqual(v1, v3);
}
}
template<typename Real> void CuVectorUnitTestCopyDiagFromMat() {
for (int32 i = 0; i < 5; i++) {
int32 M = 100 + rand() % 255, N = M + rand() % 2;
Matrix<Real> matrix(M, N);
if (i % 2 == 0) matrix.Transpose();
matrix.SetRandn();
Vector<Real> vector(M, kUndefined);
vector.CopyDiagFromMat(matrix);
CuMatrix<Real> cuda_matrix(matrix);
CuVector<Real> cuda_vector(M, kUndefined);
cuda_vector.CopyDiagFromMat(cuda_matrix);
Vector<Real> vector2(cuda_vector);
AssertEqual(vector, vector2);
AssertEqual(vector.Sum(), cuda_matrix.Trace(false));
AssertEqual(cuda_vector.Sum(), matrix.Trace(false));
}
}
template<typename Real> void CuVectorUnitTestNorm() {
int32 dim = 2;
CuVector<Real> cu_vector(dim);
cu_vector(0) = 1.0;
cu_vector(1) = -2.0;
KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0));
KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0)));
}
template<typename Real> void CuVectorUnitTestMin() {
for (int32 p = 0; p < 5; p++) {
int32 dim = 100 + rand() % 500;
CuVector<Real> cu_vector(dim);
cu_vector.SetRandn();
Vector<Real> vector(cu_vector);
Real min1 = cu_vector.Min(), min2 = vector.Min();
KALDI_ASSERT(min1 == min2);
}
}
template<typename Real> void CuVectorUnitTestMax() {
for (int32 p = 0; p < 5; p++) {
int32 dim = 100 + rand() % 500;
CuVector<Real> cu_vector(dim);
cu_vector.SetRandn();
Vector<Real> vector(cu_vector);
Real max1 = cu_vector.Max(), max2 = vector.Max();
KALDI_ASSERT(max1 == max2);
}
}
template<typename Real> void CuVectorUnitTestApplySoftMax() {
for (int32 i = 0; i < 10; i++) {
int32 dim = 100 + rand() % 300;
//int32 dim = 1024;
CuVector<Real> cu_vector(dim);
cu_vector.SetRandn();
Vector<Real> vector(cu_vector);
cu_vector.ApplySoftMax();
vector.ApplySoftMax();
CuVector<Real> cu_vector2(vector);
//std::cout<<cu_vector <<"\n"<<cu_vector2<<std::endl;
AssertEqual(cu_vector, cu_vector2);
}
}
template<typename Real> void CuVectorUnitTestApplyExp() {
int32 dim = 100;
CuVector<Real> vector(dim);
vector.SetRandn();
CuVector<Real> vector2(vector);
vector.ApplyExp();
for(int32 j = 0; j < dim; j++) {
//std::cout<<"diff is "<<exp(vector2(j))-vector(j)<<std::endl;;
KALDI_ASSERT(abs(exp(vector2(j))-vector(j)) < 0.000001 )
}
}
template<typename Real> void CuVectorUnitTestApplyLog() {
int32 dim = 100;
CuVector<Real> vector(dim);
vector.SetRandn();
for(int32 j = 0; j < dim; j++) {
if(vector(j) <= 0.0)
vector(j) = 1.0 - vector(j);
}
CuVector<Real> vector2(vector);
vector.ApplyLog();
for(int32 j = 0; j < dim; j++) {
//std::cout<<"diff is "<<exp(vector2(j))-vector(j)<<std::endl;;
KALDI_ASSERT(abs(log(vector2(j))-vector(j)) < 0.000001 )
}
}
template<typename Real> void CuVectorUnitTestApplyFloor() {
for (int32 l = 0; l < 10; l++) {
int32 dim = 100 + rand() % 700;
CuVector<Real> cu_vector(dim);
cu_vector.SetRandn();
Vector<Real> vector(cu_vector);
BaseFloat floor = 0.33 * (-5 + rand() % 10);
int32 i = cu_vector.ApplyFloor(floor);
int32 j = vector.ApplyFloor(floor);
CuVector<Real> cu2(vector);
AssertEqual(cu2, cu_vector);
if (i != j) {
KALDI_WARN << "ApplyFloor return code broken...";
}
KALDI_ASSERT(i==j);
}
}
template<typename Real> void CuVectorUnitTestApplyPow() {
for (int32 l = 0; l < 10; l++) {
int32 dim = 100 + rand() % 700;
CuVector<Real> cu_vector(dim);
cu_vector.SetRandn();
Vector<Real> vector(cu_vector);
BaseFloat pow = -2 + (rand() % 5);
cu_vector.ApplyPow(pow);
vector.ApplyPow(pow);
CuVector<Real> cu2(vector);
AssertEqual(cu2, cu_vector);
}
}
template<typename Real> void CuVectorUnitTestAddVecVec() {
int32 dim = 100;
CuVector<Real> cu_vector(dim);
cu_vector.SetRandn();
Vector<Real> vector(cu_vector);
Real beta = rand();
Real alpha = rand();
Vector<Real> v(dim), r(dim);
v.SetRandn(); r.SetRandn();
CuVector<Real> cuV(v), cuR(r);
cu_vector.AddVecVec(alpha, cuR, cuV, beta);
vector.AddVecVec(alpha, r, v, beta);
CuVector<Real> cu2(vector);
std::cout<<cu2(0)<<' '<<cu_vector(0)<<std::endl;
AssertEqual(cu2, cu_vector);
}
template<typename Real> void CuVectorUnitTestAddDiagMat2() {
for (int p = 0; p < 4; p++) {
int32 M = 230 + rand() % 100, N = 230 + rand() % 100;
BaseFloat alpha = 0.2 + rand() % 3, beta = 0.3 + rand() % 2;
CuVector<Real> cu_vector(M);
cu_vector.SetRandn();
CuMatrix<Real> cu_mat_orig(M, N);
cu_mat_orig.SetRandn();
MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
CuMatrix<Real> cu_mat(cu_mat_orig, trans);
Vector<Real> vector(cu_vector);
Matrix<Real> mat(cu_mat);
vector.AddDiagMat2(alpha, mat, trans, beta);
cu_vector.AddDiagMat2(alpha, cu_mat, trans, beta);
Vector<Real> vector2(cu_vector);
AssertEqual(vector, vector2);
}
}
template<typename Real>
static void CuVectorUnitTestAddDiagMatMat() {
for (MatrixIndexT iter = 0; iter < 4; iter++) {
BaseFloat alpha = 0.432 + rand() % 5, beta = 0.043 + rand() % 2;
MatrixIndexT dimM = 10 + rand() % 300,
dimN = 5 + rand() % 300;
CuVector<Real> v(dimM);
CuMatrix<Real> M_orig(dimM, dimN), N_orig(dimN, dimM);
M_orig.SetRandn();
N_orig.SetRandn();
MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans);
MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans);
CuMatrix<Real> M(M_orig, transM), N(N_orig, transN);
v.SetRandn();
CuVector<Real> w(v);
w.AddDiagMatMat(alpha, M, transM, N, transN, beta);
{
CuVector<Real> w2(v);
CuMatrix<Real> MN(dimM, dimM);
MN.AddMatMat(1.0, M, transM, N, transN, 0.0);
CuVector<Real> d(dimM);
d.CopyDiagFromMat(MN);
w2.Scale(beta);
w2.AddVec(alpha, d);
AssertEqual(w, w2);
}
}
}
template<typename Real> void CuVectorUnitTestAddMatVec() {
for (int32 i = 0; i < 10; i++) {
int32 M = 10 + rand() % 500, N = 10 + rand() % 400;
bool transpose = (i % 2 == 0);
CuVector<Real> src_cu(M);
src_cu.SetRandn();
Vector<Real> src(src_cu);
CuVector<Real> dst_cu(N);
dst_cu.SetRandn();
Vector<Real> dst(dst_cu);
CuMatrix<Real> mat_cu(transpose ? M : N, transpose ? N : M);
mat_cu.SetRandn();
Matrix<Real> mat(mat_cu);
BaseFloat alpha = 0.5 * (rand() % 10), beta = 0.5 * (rand() % 10);
dst_cu.AddMatVec(alpha, mat_cu, transpose ? kTrans : kNoTrans,
src_cu, beta);
dst.AddMatVec(alpha, mat, transpose ? kTrans : kNoTrans,
src, beta);
Vector<Real> dst2(dst_cu);
AssertEqual(dst, dst2);
}
}
template<typename Real> void CuVectorUnitTestAddSpVec() {
for (int32 i = 0; i < 5; i++) {
int32 M = 100 + rand() % 256;
CuVector<Real> src_cu(M);
src_cu.SetRandn();
Vector<Real> src(src_cu);
CuVector<Real> dst_cu(M);
dst_cu.SetRandn();
Vector<Real> dst(dst_cu);
CuSpMatrix<Real> mat_cu(M);
mat_cu.SetRandn();
SpMatrix<Real> mat(mat_cu);
BaseFloat alpha = 0.5 * (rand() % 5), beta = 0.5 * (rand() % 5);
dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta);
dst.AddSpVec(alpha, mat, src, beta);
Vector<Real> dst2(dst_cu);
AssertEqual(dst, dst2);
}
}
template<typename Real> void CuVectorUnitTest() {
UnitTestCuVectorCopyFromVec<Real, float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported())
#endif
UnitTestCuVectorCopyFromVec<Real, double>();
UnitTestCuVectorIO<Real>();
CuVectorUnitTestVecVec<Real>();
CuVectorUnitTestAddVec<Real>();
CuVectorUnitTestAddVecCross<Real>();
CuVectorUnitTestAddVecExtra<Real>();
CuVectorUnitTestApproxEqual<Real>();
CuVectorUnitTestScale<Real>();
CuVectorUnitTestSum<Real>();
CuVectorUnitTestInvertElements<Real>();
CuVectorUnitTestAddRowSumMat<Real>();
CuVectorUnitTestAddColSumMat<Real>();
UnitTestCuVectorReplaceValue<Real>();
UnitTestCuVectorAddTp<Real>();
UnitTestCuVectorMulTp<Real>();
UnitTestCuSubVector<Real>();
CuVectorUnitTestCopyFromMat<Real>();
CuVectorUnitTestMin<Real>();
CuVectorUnitTestMax<Real>();
CuVectorUnitTestApplySoftMax<Real>();
CuVectorUnitTestCopyDiagFromPacked<Real>();
CuVectorUnitTestCopyDiagFromMat<Real>();
CuVectorUnitTestCopyCross<Real>();
CuVectorUnitTestCopyCross2<Real>();
CuVectorUnitTestNorm<Real>();
CuVectorUnitTestApplyExp<Real>();
CuVectorUnitTestApplyLog<Real>();
CuVectorUnitTestApplyFloor<Real>();
CuVectorUnitTestApplyPow<Real>();
CuVectorUnitTestAddMatVec<Real>();
CuVectorUnitTestAddSpVec<Real>();
CuVectorUnitTestAddVecVec<Real>();
CuVectorUnitTestAddDiagMat2<Real>();
CuVectorUnitTestAddDiagMatMat<Real>();
}
} // namespace kaldi
int main(int argc, char *argv[]) {
//Select the GPU
using namespace kaldi;
const char *usage = "Usage: cu-vector-test [options]";
ParseOptions po(usage);
std::string use_gpu = "yes";
po.Register("use-gpu", &use_gpu, "yes|no|optional");
po.Read(argc, argv);
if (po.NumArgs() != 0) {
po.PrintUsage();
exit(1);
}
for (int32 loop = 0; loop < 2; loop++) {
#if HAVE_CUDA == 1
if (loop == 0)
CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
else
CuDevice::Instantiate().SelectGpuId(use_gpu);
#endif
kaldi::CuVectorUnitTest<float>();
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().DoublePrecisionSupported()) {
kaldi::CuVectorUnitTest<double>();
} else {
KALDI_WARN << "Double precision not supported";
}
#else
kaldi::CuVectorUnitTest<double>();
#endif
if (loop == 0)
KALDI_LOG << "Tests without GPU use succeeded.\n";
else
KALDI_LOG << "Tests with GPU use (if available) succeeded.\n";
}
#if HAVE_CUDA == 1
CuDevice::Instantiate().PrintProfile();
#endif
return 0;
}

1176
src/cudamatrix/cu-vector.cc Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -2,6 +2,8 @@
// Copyright 2009-2012 Karel Vesely
// Johns Hopkins University (author: Daniel Povey)
// Lucas Ondel
// 2013 Xiaohui Zhang
// See ../../COPYING for clarification regarding multiple authors
//
@ -25,49 +27,170 @@
#include "matrix/kaldi-vector.h"
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-value.h"
#include "cudamatrix/cu-math.h"
namespace kaldi {
template<typename Real> class CuMatrixBase;
template<typename Real>
Real VecVec(const CuVectorBase<Real> &v1, const CuVectorBase<Real> &v2);
template<typename Real, typename OtherReal>
Real VecVec(const CuVectorBase<Real> &v1, const CuVectorBase<OtherReal> &v2);
/**
* Vector for CUDA computing
*/
template<typename Real>
class CuVectorBase {
public:
friend class CuVectorBase<float>;
friend class CuVectorBase<double>;
friend class CuMatrixBase<Real>;
friend class MatrixBase<Real>;
friend class CuPackedMatrix<Real>;
friend class CuSpMatrix<Real>;
friend class CuTpMatrix<Real>;
template <typename OtherReal>
friend OtherReal VecVec(const CuVectorBase<OtherReal> &v1,
const CuVectorBase<OtherReal> &v2);
friend void cu::Splice<Real>(const CuMatrix<Real> &src,
const CuStlVector<int32> &frame_offsets,
const CuArray<int32> &frame_offsets,
CuMatrix<Real> *tgt);
friend class CuRand<Real>;
/// Dimensions
MatrixIndexT Dim() const { return dim_; }
/// Returns a pointer to the start of the vector's data.
inline Real* Data() { return data_; }
/// Returns a pointer to the start of the vector's data (const).
inline const Real* Data() const { return data_; }
/// Copy functions; these will crash if the dimension
/// do not match. The operator = in class CuVector will
/// also change the sizes for you.
void CopyFromVec(const CuVectorBase<Real> &src);
void CopyFromVec(const VectorBase<Real> &src);
void CopyToVec(VectorBase<Real> *dst) const;
template<typename OtherReal>
void CopyFromVec(const CuVectorBase<OtherReal> &M);
template<typename OtherReal>
void CopyFromVec(const VectorBase<OtherReal> &src);
template<typename OtherReal>
void CopyToVec(VectorBase<OtherReal> *dst) const;
void CopyRowsFromMat(const CuMatrixBase<Real> &M);
void CopyRowsFromMat(const MatrixBase<Real> &M);
/// Math operations
void SetZero();
void Set(Real value);
void Add(Real value);
void Scale(Real value);
void AddVec(Real alpha, const CuVectorBase<Real> &vec, Real beta = 1.0);
template<typename OtherReal>
void AddVec(Real alpha, const CuVectorBase<OtherReal> &vec, Real beta = 1.0);
/// Sum the rows of the matrix, add to vector
void AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0);
/// Sum the columns of the matrix, add to vector
void AddColSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0);
/// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
/// Works even if rv == *this.
void AddTpVec(const Real alpha, const CuTpMatrix<Real>&M,
const MatrixTransposeType trans, const CuVectorBase<Real> &v,
const Real beta); // **beta previously defaulted to 0.0**
/// Multiplies this vector by lower-triangular marix: *this <-- *this *M
void MulTp(const CuTpMatrix<Real> &M, const MatrixTransposeType trans);
bool ApproxEqual(const CuVectorBase<Real> &other, float tol = 0.01) const;
void InvertElements();
void ApplySoftMax();
void ApplyExp();
void ApplyLog();
MatrixIndexT ApplyFloor(Real floor_val);
void ApplyPow(Real power);
Real Sum() const;
void SetRandn();
CuSubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
return CuSubVector<Real>(*this, o, l);
}
const CuSubVector<Real> Range(const MatrixIndexT o,
const MatrixIndexT l) const {
return CuSubVector<Real>(*this, o, l);
}
void CopyColFromMat(const CuMatrixBase<Real> &mat, MatrixIndexT col);
template<typename OtherReal>
void CopyColFromMat(const CuMatrixBase<OtherReal> &mat, MatrixIndexT col);
void AddMatVec(const Real alpha, const CuMatrixBase<Real> &M,
MatrixTransposeType trans, const CuVectorBase<Real> &v,
const Real beta);
void AddVecVec(Real alpha, const CuVectorBase<Real> &v,
const CuVectorBase<Real> &r, Real beta);
void AddSpVec(const Real alpha, const CuSpMatrix<Real> &S,
const CuVectorBase<Real> &v, const Real beta);
/// Add the diagonal of a matrix times itself:
/// *this = diag(M M^T) + beta * *this (if trans == kNoTrans), or
/// *this = diag(M^T M) + beta * *this (if trans == kTrans).
void AddDiagMat2(Real alpha, const CuMatrixBase<Real> &M,
MatrixTransposeType trans, Real beta);
/// Add the diagonal of a matrix product: *this = diag(M N), assuming the
/// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
/// as you would expect.
void AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M, MatrixTransposeType transM,
const CuMatrixBase<Real> &N, MatrixTransposeType transN,
Real beta = 1.0);
inline CuValue<Real> operator() (MatrixIndexT i) {
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
static_cast<UnsignedMatrixIndexT>(dim_));
return CuValue<Real>(data_ + i);
}
Real Norm(BaseFloat p); // Only works for p = 1 and p = 2.
inline Real operator() (MatrixIndexT i) const {
KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
static_cast<UnsignedMatrixIndexT>(dim_));
return CuValue<Real>(data_ + i); // will be casted to Real.
}
/// Extracts the diagonal of a packed matrix M; works for Sp or Tp.
void CopyDiagFromPacked(const CuPackedMatrix<Real> &M);
/// Extracts the diagonal of a matrix.
void CopyDiagFromMat(const CuMatrix<Real> &M);
Real Max() const;
Real Min() const;
// Set each element to y = (x == orig ? changed : x).
void ReplaceValue(Real orig, Real changed);
void MulElements(const CuVectorBase<Real> &v);
protected:
protected:
// The following two functions should only be called if we did not compile
// with CUDA or could not get a CUDA card; in that case the contents are
// interpreted the same as a regular vector.
@ -78,7 +201,7 @@ protected:
return *(reinterpret_cast<VectorBase<Real>* >(this));
}
/// Default constructor: make it private so the user cannot
/// Default constructor: make it protected so the user cannot
/// instantiate this class.
CuVectorBase<Real>(): data_(NULL), dim_(0) { }
@ -89,14 +212,38 @@ protected:
KALDI_DISALLOW_COPY_AND_ASSIGN(CuVectorBase);
};
template<class Real>
template<typename Real>
class CuVector: public CuVectorBase<Real> {
friend class CuVectorBase<float>;
friend class CuVectorBase<double>;
friend class CuMatrixBase<Real>;
friend class CuPackedMatrix<Real>;
friend class CuSpMatrix<Real>;
friend class CuTpMatrix<Real>;
public:
CuVector() { }
CuVector(MatrixIndexT dim, MatrixResizeType t = kSetZero) { Resize(dim, t); }
CuVector(const CuVector<Real> &v);
CuVector(const CuVectorBase<Real> &v);
CuVector(const VectorBase<Real> &v);
explicit CuVector(const CuVector<Real> &v) : CuVectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
template<typename OtherReal>
explicit CuVector(const CuVectorBase<OtherReal> &v) : CuVectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
template<typename OtherReal>
explicit CuVector(const VectorBase<OtherReal> &v) : CuVectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(Vector<Real>(v));
}
/// Allocate the memory
void Resize(MatrixIndexT dim, MatrixResizeType t = kSetZero);
@ -104,12 +251,20 @@ class CuVector: public CuVectorBase<Real> {
~CuVector() { Destroy(); }
CuVector<Real> &operator = (const CuVectorBase<Real> &other) {
Resize(other.Dim());
CopyFromVec(other);
Resize(other.Dim(), kUndefined);
this->CopyFromVec(other);
return *this;
}
CuVector<Real> &operator = (const CuVector<Real> &other) {
Resize(other.Dim(), kUndefined);
this->CopyFromVec(other);
return *this;
}
CuVector<Real> &operator = (const VectorBase<Real> &other) {
Resize(other.Dim());
CopyFromVec(other);
this->CopyFromVec(other);
return *this;
}
@ -118,27 +273,91 @@ class CuVector: public CuVectorBase<Real> {
void Write(std::ostream &is, bool binary) const;
void Swap(Vector<Real> *vec);
private:
void Destroy();
};
// We'll fill out the following class if it's needed.
template<class Real>
template<typename Real>
class CuSubVector: public CuVectorBase<Real> {
public:
private:
public:
CuSubVector(const CuVectorBase<Real> &t, const MatrixIndexT origin,
const MatrixIndexT length) : CuVectorBase<Real>() {
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
static_cast<UnsignedMatrixIndexT>(length) <=
static_cast<UnsignedMatrixIndexT>(t.Dim()));
CuVectorBase<Real>::data_ = const_cast<Real*>(t.Data()+origin);
CuVectorBase<Real>::dim_ = length;
}
/// Copy constructor
/// this constructor needed for Range() to work in base class.
CuSubVector(const CuSubVector &other) : CuVectorBase<Real> () {
CuVectorBase<Real>::data_ = other.data_;
CuVectorBase<Real>::dim_ = other.dim_;
}
CuSubVector(const Real* data, MatrixIndexT length) : CuVectorBase<Real> () {
// Yes, we're evading C's restrictions on const here, and yes, it can be used
// to do wrong stuff; unfortunately the workaround would be very difficult.
CuVectorBase<Real>::data_ = const_cast<Real*>(data);
CuVectorBase<Real>::dim_ = length;
}
/// This operation does not preserve const-ness, so be careful.
CuSubVector(const CuMatrixBase<Real> &matrix, MatrixIndexT row) {
CuVectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
CuVectorBase<Real>::dim_ = matrix.NumCols();
}
};
/// I/O
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec);
template<typename Real>
bool ApproxEqual(const CuVectorBase<Real> &a,
const CuVectorBase<Real> &b, Real tol = 0.01) {
return a.ApproxEqual(b, tol);
}
template<typename Real>
inline void AssertEqual(CuVectorBase<Real> &a, CuVectorBase<Real> &b,
float tol = 0.01) {
KALDI_ASSERT(a.ApproxEqual(b, tol));
}
template<typename Real>
template<typename OtherReal>
void CuVectorBase<Real>::CopyFromVec(const CuVectorBase<OtherReal> &v) {
v.CopyToVec(&this);
}
template<typename Real>
template<typename OtherReal>
void VectorBase<Real>::CopyFromVec(const CuVectorBase<OtherReal> &cu) {
cu.CopyToVec(this);
}
// declare template specializations.
template <>
template <>
void CuVectorBase<double>::CopyFromVec<float>(const CuVectorBase<float> &src);
template<>
template <>
void CuVectorBase<float>::CopyFromVec<double>(const CuVectorBase<double> &src);
template<typename Real>
template<typename OtherReal>
Vector<Real>::Vector(const CuVectorBase<OtherReal> &cu) {
Init(cu.Dim());
cu.CopyToVec(this);
}
} // namespace
#include "cu-vector-inl.h"
#endif

Просмотреть файл

@ -0,0 +1,136 @@
// cudamatrix/cublas-wrappers.h
// Copyright 2013 Johns Hopkins University (author: Daniel Povey);
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_CUBLAS_WRAPPERS_H_
#define KALDI_MATRIX_CUBLAS_WRAPPERS_H_ 1
// Do not include this file directly. It is to be included
// by .cc files in this directory.
namespace kaldi {
#if HAVE_CUDA == 1
inline void cublas_gemm(char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda,const float *B, int ldb, float beta, float *C, int ldc) {
cublasSgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
}
inline void cublas_gemm(char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda,const double *B, int ldb, double beta, double *C, int ldc) {
cublasDgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
}
inline void cublas_trsm(int m, int n, float alpha, const float* A, int lda, float* B, int ldb) {
cublasStrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
}
inline void cublas_trsm(int m, int n, double alpha, const double* A, int lda, double* B, int ldb) {
cublasDtrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
}
inline void cublas_syrk(char uplo, char trans, int n, int k,
float alpha, const float *A, int lda,
float beta, float *C, int ldc) {
cublasSsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
}
inline void cublas_syrk(char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
double beta, double *C, int ldc) {
cublasDsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
}
inline float cublas_dot(int n, const float *x, int incx, const float *y, int incy) {
return cublasSdot(n, x, incx, y, incy);
}
inline double cublas_dot(int n, const double *x, int incx, const double *y, int incy) {
return cublasDdot(n, x, incx, y, incy);
}
inline float cublas_asum(int n, const float* x, int incx) {
return cublasSasum(n, x, incx);
}
inline double cublas_asum(int n, const double* x, int incx) {
return cublasDasum(n, x, incx);
}
inline float cublas_nrm2(int n, const float* x, int incx) {
return cublasSnrm2(n, x, incx);
}
inline double cublas_nrm2(int n, const double* x, int incx) {
return cublasDnrm2(n, x, incx);
}
inline void cublas_copy(int n, const float* x, int incx,
float* y, int incy) {
cublasScopy(n,x,incx,y,incy);
}
inline void cublas_copy(int n, const double* x, int incx,
double* y, int incy) {
cublasDcopy(n,x,incx,y,incy);
}
inline void cublas_scal(int n, float alpha, float* mat, int incx) {
cublasSscal(n, alpha, mat, incx);
}
inline void cublas_scal(int n, double alpha, double* mat, int incx) {
cublasDscal(n, alpha, mat, incx);
}
inline void cublas_axpy(int n, float alpha, const float* x, int incx, float* y, int incy) {
cublasSaxpy(n, alpha, x, incx, y, incy);
}
inline void cublas_axpy(int n, double alpha, const double* x, int incx, double* y, int incy) {
cublasDaxpy(n, alpha, x, incx, y, incy);
}
inline void cublas_gemv(char trans, int m, int n, float alpha,
const float* A, int lda, const float* x,
int incx, float beta, float* y, int incy) {
cublasSgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
}
inline void cublas_gemv(char trans, int m, int n, double alpha,
const double* A, int lda, const double* x,
int incx, double beta, double* y, int incy) {
cublasDgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
}
inline void cublas_spmv(char uplo, int n, float alpha, const float *AP, const float *x,
int incx, float beta, float *y, int incy) {
cublasSspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
}
inline void cublas_spmv(char uplo, int n, double alpha, const double *AP, const double *x,
int incx, double beta, double *y, int incy) {
cublasDspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
}
// Use caution with these, the 'transpose' argument is the opposite of what it
// should really be, due to CUDA storing things in column major order. We also
// had to switch 'l' to 'u'; we view our packed matrices as lower-triangular,
// row-by-row, but CUDA views the same layout as upper-triangular,
// column-by-column.
inline void cublas_tpmv(char trans, int n,
const float* Ap, float* x, int incx) {
return cublasStpmv('u', trans, 'n', n, Ap, x, incx);
}
inline void cublas_tpmv(char trans, int n, const double* Ap,
double* x,int incx) {
return cublasDtpmv('u', trans, 'n', n, Ap, x, incx);
}
inline void cublas_spr(char uplo, int n, float alpha, const float *x,
int incx, float *AP) {
cublasSspr(uplo, n, alpha, x, incx, AP);
}
inline void cublas_spr(char uplo, int n, double alpha, const double *x,
int incx, double *AP) {
cublasDspr(uplo, n, alpha, x, incx, AP);
}
#endif
}
// namespace kaldi
#endif

Просмотреть файл

@ -1,713 +0,0 @@
// cudamatrix/cuda-matrix-test.cc
// Copyright 2010 Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "base/kaldi-common.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-math.h"
using namespace kaldi;
namespace kaldi {
/*
* INITIALIZERS
*/
template<class Real>
static void InitRand(VectorBase<Real> *v) {
for (MatrixIndexT i = 0;i < v->Dim();i++)
(*v)(i) = RandGauss();
}
template<class Real>
static void InitRand(MatrixBase<Real> *M) {
do {
for (MatrixIndexT i = 0;i < M->NumRows();i++)
for (MatrixIndexT j = 0;j < M->NumCols();j++)
(*M)(i, j) = RandGauss();
} while (M->NumRows() != 0 && M->Cond() > 100);
}
template<class Real>
static void RandGaussMatrix(MatrixBase<Real>* mat) {
for(int32 r=0; r<mat->NumRows(); r++)
for(int32 c=0; c<mat->NumCols(); c++)
(*mat)(r,c) = RandGauss();
}
template<class Real>
static void RandZeroToOneMatrix(MatrixBase<Real>* mat) {
for(int32 r=0; r<mat->NumRows(); r++)
for(int32 c=0; c<mat->NumCols(); c++)
(*mat)(r,c) = RandUniform();
}
/*
* ASSERTS
*/
template<class Real>
static void AssertEqual(const MatrixBase<Real> &A,
const MatrixBase<Real> &B,
float tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows()&&A.NumCols() == B.NumCols());
for (MatrixIndexT i = 0;i < A.NumRows();i++) {
for (MatrixIndexT j = 0;j < A.NumCols();j++) {
KALDI_ASSERT(std::abs(A(i, j)-B(i, j)) < tol*std::max(1.0, (double) (std::abs(A(i, j))+std::abs(B(i, j)))));
}
}
}
template<class Real>
static bool ApproxEqual(const MatrixBase<Real> &A,
const MatrixBase<Real> &B, Real tol = 0.001) {
KALDI_ASSERT(A.NumRows() == B.NumRows());
MatrixBase<Real> diff(A);
diff.AddSp(1.0, B);
Real a = std::max(A.Max(), -A.Min()), b = std::max(B.Max(), -B.Min),
d = std::max(diff.Max(), -diff.Min());
return (d <= tol * std::max(a, b));
}
template<class Real>
static void AssertEqual(VectorBase<Real> &A, VectorBase<Real> &B, float tol = 0.001) {
KALDI_ASSERT(A.Dim() == B.Dim());
for (MatrixIndexT i=0; i < A.Dim(); i++)
KALDI_ASSERT(std::abs(A(i)-B(i)) < tol);
}
template<class Real>
static bool ApproxEqual(VectorBase<Real> &A, VectorBase<Real> &B, float tol = 0.001) {
KALDI_ASSERT(A.Dim() == B.Dim());
for (MatrixIndexT i=0; i < A.Dim(); i++)
if (std::abs(A(i)-B(i)) > tol) return false;
return true;
}
static void AssertEqual(std::vector<int32> &A, std::vector<int32> &B) {
KALDI_ASSERT(A.size() == B.size());
for (size_t i=0; i < A.size(); i++)
KALDI_ASSERT(A[i] == B[i]);
}
/*
* Unit tests
*/
/*
* CuMatrix
*/
template<class Real>
static void UnitTestCuMatrixApplyLog() {
Matrix<Real> H(100,100);
RandGaussMatrix(&H);
H.MulElements(H); //make numbers positive
CuMatrix<Real> D(100,100);
D.CopyFromMat(H);
D.ApplyLog();
H.ApplyLog();
Matrix<Real> H2(100,100);
D.CopyToMat(&H2);
AssertEqual(H,H2);
}
template<class Real>
static void UnitTestCuMatrixMulElements() {
Matrix<Real> Ha(100,100);
Matrix<Real> Hb(100,100);
RandGaussMatrix(&Ha);
RandGaussMatrix(&Hb);
CuMatrix<Real> Da(100,100);
CuMatrix<Real> Db(100,100);
Da.CopyFromMat(Ha);
Db.CopyFromMat(Hb);
Da.MulElements(Db);
Ha.MulElements(Hb);
Matrix<Real> Ha2(100,100);
Da.CopyToMat(&Ha2);
AssertEqual(Ha,Ha2);
}
template<class Real>
static void UnitTestCuMatrixMulColsVec() {
Matrix<Real> Hm(100,99);
Vector<Real> Hv(99);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(100,99);
CuVector<Real> Dv(99);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dm.MulColsVec(Dv);
Hm.MulColsVec(Hv);
Matrix<Real> Hm2(100,99);
Dm.CopyToMat(&Hm2);
AssertEqual(Hm,Hm2);
}
template<class Real>
static void UnitTestCuMatrixMulRowsVec() {
Matrix<Real> Hm(100,99);
Vector<Real> Hv(100);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(100,99);
CuVector<Real> Dv(100);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dm.MulRowsVec(Dv);
Hm.MulRowsVec(Hv);
Matrix<Real> Hm2(100,99);
Dm.CopyToMat(&Hm2);
AssertEqual(Hm,Hm2);
}
template<class Real>
static void UnitTestCuMatrixDivRowsVec() {
Matrix<Real> Hm(100,99);
Vector<Real> Hv(100);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(100,99);
CuVector<Real> Dv(100);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dm.DivRowsVec(Dv);
Hv.InvertElements();
Hm.MulRowsVec(Hv);
Matrix<Real> Hm2(100,99);
Dm.CopyToMat(&Hm2);
AssertEqual(Hm,Hm2);
}
template<class Real>
static void UnitTestCuMatrixAddMat() {
Matrix<Real> Ha(100,100);
Matrix<Real> Hb(100,100);
RandGaussMatrix(&Ha);
RandGaussMatrix(&Hb);
CuMatrix<Real> Da(100,100);
CuMatrix<Real> Db(100,100);
Da.CopyFromMat(Ha);
Db.CopyFromMat(Hb);
Da.AddMat(0.5,Db);
Ha.AddMat(0.5,Hb);
Matrix<Real> Ha2(100,100);
Da.CopyToMat(&Ha2);
AssertEqual(Ha,Ha2);
}
template<class Real>
static void UnitTestCuMatrixAddVecToCols() {
Matrix<Real> Hm(100,99);
Vector<Real> Hv(100);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(100,99);
CuVector<Real> Dv(100);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dm.AddVecToCols(0.5,Dv);
Hm.AddVecToCols(0.5,Hv);
Matrix<Real> Hm2(100,99);
Dm.CopyToMat(&Hm2);
AssertEqual(Hm,Hm2);
}
template<class Real>
static void UnitTestCuMatrixAddVecToRows() {
Matrix<Real> Hm(100,99);
Vector<Real> Hv(99);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(100,99);
CuVector<Real> Dv(99);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dm.AddVecToRows(0.5,Dv);
Hm.AddVecToRows(0.5,Hv);
Matrix<Real> Hm2(100,99);
Dm.CopyToMat(&Hm2);
AssertEqual(Hm,Hm2);
}
template<class Real>
static void UnitTestCuMatrixAddMatMat() {
Matrix<Real> Ha(200,100);
Matrix<Real> Hb(100,200);
Matrix<Real> Hc1(200,200);
Matrix<Real> Hc2(100,100);
RandGaussMatrix(&Ha);
RandGaussMatrix(&Hb);
CuMatrix<Real> Da(200,100);
CuMatrix<Real> Db(100,200);
Da.CopyFromMat(Ha);
Db.CopyFromMat(Hb);
CuMatrix<Real> Dc1(200,200);
CuMatrix<Real> Dc2(100,100);
Dc1.AddMatMat(0.5f,Da,kNoTrans,Db,kNoTrans,0.0f);
Dc2.AddMatMat(0.5f,Da,kTrans,Db,kTrans,0.0f);
Hc1.AddMatMat(0.5f,Ha,kNoTrans,Hb,kNoTrans,0.0f);
Hc2.AddMatMat(0.5f,Ha,kTrans,Hb,kTrans,0.0f);
Matrix<Real> Hc1a(200,200);
Matrix<Real> Hc2a(100,100);
Dc1.CopyToMat(&Hc1a);
Dc2.CopyToMat(&Hc2a);
AssertEqual(Hc1,Hc1a);
AssertEqual(Hc2,Hc2a);
}
/*
* CuVector unit tests
*/
template<class Real>
static void UnitTestCuVectorAddVec() {
Vector<Real> Hv(777);
Vector<Real> Hw(777);
InitRand(&Hv);
InitRand(&Hw);
CuVector<Real> Dv(777);
CuVector<Real> Dw(777);
Dv.CopyFromVec(Hv);
Dw.CopyFromVec(Hw);
Dv.AddVec(0.1,Dw,0.9);
Hv.Scale(0.9);
Hv.AddVec(0.1,Hw);
Vector<Real> Hv2(777);
Dv.CopyToVec(&Hv2);
AssertEqual(Hv,Hv2);
}
template<class Real>
static void UnitTestCuVectorAddRowSumMat() {
const int32 X=4321, Y=19;
Real alpha=0.1, beta=0.7;
Matrix<Real> Hm(X,Y);
Vector<Real> Hv(Y);
Vector<Real> Hv_accu(Y);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(X,Y);
CuVector<Real> Dv(Y);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dv.AddRowSumMat(alpha,Dm,beta);
Hv_accu.SetZero();
Hv_accu.AddRowSumMat(1.0, Hm);
Hv.Scale(beta);
Hv.AddVec(alpha,Hv_accu);
Vector<Real> Hv2(Y);
Dv.CopyToVec(&Hv2);
AssertEqual(Hv,Hv2);
}
template<class Real>
static void UnitTestCuVectorAddRowSumMatLarge() {
Matrix<Real> Hm(1000,990);
Vector<Real> Hv(990);
Vector<Real> Hv_accu(990);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(1000,990);
CuVector<Real> Dv(990);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dv.AddRowSumMat(0.5,Dm,0.7);
Hv_accu.SetZero();
Hv_accu.AddRowSumMat(1.0, Hm);
Hv.Scale(0.7);
Hv.AddVec(0.5,Hv_accu);
Vector<Real> Hv2(990);
Dv.CopyToVec(&Hv2);
AssertEqual(Hv,Hv2);
}
template<class Real>
static void UnitTestCuVectorAddColSumMat() {
const int32 X=19, Y=4321;
Real alpha=0.5, beta=0.7;
Matrix<Real> Hm(X,Y);
Vector<Real> Hv(X);
Vector<Real> Hv_accu(X);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(X,Y);
CuVector<Real> Dv(X);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dv.AddColSumMat(alpha,Dm,beta);
Hv_accu.SetZero();
Hv_accu.AddColSumMat(1.0, Hm);
Hv.Scale(beta);
Hv.AddVec(alpha, Hv_accu);
Vector<Real> Hv2(X);
Dv.CopyToVec(&Hv2);
AssertEqual(Hv,Hv2);
}
template<class Real>
static void UnitTestCuVectorAddColSumMatLarge() {
Matrix<Real> Hm(1000,990);
Vector<Real> Hv(1000);
Vector<Real> Hv_accu(1000);
RandGaussMatrix(&Hm);
InitRand(&Hv);
CuMatrix<Real> Dm(1000,990);
CuVector<Real> Dv(1000);
Dm.CopyFromMat(Hm);
Dv.CopyFromVec(Hv);
Dv.AddColSumMat(0.5, Dm, 0.7);
Hv_accu.SetZero();
Hv_accu.AddColSumMat(1.0, Hm);
Hv.Scale(0.7);
Hv.AddVec(0.5,Hv_accu);
Vector<Real> Hv2(1000);
Dv.CopyToVec(&Hv2);
AssertEqual(Hv,Hv2);
}
template<class Real>
static void UnitTestCuVectorInvertElements() {
Vector<Real> Hv(777);
InitRand(&Hv);
CuVector<Real> Dv(777);
Dv.CopyFromVec(Hv);
Dv.InvertElements();
Hv.InvertElements();
Vector<Real> Hv2(777);
Dv.CopyToVec(&Hv2);
AssertEqual(Hv,Hv2);
}
/*
* cu:: unit tests
*/
template<class Real>
static void UnitTestCuSigmoid() {
Matrix<Real> Hi(100,111);
Matrix<Real> Ho(100,111);
RandGaussMatrix(&Hi);
CuMatrix<Real> Di(100,111);
CuMatrix<Real> Do(100,111);
Di.CopyFromMat(Hi);
//gpu
Do.Sigmoid(Di);
//cpu
for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
for(MatrixIndexT c=0; c<Hi.NumCols(); c++) {
Ho(r, c) = 1.0/(1.0+exp(-Hi(r, c)));
}
}
Matrix<Real> Ho2(100,111);
Do.CopyToMat(&Ho2);
AssertEqual(Ho,Ho2);
}
template<class Real>
static void UnitTestCuDiffSigmoid() {
Matrix<Real> Hi(100,111);
Matrix<Real> Ho(100,111);
Matrix<Real> Hy(100,111);
RandGaussMatrix(&Hi);
RandZeroToOneMatrix(&Hy);
CuMatrix<Real> Di(100,111);
CuMatrix<Real> Do(100,111);
CuMatrix<Real> Dy(100,111);
Di.CopyFromMat(Hi);
Dy.CopyFromMat(Hy);
//gpu
Do.DiffSigmoid(Dy, Di);
//cpu
for(MatrixIndexT r=0; r<Ho.NumRows(); r++) {
for(MatrixIndexT c=0; c<Ho.NumCols(); c++) {
Ho(r, c) = Hy(r, c)*(1.0 - Hy(r, c)) * Hi(r, c);
}
}
Matrix<Real> Ho2(100,111);
Do.CopyToMat(&Ho2);
AssertEqual(Ho,Ho2);
}
template<class Real>
static void UnitTestCuSoftmax() {
Matrix<Real> Hi(100,111);
Matrix<Real> Ho(100,111);
RandGaussMatrix(&Hi);
CuMatrix<Real> Di(100,111);
CuMatrix<Real> Do(100,111);
Di.CopyFromMat(Hi);
//gpu
Do.Softmax(Di);
//cpu
Ho.CopyFromMat(Hi);
for(MatrixIndexT r=0; r<Ho.NumRows(); r++) {
Ho.Row(r).ApplySoftMax();
}
Matrix<Real> Ho2(100,111);
Do.CopyToMat(&Ho2);
AssertEqual(Ho,Ho2);
}
template<class Real>
static void UnitTestCuFindRowMaxId() {
Matrix<Real> Hi(100,111);
RandGaussMatrix(&Hi);
CuMatrix<Real> Di(100,111);
Di.CopyFromMat(Hi);
std::vector<int32> Hmax(100);
CuStlVector<int32> Dmax(100);
//gpu
Di.FindRowMaxId(&Dmax);
//cpu
for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
Real max=-1e20; int32 idx=-1;
for(MatrixIndexT c=0; c<Hi.NumCols(); c++) {
if(Hi(r,c) > max) { idx=c; max=Hi(r,c); }
}
Hmax[r] = idx;
}
std::vector<int32> Hmax2(100);
Dmax.CopyToVec(&Hmax2);
AssertEqual(Hmax,Hmax2);
}
template<class Real>
static void UnitTestCuDiffXent() {
int32 X=100, Y=111;
//nnet output / diff
Matrix<Real> Hi(X,Y);
RandZeroToOneMatrix(&Hi);
CuMatrix<Real> Di(X,Y);
Di.CopyFromMat(Hi);
//target vector
std::vector<int32> Htgt(X);
for(int32 i=0; i<X; i++) {
Htgt[i] = rand()%Y;
}
CuStlVector<int32> Dtgt(X);
Dtgt.CopyFromVec(Htgt);
//logpost vector
Vector<Real> Hlogpost(X);
CuVector<Real> Dlogpost(X);
//gpu
Di.DiffXent(Dtgt, &Dlogpost);
//cpu
for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
int32 col_tgt = Htgt[r];
Hlogpost(r) = log(Hi(r, col_tgt));
Hi(r, col_tgt) -= 1.0;
}
Matrix<Real> Hi2(X,Y);
Di.CopyToMat(&Hi2);
Vector<Real> Hlogpost2(X);
Dlogpost.CopyToVec(&Hlogpost2);
AssertEqual(Hi,Hi2);
AssertEqual(Hlogpost,Hlogpost2);
}
template<class Real> void CudaMatrixUnitTest() {
//test CuMatrix<Real> methods by cross-check with Matrix
UnitTestCuMatrixApplyLog<Real>();
UnitTestCuMatrixMulElements<Real>();
UnitTestCuMatrixMulColsVec<Real>();
UnitTestCuMatrixMulRowsVec<Real>();
UnitTestCuMatrixDivRowsVec<Real>();
UnitTestCuMatrixAddMat<Real>();
UnitTestCuMatrixAddVecToCols<Real>();
UnitTestCuMatrixAddVecToRows<Real>();
UnitTestCuMatrixAddMatMat<Real>();
//test CuVector<Real> methods
UnitTestCuVectorAddVec<Real>();
UnitTestCuVectorAddRowSumMat<Real>();
UnitTestCuVectorAddRowSumMatLarge<Real>();
UnitTestCuVectorAddColSumMat<Real>();
UnitTestCuVectorAddColSumMatLarge<Real>();
UnitTestCuVectorInvertElements<Real>();
UnitTestCuSigmoid<Real>();
UnitTestCuDiffSigmoid<Real>();
UnitTestCuFindRowMaxId<Real>();
UnitTestCuSoftmax<Real>();
UnitTestCuDiffXent<Real>();
}
} // namespace kaldi
int main() {
//Select the GPU
#if HAVE_CUDA==1
CuDevice::Instantiate().SelectGpuId(-2); //-2 .. automatic selection
#endif
kaldi::CudaMatrixUnitTest<float>();
kaldi::CudaMatrixUnitTest<double>();
std::cout << "Tests succeeded.\n";
}

Просмотреть файл

@ -17,6 +17,9 @@
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
#include <limits>
#include "matrix/sp-matrix.h"
@ -235,6 +238,8 @@ inline void cblas_Xgemm(const double alpha,
alpha, Adata, a_stride, Bdata, b_stride,
beta, Mdata, stride);
}
inline void cblas_Xsymm(const float alpha,
MatrixIndexT sz,
const float *Adata,MatrixIndexT a_stride,
@ -470,3 +475,5 @@ inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT st
}
// namespace kaldi
#endif

Просмотреть файл

@ -23,7 +23,7 @@
namespace kaldi {
template<class Real>
template<typename Real>
void CompressedMatrix::CopyFromMat(
const MatrixBase<Real> &mat) {
if (data_ != NULL) {
@ -95,6 +95,20 @@ void CompressedMatrix::CopyFromMat(const MatrixBase<float> &mat);
template
void CompressedMatrix::CopyFromMat(const MatrixBase<double> &mat);
template<typename Real>
CompressedMatrix &CompressedMatrix::operator =(const MatrixBase<Real> &mat) {
this->CopyFromMat(mat);
return *this;
}
// Instantiate the template for float and double.
template
CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<float> &mat);
template
CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<double> &mat);
inline uint16 CompressedMatrix::FloatToUint16(
const GlobalHeader &global_header,
float value) {
@ -114,7 +128,7 @@ inline float CompressedMatrix::Uint16ToFloat(
+ global_header.range * 1.52590218966964e-05 * value;
}
template<class Real> // static
template<typename Real> // static
void CompressedMatrix::ComputeColHeader(
const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,
@ -229,7 +243,7 @@ inline float CompressedMatrix::CharToFloat(
}
template<class Real> // static
template<typename Real> // static
void CompressedMatrix::CompressColumn(
const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,
@ -383,7 +397,7 @@ void CompressedMatrix::Read(std::istream &is, bool binary) {
KALDI_ERR << "Failed to read data.";
}
template<class Real>
template<typename Real>
void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat) const {
if (data_ == NULL) {
KALDI_ASSERT(mat->NumRows() == 0);

Просмотреть файл

@ -46,20 +46,24 @@ class CompressedMatrix {
~CompressedMatrix() { Destroy(); }
template<class Real>
template<typename Real>
CompressedMatrix(const MatrixBase<Real> &mat): data_(NULL) { CopyFromMat(mat); }
/// This will resize *this and copy the contents of mat to *this.
template<class Real>
template<typename Real>
void CopyFromMat(const MatrixBase<Real> &mat);
CompressedMatrix(const CompressedMatrix &mat);
CompressedMatrix &operator = (const CompressedMatrix &mat); // assignment operator.
template<typename Real>
CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
// Note: mat must have the correct size, CopyToMat no longer attempts
// to resize the matrix
template<class Real>
template<typename Real>
void CopyToMat(MatrixBase<Real> *mat) const;
void Write(std::ostream &os, bool binary) const;
@ -122,12 +126,12 @@ class CompressedMatrix {
uint16 percentile_100;
};
template<class Real>
template<typename Real>
static void CompressColumn(const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,
int32 num_rows, PerColHeader *header,
unsigned char *byte_data);
template<class Real>
template<typename Real>
static void ComputeColHeader(const GlobalHeader &global_header,
const Real *data, MatrixIndexT stride,
int32 num_rows, PerColHeader *header);

Просмотреть файл

@ -36,7 +36,7 @@ namespace kaldi {
// This class is not to be used externally. See the Eig function in the Matrix
// class in kaldi-matrix.h. This is the external interface.
template<class Real> class EigenvalueDecomposition {
template<typename Real> class EigenvalueDecomposition {
// This class is based on the EigenvalueDecomposition class from the JAMA
// library (version 1.0.2).
public:
@ -110,7 +110,7 @@ template<class Real> class EigenvalueDecomposition {
template class EigenvalueDecomposition<float>; // force instantiation.
template class EigenvalueDecomposition<double>; // force instantiation.
template<class Real> void EigenvalueDecomposition<Real>::Tred2() {
template<typename Real> void EigenvalueDecomposition<Real>::Tred2() {
// This is derived from the Algol procedures tred2 by
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
@ -224,7 +224,7 @@ template<class Real> void EigenvalueDecomposition<Real>::Tred2() {
e_[0] = 0.0;
}
template<class Real> void EigenvalueDecomposition<Real>::Tql2() {
template<typename Real> void EigenvalueDecomposition<Real>::Tql2() {
// This is derived from the Algol procedures tql2, by
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
@ -341,7 +341,7 @@ template<class Real> void EigenvalueDecomposition<Real>::Tql2() {
}
}
template<class Real>
template<typename Real>
void EigenvalueDecomposition<Real>::Orthes() {
// This is derived from the Algol procedures orthes and ortran,
@ -433,7 +433,7 @@ void EigenvalueDecomposition<Real>::Orthes() {
}
}
template<class Real> void EigenvalueDecomposition<Real>::Hqr2() {
template<typename Real> void EigenvalueDecomposition<Real>::Hqr2() {
// This is derived from the Algol procedure hqr2,
// by Martin and Wilkinson, Handbook for Auto. Comp.,
// Vol.ii-Linear Algebra, and the corresponding
@ -872,7 +872,7 @@ template<class Real> void EigenvalueDecomposition<Real>::Hqr2() {
}
}
template<class Real>
template<typename Real>
EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A) {
KALDI_ASSERT(A.NumCols() == A.NumRows() && A.NumCols() >= 1);
n_ = A.NumRows();
@ -907,7 +907,7 @@ EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A
}
}
template<class Real>
template<typename Real>
EigenvalueDecomposition<Real>::~EigenvalueDecomposition() {
delete [] d_;
delete [] e_;

Просмотреть файл

@ -61,7 +61,7 @@ namespace kaldi {
*/
template<class Real>
template<typename Real>
bool MatrixBase<Real>::JamaSvd(VectorBase<Real> *s_in,
MatrixBase<Real> *U_in,
MatrixBase<Real> *V_in) { // Destructive!

Просмотреть файл

@ -27,12 +27,12 @@ namespace ut = kaldi::unittest;
namespace kaldi {
template<class Real> static void InitRand(VectorBase<Real> *v) {
template<typename Real> static void InitRand(VectorBase<Real> *v) {
for (MatrixIndexT i = 0;i < v->Dim();i++)
(*v)(i) = RandGauss();
}
template<class Real> static void InitRand(MatrixBase<Real> *M) {
template<typename Real> static void InitRand(MatrixBase<Real> *M) {
start:
for (MatrixIndexT i = 0;i < M->NumRows();i++)
for (MatrixIndexT j = 0;j < M->NumCols();j++)
@ -44,7 +44,7 @@ template<class Real> static void InitRand(MatrixBase<Real> *M) {
}
}
template<class Real> static void InitRand(SpMatrix<Real> *M) {
template<typename Real> static void InitRand(SpMatrix<Real> *M) {
start_sp:
for (MatrixIndexT i = 0;i < M->NumRows();i++)
for (MatrixIndexT j = 0;j<=i;j++)
@ -56,7 +56,7 @@ template<class Real> static void InitRand(SpMatrix<Real> *M) {
}
}
template<class Real> static void UnitTestGpsr() {
template<typename Real> static void UnitTestGpsr() {
for (int32 i = 0; i < 5; i++) {
MatrixIndexT dim1 = (rand() % 10) + 10;
MatrixIndexT dim2 = (rand() % 10) + 10;

Просмотреть файл

@ -5,7 +5,6 @@
// Yanmin Qian; Petr Schwarz; Jan Silovsky;
// Haihua Xu
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
@ -30,9 +29,14 @@
namespace kaldi {
template<typename Real>
void MatrixBase<Real>::Invert(Real *LogDet, Real *DetSign,
void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
bool inverse_needed) {
KALDI_ASSERT(num_rows_ == num_cols_);
if (num_rows_ == 0) {
if (det_sign) *det_sign = 1;
if (log_det) *log_det = 0.0;
return;
}
#ifndef HAVE_ATLAS
KaldiBlasInt *pivot = new KaldiBlasInt[num_rows_];
KaldiBlasInt M = num_rows_;
@ -60,26 +64,26 @@ void MatrixBase<Real>::Invert(Real *LogDet, Real *DetSign,
if (inverse_needed) {
KALDI_ERR << "Cannot invert: matrix is singular";
} else {
if (LogDet) *LogDet = -std::numeric_limits<Real>::infinity();
if (DetSign) *DetSign = 0;
if (log_det) *log_det = -std::numeric_limits<Real>::infinity();
if (det_sign) *det_sign = 0;
return;
}
}
if (DetSign != NULL) {
if (det_sign != NULL) {
int sign = 1;
for (MatrixIndexT i = 0; i < num_rows_; i++)
if (pivot[i] != static_cast<int>(i) + pivot_offset) sign *= -1;
*DetSign = sign;
*det_sign = sign;
}
if (LogDet != NULL || DetSign != NULL) { // Compute log determinant.
if (LogDet != NULL) *LogDet = 0.0;
if (log_det != NULL || det_sign != NULL) { // Compute log determinant.
if (log_det != NULL) *log_det = 0.0;
Real prod = 1.0;
for (MatrixIndexT i = 0; i < num_rows_; i++) {
prod *= (*this)(i, i);
if (i == num_rows_ - 1 || std::fabs(prod) < 1.0e-10 ||
std::fabs(prod) > 1.0e+10) {
if (LogDet != NULL) *LogDet += log(fabs(prod));
if (DetSign != NULL) *DetSign *= (prod > 0 ? 1.0 : -1.0);
if (log_det != NULL) *log_det += log(fabs(prod));
if (det_sign != NULL) *det_sign *= (prod > 0 ? 1.0 : -1.0);
prod = 1.0;
}
}
@ -108,8 +112,8 @@ void MatrixBase<float>::AddVecVec(const float alpha,
1, data_, stride_);
}
template<class Real>
template<class OtherReal>
template<typename Real>
template<typename OtherReal>
void MatrixBase<Real>::AddVecVec(const Real alpha,
const VectorBase<OtherReal> &a,
const VectorBase<OtherReal> &b) {
@ -146,6 +150,7 @@ void MatrixBase<double>::AddVecVec(const double alpha,
const VectorBase<double> &a,
const VectorBase<double> &rb) {
KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
if (num_rows_ == 0) return;
cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
1, data_, stride_);
}
@ -162,11 +167,50 @@ void MatrixBase<Real>::AddMatMat(const Real alpha,
|| (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
|| (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
KALDI_ASSERT(&A != this && &B != this);
if (num_rows_ == 0) return;
cblas_Xgemm(alpha, transA, A.data_, A.num_rows_, A.num_cols_, A.stride_,
transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);
}
template<typename Real>
void MatrixBase<Real>::CopyLowerToUpper() {
KALDI_ASSERT(num_rows_ == num_cols_);
Real *data = data_;
MatrixIndexT num_rows = num_rows_, stride = stride_;
for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < i; j++)
data[j * stride + i ] = data[i * stride + j];
}
template<typename Real>
void MatrixBase<Real>::CopyUpperToLower() {
KALDI_ASSERT(num_rows_ == num_cols_);
Real *data = data_;
MatrixIndexT num_rows = num_rows_, stride = stride_;
for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < i; j++)
data[i * stride + j] = data[j * stride + i];
}
template<typename Real>
void MatrixBase<Real>::SymAddMat2(const Real alpha,
const MatrixBase<Real> &A,
MatrixTransposeType transA,
Real beta) {
KALDI_ASSERT(num_rows_ == num_cols_ &&
((transA == kNoTrans && A.num_rows_ == num_rows_) ||
(transA == kTrans && A.num_cols_ == num_cols_)));
KALDI_ASSERT(A.data_ != data_);
if (num_rows_ == 0) return;
MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
// This function call is hard-coded to update the lower triangle.
cblas_Xsyrk(transA, num_rows_, A_other_dim, alpha, A.Data(),
A.Stride(), beta, this->data_, this->stride_);
}
template<typename Real>
void MatrixBase<Real>::AddMatSmat(const Real alpha,
@ -253,13 +297,14 @@ void MatrixBase<Real>::AddSpSp(const Real alpha, const SpMatrix<Real> &A_in,
// CblasLower or CblasUpper would work below as symmetric matrix is copied
// fully (to save work, we used the matrix constructor from SpMatrix).
// CblasLeft means A is on the left: C <-- alpha A B + beta C
if (sz == 0) return;
cblas_Xsymm(alpha, sz, A.data_, A.stride_, B.data_, B.stride_, beta, data_, stride_);
}
template<typename Real>
void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
MatrixTransposeType transA) {
if (&A == this) { // Make it work in this case.
if (&A == this) {
if (transA == kNoTrans) {
Scale(alpha + 1.0);
} else {
@ -293,20 +338,22 @@ void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
Real *adata = A.data_, *data = data_;
if (transA == kNoTrans) {
KALDI_ASSERT(A.num_rows_ == num_rows_ && A.num_cols_ == num_cols_);
if (num_rows_ == 0) return;
for (MatrixIndexT row = 0; row < num_rows_; row++, adata += aStride,
data += stride) {
cblas_Xaxpy(num_cols_, alpha, adata, 1, data, 1);
}
} else {
KALDI_ASSERT(A.num_cols_ == num_rows_ && A.num_rows_ == num_cols_);
if (num_rows_ == 0) return;
for (MatrixIndexT row = 0; row < num_rows_; row++, adata++, data += stride)
cblas_Xaxpy(num_cols_, alpha, adata, aStride, data, 1);
}
}
}
template<class Real>
template<class OtherReal>
template<typename Real>
template<typename OtherReal>
void MatrixBase<Real>::AddSp(const Real alpha, const SpMatrix<OtherReal> &S) {
KALDI_ASSERT(S.NumRows() == NumRows() && S.NumRows() == NumCols());
Real *data = data_; const OtherReal *sdata = S.Data();
@ -331,6 +378,31 @@ template
void MatrixBase<double>::AddSp(const double alpha, const SpMatrix<float> &S);
template<typename Real>
void MatrixBase<Real>::AddDiagVecMat(
const Real alpha, VectorBase<Real> &v,
const MatrixBase<Real> &M,
MatrixTransposeType transM,
Real beta) {
if (beta != 1.0) this->Scale(beta);
if (transM == kNoTrans) {
KALDI_ASSERT(SameDim(*this, M));
} else {
KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
}
KALDI_ASSERT(v.Dim() == this->NumRows());
MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1, stride = stride_,
num_rows = num_rows_, num_cols = num_cols_;
if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
Real *data = data_;
const Real *Mdata = M.Data(), *vdata = v.Data();
if (num_rows_ == 0) return;
for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++)
cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
}
#if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
// ****************************************************************************
// ****************************************************************************
@ -869,6 +941,7 @@ template<typename Real> void MatrixBase<Real>::Max(const MatrixBase<Real> &A) {
template<typename Real> void MatrixBase<Real>::Scale(Real alpha) {
if (alpha == 1.0) return;
if (num_rows_ == 0) return;
if (num_cols_ == stride_) {
cblas_Xscal(static_cast<size_t>(num_rows_) * static_cast<size_t>(num_cols_),
alpha, data_,1);
@ -893,6 +966,58 @@ void MatrixBase<Real>::MulRowsVec(const VectorBase<Real> &scale) {
}
}
template<typename Real>
void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
KALDI_ASSERT(src.NumCols() > 0 && src.NumCols() <= this->NumCols());
KALDI_ASSERT(this->NumCols() % src.NumCols() == 0 ||
this->NumCols() % (src.NumCols() - 1) < this->NumCols() / (src.NumCols() - 1));
int group_size = 0;
if (this->NumCols() % src.NumCols() == 0) {
group_size = this->NumCols() / src.NumCols();
} else {
group_size = this->NumCols() / src.NumCols() + 1;
}
MatrixIndexT M = num_rows_, N = num_cols_;
for (MatrixIndexT i = 0; i < M; i++)
for (MatrixIndexT j = 0; j < N; j++)
(*this)(i, j) *= src(i, j / group_size);
}
template<typename Real>
void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &src1,
const MatrixBase<Real> &src2,
Real power) {
KALDI_ASSERT(src2.NumCols() > 0 && src2.NumCols() <= this->NumCols());
KALDI_ASSERT(this->NumCols() % src2.NumCols() == 0 ||
this->NumCols() % (src2.NumCols() - 1) < this->NumCols() / (src2.NumCols() - 1));
int group_size = 0;
if (this->NumCols() % src2.NumCols() == 0) {
group_size = this->NumCols() / src2.NumCols();
} else {
group_size = this->NumCols() / src2.NumCols() + 1;
}
MatrixIndexT M = this->NumRows(), N = this->NumCols();
if (power == 1.0) {
for (MatrixIndexT i = 0; i < M; i++)
for (MatrixIndexT j = 0; j < N; j++)
(*this)(i, j) = (src1(i, j) == 0 ? 0 : (src1(i, j) > 0 ? 1 : -1));
} else {
for (MatrixIndexT i = 0; i < M; i++) {
for (MatrixIndexT j = 0; j < N; j++) {
if (src2(i, j / group_size) == 0) {
(*this)(i, j) = 0;
} else {
(*this)(i, j) = pow(std::abs(src1(i, j)), power - 1) *
(src2(i, j / group_size) > 0 ? pow(src2(i, j / group_size), 1 - power) : 1) *
(src1(i, j) >= 0 ? 1 : -1) ;
}
}
}
}
}
template<typename Real> // scales each column by scale[i].
void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
KALDI_ASSERT(scale.Dim() == num_cols_);
@ -932,8 +1057,19 @@ void MatrixBase<Real>::SetUnit() {
template<typename Real>
void MatrixBase<Real>::SetRandn() {
for (MatrixIndexT row = 0; row < num_rows_; row++) {
for (MatrixIndexT col = 0; col < num_cols_; col++) {
(*this)(row, col) = static_cast<Real>(kaldi::RandGauss());
Real *row_data = this->RowData(row);
for (MatrixIndexT col = 0; col < num_cols_; col++, row_data++) {
*row_data = static_cast<Real>(kaldi::RandGauss());
}
}
}
template<typename Real>
void MatrixBase<Real>::SetRandUniform() {
for (MatrixIndexT row = 0; row < num_rows_; row++) {
Real *row_data = this->RowData(row);
for (MatrixIndexT col = 0; col < num_cols_; col++, row_data++) {
*row_data = static_cast<Real>(kaldi::RandUniform());
}
}
}
@ -1218,7 +1354,7 @@ SubMatrix<Real>::SubMatrix(Real *data,
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::Add(const Real alpha) {
Real *data = data_;
MatrixIndexT stride = stride_;
@ -1227,8 +1363,17 @@ void MatrixBase<Real>::Add(const Real alpha) {
data[c + stride*r] += alpha;
}
template<typename Real>
void MatrixBase<Real>::AddToDiag(const Real alpha) {
Real *data = data_;
MatrixIndexT this_stride = stride_ + 1,
num_to_add = std::min(num_rows_, num_cols_);
for (MatrixIndexT r = 0; r < num_to_add; r++)
data[r * this_stride] += alpha;
}
template<class Real>
template<typename Real>
Real MatrixBase<Real>::Cond() const {
KALDI_ASSERT(num_rows_ > 0&&num_cols_ > 0);
Vector<Real> singular_values(std::min(num_rows_, num_cols_));
@ -1241,7 +1386,7 @@ Real MatrixBase<Real>::Cond() const {
else return 1.0e+100;
}
template<class Real>
template<typename Real>
Real MatrixBase<Real>::Trace(bool check_square) const {
KALDI_ASSERT(!check_square || num_rows_ == num_cols_);
Real ans = 0.0;
@ -1249,7 +1394,7 @@ Real MatrixBase<Real>::Trace(bool check_square) const {
return ans;
}
template<class Real>
template<typename Real>
Real MatrixBase<Real>::Max() const {
KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
Real ans= *data_;
@ -1260,7 +1405,7 @@ Real MatrixBase<Real>::Max() const {
return ans;
}
template<class Real>
template<typename Real>
Real MatrixBase<Real>::Min() const {
KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
Real ans= *data_;
@ -1273,7 +1418,7 @@ Real MatrixBase<Real>::Min() const {
template <class Real>
template <typename Real>
void MatrixBase<Real>::AddMatMatMat(Real alpha,
const MatrixBase<Real> &A, MatrixTransposeType transA,
const MatrixBase<Real> &B, MatrixTransposeType transB,
@ -1313,7 +1458,7 @@ void MatrixBase<Real>::AddMatMatMat(Real alpha,
template<class Real>
template<typename Real>
void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) {
// Svd, *this = U*diag(s)*Vt.
// With (*this).num_rows_ == m, (*this).num_cols_ == n,
@ -1357,7 +1502,7 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
if (prescale != 1.0) s->Scale(1.0/prescale);
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) const {
try {
if (num_rows_ >= num_cols_) {
@ -1380,7 +1525,7 @@ void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<
}
}
template<class Real>
template<typename Real>
bool MatrixBase<Real>::IsSymmetric(Real cutoff) const {
MatrixIndexT R = num_rows_, C = num_cols_;
if (R != C) return false;
@ -1396,7 +1541,7 @@ bool MatrixBase<Real>::IsSymmetric(Real cutoff) const {
return true;
}
template<class Real>
template<typename Real>
bool MatrixBase<Real>::IsDiagonal(Real cutoff) const{
MatrixIndexT R = num_rows_, C = num_cols_;
Real bad_sum = 0.0, good_sum = 0.0;
@ -1422,7 +1567,7 @@ void MatrixBase<Real>::TestUninitialized() const {
}
template<class Real>
template<typename Real>
bool MatrixBase<Real>::IsUnit(Real cutoff) const {
MatrixIndexT R = num_rows_, C = num_cols_;
// if (R != C) return false;
@ -1433,7 +1578,7 @@ bool MatrixBase<Real>::IsUnit(Real cutoff) const {
return (bad_max <= cutoff);
}
template<class Real>
template<typename Real>
bool MatrixBase<Real>::IsZero(Real cutoff)const {
MatrixIndexT R = num_rows_, C = num_cols_;
Real bad_max = 0.0;
@ -1443,16 +1588,9 @@ bool MatrixBase<Real>::IsZero(Real cutoff)const {
return (bad_max <= cutoff);
}
template<class Real>
template<typename Real>
Real MatrixBase<Real>::FrobeniusNorm() const{
MatrixIndexT R = num_rows_, C = num_cols_;
Real sum = 0.0;
for (MatrixIndexT i = 0;i < R;i++)
for (MatrixIndexT j = 0;j < C;j++) {
Real tmp = (*this)(i, j);
sum += tmp*tmp;
}
return sqrt(sum);
return sqrt(TraceMatMat(*this, *this, kTrans));
}
template<typename Real>
@ -1477,7 +1615,7 @@ bool MatrixBase<Real>::Equal(const MatrixBase<Real> &other) const {
}
template<class Real>
template<typename Real>
Real MatrixBase<Real>::LargestAbsElem() const{
MatrixIndexT R = num_rows_, C = num_cols_;
Real largest = 0.0;
@ -1488,7 +1626,7 @@ Real MatrixBase<Real>::LargestAbsElem() const{
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::OrthogonalizeRows() {
KALDI_ASSERT(NumRows() <= NumCols());
MatrixIndexT num_rows = num_rows_;
@ -1529,7 +1667,7 @@ void MatrixBase<Real>::OrthogonalizeRows() {
// Throws exception if this failed to within supplied precision (typically because *this was not
// symmetric positive definite).
template<class Real>
template<typename Real>
void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *rU, Real check_thresh) // e.g. check_thresh = 0.001
{
const MatrixIndexT D = num_rows_;
@ -1571,7 +1709,7 @@ void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *
}
template<class Real>
template<typename Real>
Real MatrixBase<Real>::LogDet(Real *det_sign) const {
Real log_det;
Matrix<Real> tmp(*this);
@ -1579,15 +1717,15 @@ Real MatrixBase<Real>::LogDet(Real *det_sign) const {
return log_det;
}
template<class Real>
void MatrixBase<Real>::InvertDouble(Real *LogDet, Real *DetSign,
template<typename Real>
void MatrixBase<Real>::InvertDouble(Real *log_det, Real *det_sign,
bool inverse_needed) {
double LogDet_tmp, DetSign_tmp;
double log_det_tmp, det_sign_tmp;
Matrix<double> dmat(*this);
dmat.Invert(&LogDet_tmp, &DetSign_tmp, inverse_needed);
dmat.Invert(&log_det_tmp, &det_sign_tmp, inverse_needed);
if (inverse_needed) (*this).CopyFromMat(dmat);
if (LogDet) *LogDet = LogDet_tmp;
if (DetSign) *DetSign = DetSign_tmp;
if (log_det) *log_det = log_det_tmp;
if (det_sign) *det_sign = det_sign_tmp;
}
template<class Real>
@ -1610,7 +1748,7 @@ void MatrixBase<Real>::InvertElements() {
}
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::Transpose() {
KALDI_ASSERT(num_rows_ == num_cols_);
MatrixIndexT M = num_rows_;
@ -1622,7 +1760,7 @@ void MatrixBase<Real>::Transpose() {
}
template<class Real>
template<typename Real>
void Matrix<Real>::Transpose() {
if (this->num_rows_ != this->num_cols_) {
Matrix<Real> tmp(*this, kTrans);
@ -1633,7 +1771,7 @@ void Matrix<Real>::Transpose() {
}
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::ApplyFloor(Real floor_val) {
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
for (MatrixIndexT i = 0; i < num_rows; i++) {
@ -1643,7 +1781,7 @@ void MatrixBase<Real>::ApplyFloor(Real floor_val) {
}
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
for (MatrixIndexT i = 0; i < num_rows; i++) {
@ -1653,28 +1791,28 @@ void MatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
}
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::ApplyLog() {
for (MatrixIndexT i = 0; i < num_rows_; i++) {
Row(i).ApplyLog();
}
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::ApplyExp() {
for (MatrixIndexT i = 0; i < num_rows_; i++) {
Row(i).ApplyExp();
}
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::ApplyPow(Real power) {
for (MatrixIndexT i = 0; i < num_rows_; i++) {
Row(i).ApplyPow(power);
}
}
template<class Real>
template<typename Real>
void MatrixBase<Real>::ApplyHeaviside() {
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
for (MatrixIndexT i = 0; i < num_rows; i++) {
@ -1685,7 +1823,7 @@ void MatrixBase<Real>::ApplyHeaviside() {
}
template<class Real>
template<typename Real>
bool MatrixBase<Real>::Power(Real power) {
KALDI_ASSERT(num_rows_ > 0 && num_rows_ == num_cols_);
MatrixIndexT n = num_rows_;
@ -1708,7 +1846,7 @@ bool MatrixBase<Real>::Power(Real power) {
return true;
}
template<class Real>
template<typename Real>
void Matrix<Real>::Swap(Matrix<Real> *other) {
std::swap(this->data_, other->data_);
std::swap(this->num_cols_, other->num_cols_);
@ -1733,7 +1871,7 @@ void Matrix<Real>::Swap(Matrix<Real> *other) {
// By making the pointer arguments non-NULL or NULL, the user can choose to take
// not to take the eigenvalues directly, and/or the matrix D which is block-diagonal
// with 2x2 blocks.
template<class Real>
template<typename Real>
void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
VectorBase<Real> *r,
VectorBase<Real> *i) const {
@ -1756,7 +1894,7 @@ void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
// INT_32 mSampSize;
// };
template<class Real>
template<typename Real>
bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
{
// check instantiated with double or float.
@ -1856,7 +1994,7 @@ bool ReadHtk(std::istream &is, Matrix<float> *M, HtkHeader *header_ptr);
template
bool ReadHtk(std::istream &is, Matrix<double> *M, HtkHeader *header_ptr);
template<class Real>
template<typename Real>
bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr) // header may be derived from a previous call to ReadHtk. Must be in binary mode.
{
KALDI_ASSERT(M.NumRows() == static_cast<MatrixIndexT>(htk_hdr.mNSamples));
@ -1910,7 +2048,7 @@ template
bool WriteHtk(std::ostream &os, const MatrixBase<double> &M, HtkHeader htk_hdr);
template <class Real>
template <typename Real>
Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
const MatrixBase<Real> &B, MatrixTransposeType transB,
const MatrixBase<Real> &C, MatrixTransposeType transC) {
@ -1946,7 +2084,7 @@ double TraceMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA,
const MatrixBase<double> &C, MatrixTransposeType transC);
template <class Real>
template <typename Real>
Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
const MatrixBase<Real> &B, MatrixTransposeType transB,
const MatrixBase<Real> &C, MatrixTransposeType transC,
@ -1989,7 +2127,7 @@ double TraceMatMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA
const MatrixBase<double> &C, MatrixTransposeType transC,
const MatrixBase<double> &D, MatrixTransposeType transD);
template<class Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
template<typename Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
MatrixBase<Real> *Vt, bool sort_on_absolute_value) {
/// Makes sure the Svd is sorted (from greatest to least absolute value).
MatrixIndexT num_singval = s->Dim();
@ -2031,7 +2169,7 @@ template
void SortSvd(VectorBase<double> *s, MatrixBase<double> *U,
MatrixBase<double> *Vt, bool);
template<class Real>
template<typename Real>
void CreateEigenvalueMatrix(const VectorBase<Real> &re, const VectorBase<Real> &im,
MatrixBase<Real> *D) {
MatrixIndexT n = re.Dim();
@ -2067,7 +2205,7 @@ void CreateEigenvalueMatrix(const VectorBase<double> &re, const VectorBase<doubl
template<class Real>
template<typename Real>
bool AttemptComplexPower(Real *x_re, Real *x_im, Real power) {
// Used in Matrix<Real>::Power().
// Attempts to take the complex value x to the power "power",
@ -2100,7 +2238,7 @@ bool AttemptComplexPower(double *x_re, double *x_im, double power);
template <class Real>
template <typename Real>
Real TraceMatMat(const MatrixBase<Real> &A,
const MatrixBase<Real> &B,
MatrixTransposeType trans) { // tr(A B), equivalent to sum of each element of A times same element in B'
@ -2186,6 +2324,75 @@ void MatrixBase<Real>::Tanh(const MatrixBase<Real> &src) {
}
}
template<typename Real>
void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
KALDI_ASSERT(SameDim(*this, src));
int32 num_rows = num_rows_, num_cols = num_cols_;
for (MatrixIndexT r = 0; r < num_rows; r++) {
Real *row_data = this->RowData(r);
const Real *src_row_data = src.RowData(r);
for (MatrixIndexT c = 0; c < num_cols; c++) {
Real x = src_row_data[c], y;
if (x > 10.0) y = x; // avoid exponentiating large numbers; function
// approaches y=x.
else y = log1p(exp(x));
row_data[c] = y;
}
}
}
template<typename Real>
void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) {
int group_size = src.NumCols() / this->NumCols();
KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size);
for (MatrixIndexT i = 0; i < src.NumRows(); i++)
for (MatrixIndexT j = 0; j < this->NumCols(); j++)
(*this)(i, j) = src.Row(i).Range(j * group_size, group_size).Norm(power);
}
template<typename Real>
void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices) {
KALDI_ASSERT(NumRows() == src.NumRows());
KALDI_ASSERT(NumCols() == static_cast<MatrixIndexT>(indices.size()));
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
this_stride = stride_, src_stride = src.stride_;
Real *this_data = this->data_;
const Real *src_data = src.data_;
#ifdef KALDI_PARANOID
MatrixIndexT src_cols = src.NumCols();
for (std::vector<MatrixIndexT>::const_iterator iter = indices.begin();
iter != indices.end(); ++iter)
KALDI_ASSERT(*iter >= -1 && *iter < src_cols);
#endif
// For the sake of memory locality we do this row by row, rather
// than doing it column-wise using cublas_Xcopy
for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
const MatrixIndexT *index_ptr = &(indices[0]);
for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
if (*index_ptr < 0) this_data[c] = 0;
else this_data[c] = src_data[*index_ptr];
}
}
}
template<typename Real>
void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices) {
KALDI_ASSERT(NumCols() == src.NumCols());
KALDI_ASSERT(NumRows() == static_cast<MatrixIndexT>(indices.size()));
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
this_stride = stride_;
Real *this_data = this->data_;
for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
MatrixIndexT index = indices[r];
if (index < 0) memset(this_data, 0, sizeof(Real) * num_cols_);
else cblas_Xcopy(num_cols, src.RowData(index), 1, this_data, 1);
}
}
template<typename Real>
void MatrixBase<Real>::Sigmoid(const MatrixBase<Real> &src) {
KALDI_ASSERT(SameDim(*this, src));
@ -2237,8 +2444,8 @@ void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
}
template<class Real>
template<class OtherReal>
template<typename Real>
template<typename OtherReal>
void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v) {
const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
stride = stride_;
@ -2262,8 +2469,8 @@ template void MatrixBase<double>::AddVecToRows(const double alpha,
const VectorBase<double> &v);
template<class Real>
template<class OtherReal>
template<typename Real>
template<typename OtherReal>
void MatrixBase<Real>::AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v) {
const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
stride = stride_;

Просмотреть файл

@ -41,7 +41,7 @@ Real TraceMatMat(const MatrixBase<Real> &A, const MatrixBase<Real> &B,
/// Base class which provides matrix operations not involving resizing
/// or allocation. Classes Matrix and SubMatrix inherit from it and take care
/// of allocation and resizing.
template<class Real>
template<typename Real>
class MatrixBase {
public:
// so this child can access protected members of other instances.
@ -50,6 +50,9 @@ class MatrixBase {
friend class CuMatrixBase<Real>;
friend class CuMatrix<Real>;
friend class CuSubMatrix<Real>;
friend class CuPackedMatrix<Real>;
friend class PackedMatrix<Real>;
/// Returns number of rows (or zero for emtpy matrix).
inline MatrixIndexT NumRows() const { return num_rows_; }
@ -121,13 +124,16 @@ class MatrixBase {
void SetUnit();
/// Sets to random values of a normal distribution
void SetRandn();
/// Sets to numbers uniformly distributed on (0, 1)
void SetRandUniform();
/* Copying functions. These do not resize the matrix! */
/// Copy given matrix. (no resize is done).
template<typename OtherReal>
void CopyFromMat(const MatrixBase<OtherReal> & M,
MatrixTransposeType Trans = kNoTrans);
MatrixTransposeType trans = kNoTrans);
/// Copy from compressed matrix.
void CopyFromMat(const CompressedMatrix &M);
@ -139,12 +145,21 @@ class MatrixBase {
/// Copy given tpmatrix. (no resize is done).
template<typename OtherReal>
void CopyFromTp(const TpMatrix<OtherReal> &M,
MatrixTransposeType Trans = kNoTrans);
MatrixTransposeType trans = kNoTrans);
/// Copy from CUDA matrix. Implemented in ../cudamatrix/cu-matrix.h
template<typename OtherReal>
void CopyFromMat(const CuMatrixBase<OtherReal> &M,
MatrixTransposeType trans = kNoTrans);
/// Inverse of vec() operator. Copies vector into matrix, row-by-row.
/// Note that rv.Dim() must either equal NumRows()*NumCols() or
/// NumCols()-- this has two modes of operation.
void CopyRowsFromVec(const VectorBase<Real> &v);
/// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
void CopyRowsFromVec(const CuVectorBase<Real> &v);
template<typename OtherReal>
void CopyRowsFromVec(const VectorBase<OtherReal> &v);
@ -225,6 +240,10 @@ class MatrixBase {
/// each row by a scalar taken from that dimension of the vector.
void MulRowsVec(const VectorBase<Real> &scale);
/// divide each row into src.NumCols() groups,
/// and then scale i'th row's jth group of elements by src[i, j].
void MulRowsGroupMat(const MatrixBase<Real> &src);
/// Returns logdet of matrix.
Real LogDet(Real *det_sign = NULL) const;
@ -248,6 +267,22 @@ class MatrixBase {
/// Matrix child class works also for non-square.
void Transpose();
/// Copies column r from column indices[r] of src.
/// As a special case, if indexes[i] == -1, sets column i to zero
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void CopyCols(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// Copies row r from row indices[r] of src.
/// As a special case, if indexes[i] == -1, sets row i to zero
/// "reorder".size() must equal this->NumRows(),
/// all elements of "reorder" must be in [-1, src.NumRows()-1],
/// and src.NumCols() must equal this.NumCols()
void CopyRows(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// Applies floor to all matrix elements
void ApplyFloor(Real floor_val);
@ -374,6 +409,24 @@ class MatrixBase {
/// Set each element to the sigmoid of the corresponding element of "src".
void Sigmoid(const MatrixBase<Real> &src);
/// Set each element to y = log(1 + exp(x))
void SoftHinge(const MatrixBase<Real> &src);
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
/// where G = x.NumCols() / y.NumCols() must be an integer.
void GroupPnorm(const MatrixBase<Real> &src, Real power);
/// Calculate derivatives for the GroupPnorm function above...
/// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
/// and "output" is the result of the computation (i.e. the "this" of that function
/// call), and *this has the same dimension as "input", then it sets each element
/// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
/// "output-elem" is whichever element of output depends on that input element.
void GroupPnormDeriv(const MatrixBase<Real> &input, const MatrixBase<Real> &output,
Real power);
/// Set each element to the tanh of the corresponding element of "src".
void Tanh(const MatrixBase<Real> &src);
@ -406,25 +459,40 @@ class MatrixBase {
/// Add a scalar to each element
void Add(const Real alpha);
/// Add a scalar to each diagonal element.
void AddToDiag(const Real alpha);
/// *this += alpha * a * b^T
template<class OtherReal>
template<typename OtherReal>
void AddVecVec(const Real alpha, const VectorBase<OtherReal> &a,
const VectorBase<OtherReal> &b);
/// [each row of *this] += alpha * v
template<class OtherReal>
template<typename OtherReal>
void AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v);
/// [each col of *this] += alpha * v
template<class OtherReal>
template<typename OtherReal>
void AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v);
/// *this += alpha * M [or M^T]
void AddMat(const Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType transA = kNoTrans);
/// *this = beta * *this + alpha * M M^T, for symmetric matrices. It only
/// updates the lower triangle of *this. It will leave the matrix asymmetric;
/// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
void SymAddMat2(const Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType transA, Real beta);
/// *this = beta * *this + alpha * diag(v) * M [or M^T].
/// The same as adding M but scaling each row M_i by v(i).
void AddDiagVecMat(const Real alpha, VectorBase<Real> &v,
const MatrixBase<Real> &M, MatrixTransposeType transM,
Real beta = 1.0);
/// *this += alpha * S
template<class OtherReal>
template<typename OtherReal>
void AddSp(const Real alpha, const SpMatrix<OtherReal> &S);
void AddMatMat(const Real alpha,
@ -512,6 +580,12 @@ class MatrixBase {
const SpMatrix<Real>& A, const SpMatrix<Real>& B,
const Real beta);
/// Copy lower triangle to upper triangle (symmetrize)
void CopyLowerToUpper();
/// Copy upper triangle to lower triangle (symmetrize)
void CopyUpperToLower();
/// This function orthogonalizes the rows of a matrix using the Gram-Schmidt
/// process. It is only applicable if NumRows() <= NumCols(). It will use
/// random number generation to fill in rows with something nonzero, in cases
@ -580,7 +654,7 @@ class MatrixBase {
};
/// A class for storing matrices.
template<class Real>
template<typename Real>
class Matrix : public MatrixBase<Real> {
public:
@ -589,12 +663,23 @@ class Matrix : public MatrixBase<Real> {
/// Basic constructor. Sets to zero by default.
/// if set_zero == false, memory contents are undefined.
Matrix(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type = kSetZero):
Matrix(const MatrixIndexT r, const MatrixIndexT c,
MatrixResizeType resize_type = kSetZero):
MatrixBase<Real>() { Resize(r, c, resize_type); }
/// Copy constructor from CUDA matrix
/// This is defined in ../cudamatrix/cu-matrix.h
template<typename OtherReal>
explicit Matrix(const CuMatrixBase<OtherReal> &cu,
MatrixTransposeType trans = kNoTrans);
/// Swaps the contents of *this and *other. Shallow swap.
void Swap(Matrix<Real> *other);
/// Defined in ../cudamatrix/cu-matrix.cc
void Swap(CuMatrix<Real> *mat);
/// Constructor from any MatrixBase. Can also copy with transpose.
/// Allocates new memory.
explicit Matrix(const MatrixBase<Real> & M,
@ -707,11 +792,11 @@ struct HtkHeader {
};
// Read HTK formatted features from file into matrix.
template<class Real>
template<typename Real>
bool ReadHtk(std::istream &is, Matrix<Real> *M, HtkHeader *header_ptr);
// Write (HTK format) features to file from matrix.
template<class Real>
template<typename Real>
bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr);
@ -764,19 +849,32 @@ class SubMatrix : public MatrixBase<Real> {
// Some declarations. These are traces of products.
template<typename Real>
bool ApproxEqual(const MatrixBase<Real> &A,
const MatrixBase<Real> &B, Real tol = 0.01) {
return A.ApproxEqual(B, tol);
}
template<typename Real>
inline void AssertEqual(MatrixBase<Real> &A, MatrixBase<Real> &B,
float tol = 0.01) {
KALDI_ASSERT(A.ApproxEqual(B, tol));
}
/// Returns trace of matrix.
template <class Real>
template <typename Real>
double TraceMat(const MatrixBase<Real> &A) { return A.Trace(); }
/// Returns tr(A B C)
template <class Real>
template <typename Real>
Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
const MatrixBase<Real> &B, MatrixTransposeType transB,
const MatrixBase<Real> &C, MatrixTransposeType transC);
/// Returns tr(A B C D)
template <class Real>
template <typename Real>
Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
const MatrixBase<Real> &B, MatrixTransposeType transB,
const MatrixBase<Real> &C, MatrixTransposeType transC,
@ -796,7 +894,7 @@ Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
/// otherwise, moving the columns of U, if it exists, and the rows of Vt, if it
/// exists around in the same way. Note: the "absolute value" part won't matter
/// if this is an actual SVD, since singular values are non-negative.
template<class Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
template<typename Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
MatrixBase<Real>* Vt = NULL,
bool sort_on_absolute_value = true);
@ -806,7 +904,7 @@ template<class Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
/// 2x2 block [lambda, mu; -mu, lambda].
/// This function will throw if any complex eigenvalues are not in complex conjugate
/// pairs (or the members of such pairs are not consecutively numbered).
template<class Real>
template<typename Real>
void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real> &imag,
MatrixBase<Real> *D);
@ -814,7 +912,7 @@ void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real>
/// declare it here mainly for the testing code to see. It takes a complex value to
/// a power using a method that will work for noninteger powers (but will fail if the
/// complex value is real and negative).
template<class Real>
template<typename Real>
bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
@ -834,7 +932,7 @@ template<typename Real>
std::istream & operator >> (std::istream & In, Matrix<Real> & M);
template<class Real>
template<typename Real>
bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
}

Просмотреть файл

@ -45,7 +45,7 @@ template
double VecVec<>(const VectorBase<double> &a,
const VectorBase<double> &b);
template<class Real, class OtherReal>
template<typename Real, typename OtherReal>
Real VecVec(const VectorBase<Real> &ra,
const VectorBase<OtherReal> &rb) {
MatrixIndexT adim = ra.Dim();
@ -470,20 +470,25 @@ Real VectorBase<Real>::Norm(Real p) const {
return sqrt(sum);
} else {
Real tmp;
bool ok = true;
for (MatrixIndexT i = 0; i < dim_; i++) {
tmp = pow(std::abs(data_[i]), p);
if (tmp == HUGE_VAL) { // HUGE_VAL is what pow returns on error.
KALDI_ERR << "Could not raise element " << i << "to power " << p
<< ": returned value = " << tmp;
}
if (tmp == HUGE_VAL) // HUGE_VAL is what pow returns on error.
ok = false;
sum += tmp;
}
tmp = pow(sum, static_cast<Real>(1.0/p));
if (tmp == HUGE_VAL) { // HUGE_VAL is what errno returns on error.
KALDI_ERR << "Could not take the " << p << "-th root of " << sum
<< "; returned value = " << tmp;
}
return tmp;
KALDI_ASSERT(tmp != HUGE_VAL); // should not happen here.
if (ok) {
return tmp;
} else {
Real maximum = this->Max(), minimum = this->Min(),
max_abs = std::max(maximum, -minimum);
KALDI_ASSERT(max_abs > 0); // Or should not have reached here.
Vector<Real> tmp(*this);
tmp.Scale(1.0 / max_abs);
return tmp.Norm(p) * max_abs;
}
}
}
@ -612,9 +617,7 @@ void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixInd
template<typename Real>
void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
for (MatrixIndexT i = 0; i < dim_; i++)
data_[i] = M(i, i);
// could make this more efficient.
cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
}
template<typename Real>
@ -774,12 +777,13 @@ MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
template<typename Real>
Real VectorBase<Real>::ApplySoftMax() {
Real max = this->Max(), sum = 0.0;
Real max = this->Max(), sum = 0.0;
for (MatrixIndexT i = 0; i < dim_; i++) {
sum += (data_[i] = exp(data_[i] - max));
}
this->Scale(1.0 / sum);
return max + log(sum);
}
#ifdef HAVE_MKL
@ -868,7 +872,12 @@ void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
}
}
template<typename Real> // Set each element to y = (x == orig ? changed : x).
void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
Real *data = data_;
for (MatrixIndexT i = 0; i < dim_; i++)
if (data[i] == orig) data[i] = changed;
}
template<typename Real>
@ -1136,7 +1145,7 @@ void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
}
template<class Real>
template<typename Real>
void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
KALDI_ASSERT(dim_ == v.dim_);
for (MatrixIndexT i = 0; i < dim_; i++)
@ -1144,7 +1153,7 @@ void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
}
// this <-- beta*this + alpha*M*v.
template<class Real>
template<typename Real>
void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
const MatrixTransposeType trans,
const VectorBase<Real> &v,
@ -1162,7 +1171,7 @@ void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
}
}
template<class Real>
template<typename Real>
Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
const VectorBase<Real> &v2) {
KALDI_ASSERT(v1.Dim() == M.NumRows() && v2.Dim() == M.NumCols());
@ -1178,7 +1187,7 @@ template
double VecMatVec(const VectorBase<double> &v1, const MatrixBase<double> &M,
const VectorBase<double> &v2);
template<class Real>
template<typename Real>
void Vector<Real>::Swap(Vector<Real> *other) {
std::swap(this->data_, other->data_);
std::swap(this->dim_, other->dim_);
@ -1209,6 +1218,29 @@ void VectorBase<Real>::AddDiagMat2(
}
}
template<typename Real>
void VectorBase<Real>::AddDiagMatMat(
Real alpha,
const MatrixBase<Real> &M, MatrixTransposeType transM,
const MatrixBase<Real> &N, MatrixTransposeType transN,
Real beta) {
MatrixIndexT dim = this->dim_,
M_col_dim = (transM == kTrans ? M.NumRows() : M.NumCols()),
N_row_dim = (transN == kTrans ? N.NumCols() : N.NumRows());
KALDI_ASSERT(M_col_dim == N_row_dim); // this is the dimension we sum over
MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
Real *data = this->data_;
const Real *Mdata = M.Data(), *Ndata = N.Data();
for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) {
*data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride);
}
}
template class Vector<float>;
template class Vector<double>;
template class VectorBase<float>;
@ -1216,5 +1248,3 @@ template class VectorBase<double>;
} // namespace kaldi

Просмотреть файл

@ -109,6 +109,11 @@ class VectorBase {
template<typename OtherReal>
void CopyFromVec(const VectorBase<OtherReal> &v);
/// Copy from CuVector. This is defined in ../cudamatrix/cu-vector.h
template<typename OtherReal>
void CopyFromVec(const CuVectorBase<OtherReal> &v);
/// Apply natural log to all elements. Throw if any element of
/// the vector is negative (but doesn't complain about zero; the
/// log will be -infinity
@ -157,7 +162,7 @@ class VectorBase {
/// Add vector : *this = *this + alpha * rv (with casting between floats and
/// doubles)
template<class OtherReal>
template<typename OtherReal>
void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring].
@ -165,7 +170,7 @@ class VectorBase {
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring],
/// with casting between floats and doubles.
template<class OtherReal>
template<typename OtherReal>
void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
/// Add matrix times vector : this <-- beta*this + alpha*M*v.
@ -192,6 +197,9 @@ class VectorBase {
const MatrixTransposeType trans, const VectorBase<Real> &v,
const Real beta); // **beta previously defaulted to 0.0**
/// Set each element to y = (x == orig ? changed : x).
void ReplaceValue(Real orig, Real changed);
/// Multipy element-by-element by another vector.
void MulElements(const VectorBase<Real> &v);
/// Multipy element-by-element by another vector of different type.
@ -228,6 +236,8 @@ class VectorBase {
template<typename OtherReal>
void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
/// The following is implemented in ../cudamatrix/cu-matrix.cc
void CopyRowsFromMat(const CuMatrixBase<Real> &M);
/// Performs a column stack of the matrix M
void CopyColsFromMat(const MatrixBase<Real> &M);
@ -292,6 +302,13 @@ class VectorBase {
void AddDiagMat2(Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType trans = kNoTrans, Real beta = 1.0);
/// Add the diagonal of a matrix product: *this = diag(M N), assuming the
/// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
/// as you would expect.
void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
const MatrixBase<Real> &N, MatrixTransposeType transN,
Real beta = 1.0);
/// Returns log(sum(exp())) without exp overflow
/// If prune > 0.0, ignores terms less than the max - prune.
/// [Note: in future, if prune = 0.0, it will take the max.
@ -354,6 +371,11 @@ class Vector: public VectorBase<Real> {
MatrixResizeType resize_type = kSetZero)
: VectorBase<Real>() { Resize(s, resize_type); }
/// Copy constructor from CUDA vector
/// This is defined in ../cudamatrix/cu-vector.h
template<typename OtherReal>
explicit Vector(const CuVectorBase<OtherReal> &cu);
/// Copy constructor. The need for this is controversial.
Vector(const Vector<Real> &v) : VectorBase<Real>() { // (cannot be explicit)
Resize(v.Dim(), kUndefined);
@ -432,7 +454,7 @@ class Vector: public VectorBase<Real> {
/// Represents a non-allocating general vector which can be defined
/// as a sub-vector of higher-level vector [or as the row of a matrix].
template<class Real>
template<typename Real>
class SubVector : public VectorBase<Real> {
public:
/// Constructor from a Vector or SubVector.
@ -506,6 +528,20 @@ std::istream & operator >> (std::istream & in, Vector<Real> & v);
/// \addtogroup matrix_funcs_scalar
/// @{
template<typename Real>
bool ApproxEqual(const VectorBase<Real> &a,
const VectorBase<Real> &b, Real tol = 0.01) {
return a.ApproxEqual(b, tol);
}
template<typename Real>
inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
float tol = 0.01) {
KALDI_ASSERT(a.ApproxEqual(b, tol));
}
/// Returns dot product between v1 and v2.
template<typename Real>
Real VecVec(const VectorBase<Real> &v1, const VectorBase<Real> &v2);
@ -516,7 +552,7 @@ Real VecVec(const VectorBase<Real> &v1, const VectorBase<OtherReal> &v2);
/// Returns \f$ v_1^T M v_2 \f$ .
/// Not as efficient as it could be where v1 == v2.
template<class Real>
template<typename Real>
Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
const VectorBase<Real> &v2);

Просмотреть файл

@ -38,6 +38,12 @@ typedef enum {
kCopyData
} MatrixResizeType;
typedef enum {
kTakeLower,
kTakeUpper,
kTakeMean,
kTakeMeanAndCheck
} SpCopyType;
template<typename Real> class VectorBase;
template<typename Real> class Vector;
@ -57,6 +63,9 @@ template<typename Real> class CuMatrix;
template<typename Real> class CuVectorBase;
template<typename Real> class CuSubVector;
template<typename Real> class CuVector;
template<typename Real> class CuPackedMatrix;
template<typename Real> class CuSpMatrix;
template<typename Real> class CuTpMatrix;
class CompressedMatrix;

Просмотреть файл

@ -28,14 +28,14 @@
namespace kaldi {
//! ComplexMul implements, inline, the complex multiplication b *= a.
template<class Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
Real *b_re, Real *b_im) {
Real tmp_re = (*b_re * a_re) - (*b_im * a_im);
*b_im = *b_re * a_im + *b_im * a_re;
*b_re = tmp_re;
}
template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
const Real &b_re, const Real &b_im,
Real *c_re, Real *c_im) {
*c_re += b_re*a_re - b_im*a_im;
@ -43,7 +43,7 @@ template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real
}
template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
*a_re = std::cos(x);
*a_im = std::sin(x);
}

Просмотреть файл

@ -26,7 +26,7 @@
namespace kaldi {
template<class Real> void ComplexFt (const VectorBase<Real> &in,
template<typename Real> void ComplexFt (const VectorBase<Real> &in,
VectorBase<Real> *out, bool forward) {
int exp_sign = (forward ? -1 : 1);
KALDI_ASSERT(out != NULL);
@ -93,7 +93,7 @@ void ComplexFt (const VectorBase<double> &in,
//! of the recursion.
template<class Real>
template<typename Real>
void ComplexFftRecursive (Real *data, int nffts, int N,
const int *factor_begin,
const int *factor_end, bool forward,
@ -331,7 +331,7 @@ void ComplexFftRecursive (Real *data, int nffts, int N,
// This is the outer-layer calling code for ComplexFftRecursive.
// It factorizes the dimension and then calls the FFT routine.
template<class Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
template<typename Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
KALDI_ASSERT(v != NULL);
if (v->Dim()<=1) return;
@ -347,7 +347,7 @@ template<class Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<R
}
//! Inefficient version of Fourier transform, for testing purposes.
template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
KALDI_ASSERT(v != NULL);
MatrixIndexT N = v->Dim();
KALDI_ASSERT(N%2 == 0);
@ -388,7 +388,7 @@ void ComplexFft(VectorBase<double> *v, bool forward, Vector<double> *tmp_in);
// See the long comment below for the math behind this.
template<class Real> void RealFft (VectorBase<Real> *v, bool forward) {
template<typename Real> void RealFft (VectorBase<Real> *v, bool forward) {
KALDI_ASSERT(v != NULL);
MatrixIndexT N = v->Dim(), N2 = N/2;
KALDI_ASSERT(N%2 == 0);
@ -589,7 +589,7 @@ so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})) (z2)
*/
template<class Real> void ComputeDctMatrix(Matrix<Real> *M) {
template<typename Real> void ComputeDctMatrix(Matrix<Real> *M) {
//KALDI_ASSERT(M->NumRows() == M->NumCols());
MatrixIndexT K = M->NumRows();
MatrixIndexT N = M->NumCols();
@ -612,7 +612,7 @@ template void ComputeDctMatrix(Matrix<float> *M);
template void ComputeDctMatrix(Matrix<double> *M);
template<class Real>
template<typename Real>
void MatrixExponential<Real>::Clear() {
N_ = 0;
P_.Resize(0, 0);
@ -620,7 +620,7 @@ void MatrixExponential<Real>::Clear() {
powers_.clear();
}
template<class Real>
template<typename Real>
void MatrixExponential<Real>::Compute(const MatrixBase<Real> &M,
MatrixBase<Real> *X) {
// does *X = exp(M)
@ -650,7 +650,7 @@ void MatrixExponential<Real>::Compute(const MatrixBase<Real> &M,
(*X)(i, i) += 1.0;
};
template<class Real>
template<typename Real>
void MatrixExponential<Real>::Compute(const SpMatrix<Real> &M,
SpMatrix<Real> *X) {
Matrix<Real> Mfull(M), Xfull(M.NumRows(), M.NumCols());
@ -659,7 +659,7 @@ void MatrixExponential<Real>::Compute(const SpMatrix<Real> &M,
}
template<class Real>
template<typename Real>
MatrixIndexT MatrixExponential<Real>::ComputeN(const MatrixBase<Real> &M) {
// Computes the power of two we want to use. Aim to get
// AScaled.FrobeniusNorm() < 1/10.
@ -674,7 +674,7 @@ MatrixIndexT MatrixExponential<Real>::ComputeN(const MatrixBase<Real> &M) {
return N;
}
template<class Real>
template<typename Real>
void MatrixExponential<Real>::ComputeTaylor(const MatrixBase<Real> &P, MatrixBase<Real> *B0) {
KALDI_ASSERT(P.FrobeniusNorm() < 1.001); // should actually be << 1
// for this to work fast enough.
@ -710,7 +710,7 @@ void MatrixExponential<Real>::ComputeTaylor(const MatrixBase<Real> &P, MatrixBas
}
}
template<class Real>
template<typename Real>
void MatrixExponential<Real>::Backprop(const MatrixBase<Real> &hX,
MatrixBase<Real> *hM) const {
MatrixIndexT dim = P_.NumRows();
@ -747,7 +747,7 @@ void MatrixExponential<Real>::Backprop(const MatrixBase<Real> &hX,
}
template<class Real>
template<typename Real>
void MatrixExponential<Real>::Backprop(const SpMatrix<Real> &hX,
SpMatrix<Real> *hM) const {
Matrix<Real> hXfull(hX), hMfull(hX.NumRows(), hX.NumCols());
@ -756,7 +756,7 @@ void MatrixExponential<Real>::Backprop(const SpMatrix<Real> &hX,
}
template<class Real>
template<typename Real>
void MatrixExponential<Real>::BackpropTaylor(const MatrixBase<Real> &hB0,
MatrixBase<Real> *hP) const {
// Backprop through the Taylor-series computation.
@ -819,7 +819,7 @@ template class MatrixExponential<float>;
template class MatrixExponential<double>;
template<class Real>
template<typename Real>
void ComputePca(const MatrixBase<Real> &X,
MatrixBase<Real> *U,
MatrixBase<Real> *A,
@ -861,7 +861,7 @@ void ComputePca(const MatrixBase<Real> &X,
A->AddMatMat(1.0, X, kNoTrans, *U, kTrans, 0.0);
} else { // Do inner-product PCA.
SpMatrix<Real> Nsp(N); // Matrix of inner products.
Nsp.AddMat2(1.0, X, kNoTrans); // M <-- X X^T
Nsp.AddMat2(1.0, X, kNoTrans, 0.0); // M <-- X X^T
Matrix<Real> Vtmp;
Vector<Real> l;
@ -929,7 +929,7 @@ void ComputePca(const MatrixBase<double> &X,
// Added by Dan, Feb. 13 2012.
// This function does: *plus += max(0, a b^T),
// *minus += max(0, -(a b^T)).
template<class Real>
template<typename Real>
void AddOuterProductPlusMinus(Real alpha,
const VectorBase<Real> &a,
const VectorBase<Real> &b,

Просмотреть файл

@ -59,12 +59,12 @@ namespace kaldi {
in some contexts, the transform is made symmetric by multiplying
by sqrt(N) in both passes. The user can do this by themselves.
*/
template<class Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
/// ComplexFt is the same as ComplexFft but it implements the Fourier
/// transform in an inefficient way. It is mainly included for testing purposes.
/// See comment for ComplexFft to describe the input and outputs and what it does.
template<class Real> void ComplexFt (const VectorBase<Real> &in,
template<typename Real> void ComplexFt (const VectorBase<Real> &in,
VectorBase<Real> *out, bool forward);
/// RealFft is a fourier transform of real inputs. Internally it uses
@ -76,12 +76,12 @@ template<class Real> void ComplexFt (const VectorBase<Real> &in,
/// The interpretation of the complex-FFT data is as follows: the array
/// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
/// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
template<class Real> void RealFft (VectorBase<Real> *v, bool forward);
template<typename Real> void RealFft (VectorBase<Real> *v, bool forward);
/// RealFt has the same input and output format as RealFft above, but it is
/// an inefficient implementation included for testing purposes.
template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
/// ComputeDctMatrix computes a matrix corresponding to the DCT, such that
/// M * v equals the DCT of vector v. M must be square at input.
@ -97,21 +97,21 @@ template<class Real> void RealFftInefficient (VectorBase<Real> *v, bool forward)
/// because it was this way from the start and changing it would affect the
/// feature generation.
template<class Real> void ComputeDctMatrix(Matrix<Real> *M);
template<typename Real> void ComputeDctMatrix(Matrix<Real> *M);
/// ComplexMul implements, inline, the complex multiplication b *= a.
template<class Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
Real *b_re, Real *b_im);
/// ComplexMul implements, inline, the complex operation c += (a * b).
template<class Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
const Real &b_re, const Real &b_im,
Real *c_re, Real *c_im);
/// ComplexImExp implements a <-- exp(i x), inline.
template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
// This class allows you to compute the matrix exponential function
@ -122,7 +122,7 @@ template<class Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
// It also provides a function that allows you do back-propagate the
// derivative of a scalar function through this calculation.
// The
template<class Real>
template<typename Real>
class MatrixExponential {
public:
MatrixExponential() { }
@ -194,7 +194,7 @@ class MatrixExponential {
method.
*/
template<class Real>
template<typename Real>
void ComputePca(const MatrixBase<Real> &X,
MatrixBase<Real> *U,
MatrixBase<Real> *A,
@ -205,14 +205,14 @@ void ComputePca(const MatrixBase<Real> &X,
// This function does: *plus += max(0, a b^T),
// *minus += max(0, -(a b^T)).
template<class Real>
template<typename Real>
void AddOuterProductPlusMinus(Real alpha,
const VectorBase<Real> &a,
const VectorBase<Real> &b,
MatrixBase<Real> *plus,
MatrixBase<Real> *minus);
template<class Real1, class Real2>
template<typename Real1, typename Real2>
inline void AssertSameDim(const MatrixBase<Real1> &mat1, const MatrixBase<Real2> &mat2) {
KALDI_ASSERT(mat1.NumRows() == mat2.NumRows()
&& mat1.NumCols() == mat2.NumCols());

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -28,7 +28,7 @@ namespace kaldi {
// Below, N&W refers to Nocedal and Wright, "Numerical Optimization", 2nd Ed.
template<class Real>
template<typename Real>
OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
const LbfgsOptions &opts):
opts_(opts), k_(0), computation_state_(kBeforeStep), H_was_set_(false) {
@ -48,7 +48,7 @@ OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
}
template<class Real>
template<typename Real>
Real OptimizeLbfgs<Real>::RecentStepLength() const {
size_t n = step_lengths_.size();
if (n == 0) return std::numeric_limits<Real>::infinity();
@ -63,7 +63,7 @@ Real OptimizeLbfgs<Real>::RecentStepLength() const {
}
}
template<class Real>
template<typename Real>
void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
if (k_ == 0) {
if (H_.Dim() == 0) {
@ -107,7 +107,7 @@ void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
// This represents the first 2 lines of Algorithm 7.5 (N&W), which
// in fact is mostly a call to Algorithm 7.4.
// Note: this is valid whether we are minimizing or maximizing.
template<class Real>
template<typename Real>
void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
const VectorBase<Real> &gradient) {
KALDI_ASSERT(computation_state_ == kBeforeStep);
@ -166,7 +166,7 @@ void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
}
template<class Real>
template<typename Real>
bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
const VectorBase<Real> &gradient) {
// Save s_k = x_{k+1} - x_{k}, and y_k = \nabla f_{k+1} - \nabla f_k.
@ -200,7 +200,7 @@ bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
return true; // We successfully accepted the step.
}
template<class Real>
template<typename Real>
void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
step_lengths_.push_back(s);
if (step_lengths_.size() > static_cast<size_t>(opts_.avg_step_length))
@ -208,7 +208,7 @@ void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
}
template<class Real>
template<typename Real>
void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
Real f,
const VectorBase<Real> &gradient) {
@ -231,7 +231,7 @@ void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
ComputeNewDirection(f, gradient);
}
template<class Real>
template<typename Real>
void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
const VectorBase<Real> &gradient) {
KALDI_VLOG(3) << "In step size iteration, function value changed "
@ -376,7 +376,7 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
}
}
template<class Real>
template<typename Real>
void OptimizeLbfgs<Real>::DoStep(Real function_value,
const VectorBase<Real> &gradient) {
if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
@ -389,7 +389,7 @@ void OptimizeLbfgs<Real>::DoStep(Real function_value,
StepSizeIteration(function_value, gradient);
}
template<class Real>
template<typename Real>
void OptimizeLbfgs<Real>::DoStep(Real function_value,
const VectorBase<Real> &gradient,
const VectorBase<Real> &diag_approx_2nd_deriv) {
@ -408,7 +408,7 @@ void OptimizeLbfgs<Real>::DoStep(Real function_value,
DoStep(function_value, gradient);
}
template<class Real>
template<typename Real>
const VectorBase<Real>&
OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
if (objf_value != NULL) *objf_value = best_f_;

Просмотреть файл

@ -83,7 +83,7 @@ struct LbfgsOptions {
avg_step_length(4) { }
};
template<class Real>
template<typename Real>
class OptimizeLbfgs {
public:
/// Initializer takes the starting value of x.

Просмотреть файл

@ -36,12 +36,6 @@ void PackedMatrix<Real>::Scale(Real alpha) {
cblas_Xscal(sz, alpha, data_, 1);
}
template<typename Real>
void PackedMatrix<Real>::AddVec2(const Real alpha, const Vector<Real> &rv) {
KALDI_ASSERT(rv.Dim() == num_rows_);
cblas_Xspr(rv.Dim(), alpha, rv.Data(), 1, data_);
}
template<typename Real>
void PackedMatrix<Real>::AddPacked(const Real alpha, const PackedMatrix<Real> &rMa) {
KALDI_ASSERT(num_rows_ == rMa.NumRows());
@ -50,7 +44,7 @@ void PackedMatrix<Real>::AddPacked(const Real alpha, const PackedMatrix<Real> &r
cblas_Xaxpy(sz, alpha, rMa.Data(), 1, data_, 1);
}
template<class Real>
template<typename Real>
void PackedMatrix<Real>::SetRandn() {
Real *data = data_;
size_t dim = num_rows_, size = ((dim*(dim+1))/2);
@ -89,6 +83,12 @@ void PackedMatrix<Real>::Swap(PackedMatrix<Real> *other) {
std::swap(num_rows_, other->num_rows_);
}
template<typename Real>
void PackedMatrix<Real>::Swap(Matrix<Real> *other) {
std::swap(data_, other->data_);
std::swap(num_rows_, other->num_rows_);
}
template<typename Real>
void PackedMatrix<Real>::Resize(MatrixIndexT r, MatrixResizeType resize_type) {
@ -119,6 +119,15 @@ void PackedMatrix<Real>::Resize(MatrixIndexT r, MatrixResizeType resize_type) {
template<typename Real>
void PackedMatrix<Real>::AddToDiag(Real r) {
Real *ptr = data_;
for (MatrixIndexT i = 2; i <= num_rows_+1; i++) {
*ptr += r;
ptr += i;
}
}
template<typename Real>
void PackedMatrix<Real>::ScaleDiag(Real alpha) {
Real *ptr = data_;
@ -138,6 +147,7 @@ void PackedMatrix<Real>::SetDiag(Real alpha) {
}
template<typename Real>
template<typename OtherReal>
void PackedMatrix<Real>::CopyFromPacked(const PackedMatrix<OtherReal> &orig) {
@ -221,35 +231,45 @@ void PackedMatrix<Real>::Destroy() {
num_rows_ = 0;
}
template<typename Real>
void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
if (!os.good()) {
KALDI_ERR << "Failed to write vector to stream: stream not good";
}
std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
WriteToken(os, binary, my_token);
int32 size = this->NumRows(); // make the size 32-bit on disk.
KALDI_ASSERT(this->NumRows() == (MatrixIndexT) size);
WriteBasicType(os, binary, size);
MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;
if(binary) {
std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
WriteToken(os, binary, my_token);
WriteBasicType(os, binary, size);
// We don't use the built-in Kaldi write routines for the floats, as they are
// not efficient enough.
MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;
if (!binary) {
for (MatrixIndexT i = 0; i < num_elems; i++)
WriteBasicType(os, binary, data_[i]);
os << '\n';
} else {
os.write((const char*) data_, sizeof(Real) * num_elems);
}
else {
if(size == 0)
os<<"[ ]\n";
else {
os<<"[\n";
MatrixIndexT i = 0;
for (int32 j = 0; j < size; j++) {
for (int32 k = 0; k < j + 1; k++) {
WriteBasicType(os, binary, data_[i++]);
}
os << ( (j==size-1)? "]\n" : "\n");
}
KALDI_ASSERT(i == num_elems);
}
}
if (os.fail()) {
KALDI_ERR << "Failed to write packed matrix to stream";
}
}
// template<typename Real>
// void Save (std::ostream & os, const PackedMatrix<Real>& rM)
// {
@ -275,7 +295,7 @@ void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
template<typename Real>
void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
if (add) {
PackedMatrix<Real> tmp;
tmp.Read(is, binary, false); // read without adding.
@ -295,6 +315,8 @@ void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
MatrixIndexT pos_at_start = is.tellg();
int peekval = Peek(is, binary);
const char *my_token = (sizeof(Real) == 4 ? "FP" : "DP");
const char *new_format_token = "[";
bool is_new_format = false;//added by hxu
char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
int32 size;
MatrixIndexT num_elems;
@ -310,25 +332,93 @@ void PackedMatrix<Real>::Read(std::istream & is, bool binary, bool add) {
std::string token;
ReadToken(is, binary, &token);
if (token != my_token) {
specific_error << ": Expected token " << my_token << ", got " << token;
goto bad;
}
ReadBasicType(is, binary, &size); // throws on error.
if ((MatrixIndexT)size != this->NumRows()) {
KALDI_ASSERT(size>=0);
this->Resize(size);
}
num_elems = ((size+1)*(MatrixIndexT)size)/2;
if (!binary) {
for (MatrixIndexT i = 0; i < num_elems; i++) {
ReadBasicType(is, false, data_+i); // will throw on error.
if(token != new_format_token) {
specific_error << ": Expected token " << my_token << ", got " << token;
goto bad;
}
} else {
if (num_elems)
is.read(reinterpret_cast<char*>(data_), sizeof(Real)*num_elems);
//new format it is
is_new_format = true;
}
if(!is_new_format) {
ReadBasicType(is, binary, &size); // throws on error.
if ((MatrixIndexT)size != this->NumRows()) {
KALDI_ASSERT(size>=0);
this->Resize(size);
}
num_elems = ((size+1)*(MatrixIndexT)size)/2;
if (!binary) {
for (MatrixIndexT i = 0; i < num_elems; i++) {
ReadBasicType(is, false, data_+i); // will throw on error.
}
} else {
if (num_elems)
is.read(reinterpret_cast<char*>(data_), sizeof(Real)*num_elems);
}
if (is.fail()) goto bad;
return;
}
else {
std::vector<Real> data;
while(1) {
int32 num_lines = 0;
int i = is.peek();
if (i == -1) { specific_error << "Got EOF while reading matrix data"; goto bad; }
else if (static_cast<char>(i) == ']') { // Finished reading matrix.
is.get(); // eat the "]".
i = is.peek();
if (static_cast<char>(i) == '\r') {
is.get();
is.get(); // get \r\n (must eat what we wrote)
}// I don't actually understand what it's doing here
else if (static_cast<char>(i) == '\n') { is.get(); } // get \n (must eat what we wrote)
if (is.fail()) {
KALDI_WARN << "After end of matrix data, read error.";
// we got the data we needed, so just warn for this error.
}
//now process the data:
num_lines = int32(sqrt(data.size()*2));
KALDI_ASSERT(data.size() == num_lines*(num_lines+1)/2);
this->Resize(num_lines);
//std::cout<<data.size()<<' '<<num_lines<<'\n';
for(int32 i = 0; i < data.size(); i++) {
data_[i] = data[i];
}
return;
//std::cout<<"here!!!!!hxu!!!!!"<<std::endl;
}
else if ( (i >= '0' && i <= '9') || i == '-' ) { // A number...
Real r;
is >> r;
if (is.fail()) {
specific_error << "Stream failure/EOF while reading matrix data.";
goto bad;
}
data.push_back(r);
}
else if (isspace(i)) {
is.get(); // eat the space and do nothing.
} else { // NaN or inf or error.
std::string str;
is >> str;
if (!KALDI_STRCASECMP(str.c_str(), "inf") ||
!KALDI_STRCASECMP(str.c_str(), "infinity")) {
data.push_back(std::numeric_limits<Real>::infinity());
KALDI_WARN << "Reading infinite value into matrix.";
} else if (!KALDI_STRCASECMP(str.c_str(), "nan")) {
data.push_back(std::numeric_limits<Real>::quiet_NaN());
KALDI_WARN << "Reading NaN value into matrix.";
} else {
specific_error << "Expecting numeric matrix data, got " << str;
goto bad;
}
}
}
}
if (is.fail()) goto bad;
return;
bad:
KALDI_ERR << "Failed to read packed matrix from stream. " << specific_error
<< " File position at start is "

Просмотреть файл

@ -1,7 +1,8 @@
// matrix/packed-matrix.h
// Copyright 2009-2012 Ondrej Glembek; Lukas Burget; Microsoft Corporation;
// Saarland University; Yanmin Qian; Johns Hopkins University (Author: Daniel Povey)
// Copyright 2009-2013 Ondrej Glembek; Lukas Burget; Microsoft Corporation;
// Saarland University; Yanmin Qian;
// Johns Hopkins University (Author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
@ -37,28 +38,29 @@ std::ostream & operator <<(std::ostream & out, const PackedMatrix<Real>& M);
/// @brief Packed matrix: base class for triangular and symmetric matrices.
template<typename Real> class PackedMatrix {
friend class CuPackedMatrix<Real>;
public:
//friend class CuPackedMatrix<Real>;
PackedMatrix() : data_(NULL), num_rows_(0) {}
explicit PackedMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero):
data_(NULL) { Resize(r, resize_type); }
explicit PackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL) {
Resize(orig.num_rows_);
Resize(orig.num_rows_, kUndefined);
CopyFromPacked(orig);
}
template<class OtherReal>
template<typename OtherReal>
explicit PackedMatrix(const PackedMatrix<OtherReal> &orig) : data_(NULL) {
Resize(orig.NumRows());
Resize(orig.NumRows(), kUndefined);
CopyFromPacked(orig);
}
void SetZero();
void SetZero(); /// < Set to zero
void SetUnit(); /// < Set to unit matrix.
/// Sets to random values of a normal distribution
void SetRandn();
void SetRandn(); /// < Set to random values of a normal distribution
Real Trace() const;
@ -82,17 +84,19 @@ template<typename Real> class PackedMatrix {
/// This function takes time proportional to the number of data elements.
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);
void AddToDiag(const Real r); // Adds r to diaginal
void ScaleDiag(const Real alpha); // Scales diagonal by alpha.
void SetDiag(const Real alpha); // Sets diagonal to this value.
template<class OtherReal>
template<typename OtherReal>
void CopyFromPacked(const PackedMatrix<OtherReal> &orig);
/// CopyFromVec just interprets the vector as having the same layout
/// as the packed matrix. Must have the same dimension, i.e.
/// orig.Dim() == (NumRows()*(NumRows()+1)) / 2;
template<class OtherReal>
template<typename OtherReal>
void CopyFromVec(const SubVector<OtherReal> &orig);
Real* Data() { return data_; }
@ -104,6 +108,8 @@ template<typename Real> class PackedMatrix {
return ((nr * (nr+1)) / 2) * sizeof(Real);
}
//MatrixIndexT Stride() const { return stride_; }
// This code is duplicated in child classes to avoid extra levels of calls.
Real operator() (MatrixIndexT r, MatrixIndexT c) const {
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
@ -134,10 +140,6 @@ template<typename Real> class PackedMatrix {
return * (std::min_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) ));
}
// *this <-- *this + alpha* rV * rV^T.
// The "2" in the name is because the argument is repeated.
void AddVec2(const Real alpha, const Vector<Real> &rv);
void Scale(Real c);
friend std::ostream & operator << <> (std::ostream & out,
@ -147,18 +149,20 @@ template<typename Real> class PackedMatrix {
void Read(std::istream &in, bool binary, bool add = false);
void Write(std::ostream &out, bool binary) const;
// binary = true is not yet supported.
void Destroy();
/// Swaps the contents of *this and *other. Shallow swap.
void Swap(PackedMatrix<Real> *other);
void Swap(Matrix<Real> *other);
protected:
// Will only be called from this class or derived classes.
void AddPacked(const Real alpha, const PackedMatrix<Real>& M);
Real *data_;
MatrixIndexT num_rows_;
//MatrixIndexT stride_;
private:
/// Init assumes the current contents of the class are is invalid (i.e. junk or
/// has already been freed), and it sets the matrixd to newly allocated memory
@ -189,9 +193,5 @@ std::istream & operator >> (std::istream &is, PackedMatrix<Real> &M) {
} // namespace kaldi
// Including the implementation
#include "matrix/packed-matrix-inl.h"
#endif

Просмотреть файл

@ -37,7 +37,7 @@ namespace kaldi {
x is the input of dimensino dim, v is the output of dimension
dim, and beta is a scalar. Note: we use zero-based
not one-based indexing. */
template<class Real>
template<typename Real>
void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
KALDI_ASSERT(dim > 0);
// To avoid overflow, we first compute the max of x_ (or
@ -84,7 +84,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
// the vector that is "special". This is convenient in
// the Tridiagonalize routine that uses reversed indexes for
// compatibility with the packed lower triangular format.
template<class Real>
template<typename Real>
void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
KALDI_ASSERT(dim > 0);
// To avoid overflow, we first compute the max of x_ (or
@ -138,7 +138,7 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
Caution: Q is transposed vs. Golub and Van Loan.
If Q != NULL it outputs Q.
*/
template<class Real>
template<typename Real>
void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
MatrixIndexT n = this->NumRows();
KALDI_ASSERT(Q == NULL || (Q->NumRows() == n &&
@ -194,7 +194,7 @@ template
void SpMatrix<double>::Tridiagonalize(MatrixBase<double> *Q);
/// Create Givens rotations, as in Golub and Van Loan 3rd ed., page 216.
template<class Real>
template<typename Real>
inline void Givens(Real a, Real b, Real *c, Real *s) {
if (b == 0) {
*c = 1;
@ -218,7 +218,7 @@ inline void Givens(Real a, Real b, Real *c, Real *s) {
// with Wilkinson shift." A couple of differences: this code is
// in zero based arithmetic, and we represent Q transposed from
// their Q for memory locality with row-major-indexed matrices.
template <class Real>
template <typename Real>
void QrStep(MatrixIndexT n,
Real *diag,
Real *off_diag,
@ -294,7 +294,7 @@ void QrStep(MatrixIndexT n,
// Internal code for the QR algorithm, where the diagonal
// and off-diagonal of the symmetric matrix are represented as
// vectors of length n and n-1.
template <class Real>
template <typename Real>
void QrInternal(MatrixIndexT n,
Real *diag,
Real *off_diag,
@ -372,7 +372,7 @@ void QrInternal(MatrixIndexT n,
This is the symmetric QR algorithm, from Golub and Van Loan 3rd ed., Algorithm
8.3.3. Q is transposed w.r.t. there, though.
*/
template <class Real>
template <typename Real>
void SpMatrix<Real>::Qr(MatrixBase<Real> *Q) {
KALDI_ASSERT(this->IsTridiagonal());
// We envisage that Q would be square but we don't check for this,
@ -396,7 +396,7 @@ void SpMatrix<Real>::Qr(MatrixBase<Real> *Q) {
}
}
template<class Real>
template<typename Real>
void SpMatrix<Real>::Eig(VectorBase<Real> *s, MatrixBase<Real> *P) const {
MatrixIndexT dim = this->NumRows();
KALDI_ASSERT(s->Dim() == dim);
@ -417,7 +417,7 @@ void SpMatrix<Real>::Eig(VectorBase<Real> *s, MatrixBase<Real> *P) const {
}
template<class Real>
template<typename Real>
void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
MatrixIndexT lanczos_dim) const {
const SpMatrix<Real> &S(*this); // call this "S" for easy notation.

Просмотреть файл

@ -193,7 +193,7 @@ void SpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
}
}
template<class Real>
template<typename Real>
Real SpMatrix<Real>::Trace() const {
const Real *data = this->data_;
MatrixIndexT num_rows = this->num_rows_;
@ -204,8 +204,8 @@ Real SpMatrix<Real>::Trace() const {
}
// diagonal update, this <-- this + diag(v)
template<class Real>
template<class OtherReal>
template<typename Real>
template<typename OtherReal>
void SpMatrix<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v) {
int32 num_rows = this->num_rows_;
KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0);
@ -316,7 +316,7 @@ void SpMatrix<Real>::Invert(Real *logdet, Real *det_sign, bool need_inverse) {
}
#else
// in the ATLAS case, these are not implemented using a library and we back off to something else.
template<class Real>
template<typename Real>
void SpMatrix<Real>::Invert(Real *logdet, Real *det_sign, bool need_inverse) {
Matrix<Real> M(this->NumRows(), this->NumCols());
M.CopyFromSp(*this);
@ -481,7 +481,7 @@ double TraceMatSpMatSp(const MatrixBase<double> &A, MatrixTransposeType transA,
MatrixTransposeType transC, const SpMatrix<double> &D);
template<class Real>
template<typename Real>
bool SpMatrix<Real>::IsDiagonal(Real cutoff) const {
MatrixIndexT R = this->NumRows();
Real bad_sum = 0.0, good_sum = 0.0;
@ -496,7 +496,7 @@ bool SpMatrix<Real>::IsDiagonal(Real cutoff) const {
return (!(bad_sum > good_sum * cutoff));
}
template<class Real>
template<typename Real>
bool SpMatrix<Real>::IsUnit(Real cutoff) const {
MatrixIndexT R = this->NumRows();
Real max = 0.0; // max error
@ -507,7 +507,7 @@ bool SpMatrix<Real>::IsUnit(Real cutoff) const {
return (max <= cutoff);
}
template<class Real>
template<typename Real>
bool SpMatrix<Real>::IsTridiagonal(Real cutoff) const {
MatrixIndexT R = this->NumRows();
Real max_abs_2diag = 0.0, max_abs_offdiag = 0.0;
@ -523,13 +523,13 @@ bool SpMatrix<Real>::IsTridiagonal(Real cutoff) const {
return (max_abs_offdiag <= cutoff * max_abs_2diag);
}
template<class Real>
template<typename Real>
bool SpMatrix<Real>::IsZero(Real cutoff) const {
if (this->num_rows_ == 0) return true;
return (this->Max() <= cutoff && this->Min() >= -cutoff);
}
template<class Real>
template<typename Real>
Real SpMatrix<Real>::FrobeniusNorm() const {
Real sum = 0.0;
MatrixIndexT R = this->NumRows();
@ -541,14 +541,14 @@ Real SpMatrix<Real>::FrobeniusNorm() const {
return sqrt(sum);
}
template<class Real>
template<typename Real>
bool SpMatrix<Real>::ApproxEqual(const SpMatrix<Real> &other, float tol) const {
if (this->NumRows() != other.NumRows())
KALDI_ERR << "SpMatrix::AproxEqual, size mismatch, "
<< this->NumRows() << " vs. " << other.NumRows();
SpMatrix<Real> tmp(*this);
tmp.AddSp(-1.0, other);
return (tmp.FrobeniusNorm() <= tol * this->FrobeniusNorm());
return (tmp.FrobeniusNorm() <= tol * std::max(this->FrobeniusNorm(), other.FrobeniusNorm()));
}
// function Floor: A = Floor(B, alpha * C) ... see tutorial document.
@ -600,7 +600,7 @@ int SpMatrix<Real>::ApplyFloor(const SpMatrix<Real> &C, Real alpha,
return nfloored;
}
template<class Real>
template<typename Real>
Real SpMatrix<Real>::LogDet(Real *det_sign) const {
Real log_det;
SpMatrix<Real> tmp(*this);
@ -648,7 +648,7 @@ MatrixIndexT SpMatrix<Real>::LimitCond(Real maxCond, bool invert) { // e.g. max
s(i) = sqrt(std::max(s(i), floor));
}
P.MulColsVec(s);
(*this).AddMat2(1.0, P, kNoTrans); // (*this) = P*P^T. ... (*this) = P * floor(s) * P^T ... if P was original P.
(*this).AddMat2(1.0, P, kNoTrans, 0.0); // (*this) = P*P^T. ... (*this) = P * floor(s) * P^T ... if P was original P.
return nfloored;
}
@ -965,8 +965,8 @@ void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<double> &v)
}
template<class Real>
template<class OtherReal>
template<typename Real>
template<typename OtherReal>
void SpMatrix<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
KALDI_ASSERT(v.Dim() == this->NumRows());
Real *data = this->data_;
@ -984,7 +984,7 @@ template
void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<float> &v);
template<class Real>
template<typename Real>
Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
const VectorBase<Real> &v2) {
MatrixIndexT D = M.NumRows();
@ -1002,7 +1002,7 @@ double VecSpVec(const VectorBase<double> &v1, const SpMatrix<double> &M,
const VectorBase<double> &v2);
template<class Real>
template<typename Real>
void SpMatrix<Real>::AddMat2Sp(
const Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType transM, const SpMatrix<Real> &A, const Real beta) {
@ -1046,7 +1046,7 @@ void SpMatrix<Real>::AddMat2Sp(
}
}
template<class Real>
template<typename Real>
void SpMatrix<Real>::AddSmat2Sp(
const Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType transM, const SpMatrix<Real> &A,
@ -1101,7 +1101,7 @@ void SpMatrix<Real>::AddSmat2Sp(
}
}
template<class Real>
template<typename Real>
void SpMatrix<Real>::AddMat2Vec(const Real alpha,
const MatrixBase<Real> &M,
MatrixTransposeType transM,
@ -1130,7 +1130,7 @@ void SpMatrix<Real>::AddMat2Vec(const Real alpha,
}
}
template<class Real>
template<typename Real>
void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType transM, const Real beta) {
KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
@ -1159,7 +1159,7 @@ void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
this->CopyFromMat(temp_mat, kTakeLower);
}
template<class Real>
template<typename Real>
void SpMatrix<Real>::AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
MatrixTransposeType transM, const SpMatrix<Real> &A,
const Real beta) {
@ -1167,7 +1167,7 @@ void SpMatrix<Real>::AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
AddMat2Sp(alpha, Tmat, transM, A, beta);
}
template<class Real>
template<typename Real>
void SpMatrix<Real>::AddVecVec(const Real alpha, const VectorBase<Real> &v,
const VectorBase<Real> &w) {
int32 dim = this->NumRows();
@ -1176,7 +1176,7 @@ void SpMatrix<Real>::AddVecVec(const Real alpha, const VectorBase<Real> &v,
}
template<class Real>
template<typename Real>
void SpMatrix<Real>::AddTp2(const Real alpha, const TpMatrix<Real> &T,
MatrixTransposeType transM, const Real beta) {
Matrix<Real> Tmat(T);
@ -1191,7 +1191,7 @@ template class SpMatrix<float>;
template class SpMatrix<double>;
template<class Real>
template<typename Real>
Real TraceSpSpLower(const SpMatrix<Real> &A, const SpMatrix<Real> &B) {
MatrixIndexT adim = A.NumRows();
KALDI_ASSERT(adim == B.NumRows());

Просмотреть файл

@ -28,14 +28,6 @@
namespace kaldi {
/// \weakgroup matrix_funcs_misc
typedef enum {
kTakeLower,
kTakeUpper,
kTakeMean,
kTakeMeanAndCheck
} SpCopyType;
/// \addtogroup matrix_group
/// @{
@ -47,19 +39,25 @@ template<typename Real> class SpMatrix;
*/
template<typename Real>
class SpMatrix : public PackedMatrix<Real> {
friend class CuSpMatrix<Real>;
public:
// so it can use our assignment operator.
friend class std::vector<Matrix<Real> >;
SpMatrix(): PackedMatrix<Real>() {}
/// Copy constructor from CUDA version of SpMatrix
/// This is defined in ../cudamatrix/cu-sp-matrix.h
explicit SpMatrix(const CuSpMatrix<Real> &cu);
explicit SpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
: PackedMatrix<Real>(r, resize_type) {}
SpMatrix(const SpMatrix<Real> &orig)
: PackedMatrix<Real>(orig) {}
template<class OtherReal>
template<typename OtherReal>
explicit SpMatrix(const SpMatrix<OtherReal> &orig)
: PackedMatrix<Real>(orig) {}
@ -77,8 +75,6 @@ class SpMatrix : public PackedMatrix<Real> {
}
#endif
~SpMatrix() {}
/// Shallow swap.
void Swap(SpMatrix *other);
@ -90,7 +86,7 @@ class SpMatrix : public PackedMatrix<Real> {
PackedMatrix<Real>::CopyFromPacked(other);
}
template<class OtherReal>
template<typename OtherReal>
void CopyFromSp(const SpMatrix<OtherReal> &other) {
PackedMatrix<Real>::CopyFromPacked(other);
}
@ -231,7 +227,7 @@ class SpMatrix : public PackedMatrix<Real> {
Real LogDet(Real *det_sign = NULL) const;
/// rank-one update, this <-- this + alpha v v'
template<class OtherReal>
template<typename OtherReal>
void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
/// rank-two update, this <-- this + alpha (v w' + w v').
@ -243,7 +239,7 @@ class SpMatrix : public PackedMatrix<Real> {
const SpMatrix<Real> &S, const Real beta);
/// diagonal update, this <-- this + diag(v)
template<class OtherReal>
template<typename OtherReal>
void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
/// rank-N update:
@ -251,8 +247,9 @@ class SpMatrix : public PackedMatrix<Real> {
/// (*this) = beta*(*this) + alpha * M * M^T,
/// or (if transM == kTrans)
/// (*this) = beta*(*this) + alpha * M^T * M
/// Note: beta used to default to 0.0.
void AddMat2(const Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType transM, const Real beta = 0.0);
MatrixTransposeType transM, const Real beta);
/// Extension of rank-N update:
/// this <-- beta*this + alpha * M * A * M^T.
@ -286,8 +283,7 @@ class SpMatrix : public PackedMatrix<Real> {
/// can implement it more efficiently.
void AddTp2(const Real alpha, const TpMatrix<Real> &T,
MatrixTransposeType transM, const Real beta = 0.0);
/// Extension of rank-N update:
/// this <-- beta*this + alpha * M * diag(v) * M^T.
/// if transM == kTrans, then
@ -381,6 +377,20 @@ float TraceSpSp(const SpMatrix<float> &A, const SpMatrix<float> &B);
double TraceSpSp(const SpMatrix<double> &A, const SpMatrix<double> &B);
template<typename Real>
inline bool ApproxEqual(const SpMatrix<Real> &A,
const SpMatrix<Real> &B, Real tol = 0.01) {
return A.ApproxEqual(B, tol);
}
template<typename Real>
inline void AssertEqual(const SpMatrix<Real> &A,
const SpMatrix<Real> &B, Real tol = 0.01) {
KALDI_ASSERT(ApproxEqual(A, B, tol));
}
/// Returns tr(A B).
template<typename Real, typename OtherReal>
Real TraceSpSp(const SpMatrix<Real> &A, const SpMatrix<OtherReal> &B);
@ -419,7 +429,7 @@ Real TraceMatSpMatSp(const MatrixBase<Real> &A, MatrixTransposeType transA,
/// Returns \f$ v_1^T M v_2 \f$
/// Not as efficient as it could be where v1 == v2.
template<class Real>
template<typename Real>
Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
const VectorBase<Real> &v2);
@ -461,7 +471,7 @@ struct SolverOptions {
/// Assumes H positive semidefinite.
/// Returns the objective-function change.
template<class Real>
template<typename Real>
Real SolveQuadraticProblem(const SpMatrix<Real> &H,
const VectorBase<Real> &g,
const SolverOptions &opts,
@ -479,7 +489,7 @@ Real SolveQuadraticProblem(const SpMatrix<Real> &H,
/// diagonal_precondition option is newly added, to handle problems
/// where different dimensions have very different scaling (we recommend to use
/// the option but it's set false for back compatibility).
template<class Real>
template<typename Real>
Real SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
const MatrixBase<Real> &Y,
const SpMatrix<Real> &P,
@ -490,7 +500,7 @@ Real SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
/// \f[ Q(M) = tr(M^T G) -0.5 tr(P_1 M Q_1 M^T) -0.5 tr(P_2 M Q_2 M^T). \f]
/// Encountered in matrix update with a prior. We also apply a limit on the
/// condition but it should be less frequently necessary, and can be set larger.
template<class Real>
template<typename Real>
Real SolveDoubleQuadraticMatrixProblem(const MatrixBase<Real> &G,
const SpMatrix<Real> &P1,
const SpMatrix<Real> &P2,

Просмотреть файл

@ -31,7 +31,7 @@
namespace kaldi {
template<class Real>
template<typename Real>
SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
if ( (N & (N-1)) != 0 || N <= 1)
KALDI_ERR << "SplitRadixComplexFft called with invalid number of points "
@ -46,7 +46,7 @@ SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
temp_buffer = NULL;
}
template<class Real>
template<typename Real>
void SplitRadixComplexFft<Real>::ComputeTables() {
MatrixIndexT imax, lg2, i, j;
MatrixIndexT m, m2, m4, m8, nel, n;
@ -97,7 +97,7 @@ void SplitRadixComplexFft<Real>::ComputeTables() {
}
}
template<class Real>
template<typename Real>
SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
delete [] brseed;
if (tab != NULL) {
@ -109,7 +109,7 @@ SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
delete [] temp_buffer;
}
template<class Real>
template<typename Real>
void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const {
if (!forward) { // reverse real and imaginary parts for complex FFT.
Real *tmp = xr;
@ -123,7 +123,7 @@ void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const
}
}
template<class Real>
template<typename Real>
void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
if (temp_buffer == NULL)
temp_buffer = new Real[N_];
@ -150,7 +150,7 @@ void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
x[1] = temp_buffer[0]; // special case of i = 0.
}
template<class Real>
template<typename Real>
void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) const {
MatrixIndexT i, j, lg2, n;
MatrixIndexT off, fj, gno, *brp;
@ -176,7 +176,7 @@ void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logm) c
}
template<class Real>
template<typename Real>
void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixIndexT logm) const {
MatrixIndexT m, m2, m4, m8, nel, n;
@ -321,7 +321,7 @@ void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixInde
// This code is mostly the same as the RealFft function. It would be
// possible to replace it with more efficient code from Rico's book.
template<class Real>
template<typename Real>
void SplitRadixRealFft<Real>::Compute(Real *data, bool forward) {
MatrixIndexT N = N_, N2 = N/2;
KALDI_ASSERT(N%2 == 0);

Просмотреть файл

@ -41,7 +41,7 @@ namespace kaldi {
// Microsoft Corporation
// This is a more efficient way of doing the complex FFT than ComplexFft
// above, but it only works for powers of 2.
template<class Real>
template<typename Real>
class SplitRadixComplexFft {
public:
typedef MatrixIndexT Integer;
@ -83,7 +83,7 @@ class SplitRadixComplexFft {
// data.
};
template<class Real>
template<typename Real>
class SplitRadixRealFft: private SplitRadixComplexFft<Real> {
public:
SplitRadixRealFft(MatrixIndexT N): // will fail unless N>=4 and N is a power of 2.

Просмотреть файл

@ -69,7 +69,7 @@ void TpMatrix<Real>::Invert() {
}
/*
template<class Real>
template<typename Real>
void TpMatrix<Real>::Invert() {
Matrix<Real> tmp(*this);
tmp.Invert();
@ -127,7 +127,7 @@ void TpMatrix<Real>::Cholesky(const SpMatrix<Real> &orig) {
}
template<typename Real>
void TpMatrix<Real>::CopyFromMat(MatrixBase<Real> &M,
void TpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
MatrixTransposeType Trans) {
if (Trans == kNoTrans) {
KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols());

Просмотреть файл

@ -2,6 +2,7 @@
// Copyright 2009-2011 Ondrej Glembek; Lukas Burget; Microsoft Corporation;
// Saarland University; Yanmin Qian; Haihua Xu
// 2013 Johns Hopkins Universith (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
@ -33,15 +34,22 @@ template<typename Real> class TpMatrix;
/// @brief Packed symetric matrix class
template<typename Real>
class TpMatrix : public PackedMatrix<Real> {
friend class CuTpMatrix<float>;
friend class CuTpMatrix<double>;
public:
TpMatrix() : PackedMatrix<Real>() {}
explicit TpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
: PackedMatrix<Real>(r, resize_type) {}
TpMatrix(const TpMatrix<Real>& Orig) : PackedMatrix<Real>(Orig) {}
template<class OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& Orig)
: PackedMatrix<Real>(Orig) {}
~TpMatrix() {}
/// Copy constructor from CUDA TpMatrix
/// This is defined in ../cudamatrix/cu-tp-matrix.cc
explicit TpMatrix(const CuTpMatrix<Real> &cu);
template<typename OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& Orig)
: PackedMatrix<Real>(Orig) {}
Real operator() (MatrixIndexT r, MatrixIndexT c) const {
if (static_cast<UnsignedMatrixIndexT>(c) >
static_cast<UnsignedMatrixIndexT>(r)) {
@ -85,15 +93,18 @@ class TpMatrix : public PackedMatrix<Real> {
/// CopyFromMat copies the lower triangle of M into *this
/// (or the upper triangle, if Trans == kTrans).
void CopyFromMat(MatrixBase<Real> &M,
void CopyFromMat(const MatrixBase<Real> &M,
MatrixTransposeType Trans = kNoTrans);
/// CopyFromTp copies andother triangular matrix into this one.
/// This is implemented in ../cudamatrix/cu-tp-matrix.cc
void CopyFromMat(const CuTpMatrix<Real> &other);
/// CopyFromTp copies another triangular matrix into this one.
void CopyFromTp(const TpMatrix<Real> &other) {
PackedMatrix<Real>::CopyFromPacked(other);
}
template<class OtherReal> void CopyFromTp(const TpMatrix<OtherReal> &other) {
template<typename OtherReal> void CopyFromTp(const TpMatrix<OtherReal> &other) {
PackedMatrix<Real>::CopyFromPacked(other);
}

Просмотреть файл

@ -5,6 +5,7 @@ all:
include ../kaldi.mk
LDFLAGS += $(CUDA_LDFLAGS)
LDLIBS += $(CUDA_LDLIBS)
TESTFILES = nnet-test nnet-randomizer-test

Просмотреть файл

@ -41,7 +41,7 @@ class Softmax : public Component {
void PropagateFnc(const CuMatrix<BaseFloat> &in, CuMatrix<BaseFloat> *out) {
// y = e^x_j/sum_j(e^x_j)
out->Softmax(in);
out->ApplySoftMaxPerRow(in);
}
void BackpropagateFnc(const CuMatrix<BaseFloat> &in, const CuMatrix<BaseFloat> &out,

Просмотреть файл

@ -96,7 +96,7 @@ class CacheConf {
Vector<BaseFloat> confidence_leftover_;
std::vector<int32> randmask_;
CuStlVector<int32> randmask_device_;
CuArray<int32> randmask_device_;
};

Просмотреть файл

@ -94,7 +94,7 @@ class CacheTgtMat {
CuMatrix<BaseFloat> targets_leftover_; ///< Desired vector cache
std::vector<int32> randmask_;
CuStlVector<int32> randmask_device_;
CuArray<int32> randmask_device_;
};

Просмотреть файл

@ -94,7 +94,7 @@ class Cache {
std::vector<int32> targets_leftover_; ///< Desired vector cache
std::vector<int32> randmask_;
CuStlVector<int32> randmask_device_;
CuArray<int32> randmask_device_;
};

Просмотреть файл

@ -23,7 +23,7 @@
#include "base/kaldi-common.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-stlvector.h"
#include "cudamatrix/cu-array.h"
namespace kaldi {
namespace nnet1 {
@ -67,10 +67,10 @@ class XentPrior {
double frames_scaled_nosil_;
double correct_scaled_nosil_;
CuStlVector<int32> max_id_;
CuArray<int32> max_id_;
std::vector<int32> max_id_host_;
CuStlVector<int32> target_device_;
CuArray<int32> target_device_;
CuVector<BaseFloat> log_post_tgt_;
Vector<BaseFloat> log_post_tgt_host_;

Просмотреть файл

@ -24,7 +24,7 @@
#include "util/kaldi-holder.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-vector.h"
#include "cudamatrix/cu-stlvector.h"
#include "cudamatrix/cu-array.h"
namespace kaldi {
namespace nnet1 {
@ -61,16 +61,17 @@ class Xent {
std::vector<float> loss_vec_;
// loss computation buffers
CuStlVector<int32> target_device_;
CuArray<int32> target_device_;
CuVector<BaseFloat> log_post_tgt_;
Vector<BaseFloat> log_post_tgt_host_;
CuMatrix<BaseFloat> tgt_mat_device_;
CuMatrix<BaseFloat> xentropy_aux_;
// frame classification buffers
CuStlVector<int32> max_id_out_;
CuArray<int32> max_id_out_;
std::vector<int32> max_id_out_host_;
CuStlVector<int32> max_id_tgt_;
CuArray<int32> max_id_tgt_;
std::vector<int32> max_id_tgt_host_;
};

Просмотреть файл

@ -76,7 +76,7 @@ void MatrixRandomizer::Randomize(const std::vector<int32>& mask) {
// Use auxiliary buffer for unshuffled data
CuMatrix<BaseFloat> data_aux(data_);
// Put the mask to GPU
CuStlVector<int32> mask_in_gpu(mask.size());
CuArray<int32> mask_in_gpu(mask.size());
mask_in_gpu.CopyFromVec(mask);
// randomize the data, mask is used to index rows in source matrix
cu::Randomize(data_aux, mask_in_gpu, &data_);

Просмотреть файл

@ -155,7 +155,7 @@ class Splice : public Component {
}
protected:
CuStlVector<int32> frame_offsets_;
CuArray<int32> frame_offsets_;
};
@ -218,7 +218,7 @@ class CopyComponent: public Component {
}
protected:
CuStlVector<int32> copy_from_indices_;
CuArray<int32> copy_from_indices_;
};

Просмотреть файл

@ -4,6 +4,7 @@ EXTRA_CXXFLAGS = -Wno-sign-compare
include ../kaldi.mk
LDFLAGS += $(CUDA_LDFLAGS)
LDLIBS += $(CUDA_LDLIBS)
BINFILES = nnet-train-frmshuff \
nnet-train-xent-hardlab-perutt \

Просмотреть файл

@ -51,13 +51,8 @@ int main(int argc, char *argv[]) {
bool apply_log = false;
po.Register("apply-log", &apply_log, "Transform MLP output to logscale");
#if HAVE_CUDA==1
int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
std::string use_gpu="no";
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
po.Read(argc, argv);
@ -76,7 +71,7 @@ int main(int argc, char *argv[]) {
//Select the GPU
#if HAVE_CUDA==1
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
CuDevice::Instantiate().SelectGpuId(use_gpu);
#endif
Nnet nnet_transf;

Просмотреть файл

@ -60,13 +60,8 @@ int main(int argc, char *argv[]) {
std::string frame_weights;
po.Register("frame-weights", &frame_weights, "Per-frame weights to scale gradients (frame selection/weighting).");
#if HAVE_CUDA==1
int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID (-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
std::string use_gpu="yes";
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
po.Read(argc, argv);
@ -90,7 +85,7 @@ int main(int argc, char *argv[]) {
//Select the GPU
#if HAVE_CUDA==1
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
CuDevice::Instantiate().SelectGpuId(use_gpu);
#endif
Nnet nnet_transf;

Просмотреть файл

@ -128,16 +128,9 @@ int main(int argc, char *argv[]) {
po.Register("drop-frames", &drop_frames,
"Drop frames, where is zero den-posterior under numerator path "
"(ie. path not in lattice)");
#if HAVE_CUDA == 1
kaldi::int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
"(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
std::string use_gpu="yes";
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
po.Read(argc, argv);
@ -162,7 +155,7 @@ int main(int argc, char *argv[]) {
// Select the GPU
#if HAVE_CUDA == 1
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
CuDevice::Instantiate().SelectGpuId(use_gpu);
#endif
Nnet nnet_transf;
@ -257,7 +250,7 @@ int main(int argc, char *argv[]) {
if (old_acoustic_scale != 1.0) {
fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale), &den_lat);
}
// optionaly sort it topologically
// optional sort it topologically
kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
if (!(props & fst::kTopSorted)) {
if (fst::TopSort(&den_lat) == false)

Просмотреть файл

@ -129,15 +129,9 @@ int main(int argc, char *argv[]) {
po.Register("do-smbr", &do_smbr, "Use state-level accuracies instead of "
"phone accuracies.");
#if HAVE_CUDA == 1
kaldi::int32 use_gpu_id=-2;
po.Register("use-gpu-id", &use_gpu_id, "Manually select GPU by its ID "
"(-2 automatic selection, -1 disable GPU, 0..N select GPU)");
#else
int32 use_gpu_id=0;
po.Register("use-gpu-id", &use_gpu_id, "Unused, kaldi is compiled w/o CUDA");
#endif
std::string use_gpu="yes";
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
po.Read(argc, argv);
if (po.NumArgs() != 6) {
@ -164,7 +158,7 @@ int main(int argc, char *argv[]) {
// Select the GPU
#if HAVE_CUDA == 1
CuDevice::Instantiate().SelectGpuId(use_gpu_id);
CuDevice::Instantiate().SelectGpuId(use_gpu);
#endif
Nnet nnet_transf;
@ -248,7 +242,7 @@ int main(int argc, char *argv[]) {
fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
&den_lat);
}
// optionaly sort it topologically
// optional sort it topologically
kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
if (!(props & fst::kTopSorted)) {
if (fst::TopSort(&den_lat) == false)

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше