From ff72d5696f07c4365c0272df63d212841a441caf Mon Sep 17 00:00:00 2001 From: adame Date: Mon, 20 Oct 2014 14:50:30 -0700 Subject: [PATCH] Modify port code to support CPUONLY build To use this define CPUONLY in the CN project And remove *.cu from the math project and add GPUDummy.cpp instead This allows use of Cygwin to compile both Windows and linux on same machine --- Common/BestGpu.cpp | 21 +- Common/Include/BestGpu.h | 5 +- Common/Include/basetypes.h | 43 +- Common/Include/fileutil.h | 252 +---- Common/fileutil.cpp | 22 +- MachineLearning/cn/cn.vcxproj | 4 +- Math/Math/CPUMatrix.cpp | 4 +- Math/Math/GPUDummy.cpp | 1667 +++++++++++++++++++++++++++++ Math/Math/GPUMatrix.cu | 70 +- Math/Math/GPUMatrixCUDAKernels.cu | 3 + Math/Math/GPUSparseMatrix.cu | 32 +- Math/Math/Math.vcxproj.filters | 77 ++ Math/Math/Matrix.cpp | 22 +- 13 files changed, 1911 insertions(+), 311 deletions(-) create mode 100644 Math/Math/GPUDummy.cpp create mode 100644 Math/Math/Math.vcxproj.filters diff --git a/Common/BestGpu.cpp b/Common/BestGpu.cpp index 3204e4e8c..426342c8b 100644 --- a/Common/BestGpu.cpp +++ b/Common/BestGpu.cpp @@ -6,14 +6,28 @@ #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings +#include "BestGpu.h" +#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX + +#ifdef CPUONLY +namespace Microsoft { + namespace MSR { + namespace CNTK { + short DeviceFromConfig(const ConfigParameters& config) + { + return CPUDEVICE; + } + } + } +} +#else + // CUDA-C includes #include #include #include #include #include -#include "BestGpu.h" -#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX // The "notify hook" gets called for every call to the // delay load helper. This allows a user to hook every call and @@ -507,4 +521,5 @@ void BestGpu::QueryNvmlData() m_nvmlData = true; return; } -}}} \ No newline at end of file +}}} +#endif diff --git a/Common/Include/BestGpu.h b/Common/Include/BestGpu.h index 3c04c4ab2..849b43ac6 100644 --- a/Common/Include/BestGpu.h +++ b/Common/Include/BestGpu.h @@ -4,15 +4,18 @@ // // #pragma once +#ifndef CPUONLY #include #include #include +#endif #include "commandArgUtil.h" namespace Microsoft { namespace MSR { namespace CNTK { short DeviceFromConfig(const ConfigParameters& config); +#ifndef CPUONLY struct ProcessorData { int cores; @@ -68,5 +71,5 @@ public: std::vector GetDevices(int number=AllDevices, BestGpuFlags flags=bestGpuNormal); // get multiple devices }; extern BestGpu* g_bestGpu; - +#endif }}} \ No newline at end of file diff --git a/Common/Include/basetypes.h b/Common/Include/basetypes.h index d3145570a..95819f825 100644 --- a/Common/Include/basetypes.h +++ b/Common/Include/basetypes.h @@ -13,7 +13,13 @@ typedef char16_t TCHAR; #define vsprintf_s vsprintf /* Not sure this is right... Malcolm */ #include #include -#endif /* LINUX */ +#include +#include +#define Linux(a) a +#else +#include +#endif /* LINUX */ +#include // for HUGE_VAL // Remove for a test by Malcolm because of double isnan definition... #ifndef UNDER_CE // fixed-buffer overloads not available for wince #ifdef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES // fixed-buffer overloads for strcpy() etc. @@ -80,11 +86,13 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec #include #include #include -#include // for HUGE_VAL // potential double isnan definition +#include // for HUGE_VAL // potential double isnan definition #include #include #include #include +#include // std::wstring_convert +#include // std::codecvt_utf8 #ifdef _MSC_VER #include // for CRITICAL_SECTION and Unicode conversion functions --TODO: is there a portable alternative? #endif @@ -578,6 +586,9 @@ typedef strfun::_strprintf wstrprintf; // wchar_t version #ifdef _WIN32 struct utf8 : std::string { utf8 (const std::wstring & p) // utf-16 to -8 { + //TODO: confirm it builds on VS2013 + std::wstring_convert> cv; + (*(std::string*)this) = cv.to_bytes(p); #ifdef MALCOLM size_t len = p.length(); if (len == 0) { return;} // empty string @@ -592,16 +603,19 @@ struct utf8 : std::string { utf8 (const std::wstring & p) // utf-16 to -8 }}; struct utf16 : std::wstring { utf16 (const std::string & p) // utf-8 to -16 { -#ifdef MALCOLM + std::wstring_convert> cv; + (*(std::wstring*)this) = cv.from_bytes(p); + +#ifdef OLD size_t len = p.length(); if (len == 0) { return;} // empty string msra::basetypes::fixed_vector buf (len + 1); // ... TODO: this fill() should be unnecessary (a 0 is appended)--but verify - std::fill (buf.begin (), buf.end (), (wchar_t) 0); - int rc = MultiByteToWideChar (CP_UTF8, 0, p.c_str(), (int) len, - &buf[0], (int) buf.size()); - if (rc == 0) throw std::runtime_error ("MultiByteToWideChar"); - ASSERT (rc < buf.size ()); + std::fill(buf.begin(), buf.end(), (wchar_t)0); + int rc = MultiByteToWideChar(CP_UTF8, 0, p.c_str(), (int)len, + &buf[0], (int)buf.size()); + if (rc == 0) throw std::runtime_error("MultiByteToWideChar"); + ASSERT(rc < buf.size()); (*(std::wstring*)this) = &buf[0]; #endif /* Malcolm */ }}; @@ -641,12 +655,8 @@ static inline std::string wcstombs (const std::wstring & p) // output: MBCS } static inline std::wstring mbstowcs (const std::string & p) // input: MBCS { - size_t len = p.length(); - msra::basetypes::fixed_vector buf (len + 1); // max: >1 mb chars => 1 wchar - std::fill (buf.begin (), buf.end (), (wchar_t) 0); - OACR_WARNING_SUPPRESS(UNSAFE_STRING_FUNCTION, "Reviewed OK. size checked. [rogeryu 2006/03/21]"); - ::mbstowcs (&buf[0], p.c_str(), len + 1); - return std::wstring (&buf[0]); + std::wstring ret = utf16(p); + return ret; } #pragma warning(pop) @@ -769,8 +779,6 @@ static inline FILE* _wfopen(const wchar_t * path, const wchar_t * mode) { return namespace msra { namespace basetypes { -#ifdef MALCOLM - // FILE* with auto-close; use auto_file_ptr instead of FILE*. // Warning: do not pass an auto_file_ptr to a function that calls fclose(), // except for fclose() itself. @@ -789,7 +797,7 @@ public: auto_file_ptr() : f (NULL) { } ~auto_file_ptr() { close(); } auto_file_ptr (const char * path, const char * mode) { f = fopen (path, mode); if (f == NULL) openfailed (path); } - auto_file_ptr (const wchar_t * path, const char * mode) { f = _wfopen (path, msra::strfun::utf16 (mode).c_str()); if (f == NULL) openfailed (msra::strfun::utf8 (path)); } + auto_file_ptr (const wchar_t * wpath, const char * mode) {string path = msra::strfun::utf8(wpath); f = fopen (path.c_str(), mode); if (f == NULL) openfailed (path); } FILE * operator= (FILE * other) { close(); f = other; return f; } auto_file_ptr (FILE * other) : f (other) { } operator FILE * () const { return f; } @@ -797,7 +805,6 @@ public: void swap (auto_file_ptr & other) throw() { std::swap (f, other.f); } }; inline int fclose (auto_file_ptr & af) { return af.fclose(); } -#endif /* MALCOLM */ #ifdef _MSC_VER // auto-closing container for Win32 handles. diff --git a/Common/Include/fileutil.h b/Common/Include/fileutil.h index 51ef8f82a..8371de1b9 100644 --- a/Common/Include/fileutil.h +++ b/Common/Include/fileutil.h @@ -3,232 +3,19 @@ // // Copyright (c) Microsoft Corporation. All rights reserved. // -// $Log: /Speech_To_Speech_Translation/dbn/dbn/fileutil.h $ -// -// 71 1/03/13 8:53p Kaisheny -// Asynchronous SGD using data pipe. -// -// 70 9/30/12 10:46a Fseide -// new optional parameter to fuptodate()--caller can now choose whether a -// missing input file, with target file present, will cause a failure or -// considers the target up-to-date -// -// 69 11/09/11 10:01 Fseide -// added a new overload for fgetfilelines() that returns an array of char* -// instead of strings, to avoid mem alloc -// -// 68 6/10/11 9:50 Fseide -// (fixed a missing 'inline') -// -// 67 6/10/11 9:49 Fseide -// new function fgetfilelines() for reading text files -// -// 66 6/09/11 15:18 Fseide -// added overloads to fexists() that accept STL strings -// -// 65 3/07/11 12:13 Fseide -// actually implemented unlinkOrDie() (was a dummy) -// -// 64 11/17/10 15:00 Fseide -// new function fuptodate(); -// make_intermediate_dirs() moved to namespace msra::files (all new -// functions should be put in there) -// -// 63 11/15/10 7:04p Fseide -// added an overload for freadOrDie (vector) that takes size as a size_t -// instead of an int, to pleasr the x64 compiler -// -// 62 11/08/10 17:07 Fseide -// new function make_intermediate_dirs() -// -// 61 11/08/10 11:43 Fseide -// (minor cleanup) -// -// 60 2/05/09 19:06 Fseide -// fgetline() now returns a non-const pointer, because user may want to -// post-process the line, and the returned value is a user-specified -// buffer anyway -// -// 59 1/16/09 17:34 Fseide -// relpath() and splitpath() moved to fileutil.h -// -// 58 1/16/09 8:59 Fseide -// exported fskipspace() -// -// 57 1/15/09 7:38 Fseide -// some magic to unify fgetstring() for char and wchar_t to a single -// template function -// -// 56 1/15/09 7:26 Fseide -// corrected the #include order of basetypes.h -// -// 55 1/14/09 19:26 Fseide -// new functions fsetpos() and fgetpos(); -// new fixed-buffer size overload for fgetstring() and fgettoken() -// -// 54 1/08/09 16:14 Fseide -// fopenOrDie() now supports "-" as the pathname, referring to stdin or -// stdout -// -// 53 1/08/09 15:32 Fseide -// new funtion expand_wildcards() -// -// 52 1/05/09 8:44 Fseide -// (added comments) -// -// 51 11/11/08 6:04p Qiluo -// recover the old fputstring functions -// -// 50 10/31/08 5:09p Qiluo -// remove banned APIs -// -// 49 7/17/08 7:22p V-spwang -// undid changes - back to version 47 -// -// 47 6/24/08 19:03 Fseide -// added fgetwstring() and fputstring() for wstrings; -// added templates for freadOrDie() and fwriteOrDie() for STL vectors -// -// 46 6/18/08 11:41 Fseide -// added #pragma once -// -// 45 08-05-29 18:18 Llu -// fix the interface of fputwav -// -// 44 08-05-29 13:54 Llu -// add fputwav revise fgetwav using stl instead of short * -// -// 43 11/27/06 11:40 Fseide -// new methods fgetwfx() and fputwfx() for direct access to simple PCM WAV -// files -// -// 42 10/14/06 18:31 Fseide -// added char* version of fexists() -// -// 41 5/22/06 9:34 Fseide -// (experimental auto_file class checked in) -// -// 40 5/14/06 19:59 Fseide -// new function fsetmode() -// -// 39 3/29/06 15:36 Fseide -// changed to reading entire file instead of line-by-line, not changing -// newlines anymore -// -// 38 2/21/06 12:39p Kit -// Added filesize64 function -// -// 37 1/09/06 7:12p Rogeryu -// wide version of fgetline -// -// 36 12/19/05 21:52 Fseide -// fputfile() added in 8-bit string version -// -// 35 12/15/05 20:25 Fseide -// added getfiletime(), setfiletime(), and fputfile() for strings -// -// 34 9/27/05 12:22 Fseide -// added wstring version of renameOrDie() -// -// 33 9/22/05 12:26 Fseide -// new method fexists() -// -// 32 9/15/05 11:33 Fseide -// new version of fgetline() that avoids buffer allocations, since this -// seems very expensive esp. when reading a file line by line with -// fgetline() -// -// 31 9/05/05 4:57p F-xyzhao -// added #include for #include -- ugh -// -// 30 9/05/05 11:00 Fseide -// new method renameOrDie() -// -// 29 8/24/05 5:45p Kjchen -// merge changes in OneNote -// -// 28 8/19/05 17:56 Fseide -// extended WAVEHEADER with write() and update() -// -// 27 8/13/05 15:37 Fseide -// added new version of fgetline that takes a buffer -// -// 26 7/26/05 18:54 Fseide -// new functions fgetint24() and fputint24() -// -// 25 2/12/05 15:21 Fseide -// fgetdouble() and fputdouble() added -// -// 24 2/05/05 12:38 Fseide -// new methods fputfile(), fgetfile(); -// new overload for filesize() -// -// 23 2/03/05 22:34 Fseide -// added new version of fgetline() that returns an STL string -// -// 22 5/31/04 10:06 Fseide -// new methods fseekOrDie(), ftellOrDie(), unlinkOrDie(), renameOrDie() -// -// 21 3/19/04 4:01p Fseide -// fwriteOrDie(): first argument changed to const -// -// 20 2/27/04 10:04a V-xlshi -// -// 19 2/19/04 3:45p V-xlshi -// fgetraw function is added. -// -// 18 2/19/04 1:49p V-xlshi -// -// 17 2/03/04 8:17p V-xlshi -// -// 16 2/03/04 6:20p V-xlshi -// WAVEHEADER.prepare() added -// -// 15 2/03/04 5:58p V-xlshi -// WAVEHEADER structure added -// -// 14 8/15/03 15:40 Fseide -// new method filesize() -// -// 13 8/13/03 21:06 Fseide -// new function fputbyte() -// -// 12 8/13/03 15:37 Fseide -// prototype of fOpenOrDie() Unicode version changed -// -// 11 8/07/03 22:04 Fseide -// fprintfOrDie() now really dies in case of error -// -// 10 03-07-30 12:06 I-rogery -// enable both unicode and non-unicode version -// -// 9 7/25/03 6:07p Fseide -// new functions fgetbyte() and fgetwav() -// -// 8 7/03/02 9:25p Fseide -// fcompareTag() now uses string type for both of its arguments (before, -// it used const char * for one of them) -// -// 7 6/10/02 3:14p Fseide -// new functions fgettoken(), fgetfloat_ascii(), fskipNewline() -// -// 6 6/07/02 7:26p Fseide -// new functions fcheckTag_ascii() and fgetint_ascii() -// -// 5 4/15/02 1:12p Fseide -// void fputstring (FILE * f, const TSTRING & str) and fpad() added -// -// 4 4/03/02 3:58p Fseide -// VSS keyword and copyright added -// -// F. Seide 5 Mar 2002 -// - #pragma once #ifndef _FILEUTIL_ #define _FILEUTIL_ #include +#ifdef _WIN32 +#define isfinite(x) _finite(x) +#define isnan(x) _isnan(x) +#endif +#ifdef __unix__ +#include +#include +#endif #include // for std::find #include #include @@ -695,6 +482,29 @@ namespace msra { namespace files { // simple support for WAV file I/O // ---------------------------------------------------------------------------- +// define the header if we haven't seen it yet +#ifndef _WAVEFORMATEX_ +#define _WAVEFORMATEX_ + +/* + * extended waveform format structure used for all non-PCM formats. this + * structure is common to all non-PCM formats. + */ +typedef unsigned short WORD; // in case not defined yet (i.e. linux) +typedef struct tWAVEFORMATEX +{ + WORD wFormatTag; /* format type */ + WORD nChannels; /* number of channels (i.e. mono, stereo...) */ + DWORD nSamplesPerSec; /* sample rate */ + DWORD nAvgBytesPerSec; /* for buffer estimation */ + WORD nBlockAlign; /* block size of data */ + WORD wBitsPerSample; /* number of bits per sample of mono data */ + WORD cbSize; /* the count in bytes of the size of */ + /* extra information (after cbSize) */ +} WAVEFORMATEX, *PWAVEFORMATEX; + +#endif /* _WAVEFORMATEX_ */ + typedef struct wavehder{ char riffchar[4]; unsigned int RiffLength; diff --git a/Common/fileutil.cpp b/Common/fileutil.cpp index 675c3d3c7..eca7ce6af 100644 --- a/Common/fileutil.cpp +++ b/Common/fileutil.cpp @@ -298,7 +298,7 @@ size_t filesize (const wchar_t * pathname) // filesize64(): determine size of the file in bytes (with pathname) int64_t filesize64 (const wchar_t * pathname) { - __stat64 fileinfo; + struct _stat64 fileinfo; if (_wstat64 (pathname,&fileinfo) == -1) return 0; else @@ -1375,6 +1375,21 @@ vector msra::files::fgetfilelines (const wstring & path, vector & b bool getfiletime (const wstring & path, FILETIME & time) { // return file modification time, false if cannot be determined + struct _stat buf; + int result; + + // Get data associated with "crt_stat.c": + result = _wstat(path.c_str(), &buf); + // Check if statistics are valid: + if( result != 0 ) + { + return false; + } + + (*(time_t*)(&time))= buf.st_mtime; + return true; + +#ifdef OLD WIN32_FIND_DATAW findFileData; auto_handle hFind (FindFirstFileW (path.c_str(), &findFileData), ::FindClose); if (hFind != INVALID_HANDLE_VALUE) @@ -1386,10 +1401,14 @@ bool getfiletime (const wstring & path, FILETIME & time) { return false; } +#endif } void setfiletime (const wstring & path, const FILETIME & time) { // update the file modification time of an existing file +#ifdef LINUX + throw new logic_error("setfiletime has not been converted to linux yet..."); +#else auto_handle h (CreateFileW (path.c_str(), FILE_WRITE_ATTRIBUTES, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)); @@ -1402,6 +1421,7 @@ void setfiletime (const wstring & path, const FILETIME & time) { RuntimeError ("setfiletime: error setting file time information: %d", GetLastError()); } +#endif } #if 0 diff --git a/MachineLearning/cn/cn.vcxproj b/MachineLearning/cn/cn.vcxproj index ebd270b41..d84aa6994 100644 --- a/MachineLearning/cn/cn.vcxproj +++ b/MachineLearning/cn/cn.vcxproj @@ -124,7 +124,7 @@ true Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\ - CNTKMath.dll;nvml.dll + CNTKMath.dll;nvml.dll;nvcuda.dll @@ -186,7 +186,7 @@ true Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) true - CNTKMath.dll;nvml.dll + CNTKMath.dll;nvml.dll;nvcuda.dll copy $(SolutionDir)..\Common\PTask\bin\*.dll $(TargetDir) diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp index d76320f66..9a3df0ee8 100644 --- a/Math/Math/CPUMatrix.cpp +++ b/Math/Math/CPUMatrix.cpp @@ -21,13 +21,15 @@ #ifndef LINUX #include +#define Linux(x) #else +#define Linux(x) x #ifndef max #define max(a,b) (((a) > (b)) ? (a) : (b)) #endif -#include +#include #endif /* LINUX */ #ifdef LEAKDETECT diff --git a/Math/Math/GPUDummy.cpp b/Math/Math/GPUDummy.cpp new file mode 100644 index 000000000..5528816e5 --- /dev/null +++ b/Math/Math/GPUDummy.cpp @@ -0,0 +1,1667 @@ +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +#include "GPUMatrix.cuh" +#include "GPUSparseMatrix.cuh" + +namespace Microsoft { namespace MSR { namespace CNTK { + void PrepareDevice(short deviceId); + +#pragma region Constructors and Destructor + + template + GPUSparseMatrix::GPUSparseMatrix() + {} + + template + void GPUSparseMatrix::ZeroInit() + {} + + template + GPUSparseMatrix::GPUSparseMatrix(const GPUMatrix& deepCopy) + {} + + + template + GPUSparseMatrix::GPUSparseMatrix(const GPUSparseMatrix& deepCopy) + {} + + template + GPUSparseMatrix::GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t nz, ElemType* pArray, + const size_t matrixFlags /*=matrixFormatSparseCSR*/, int deviceId /*=MANAGEDEXTERN*/, const size_t elemSizeAllocated /*=0*/) + {} + + // PrepareDevice - Setup the correct cuda context for an operation + // deviceId - the device on which the operation will take place + // defaults to -1, which means use matrices current device + template + void GPUSparseMatrix::PrepareDevice(short deviceId /*=-1*/) const + {} + + template + void GPUSparseMatrix::DeepCopy(const GPUSparseMatrix& deepCopy) + {} + + template + void GPUSparseMatrix::SetValue(const GPUSparseMatrix& deepCopy) + {} + + template + GPUMatrix GPUSparseMatrix::CopyToDenseMatrix() + { + GPUMatrix res; + return res; + } + + template + void GPUSparseMatrix::SetValue(const GPUMatrix& denseMatrix) + {} + + template + GPUSparseMatrix& GPUSparseMatrix::operator=(const GPUSparseMatrix& deepCopy) + { + return *this; + } + +#ifndef LINUX + template + GPUSparseMatrix::GPUSparseMatrix(GPUSparseMatrix&& moveFrom) + {} + + template + GPUSparseMatrix& GPUSparseMatrix::operator=(GPUSparseMatrix&& moveFrom) + { + return *this; + } +#endif /* LINUX */ + + template + GPUSparseMatrix::~GPUSparseMatrix() + {} + + template + void GPUSparseMatrix::ClearNew() + {} + + + template + void GPUSparseMatrix::Clear() + {} + + //ResizeAs - Resize this sparse matrix to have the same element structure as the passed matrix + // a - sparse matrix whose structure we want to clone + // remark: this was done for element wise operations where the structure will be identical after an operation + template + void GPUSparseMatrix::ResizeAs(const GPUSparseMatrix& a) + {} + + //------------------------------------------------------------------------- + // Start of new GPU Sparse Matrix code + //------------------------------------------------------------------------- + + template + void GPUSparseMatrix::Init() + {} + + template + GPUSparseMatrix::GPUSparseMatrix(const MatrixFormat format, const int deviceId) + {} + + template + ElemType* GPUSparseMatrix::BufferPointer() const + { + return this->m_blockVal; + } + + template + void GPUSparseMatrix::Resize(const size_t numRows, const size_t numCols, int size) + {} + + //Reset matrix so it can be reused + template + void GPUSparseMatrix::Reset() + {} + +#pragma endregion Constructors and Destructor + +#pragma region Static BLAS Functions + + // copy features to GPU matrix + template + void GPUSparseMatrix::SetMatrixFromCSCFormat(size_t *h_row, size_t *h_rowIdx, size_t size, size_t blockSize) + {} + + template + void GPUSparseMatrix::SetMatrixFromLabelAndClass(size_t *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize) + {} + + // forward pass from feature to hidden layer + template + void GPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix& lhs, const bool transposeA, + const GPUSparseMatrix& rhs, const bool transposeB, ElemType beta, GPUMatrix& c) + + {} + + // backward pass from hidden layer to feature weight + template + void GPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const GPUMatrix& lhs, const bool transposeA, + const GPUSparseMatrix& rhs, const bool transposeB, GPUSparseMatrix& c) + {} + + // used for gradients udpate + template + void GPUSparseMatrix::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix& lhs, GPUMatrix& rhs) + {} + + // a: H x No: H is hidden layer size and No is mini-batch size + // weight: V x H, V is vocab size + // label: V x No + // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class + // idx2cls: V x 1, mapping from word to class id + // etp: V x No, stores predicted values + template + void GPUSparseMatrix::ClassEntropy(const GPUMatrix& a, const GPUMatrix& weight, + const GPUSparseMatrix & label, const GPUMatrix& cls, + const GPUMatrix& idx2cls, GPUSparseMatrix& etp, GPUMatrix& entropyScore) + {} + + template + void GPUSparseMatrix::ClassEntropyError(GPUSparseMatrix& a) + {} + + template + void GPUSparseMatrix::ClassEntropyGradientOfInput(const GPUSparseMatrix& error, const GPUMatrix& weight, GPUMatrix& grd) + {} + + template + void GPUSparseMatrix::ClassEntropyGradientOfWeight(const GPUSparseMatrix& error, const GPUMatrix& input, const GPUSparseMatrix & label, const GPUMatrix& cls, + const GPUMatrix& idx2cls, GPUSparseMatrix& grd) + {} + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceTruncate (const ElemType threshold) + { + return *this; + } + + // normal update for smoothed gradients c and current gradients (this) + template + void GPUSparseMatrix::NormalGrad(GPUMatrix& c, const ElemType momentum) + {} + + //------------------------------------------------------------------------- + // End of new GPU Sparse Matrix code + //------------------------------------------------------------------------- + + template + void GPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix& a, const bool transposeA, + const GPUMatrix& b, ElemType beta, GPUMatrix& c) + {} + + + template + void GPUSparseMatrix::Multiply(const GPUSparseMatrix& S, const GPUMatrix& D, GPUMatrix& C) + {} + + template + void GPUSparseMatrix::Multiply(const GPUMatrix& D, const GPUSparseMatrix& S, GPUMatrix& C) + {} + + // ElemCountFromBufferSize - Return the elemCountAllocated for a particular buffersize + // totalBufferSize - total buffer we have to use + // return: size of allocated elements/index slots available + template + size_t GPUSparseMatrix::ElemCountFromBufferSize(size_t totalBufferSize) + { + return 0; + } + + // PrepareBuffer - Get the dimensions start buffer, computes the starting row/column of each value + // m - rows in the source + // n - cols in the source + // canReuseBuffer - target matrix can be reused for temporary space + // func - function to call to count elements in the result (returns count, and fills csrRowPtr array) + template +#ifndef LINUX + void GPUSparseMatrix::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function func) +#else + void GPUSparseMatrix::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, size_t (*func)(int *csRowPtrC)) +#endif /* LINUX */ + {} + +#ifdef LINUX + size_t PrepareBufferMultiply(int* csrRowPtrC) + { + int nnzTotal = -1; + return nnzTotal; + } +#endif + + // Multiply - multiply one spares matrix by another sparse matrix + // S1 - first sparse matrix + // transposeS1 - transpose first matrix? + // S2 - second sparse matrix + // transposeS2 - tanspose second matrix? + // c - result matrix + // NOTE: if c has enough space allocated, it will be reused, otherwise it will be freed and a new memory block used + template + void GPUSparseMatrix::Multiply(const GPUSparseMatrix& S1, bool transposeS1, const GPUSparseMatrix& S2, bool transposeS2, GPUSparseMatrix &c) + {} + + template + GPUSparseMatrix& GPUSparseMatrix::AssignProductOf(const GPUSparseMatrix& a, const bool transposeA, const GPUSparseMatrix& b, const bool transposeB) + { + return *this; + } + + template + void GPUSparseMatrix::ScaleAndAdd(ElemType alpha,const GPUSparseMatrix& a, ElemType beta, const GPUSparseMatrix& b, GPUSparseMatrix& c) + {} + + template + void GPUSparseMatrix::ScaleAndAdd(ElemType alpha,const GPUSparseMatrix& a, ElemType beta, const GPUMatrix& b, GPUMatrix& c) + {} + + template + void GPUSparseMatrix::ScaleAndAdd(ElemType alpha,const GPUMatrix& a, ElemType beta, const GPUSparseMatrix& b, GPUMatrix& c) + {} + + template + void GPUSparseMatrix::Scale(ElemType alpha, GPUSparseMatrix& a) + {} + + template + void GPUSparseMatrix::ElementWisePower (ElemType alpha, const GPUSparseMatrix& a, GPUSparseMatrix& c) + {} + + template + ElemType GPUSparseMatrix::InnerProductOfMatrices(const GPUSparseMatrix& a, const GPUMatrix& b) + { + return ElemType(0); + } + + template + ElemType GPUSparseMatrix::InnerProductOfMatrices(const GPUMatrix& a, const GPUSparseMatrix& b) + { + return ElemType(0); + } + + template + bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, const GPUSparseMatrix& b, + const ElemType threshold) + { + return false; + } + + template + bool GPUSparseMatrix::AreEqual(const GPUMatrix& a, const GPUSparseMatrix& b, + const ElemType threshold) + { + return false; + } + + template + bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, const GPUMatrix& b, + const ElemType threshold) + { + return false; + } + + template + bool GPUSparseMatrix::IsEqualTo(const GPUSparseMatrix& a, const ElemType threshold) const + { + return false; + } + + template + bool GPUSparseMatrix::IsEqualTo(const GPUMatrix& a, const ElemType threshold) const + { + return false; + } +#pragma endregion Static BLAS Functions + +#pragma region Member BLAS Functions + + template + int GPUSparseMatrix::GetComputeDeviceId() const + { + return -1; + } + + template + GPUMatrix GPUSparseMatrix::ElementProductOf (const GPUSparseMatrix& a, const GPUMatrix& b) + { + GPUMatrix c; + return c; + } + + template + GPUMatrix GPUSparseMatrix::ElementProductOf (const GPUMatrix& a, const GPUSparseMatrix& b) + { + return GPUSparseMatrix::ElementProductOf(b,a); + } + + template + GPUSparseMatrix GPUSparseMatrix::operator+ (const GPUSparseMatrix& a) const + { + return *this; + } + + template + GPUSparseMatrix GPUSparseMatrix::operator- (const GPUSparseMatrix& a) const + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::operator^=(ElemType alpha) + { + return *this; + } + + template + GPUSparseMatrix GPUSparseMatrix::operator^ (ElemType alpha) const + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::operator*=(ElemType alpha) + { + return *this; + } + + template + GPUSparseMatrix GPUSparseMatrix::operator* (ElemType alpha) const + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignElementPowerOf(const GPUSparseMatrix& a, const ElemType power) + { + return *this; + } + + template + GPUSparseMatrix GPUSparseMatrix::Transpose() const + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignTransposeOf(const GPUSparseMatrix& a) + { + return *this; + } + + template + void GPUSparseMatrix::InplaceTranspose() + {} + + template + ElemType GPUSparseMatrix::SumOfAbsElements() const + { + return ElemType(0); + } + + template + ElemType GPUSparseMatrix::SumOfElements() const + { + return ElemType(0); + } + + + template + ElemType GPUSparseMatrix::FrobeniusNorm() const + { + return ElemType(0); + } + + template + ElemType GPUSparseMatrix::MatrixNormInf() const + { + return ElemType(0); + } + + template + ElemType GPUSparseMatrix::MatrixNorm1() const + { + return ElemType(0); + } + +#pragma endregion Member BLAS Functions + +#pragma region Other Functions + + template + GPUSparseMatrix& GPUSparseMatrix::ElementInverse () + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignElementInverseOf (const GPUSparseMatrix& a) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceSigmoid() + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignSigmoidOf (const GPUSparseMatrix& a) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceLinearRectifierDerivative() + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignLinearRectifierDerivativeOf (const GPUSparseMatrix& a) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceTanh() + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignTanhOf (const GPUSparseMatrix& a) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceSqrt() + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignSqrtOf (const GPUSparseMatrix& a) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceExp() + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignExpOf (const GPUSparseMatrix& a) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceLog() + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignLogOf (const GPUSparseMatrix& a) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceAbs() + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignAbsOf (const GPUSparseMatrix& a) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceTruncateBottom (const ElemType threshold) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignTruncateBottomOf (const GPUSparseMatrix& a, const ElemType threshold) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::InplaceTruncateTop (const ElemType threshold) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::AssignTruncateTopOf (const GPUSparseMatrix& a, const ElemType threshold) + { + return *this; + } + + template + GPUSparseMatrix& GPUSparseMatrix::SetToZeroIfAbsLessThan (const ElemType threshold) + { + return *this; + } + template + void GPUSparseMatrix::Unrolling (//GPUSparseMatrix& debugMatrix, + GPUMatrix& UnrolledMatrix, const GPUMatrix& InMatrix, GPUSparseMatrix& UnrollMapping, + const int inputWidth, const int inputHeight, const int inputChannelNum, + const int FltWidth,const int FltHeight, const int FltChannel, + const int FltStepW, const int FltStepH) + { + } + +#pragma endregion + +#pragma region Helper Functions + + template + void GPUSparseMatrix::performInplaceFunction(int kind) + {} + + template + void GPUSparseMatrix::SetMatrixFromCSRFormat(int *h_CSRRow, int *h_Col, ElemType *h_Val, size_t nz, size_t numRows, size_t numCols, bool IsOnDevice, int devId) + {} + + // NOTE: we should change this to just use a single buffer, and return pointers into it + template + void GPUSparseMatrix::GetMatrixFromCSRFormat(int*& h_CSRRow, int*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const + {} + +#pragma endregion Helper Functions + + template class GPUSparseMatrix; + template class GPUSparseMatrix; + + template + MATH_API File& operator>>(File& stream, GPUSparseMatrix& us) + { + return stream; + } + + template MATH_API File& operator>>(File& stream, GPUSparseMatrix& us); + template MATH_API File& operator>>(File& stream, GPUSparseMatrix& us); + + template + MATH_API File& operator<<(File& stream, const GPUSparseMatrix& us) + { + return stream; + } + template MATH_API File& operator<<(File& stream, const GPUSparseMatrix& us); + template MATH_API File& operator<<(File& stream, const GPUSparseMatrix& us); + + +#pragma region DeviceBoundNumber class + + template + DeviceBoundNumber::DeviceBoundNumber(const DeviceBoundNumber &deepCopy) + { + NOT_IMPLEMENTED; + } + +#ifndef LINUX + template + DeviceBoundNumber::DeviceBoundNumber(DeviceBoundNumber &&shallowCopy) + { + this->ShallowCopyFrom(shallowCopy.m_data,shallowCopy.m_computeDevice); + shallowCopy.m_data=NULL; + } +#endif + + template + void DeviceBoundNumber::ShallowCopyFrom(ElemType* newVal,int newValsDevceId) + {} + + template + DeviceBoundNumber::~DeviceBoundNumber() + {} + +#pragma endregion DeviceBoundNumber class + +#pragma region Helper functions + + // GetBestGPUDeviceId - Get the best GPU DeviceId, based on cuda information + // TODO: should be replaced by BestGpu class instead, it's much better + template + int GPUMatrix::GetBestGPUDeviceId() //returns -1 if no GPUs can be used + { + return -1; // CPU + } + + // PrepareDevice - Setup the correct cuda context for an operation + // deviceId - the device on which the operation will take place + // defaults to -1, which means use matrices current device + template + void GPUMatrix::PrepareDevice(short deviceId /*=-1*/) const + {} + + template + ElemType* GPUMatrix::CopyToArray() const + { + return NULL; + } + + //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done + //return number of elements copied + template + size_t GPUMatrix::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const + { + return 0; + } + + template + void GPUMatrix::ChangeDeviceTo(int to_id) + {} + + template + void GPUMatrix::performInplaceFunction(int kind) + {} + + +#pragma endregion Helper functions + +#pragma region Constructors and Destructor + + //should only be used by constructors. + template + void GPUMatrix::ZeroInit(int deviceId) + {} + + template + GPUMatrix::GPUMatrix(int deviceId) + {}; + + //matrixName is used to verify that correct matrix is read. + template + GPUMatrix::GPUMatrix(FILE* f, const char * matrixName, int deviceId) + {} + + template + GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols,int deviceId) + {}; + + template + GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags, int deviceId) + {}; + + template + GPUMatrix::GPUMatrix(const GPUMatrix& deepCopyFrom) + {} + +#ifndef LINUX + template + GPUMatrix::GPUMatrix(GPUMatrix&& moveFrom) + {} +#endif + + //assignment operator, deep copy + template + GPUMatrix& GPUMatrix::operator=(const GPUMatrix& deepCopyFrom) + { + return *this; + } + +#ifndef LINUX + //move assignment operator, shallow copy + template + GPUMatrix& GPUMatrix::operator=(GPUMatrix&& moveFrom) + { + return *this; + } +#endif /* LINUX */ + + template + GPUMatrix::~GPUMatrix(void) + { + } + + template + void GPUMatrix::Clear() + {} +#pragma endregion Constructors and Destructor + + template + int GPUMatrix::GetComputeDeviceId() const + { + return -1; + } + +#pragma region Basic Operators + template + GPUMatrix GPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const + { + GPUMatrix slice; + + return slice; + } + + template + GPUMatrix& GPUMatrix::AssignColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols) + { + return *this; + } + + + //for each column of a, we assign numRows starting from startIndex to this + template + GPUMatrix& GPUMatrix::AssignRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) + { + return *this; + } + + //for each column of a, we add all rows of a to this starting from startIndex + template + GPUMatrix& GPUMatrix::AddToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) + { + return *this; + } + + template + GPUMatrix GPUMatrix::Transpose() const + { + return *this; + } + + // GetCublasHandle - get a cublas handle for the given GPU, should only need one per GPU + // computeDevice - The compute device for which the cublas handle is desired + // returns: cublas handle + // NOTE: we currently don't bother to ever free the CUBLAS handle, it will be freed automatically by CUDA when the process ends + template + cublasHandle_t GPUMatrix::GetCublasHandle(int computeDevice/*=-1*/) + { + cublasHandle_t cuHandle = 0; + return cuHandle; + } + + template + GPUMatrix& GPUMatrix::AssignTransposeOf (const GPUMatrix& a) + { + return *this; + } + + template + void GPUMatrix::SetValue(const ElemType v) + {} + + template + void GPUMatrix::SetValue(const ElemType* d_v) //d_v is pointer to the the value in GPU memory + {} + + template + void GPUMatrix::SetColumn(const ElemType* colPointer, size_t colInd) + {} + + template + void GPUMatrix::SetValue(const GPUMatrix& deepCopyFrom) + {} + + template + void GPUMatrix::SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, size_t matrixFlags, int deviceId) + {} + + + template + void GPUMatrix::SetDiagonalValue(const ElemType v) + {} + + template + void GPUMatrix::SetDiagonalValue(GPUMatrix& vector) + {} + + template + void GPUMatrix::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed) + {} + + template + void GPUMatrix::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) + {} + + //maskRate: percentage of values masked out (similar to dropout rate) + //scaleValue: which scale value to set to the left ones (unmasked items). + template + void GPUMatrix::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed) + {} + + template + void GPUMatrix::Adagrad(GPUMatrix& gradients) + {} + + template + void GPUMatrix::Reshape(const size_t numRows, const size_t numCols) + {} + + template + void GPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly) + {} + + template + size_t GPUMatrix::LocateElement (const size_t row, const size_t col) const + { + return 0; + } + + template + size_t GPUMatrix::LocateColumn (const size_t col) const + { + return 0; + } + + template + ElemType GPUMatrix::Get00Element() const + { + ElemType res=0; + return res; + } +#pragma endregion Basic Operators + +#pragma region Member BLAS Functions + template + GPUMatrix& GPUMatrix::operator+= (ElemType alpha) + { + return *this; + } + + template + GPUMatrix GPUMatrix::operator+ (ElemType alpha) const + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSumOf(const ElemType alpha, const GPUMatrix& a) + { + return (*this); + } + + + template + GPUMatrix& GPUMatrix::operator+= (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix GPUMatrix::operator+ (const GPUMatrix& a) const + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSumOf(const GPUMatrix& a, const GPUMatrix& b) + { + return (*this); + } + + template + GPUMatrix& GPUMatrix::operator-= (ElemType alpha) + { + return *this; + } + + template + GPUMatrix GPUMatrix::operator- (ElemType alpha) const + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignDifferenceOf(const ElemType alpha, const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignDifferenceOf(const GPUMatrix& a, const ElemType alpha) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::operator-= (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix GPUMatrix::operator- (const GPUMatrix& a) const + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignDifferenceOf(const GPUMatrix& a, const GPUMatrix& b) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::operator*= (ElemType alpha) + { + return *this; + } + + template + GPUMatrix GPUMatrix::operator* (ElemType alpha) const + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignProductOf(const ElemType alpha, const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignProductOf (const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB) + { + return *this; + } + + template + GPUMatrix GPUMatrix::operator* (const GPUMatrix& a) const + { + return *this; + } + + template + GPUMatrix& GPUMatrix::operator/= (ElemType alpha) + { + return (*this); + } + + template + GPUMatrix GPUMatrix::operator/ (ElemType alpha) const + { + return *this; + } + + //element-wise power + template + GPUMatrix& GPUMatrix::operator^= (ElemType alpha) + { + return *this; + } + + template + GPUMatrix GPUMatrix::operator^ (ElemType alpha) const + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignElementPowerOf(const GPUMatrix& a, const ElemType power) + { + return *this; + } + + + template + GPUMatrix& GPUMatrix::AddElementProductOf (const GPUMatrix& a, const GPUMatrix& b) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::ColumnElementMultiplyWith(const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::RowElementMultiplyWith(const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::ElementInverse () + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignElementInverseOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceSigmoid() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSigmoidOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceSigmoidDerivative() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSigmoidDerivativeOf (const GPUMatrix& a) + { + return *this; + } + + + template + GPUMatrix& GPUMatrix::InplaceTanh() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignTanhOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceSoftmax (const bool isColWise) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSoftmaxOf (const GPUMatrix& a, const bool isColWise) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceSqrt() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignSqrtOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceExp() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignExpOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceLog() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignLogOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceAbs() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignAbsOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceLinearRectifierDerivative() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignLinearRectifierDerivativeOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceCosine() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignCosineOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceNegativeSine() + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignNegativeSineOf (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceTruncateBottom (const ElemType threshold) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignTruncateBottomOf (const GPUMatrix& a, const ElemType threshold) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::InplaceTruncateTop (const ElemType threshold) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignTruncateTopOf (const GPUMatrix& a, const ElemType threshold) + { + return *this; + } + template + GPUMatrix& GPUMatrix::SetToZeroIfAbsLessThan (const ElemType threshold) + { + return *this; + } + + template + ElemType GPUMatrix::SumOfAbsElements() const + { + return ElemType(0); + } + + template + ElemType GPUMatrix::SumOfElements() const + { + return ElemType(0); + } + + + template + GPUMatrix& GPUMatrix::AssignSumOfElements(const GPUMatrix& a) + { + return (*this); + } + + template + DeviceBoundNumber GPUMatrix::Sum_AsDeviceBoundNum() const + { + DeviceBoundNumber result; + return result; + } + + template + ElemType GPUMatrix::Max() const + { + return ElemType(0); + } + + + template + GPUMatrix& GPUMatrix::ElementMultiplyWith (const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignElementProductOf (const GPUMatrix& a, const GPUMatrix& b) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignElementDivisionOf (const GPUMatrix& a, const GPUMatrix& b) + { + return *this; + } + + template + bool GPUMatrix::IsEqualTo(const GPUMatrix& a, const ElemType threshold /*= 1e-8*/) const + { + return AreEqual(*this, a, threshold); + } + + template + void GPUMatrix::VectorNorm1(GPUMatrix& c, const bool isColWise) const + { + } + + template + GPUMatrix& GPUMatrix::AssignVectorNorm1Of(GPUMatrix& a, const bool isColWise) + { + return *this; + } + + template + void GPUMatrix::VectorNorm2(GPUMatrix& c, const bool isColWise) const + {} + + template + GPUMatrix& GPUMatrix::AssignVectorNorm2Of(GPUMatrix& a, const bool isColWise) + { + return *this; + } + + template + void GPUMatrix::VectorNormInf(GPUMatrix& c, const bool isColWise) const + {} + + template + GPUMatrix& GPUMatrix::AssignVectorNormInfOf(GPUMatrix& a, const bool isColWise) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignInnerProductOf(const GPUMatrix& a, const GPUMatrix& b, const bool isColWise) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignKhatriRaoProductOf(const GPUMatrix& a, const GPUMatrix& b) + { + return *this; + } + + //column-wise reshaped product. Used to compute KhatriRaoProduct Gradient + // this = reshape each column of a from (K1xK2,1) to (K1, K2) + // if each column of a is not transposed, each (K1, K2) times each column of b (K2, frames). + // the output is a (K1, frames) matrix + // if each column of a is tranposed, each (K1, K2)^T times each column of b(K1, frames) and output is (K2, frames) + template + GPUMatrix& GPUMatrix::AddColumnReshapeProductOf(const GPUMatrix& a, const GPUMatrix& b, const bool transposeAColumn) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AddWithScaleOf(ElemType alpha, const GPUMatrix& a) + { + return *this; + } + + template + ElemType GPUMatrix::FrobeniusNorm() const + { + ElemType h_sum=0; + return (h_sum); + } + + template + GPUMatrix& GPUMatrix::AssignFrobeniusNormOf (const GPUMatrix& a) + { + return *this; + } + + template + ElemType GPUMatrix::MatrixNormInf() const + { + ElemType h_maxAbs=0; + return h_maxAbs; + } + + template + ElemType GPUMatrix::MatrixNorm1() const + { + return ElemType(0); + } + + template + ElemType GPUMatrix::MatrixNorm0() const + { + return ElemType(0); + } + + template + GPUMatrix& GPUMatrix::AssignSignOf(const GPUMatrix& a) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AddSignOf(const GPUMatrix& a) + { + return *this; + } + + template + void GPUMatrix::VectorMax(GPUMatrix& maxIndexes, GPUMatrix& maxValues, const bool isColWise) const + {} + + template + void GPUMatrix::VectorMin(GPUMatrix& minIndexes, GPUMatrix& minValues, const bool isColWise) const + {} + + template + GPUMatrix& GPUMatrix::AssignNumOfDiff(const GPUMatrix& a, const GPUMatrix& b) + { + return *this; + } + +#pragma endregion Member BLAS Functions + +#pragma region Other helper functions + template + void GPUMatrix::Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const + {} + + template + void GPUMatrix::Print(const char* matrixName /*=nullptr*/) const + {} + + // file I/O + //matrixName is used to verify that correct matrix is read. + template + void GPUMatrix::ReadFromFile(FILE* f, const char * matrixName) + {} + + //matrixName is used to verify that correct matrix is read. + template + void GPUMatrix::WriteToFile(FILE* f, const char * matrixName) + {} + + //helpfer function used for convolution neural network + template + GPUMatrix& GPUMatrix::AssignPackedConvolutionInput(const GPUMatrix& inputSubBatch, + const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, + const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, + const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, + const bool zeroPadding) + { + return *this; + } + + //helpfer function used for convolution neural network + template + GPUMatrix& GPUMatrix::UnpackConvolutionInput(GPUMatrix& inputSubBatch, + const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, + const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, + const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, + const bool zeroPadding) const + { + GPUMatrix mat; + return mat; + } + + template + GPUMatrix& GPUMatrix::AssignMaxPoolingResult(const GPUMatrix& inputBatch, const size_t channels, + const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AddMaxPoolingGradient(const GPUMatrix& outputGradientBatch, const GPUMatrix& inputBatch, const GPUMatrix& outputBatch, + const size_t channels, + const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AssignAveragePoolingResult(const GPUMatrix& inputBatch, const size_t channels, + const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + return *this; + } + + template + GPUMatrix& GPUMatrix::AddAveragePoolingGradient(const GPUMatrix& outputGradientBatch, + const size_t channels, + const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + return *this; + } + +#pragma endregion Other helper functions + +#pragma region Static BLAS Functions + template + void GPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, + ElemType beta, GPUMatrix& c) + { + } + + template + void GPUMatrix::MultiplyAndAdd(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, GPUMatrix& c) + {} + + template + void GPUMatrix::Multiply(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, GPUMatrix& c) + {} + + template + void GPUMatrix::Multiply(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + {} + + /// Matrix-scalar multiply with col-major matrices: c = alpha * a + c + /// if a is a column vector, add to all columns of c + /// if a is a row vector, add to all rows of c + /// if a is a scalar, add to all elements of c + /// Scalar + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::ScaleAndAdd(ElemType alpha,const GPUMatrix& a, GPUMatrix& c) + {} + + /// c += alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::AddScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + {} + + /// c = alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::AssignScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + {} + + /// c += alpha * (a-b) + /// if a, b, c must have same dim + /// 1X1 matrix + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::AddScaledDifference(const GPUMatrix& alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + {} + + /// c = alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void GPUMatrix::AssignScaledDifference(const GPUMatrix& alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) + {} + + //c[ci,cj] += a[ai,aj] + template + void GPUMatrix::AddElementToElement(const GPUMatrix& a, const size_t ai, const size_t aj, GPUMatrix& c, const size_t ci, const size_t cj) + {} + + template + void GPUMatrix::Scale(ElemType alpha, GPUMatrix& a) + {} + + + template + void GPUMatrix::Scale(GPUMatrix& alpha, GPUMatrix& a) + {} + + template //c = alpha * a + void GPUMatrix::Scale(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) + {} + + + template + void GPUMatrix::InnerProduct (const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c, const bool isColWise) + {} + + template + ElemType GPUMatrix::InnerProductOfMatrices(const GPUMatrix& a, const GPUMatrix& b) + { + return ElemType(0); + } + + + template + GPUMatrix& GPUMatrix::AssignInnerProductOfMatrices(const GPUMatrix& a, const GPUMatrix& b) + { + return *this; + } + + + template + void GPUMatrix::ElementWisePower(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) + {} + + template + bool GPUMatrix::AreEqual(const GPUMatrix& a, const GPUMatrix& b, const ElemType threshold /*= 1e-8*/) + { + return false; + } + + template + GPUMatrix GPUMatrix::Ones(const size_t rows, const size_t cols) + { + GPUMatrix mat; + return mat; + } + + template + GPUMatrix GPUMatrix::Zeros(const size_t rows, const size_t cols) + { + GPUMatrix mat; + return mat; + } + + template + GPUMatrix GPUMatrix::Eye(const size_t rows) + { + GPUMatrix mat; + return mat; + } + + template + GPUMatrix GPUMatrix::RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed) + { + GPUMatrix mat; + return mat; + } + + template + GPUMatrix GPUMatrix::RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed) + { + GPUMatrix mat; + return mat; + } + + template + ElemType GPUMatrix::GetLearnRateForBlock_Helper(const GPUMatrix &Gradients, const GPUMatrix &SmoothedGradients) + { + return ElemType(0); + } + +#pragma endregion Static BLAS Functions + + template class GPUMatrix; + template class GPUMatrix; + template class DeviceBoundNumber; + template class DeviceBoundNumber; + + template + cublasHandle_t GPUMatrix::s_cuHandle[GPUMatrix::MaxGpus]={0}; + + template + void* GPUMatrix::s_curandGenerator=NULL; +}}} + +// define a dummy GPUWatcher class too +#include "GPUWatcher.cuh" + +int GPUWatcher::GetGPUIdWithTheMostFreeMemory() +{ + return 0; +} + + +size_t GPUWatcher::GetFreeMemoryOnCUDADevice(int devId) +{ + return 0; +} + +GPUWatcher::GPUWatcher(void) +{ +} + +GPUWatcher::~GPUWatcher(void) +{ +} + + + + + diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu index 9d690f0e4..692b059a3 100644 --- a/Math/Math/GPUMatrix.cu +++ b/Math/Math/GPUMatrix.cu @@ -324,22 +324,22 @@ namespace Microsoft { namespace MSR { namespace CNTK { switch (kind) { case 0: - _inplaceSigmoidOnCuda<<>>(this->m_pArray,N); + _inplaceSigmoidOnCuda<<>>(this->m_pArray, N); break; case 1: - _inplaceTanhOnCuda<<>>(this->m_pArray,N); + _inplaceTanhOnCuda<<>>(this->m_pArray, N); break; case 2: - _inplaceSqrtOnCuda<<>>(this->m_pArray,N); + _inplaceSqrtOnCuda<<>>(this->m_pArray, N); break; case 3: - _inplaceExpOnCuda<<>>(this->m_pArray,N); + _inplaceExpOnCuda<<>>(this->m_pArray,N); break; case 4: - _inplaceLogOnCuda<<>>(this->m_pArray,N); + _inplaceLogOnCuda<<>>(this->m_pArray,N); break; case 5: - _inplaceAbsOnCuda<<>>(this->m_pArray,N); + _inplaceAbsOnCuda<<>>(this->m_pArray,N); break; case 6: _inplaceLinRectDerivative<<>>(this->m_pArray,N); @@ -1205,7 +1205,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); // cudaEvent_t done = nullptr; // if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - // _addValue<<>>(m_pArray,a.m_pArray,N); + // _addValue<<>>(m_pArray,a.m_pArray,N); // if (do_sync) CUDA_CALL(cudaEventRecord(done)); // if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); // if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1458,7 +1458,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addElementProductOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray,N); + _addElementProductOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1480,7 +1480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _columnElementMultiplyWith<<>>(this->m_pArray,a.m_pArray,N,M); + _columnElementMultiplyWith<<>>(this->m_pArray,a.m_pArray,N,M); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1503,7 +1503,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _rowElementMultiplyWith<<>>(this->m_pArray,a.m_pArray,N,M); + _rowElementMultiplyWith<<>>(this->m_pArray,a.m_pArray,N,M); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1568,7 +1568,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _elemInverse<<>>(this->m_pArray,N); + _elemInverse<<>>(this->m_pArray,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1825,7 +1825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _inplaceTruncateBottom<<>>(this->m_pArray,threshold,N); + _inplaceTruncateBottom<<>>(this->m_pArray,threshold,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1848,7 +1848,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignTruncateBottom<<>>(this->m_pArray,a.m_pArray,threshold,N); + _assignTruncateBottom<<>>(this->m_pArray,a.m_pArray,threshold,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1866,7 +1866,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _inplaceTruncateTop<<>>(this->m_pArray,threshold,N); + _inplaceTruncateTop<<>>(this->m_pArray,threshold,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1889,7 +1889,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignTruncateTop<<>>(this->m_pArray,a.m_pArray,threshold,N); + _assignTruncateTop<<>>(this->m_pArray,a.m_pArray,threshold,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1906,7 +1906,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _setToZeroIfAbsLessThan<<>>(this->m_pArray,threshold,N); + _setToZeroIfAbsLessThan<<>>(this->m_pArray,threshold,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -1964,7 +1964,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); //WARNING: THIS kernel is not the most efficient way! - _reductionSumAndAssign<<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements()); + _reductionSumAndAssign<<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements()); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2026,7 +2026,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _elemMul<<>>(this->m_pArray,a.m_pArray,N); + _elemMul<<>>(this->m_pArray,a.m_pArray,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2049,7 +2049,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignElementProductOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray,N); + _assignElementProductOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2078,7 +2078,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignElementDivisionOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray,N); + _assignElementDivisionOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2119,7 +2119,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _vectorNorm1<<>>(c.m_pArray, this->m_pArray,n,m,isColWise); + _vectorNorm1<<>>(c.m_pArray, this->m_pArray,n,m,isColWise); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2160,7 +2160,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _vectorNorm2<<>>(c.m_pArray, this->m_pArray,n,m,isColWise); + _vectorNorm2<<>>(c.m_pArray, this->m_pArray,n,m,isColWise); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2219,7 +2219,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignKhatriRaoProductOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols); + _assignKhatriRaoProductOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2257,7 +2257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addColumnReshapeProductOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn); + _addColumnReshapeProductOf<<>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2360,7 +2360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { cudaEvent_t done = nullptr; int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignSignOf<<>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements()); + _assignSignOf<<>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements()); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2380,7 +2380,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { cudaEvent_t done = nullptr; int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addSignOf<<>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements()); + _addSignOf<<>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements()); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2473,8 +2473,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { cudaEvent_t done = nullptr; //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - //_assignNumOfDiff<<>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements()); - _assignNumOfDiff<<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements()); + //_assignNumOfDiff<<>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements()); + _assignNumOfDiff<<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements()); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2926,7 +2926,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { LONG64 n=(LONG64)a.GetNumElements(); int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addScaledDifference<<>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n); + _addScaledDifference<<>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2967,7 +2967,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { LONG64 n=(LONG64)a.GetNumElements(); int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignScaledDifference<<>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n); + _assignScaledDifference<<>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -3011,7 +3011,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { LONG64 n=(LONG64)a.GetNumElements(); int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addScaledDifference<<>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n); + _addScaledDifference<<>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -3055,7 +3055,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { LONG64 n=(LONG64)a.GetNumElements(); int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _assignScaledDifference<<>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n); + _assignScaledDifference<<>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -3074,7 +3074,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { cudaEvent_t done = nullptr; int blocksPerGrid=1; //only one element if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _addElementToElement<<>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj)); + _addElementToElement<<>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj)); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -3195,7 +3195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _innerProduct<<>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise); + _innerProduct<<>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -3288,7 +3288,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (do_sync) CUDA_CALL(cudaEventCreate(&done)); LONG64 N=(LONG64)a.GetNumElements(); int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - _elementWisePowerOnCuda<<>>(alpha,a.m_pArray,c.m_pArray,N); + _elementWisePowerOnCuda<<>>(alpha,a.m_pArray,c.m_pArray,N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu index 55af99884..7a116df3a 100644 --- a/Math/Math/GPUMatrixCUDAKernels.cu +++ b/Math/Math/GPUMatrixCUDAKernels.cu @@ -3,6 +3,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // // +#ifndef CPU_ONLY #include #include #include "CommonMatrix.h" @@ -3235,3 +3236,5 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max(( } } */ + +#endif /*!CPU_ONLY*/ diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu index e0e908292..1882798dc 100644 --- a/Math/Math/GPUSparseMatrix.cu +++ b/Math/Math/GPUSparseMatrix.cu @@ -909,7 +909,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); cudaEvent_t done = nullptr; CUDACALL(cudaEventCreate(&done)); - _inplaceTruncate<<>>(this->m_blockVal,threshold,N); + _inplaceTruncate<<>>(this->m_blockVal,threshold,N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); CUDACALL(cudaEventDestroy(done)); @@ -1310,7 +1310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { a.PrepareDevice(); long N=(long)a.GetNZElements(); int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); - _elementWisePowerOnCuda<<>>(alpha,a.NzLocation(),c.NzLocation(),N); + _elementWisePowerOnCuda<<>>(alpha,a.NzLocation(),c.NzLocation(),N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); } @@ -1360,7 +1360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { //int* h_vectArray= new int[a.m_nz]; int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock); CUDACALL(cudaEventCreate(&done)); - _getSparseVectorRepresntationForMatrix<<>>(cscColPtrA,cscRowIndA,vectArray,M,N); + _getSparseVectorRepresntationForMatrix<<>>(cscColPtrA,cscRowIndA,vectArray,M,N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); CUDACALL(cudaEventDestroy(done)); @@ -1411,7 +1411,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { CUDACALL(cudaMemcpy(d_res,res,sizeof(long)*3,cudaMemcpyHostToDevice)); int blocksPerGrid =(int)ceil(1.0*a.GetNZElements()/threadsPerBlock); - _areEqual<<>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res); + _areEqual<<>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res); _areEqual<<>>(a.ColLocation(),b.ColLocation(),(long)a.GetNZElements(),(int)threshold,d_res+1); blocksPerGrid =(int)ceil((1.0*a.GetNumRows()+1.0)/threadsPerBlock); _areEqual<<>>(a.RowLocation(),b.RowLocation(),(long)a.GetNumRows()+1,(int)threshold,d_res+2); @@ -1719,7 +1719,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock); cudaEvent_t done = nullptr; CUDACALL(cudaEventCreate(&done)); - _elemInverse<<>>(this->m_pArray,N); + _elemInverse<<>>(this->m_pArray,N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); return *this; @@ -1846,7 +1846,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); cudaEvent_t done = nullptr; CUDACALL(cudaEventCreate(&done)); - _inplaceTruncateBottom<<>>(this->m_pArray,threshold,N); + _inplaceTruncateBottom<<>>(this->m_pArray,threshold,N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); return *this; @@ -1867,7 +1867,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); cudaEvent_t done = nullptr; CUDACALL(cudaEventCreate(&done)); - _assignTruncateBottom<<>>(this->m_pArray,a.NzLocation(),threshold,N); + _assignTruncateBottom<<>>(this->m_pArray,a.NzLocation(),threshold,N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); return *this; @@ -1882,7 +1882,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); cudaEvent_t done = nullptr; CUDACALL(cudaEventCreate(&done)); - _inplaceTruncateTop<<>>(this->m_pArray,threshold,N); + _inplaceTruncateTop<<>>(this->m_pArray,threshold,N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); return *this; @@ -1903,7 +1903,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); cudaEvent_t done = nullptr; CUDACALL(cudaEventCreate(&done)); - _assignTruncateTop<<>>(this->m_pArray,a.NzLocation(),threshold,N); + _assignTruncateTop<<>>(this->m_pArray,a.NzLocation(),threshold,N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); return *this; @@ -1918,7 +1918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock); cudaEvent_t done = nullptr; CUDACALL(cudaEventCreate(&done)); - _setToZeroIfAbsLessThan<<>>(this->m_pArray,threshold,N); + _setToZeroIfAbsLessThan<<>>(this->m_pArray,threshold,N); CUDACALL(cudaEventRecord(done)); CUDACALL(cudaEventSynchronize(done)); return *this; @@ -2012,22 +2012,22 @@ namespace Microsoft { namespace MSR { namespace CNTK { switch (kind) { case 0: - _inplaceSigmoidOnCuda<<>>(this->m_pArray,N); + _inplaceSigmoidOnCuda<<>>(this->m_pArray,N); break; case 1: - _inplaceTanhOnCuda<<>>(this->m_pArray,N); + _inplaceTanhOnCuda<<>>(this->m_pArray,N); break; case 2: - _inplaceSqrtOnCuda<<>>(this->m_pArray,N); + _inplaceSqrtOnCuda<<>>(this->m_pArray,N); break; case 3: - _inplaceExpOnCuda<<>>(this->m_pArray,N); + _inplaceExpOnCuda<<>>(this->m_pArray,N); break; case 4: - _inplaceLogOnCuda<<>>(this->m_pArray,N); + _inplaceLogOnCuda<<>>(this->m_pArray,N); break; case 5: - _inplaceAbsOnCuda<<>>(this->m_pArray,N); + _inplaceAbsOnCuda<<>>(this->m_pArray,N); break; case 6: _inplaceLinRectDerivative<<>>(this->m_pArray,N); diff --git a/Math/Math/Math.vcxproj.filters b/Math/Math/Math.vcxproj.filters new file mode 100644 index 000000000..4846433c6 --- /dev/null +++ b/Math/Math/Math.vcxproj.filters @@ -0,0 +1,77 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp index cdbd2526d..eecc95d96 100644 --- a/Math/Math/Matrix.cpp +++ b/Math/Math/Matrix.cpp @@ -288,15 +288,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (m_preferredDeviceId == CPUDEVICE) { - m_CPUMatrix = new CPUMatrix(numRows,numCols); + m_CPUMatrix = new CPUMatrix(numRows,numCols); SetDataLocation(CPU, DENSE); - } - else - { - m_GPUMatrix = new GPUMatrix(numRows,numCols,m_preferredDeviceId); - SetDataLocation(GPU, DENSE); - } - } + } + else + { + m_GPUMatrix = new GPUMatrix(numRows,numCols,m_preferredDeviceId); + SetDataLocation(GPU, DENSE); + } + } } template @@ -840,11 +840,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_CPUMatrix->SetValue(*db_number.ExposePointer2Value()), if (GetDeviceId()!=db_number.GetDeviceId()) { -#ifndef LINUX - throw std::exception("Matrix and device bound number must be on the same device"); -#else - throw std::exception(); -#endif /* LINUX */ + throw std::runtime_error("Matrix and device bound number must be on the same device"); } m_GPUMatrix->SetValue(db_number.ExposePointer2Value()), NOT_IMPLEMENTED,