Modify port code to support CPUONLY build

To use this define CPUONLY in the CN project
And remove *.cu from the math project and add GPUDummy.cpp instead
This allows use of Cygwin to compile both Windows and linux on same machine
This commit is contained in:
adame 2014-10-20 14:50:30 -07:00 коммит произвёл unknown
Родитель a863c7746f
Коммит ff72d5696f
13 изменённых файлов: 1911 добавлений и 311 удалений

Просмотреть файл

@ -6,14 +6,28 @@
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#include "BestGpu.h"
#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX
#ifdef CPUONLY
namespace Microsoft {
namespace MSR {
namespace CNTK {
short DeviceFromConfig(const ConfigParameters& config)
{
return CPUDEVICE;
}
}
}
}
#else
// CUDA-C includes
#include <cuda.h>
#include <windows.h>
#include <delayimp.h>
#include <Shlobj.h>
#include <stdio.h>
#include "BestGpu.h"
#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX
// The "notify hook" gets called for every call to the
// delay load helper. This allows a user to hook every call and
@ -507,4 +521,5 @@ void BestGpu::QueryNvmlData()
m_nvmlData = true;
return;
}
}}}
}}}
#endif

Просмотреть файл

@ -4,15 +4,18 @@
// </copyright>
//
#pragma once
#ifndef CPUONLY
#include <cuda_runtime.h>
#include <nvml.h>
#include <vector>
#endif
#include "commandArgUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
short DeviceFromConfig(const ConfigParameters& config);
#ifndef CPUONLY
struct ProcessorData
{
int cores;
@ -68,5 +71,5 @@ public:
std::vector<int> GetDevices(int number=AllDevices, BestGpuFlags flags=bestGpuNormal); // get multiple devices
};
extern BestGpu* g_bestGpu;
#endif
}}}

Просмотреть файл

@ -13,7 +13,13 @@ typedef char16_t TCHAR;
#define vsprintf_s vsprintf /* Not sure this is right... Malcolm */
#include <chrono>
#include <thread>
#endif /* LINUX */
#include <cstdlib>
#include <cerrno>
#define Linux(a) a
#else
#include <tchar.h>
#endif /* LINUX */
#include <cmath> // for HUGE_VAL // Remove for a test by Malcolm because of double isnan definition...
#ifndef UNDER_CE // fixed-buffer overloads not available for wince
#ifdef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES // fixed-buffer overloads for strcpy() etc.
@ -80,11 +86,13 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
#include <errno.h>
#include <string>
#include <vector>
#include <cmath> // for HUGE_VAL // potential double isnan definition
#include <math.h> // for HUGE_VAL // potential double isnan definition
#include <assert.h>
#include <stdarg.h>
#include <map>
#include <stdexcept>
#include <locale> // std::wstring_convert
#include <codecvt> // std::codecvt_utf8
#ifdef _MSC_VER
#include <windows.h> // for CRITICAL_SECTION and Unicode conversion functions --TODO: is there a portable alternative?
#endif
@ -578,6 +586,9 @@ typedef strfun::_strprintf<wchar_t> wstrprintf; // wchar_t version
#ifdef _WIN32
struct utf8 : std::string { utf8 (const std::wstring & p) // utf-16 to -8
{
//TODO: confirm it builds on VS2013
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cv;
(*(std::string*)this) = cv.to_bytes(p);
#ifdef MALCOLM
size_t len = p.length();
if (len == 0) { return;} // empty string
@ -592,16 +603,19 @@ struct utf8 : std::string { utf8 (const std::wstring & p) // utf-16 to -8
}};
struct utf16 : std::wstring { utf16 (const std::string & p) // utf-8 to -16
{
#ifdef MALCOLM
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cv;
(*(std::wstring*)this) = cv.from_bytes(p);
#ifdef OLD
size_t len = p.length();
if (len == 0) { return;} // empty string
msra::basetypes::fixed_vector<wchar_t> buf (len + 1);
// ... TODO: this fill() should be unnecessary (a 0 is appended)--but verify
std::fill (buf.begin (), buf.end (), (wchar_t) 0);
int rc = MultiByteToWideChar (CP_UTF8, 0, p.c_str(), (int) len,
&buf[0], (int) buf.size());
if (rc == 0) throw std::runtime_error ("MultiByteToWideChar");
ASSERT (rc < buf.size ());
std::fill(buf.begin(), buf.end(), (wchar_t)0);
int rc = MultiByteToWideChar(CP_UTF8, 0, p.c_str(), (int)len,
&buf[0], (int)buf.size());
if (rc == 0) throw std::runtime_error("MultiByteToWideChar");
ASSERT(rc < buf.size());
(*(std::wstring*)this) = &buf[0];
#endif /* Malcolm */
}};
@ -641,12 +655,8 @@ static inline std::string wcstombs (const std::wstring & p) // output: MBCS
}
static inline std::wstring mbstowcs (const std::string & p) // input: MBCS
{
size_t len = p.length();
msra::basetypes::fixed_vector<wchar_t> buf (len + 1); // max: >1 mb chars => 1 wchar
std::fill (buf.begin (), buf.end (), (wchar_t) 0);
OACR_WARNING_SUPPRESS(UNSAFE_STRING_FUNCTION, "Reviewed OK. size checked. [rogeryu 2006/03/21]");
::mbstowcs (&buf[0], p.c_str(), len + 1);
return std::wstring (&buf[0]);
std::wstring ret = utf16(p);
return ret;
}
#pragma warning(pop)
@ -769,8 +779,6 @@ static inline FILE* _wfopen(const wchar_t * path, const wchar_t * mode) { return
namespace msra { namespace basetypes {
#ifdef MALCOLM
// FILE* with auto-close; use auto_file_ptr instead of FILE*.
// Warning: do not pass an auto_file_ptr to a function that calls fclose(),
// except for fclose() itself.
@ -789,7 +797,7 @@ public:
auto_file_ptr() : f (NULL) { }
~auto_file_ptr() { close(); }
auto_file_ptr (const char * path, const char * mode) { f = fopen (path, mode); if (f == NULL) openfailed (path); }
auto_file_ptr (const wchar_t * path, const char * mode) { f = _wfopen (path, msra::strfun::utf16 (mode).c_str()); if (f == NULL) openfailed (msra::strfun::utf8 (path)); }
auto_file_ptr (const wchar_t * wpath, const char * mode) {string path = msra::strfun::utf8(wpath); f = fopen (path.c_str(), mode); if (f == NULL) openfailed (path); }
FILE * operator= (FILE * other) { close(); f = other; return f; }
auto_file_ptr (FILE * other) : f (other) { }
operator FILE * () const { return f; }
@ -797,7 +805,6 @@ public:
void swap (auto_file_ptr & other) throw() { std::swap (f, other.f); }
};
inline int fclose (auto_file_ptr & af) { return af.fclose(); }
#endif /* MALCOLM */
#ifdef _MSC_VER
// auto-closing container for Win32 handles.

Просмотреть файл

@ -3,232 +3,19 @@
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// $Log: /Speech_To_Speech_Translation/dbn/dbn/fileutil.h $
//
// 71 1/03/13 8:53p Kaisheny
// Asynchronous SGD using data pipe.
//
// 70 9/30/12 10:46a Fseide
// new optional parameter to fuptodate()--caller can now choose whether a
// missing input file, with target file present, will cause a failure or
// considers the target up-to-date
//
// 69 11/09/11 10:01 Fseide
// added a new overload for fgetfilelines() that returns an array of char*
// instead of strings, to avoid mem alloc
//
// 68 6/10/11 9:50 Fseide
// (fixed a missing 'inline')
//
// 67 6/10/11 9:49 Fseide
// new function fgetfilelines() for reading text files
//
// 66 6/09/11 15:18 Fseide
// added overloads to fexists() that accept STL strings
//
// 65 3/07/11 12:13 Fseide
// actually implemented unlinkOrDie() (was a dummy)
//
// 64 11/17/10 15:00 Fseide
// new function fuptodate();
// make_intermediate_dirs() moved to namespace msra::files (all new
// functions should be put in there)
//
// 63 11/15/10 7:04p Fseide
// added an overload for freadOrDie (vector) that takes size as a size_t
// instead of an int, to pleasr the x64 compiler
//
// 62 11/08/10 17:07 Fseide
// new function make_intermediate_dirs()
//
// 61 11/08/10 11:43 Fseide
// (minor cleanup)
//
// 60 2/05/09 19:06 Fseide
// fgetline() now returns a non-const pointer, because user may want to
// post-process the line, and the returned value is a user-specified
// buffer anyway
//
// 59 1/16/09 17:34 Fseide
// relpath() and splitpath() moved to fileutil.h
//
// 58 1/16/09 8:59 Fseide
// exported fskipspace()
//
// 57 1/15/09 7:38 Fseide
// some magic to unify fgetstring() for char and wchar_t to a single
// template function
//
// 56 1/15/09 7:26 Fseide
// corrected the #include order of basetypes.h
//
// 55 1/14/09 19:26 Fseide
// new functions fsetpos() and fgetpos();
// new fixed-buffer size overload for fgetstring() and fgettoken()
//
// 54 1/08/09 16:14 Fseide
// fopenOrDie() now supports "-" as the pathname, referring to stdin or
// stdout
//
// 53 1/08/09 15:32 Fseide
// new funtion expand_wildcards()
//
// 52 1/05/09 8:44 Fseide
// (added comments)
//
// 51 11/11/08 6:04p Qiluo
// recover the old fputstring functions
//
// 50 10/31/08 5:09p Qiluo
// remove banned APIs
//
// 49 7/17/08 7:22p V-spwang
// undid changes - back to version 47
//
// 47 6/24/08 19:03 Fseide
// added fgetwstring() and fputstring() for wstrings;
// added templates for freadOrDie() and fwriteOrDie() for STL vectors
//
// 46 6/18/08 11:41 Fseide
// added #pragma once
//
// 45 08-05-29 18:18 Llu
// fix the interface of fputwav
//
// 44 08-05-29 13:54 Llu
// add fputwav revise fgetwav using stl instead of short *
//
// 43 11/27/06 11:40 Fseide
// new methods fgetwfx() and fputwfx() for direct access to simple PCM WAV
// files
//
// 42 10/14/06 18:31 Fseide
// added char* version of fexists()
//
// 41 5/22/06 9:34 Fseide
// (experimental auto_file class checked in)
//
// 40 5/14/06 19:59 Fseide
// new function fsetmode()
//
// 39 3/29/06 15:36 Fseide
// changed to reading entire file instead of line-by-line, not changing
// newlines anymore
//
// 38 2/21/06 12:39p Kit
// Added filesize64 function
//
// 37 1/09/06 7:12p Rogeryu
// wide version of fgetline
//
// 36 12/19/05 21:52 Fseide
// fputfile() added in 8-bit string version
//
// 35 12/15/05 20:25 Fseide
// added getfiletime(), setfiletime(), and fputfile() for strings
//
// 34 9/27/05 12:22 Fseide
// added wstring version of renameOrDie()
//
// 33 9/22/05 12:26 Fseide
// new method fexists()
//
// 32 9/15/05 11:33 Fseide
// new version of fgetline() that avoids buffer allocations, since this
// seems very expensive esp. when reading a file line by line with
// fgetline()
//
// 31 9/05/05 4:57p F-xyzhao
// added #include <windows.h> for #include <mmreg.h> -- ugh
//
// 30 9/05/05 11:00 Fseide
// new method renameOrDie()
//
// 29 8/24/05 5:45p Kjchen
// merge changes in OneNote
//
// 28 8/19/05 17:56 Fseide
// extended WAVEHEADER with write() and update()
//
// 27 8/13/05 15:37 Fseide
// added new version of fgetline that takes a buffer
//
// 26 7/26/05 18:54 Fseide
// new functions fgetint24() and fputint24()
//
// 25 2/12/05 15:21 Fseide
// fgetdouble() and fputdouble() added
//
// 24 2/05/05 12:38 Fseide
// new methods fputfile(), fgetfile();
// new overload for filesize()
//
// 23 2/03/05 22:34 Fseide
// added new version of fgetline() that returns an STL string
//
// 22 5/31/04 10:06 Fseide
// new methods fseekOrDie(), ftellOrDie(), unlinkOrDie(), renameOrDie()
//
// 21 3/19/04 4:01p Fseide
// fwriteOrDie(): first argument changed to const
//
// 20 2/27/04 10:04a V-xlshi
//
// 19 2/19/04 3:45p V-xlshi
// fgetraw function is added.
//
// 18 2/19/04 1:49p V-xlshi
//
// 17 2/03/04 8:17p V-xlshi
//
// 16 2/03/04 6:20p V-xlshi
// WAVEHEADER.prepare() added
//
// 15 2/03/04 5:58p V-xlshi
// WAVEHEADER structure added
//
// 14 8/15/03 15:40 Fseide
// new method filesize()
//
// 13 8/13/03 21:06 Fseide
// new function fputbyte()
//
// 12 8/13/03 15:37 Fseide
// prototype of fOpenOrDie() Unicode version changed
//
// 11 8/07/03 22:04 Fseide
// fprintfOrDie() now really dies in case of error
//
// 10 03-07-30 12:06 I-rogery
// enable both unicode and non-unicode version
//
// 9 7/25/03 6:07p Fseide
// new functions fgetbyte() and fgetwav()
//
// 8 7/03/02 9:25p Fseide
// fcompareTag() now uses string type for both of its arguments (before,
// it used const char * for one of them)
//
// 7 6/10/02 3:14p Fseide
// new functions fgettoken(), fgetfloat_ascii(), fskipNewline()
//
// 6 6/07/02 7:26p Fseide
// new functions fcheckTag_ascii() and fgetint_ascii()
//
// 5 4/15/02 1:12p Fseide
// void fputstring (FILE * f, const TSTRING & str) and fpad() added
//
// 4 4/03/02 3:58p Fseide
// VSS keyword and copyright added
//
// F. Seide 5 Mar 2002
//
#pragma once
#ifndef _FILEUTIL_
#define _FILEUTIL_
#include <stdio.h>
#ifdef _WIN32
#define isfinite(x) _finite(x)
#define isnan(x) _isnan(x)
#endif
#ifdef __unix__
#include <sys/types.h>
#include <sys/stat.h>
#endif
#include <algorithm> // for std::find
#include <vector>
#include <map>
@ -695,6 +482,29 @@ namespace msra { namespace files {
// simple support for WAV file I/O
// ----------------------------------------------------------------------------
// define the header if we haven't seen it yet
#ifndef _WAVEFORMATEX_
#define _WAVEFORMATEX_
/*
* extended waveform format structure used for all non-PCM formats. this
* structure is common to all non-PCM formats.
*/
typedef unsigned short WORD; // in case not defined yet (i.e. linux)
typedef struct tWAVEFORMATEX
{
WORD wFormatTag; /* format type */
WORD nChannels; /* number of channels (i.e. mono, stereo...) */
DWORD nSamplesPerSec; /* sample rate */
DWORD nAvgBytesPerSec; /* for buffer estimation */
WORD nBlockAlign; /* block size of data */
WORD wBitsPerSample; /* number of bits per sample of mono data */
WORD cbSize; /* the count in bytes of the size of */
/* extra information (after cbSize) */
} WAVEFORMATEX, *PWAVEFORMATEX;
#endif /* _WAVEFORMATEX_ */
typedef struct wavehder{
char riffchar[4];
unsigned int RiffLength;

Просмотреть файл

@ -298,7 +298,7 @@ size_t filesize (const wchar_t * pathname)
// filesize64(): determine size of the file in bytes (with pathname)
int64_t filesize64 (const wchar_t * pathname)
{
__stat64 fileinfo;
struct _stat64 fileinfo;
if (_wstat64 (pathname,&fileinfo) == -1)
return 0;
else
@ -1375,6 +1375,21 @@ vector<char*> msra::files::fgetfilelines (const wstring & path, vector<char> & b
bool getfiletime (const wstring & path, FILETIME & time)
{ // return file modification time, false if cannot be determined
struct _stat buf;
int result;
// Get data associated with "crt_stat.c":
result = _wstat(path.c_str(), &buf);
// Check if statistics are valid:
if( result != 0 )
{
return false;
}
(*(time_t*)(&time))= buf.st_mtime;
return true;
#ifdef OLD
WIN32_FIND_DATAW findFileData;
auto_handle hFind (FindFirstFileW (path.c_str(), &findFileData), ::FindClose);
if (hFind != INVALID_HANDLE_VALUE)
@ -1386,10 +1401,14 @@ bool getfiletime (const wstring & path, FILETIME & time)
{
return false;
}
#endif
}
void setfiletime (const wstring & path, const FILETIME & time)
{ // update the file modification time of an existing file
#ifdef LINUX
throw new logic_error("setfiletime has not been converted to linux yet...");
#else
auto_handle h (CreateFileW (path.c_str(), FILE_WRITE_ATTRIBUTES,
FILE_SHARE_READ|FILE_SHARE_WRITE, NULL,
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL));
@ -1402,6 +1421,7 @@ void setfiletime (const wstring & path, const FILETIME & time)
{
RuntimeError ("setfiletime: error setting file time information: %d", GetLastError());
}
#endif
}
#if 0

Просмотреть файл

@ -124,7 +124,7 @@
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
<DelayLoadDLLs>CNTKMath.dll;nvml.dll</DelayLoadDLLs>
<DelayLoadDLLs>CNTKMath.dll;nvml.dll;nvcuda.dll</DelayLoadDLLs>
</Link>
<PostBuildEvent>
<Command>
@ -186,7 +186,7 @@
<OptimizeReferences>true</OptimizeReferences>
<AdditionalDependencies>Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<Profile>true</Profile>
<DelayLoadDLLs>CNTKMath.dll;nvml.dll</DelayLoadDLLs>
<DelayLoadDLLs>CNTKMath.dll;nvml.dll;nvcuda.dll</DelayLoadDLLs>
</Link>
<PostBuildEvent>
<Command>copy $(SolutionDir)..\Common\PTask\bin\*.dll $(TargetDir)</Command>

Просмотреть файл

@ -21,13 +21,15 @@
#ifndef LINUX
#include <Windows.h>
#define Linux(x)
#else
#define Linux(x) x
#ifndef max
#define max(a,b) (((a) > (b)) ? (a) : (b))
#endif
#include <values.h>
#include <cfloat>
#endif /* LINUX */
#ifdef LEAKDETECT

1667
Math/Math/GPUDummy.cpp Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -324,22 +324,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
switch (kind)
{
case 0:
_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
break;
case 1:
_inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
_inplaceTanhOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
break;
case 2:
_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
break;
case 3:
_inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
_inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
break;
case 4:
_inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
_inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
break;
case 5:
_inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
_inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
break;
case 6:
_inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
@ -1205,7 +1205,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
// cudaEvent_t done = nullptr;
// if (do_sync) CUDA_CALL(cudaEventCreate(&done));
// _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
// _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
// if (do_sync) CUDA_CALL(cudaEventRecord(done));
// if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
// if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1458,7 +1458,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
_addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1480,7 +1480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N,M);
_columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N,M);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1503,7 +1503,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.m_pArray,N,M);
_rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.m_pArray,N,M);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1568,7 +1568,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
_elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1825,7 +1825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
_inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1848,7 +1848,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
_assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1866,7 +1866,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
_inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1889,7 +1889,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
_assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1906,7 +1906,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
_setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1964,7 +1964,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
//WARNING: THIS kernel is not the most efficient way!
_reductionSumAndAssign<ElemType><<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements());
_reductionSumAndAssign<ElemType><<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements());
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2026,7 +2026,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N);
_elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2049,7 +2049,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
_assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2078,7 +2078,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
_assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2119,7 +2119,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
_vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2160,7 +2160,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
_vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2219,7 +2219,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);
_assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2257,7 +2257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);
_addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2360,7 +2360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cudaEvent_t done = nullptr;
int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock);
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements());
_assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements());
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2380,7 +2380,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cudaEvent_t done = nullptr;
int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock);
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements());
_addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements());
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2473,8 +2473,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cudaEvent_t done = nullptr;
//int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock);
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
//_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements());
_assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements());
//_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements());
_assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements());
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2926,7 +2926,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LONG64 n=(LONG64)a.GetNumElements();
int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
_addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2967,7 +2967,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LONG64 n=(LONG64)a.GetNumElements();
int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
_assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -3011,7 +3011,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LONG64 n=(LONG64)a.GetNumElements();
int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
_addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -3055,7 +3055,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LONG64 n=(LONG64)a.GetNumElements();
int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
_assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -3074,7 +3074,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cudaEvent_t done = nullptr;
int blocksPerGrid=1; //only one element
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));
_addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -3195,7 +3195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
_innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);
_innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -3288,7 +3288,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
LONG64 N=(LONG64)a.GetNumElements();
int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
_elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);
_elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));

Просмотреть файл

@ -3,6 +3,7 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
#ifndef CPU_ONLY
#include <float.h>
#include <cuda_runtime.h>
#include "CommonMatrix.h"
@ -3235,3 +3236,5 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((
}
}
*/
#endif /*!CPU_ONLY*/

Просмотреть файл

@ -909,7 +909,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
cudaEvent_t done = nullptr;
CUDACALL(cudaEventCreate(&done));
_inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_blockVal,threshold,N);
_inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_blockVal,threshold,N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
CUDACALL(cudaEventDestroy(done));
@ -1310,7 +1310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
a.PrepareDevice();
long N=(long)a.GetNZElements();
int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
_elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzLocation(),c.NzLocation(),N);
_elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzLocation(),c.NzLocation(),N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
}
@ -1360,7 +1360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//int* h_vectArray= new int[a.m_nz];
int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);
CUDACALL(cudaEventCreate(&done));
_getSparseVectorRepresntationForMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);
_getSparseVectorRepresntationForMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
CUDACALL(cudaEventDestroy(done));
@ -1411,7 +1411,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CUDACALL(cudaMemcpy(d_res,res,sizeof(long)*3,cudaMemcpyHostToDevice));
int blocksPerGrid =(int)ceil(1.0*a.GetNZElements()/threadsPerBlock);
_areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res);
_areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res);
_areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.ColLocation(),b.ColLocation(),(long)a.GetNZElements(),(int)threshold,d_res+1);
blocksPerGrid =(int)ceil((1.0*a.GetNumRows()+1.0)/threadsPerBlock);
_areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.RowLocation(),b.RowLocation(),(long)a.GetNumRows()+1,(int)threshold,d_res+2);
@ -1719,7 +1719,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
cudaEvent_t done = nullptr;
CUDACALL(cudaEventCreate(&done));
_elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
_elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
return *this;
@ -1846,7 +1846,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
cudaEvent_t done = nullptr;
CUDACALL(cudaEventCreate(&done));
_inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
_inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
return *this;
@ -1867,7 +1867,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
cudaEvent_t done = nullptr;
CUDACALL(cudaEventCreate(&done));
_assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
_assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
return *this;
@ -1882,7 +1882,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
cudaEvent_t done = nullptr;
CUDACALL(cudaEventCreate(&done));
_inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
_inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
return *this;
@ -1903,7 +1903,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
cudaEvent_t done = nullptr;
CUDACALL(cudaEventCreate(&done));
_assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
_assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
return *this;
@ -1918,7 +1918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
cudaEvent_t done = nullptr;
CUDACALL(cudaEventCreate(&done));
_setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
_setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
CUDACALL(cudaEventRecord(done));
CUDACALL(cudaEventSynchronize(done));
return *this;
@ -2012,22 +2012,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
switch (kind)
{
case 0:
_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
break;
case 1:
_inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
_inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
break;
case 2:
_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
break;
case 3:
_inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
_inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
break;
case 4:
_inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
_inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
break;
case 5:
_inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
_inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
break;
case 6:
_inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);

Просмотреть файл

@ -0,0 +1,77 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="stdafx.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="targetver.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="CPUMatrix.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="Matrix.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="Helpers.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="CommonMatrix.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="CPUSparseMatrix.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="stdafx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="dllmain.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="CPUMatrix.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="Matrix.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\Common\File.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\Common\fileutil.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="CPUSparseMatrix.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GPUDummy.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="GPUMatrix.cuh">
<Filter>Header Files</Filter>
</None>
<None Include="GPUSparseMatrix.cuh">
<Filter>Header Files</Filter>
</None>
<None Include="GPUWatcher.cuh">
<Filter>Header Files</Filter>
</None>
</ItemGroup>
</Project>

Просмотреть файл

@ -288,15 +288,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (m_preferredDeviceId == CPUDEVICE)
{
m_CPUMatrix = new CPUMatrix<ElemType>(numRows,numCols);
m_CPUMatrix = new CPUMatrix<ElemType>(numRows,numCols);
SetDataLocation(CPU, DENSE);
}
else
{
m_GPUMatrix = new GPUMatrix<ElemType>(numRows,numCols,m_preferredDeviceId);
SetDataLocation(GPU, DENSE);
}
}
}
else
{
m_GPUMatrix = new GPUMatrix<ElemType>(numRows,numCols,m_preferredDeviceId);
SetDataLocation(GPU, DENSE);
}
}
}
template<class ElemType>
@ -840,11 +840,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_CPUMatrix->SetValue(*db_number.ExposePointer2Value()),
if (GetDeviceId()!=db_number.GetDeviceId())
{
#ifndef LINUX
throw std::exception("Matrix and device bound number must be on the same device");
#else
throw std::exception();
#endif /* LINUX */
throw std::runtime_error("Matrix and device bound number must be on the same device");
}
m_GPUMatrix->SetValue(db_number.ExposePointer2Value()),
NOT_IMPLEMENTED,