Modify port code to support CPUONLY build
To use this define CPUONLY in the CN project And remove *.cu from the math project and add GPUDummy.cpp instead This allows use of Cygwin to compile both Windows and linux on same machine
This commit is contained in:
Родитель
a863c7746f
Коммит
ff72d5696f
|
@ -6,14 +6,28 @@
|
|||
|
||||
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
|
||||
|
||||
#include "BestGpu.h"
|
||||
#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX
|
||||
|
||||
#ifdef CPUONLY
|
||||
namespace Microsoft {
|
||||
namespace MSR {
|
||||
namespace CNTK {
|
||||
short DeviceFromConfig(const ConfigParameters& config)
|
||||
{
|
||||
return CPUDEVICE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
// CUDA-C includes
|
||||
#include <cuda.h>
|
||||
#include <windows.h>
|
||||
#include <delayimp.h>
|
||||
#include <Shlobj.h>
|
||||
#include <stdio.h>
|
||||
#include "BestGpu.h"
|
||||
#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX
|
||||
|
||||
// The "notify hook" gets called for every call to the
|
||||
// delay load helper. This allows a user to hook every call and
|
||||
|
@ -507,4 +521,5 @@ void BestGpu::QueryNvmlData()
|
|||
m_nvmlData = true;
|
||||
return;
|
||||
}
|
||||
}}}
|
||||
}}}
|
||||
#endif
|
||||
|
|
|
@ -4,15 +4,18 @@
|
|||
// </copyright>
|
||||
//
|
||||
#pragma once
|
||||
#ifndef CPUONLY
|
||||
#include <cuda_runtime.h>
|
||||
#include <nvml.h>
|
||||
#include <vector>
|
||||
#endif
|
||||
#include "commandArgUtil.h"
|
||||
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
short DeviceFromConfig(const ConfigParameters& config);
|
||||
|
||||
#ifndef CPUONLY
|
||||
struct ProcessorData
|
||||
{
|
||||
int cores;
|
||||
|
@ -68,5 +71,5 @@ public:
|
|||
std::vector<int> GetDevices(int number=AllDevices, BestGpuFlags flags=bestGpuNormal); // get multiple devices
|
||||
};
|
||||
extern BestGpu* g_bestGpu;
|
||||
|
||||
#endif
|
||||
}}}
|
|
@ -13,7 +13,13 @@ typedef char16_t TCHAR;
|
|||
#define vsprintf_s vsprintf /* Not sure this is right... Malcolm */
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#endif /* LINUX */
|
||||
#include <cstdlib>
|
||||
#include <cerrno>
|
||||
#define Linux(a) a
|
||||
#else
|
||||
#include <tchar.h>
|
||||
#endif /* LINUX */
|
||||
#include <cmath> // for HUGE_VAL // Remove for a test by Malcolm because of double isnan definition...
|
||||
|
||||
#ifndef UNDER_CE // fixed-buffer overloads not available for wince
|
||||
#ifdef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES // fixed-buffer overloads for strcpy() etc.
|
||||
|
@ -80,11 +86,13 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
|
|||
#include <errno.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cmath> // for HUGE_VAL // potential double isnan definition
|
||||
#include <math.h> // for HUGE_VAL // potential double isnan definition
|
||||
#include <assert.h>
|
||||
#include <stdarg.h>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <locale> // std::wstring_convert
|
||||
#include <codecvt> // std::codecvt_utf8
|
||||
#ifdef _MSC_VER
|
||||
#include <windows.h> // for CRITICAL_SECTION and Unicode conversion functions --TODO: is there a portable alternative?
|
||||
#endif
|
||||
|
@ -578,6 +586,9 @@ typedef strfun::_strprintf<wchar_t> wstrprintf; // wchar_t version
|
|||
#ifdef _WIN32
|
||||
struct utf8 : std::string { utf8 (const std::wstring & p) // utf-16 to -8
|
||||
{
|
||||
//TODO: confirm it builds on VS2013
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cv;
|
||||
(*(std::string*)this) = cv.to_bytes(p);
|
||||
#ifdef MALCOLM
|
||||
size_t len = p.length();
|
||||
if (len == 0) { return;} // empty string
|
||||
|
@ -592,16 +603,19 @@ struct utf8 : std::string { utf8 (const std::wstring & p) // utf-16 to -8
|
|||
}};
|
||||
struct utf16 : std::wstring { utf16 (const std::string & p) // utf-8 to -16
|
||||
{
|
||||
#ifdef MALCOLM
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cv;
|
||||
(*(std::wstring*)this) = cv.from_bytes(p);
|
||||
|
||||
#ifdef OLD
|
||||
size_t len = p.length();
|
||||
if (len == 0) { return;} // empty string
|
||||
msra::basetypes::fixed_vector<wchar_t> buf (len + 1);
|
||||
// ... TODO: this fill() should be unnecessary (a 0 is appended)--but verify
|
||||
std::fill (buf.begin (), buf.end (), (wchar_t) 0);
|
||||
int rc = MultiByteToWideChar (CP_UTF8, 0, p.c_str(), (int) len,
|
||||
&buf[0], (int) buf.size());
|
||||
if (rc == 0) throw std::runtime_error ("MultiByteToWideChar");
|
||||
ASSERT (rc < buf.size ());
|
||||
std::fill(buf.begin(), buf.end(), (wchar_t)0);
|
||||
int rc = MultiByteToWideChar(CP_UTF8, 0, p.c_str(), (int)len,
|
||||
&buf[0], (int)buf.size());
|
||||
if (rc == 0) throw std::runtime_error("MultiByteToWideChar");
|
||||
ASSERT(rc < buf.size());
|
||||
(*(std::wstring*)this) = &buf[0];
|
||||
#endif /* Malcolm */
|
||||
}};
|
||||
|
@ -641,12 +655,8 @@ static inline std::string wcstombs (const std::wstring & p) // output: MBCS
|
|||
}
|
||||
static inline std::wstring mbstowcs (const std::string & p) // input: MBCS
|
||||
{
|
||||
size_t len = p.length();
|
||||
msra::basetypes::fixed_vector<wchar_t> buf (len + 1); // max: >1 mb chars => 1 wchar
|
||||
std::fill (buf.begin (), buf.end (), (wchar_t) 0);
|
||||
OACR_WARNING_SUPPRESS(UNSAFE_STRING_FUNCTION, "Reviewed OK. size checked. [rogeryu 2006/03/21]");
|
||||
::mbstowcs (&buf[0], p.c_str(), len + 1);
|
||||
return std::wstring (&buf[0]);
|
||||
std::wstring ret = utf16(p);
|
||||
return ret;
|
||||
}
|
||||
#pragma warning(pop)
|
||||
|
||||
|
@ -769,8 +779,6 @@ static inline FILE* _wfopen(const wchar_t * path, const wchar_t * mode) { return
|
|||
|
||||
namespace msra { namespace basetypes {
|
||||
|
||||
#ifdef MALCOLM
|
||||
|
||||
// FILE* with auto-close; use auto_file_ptr instead of FILE*.
|
||||
// Warning: do not pass an auto_file_ptr to a function that calls fclose(),
|
||||
// except for fclose() itself.
|
||||
|
@ -789,7 +797,7 @@ public:
|
|||
auto_file_ptr() : f (NULL) { }
|
||||
~auto_file_ptr() { close(); }
|
||||
auto_file_ptr (const char * path, const char * mode) { f = fopen (path, mode); if (f == NULL) openfailed (path); }
|
||||
auto_file_ptr (const wchar_t * path, const char * mode) { f = _wfopen (path, msra::strfun::utf16 (mode).c_str()); if (f == NULL) openfailed (msra::strfun::utf8 (path)); }
|
||||
auto_file_ptr (const wchar_t * wpath, const char * mode) {string path = msra::strfun::utf8(wpath); f = fopen (path.c_str(), mode); if (f == NULL) openfailed (path); }
|
||||
FILE * operator= (FILE * other) { close(); f = other; return f; }
|
||||
auto_file_ptr (FILE * other) : f (other) { }
|
||||
operator FILE * () const { return f; }
|
||||
|
@ -797,7 +805,6 @@ public:
|
|||
void swap (auto_file_ptr & other) throw() { std::swap (f, other.f); }
|
||||
};
|
||||
inline int fclose (auto_file_ptr & af) { return af.fclose(); }
|
||||
#endif /* MALCOLM */
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// auto-closing container for Win32 handles.
|
||||
|
|
|
@ -3,232 +3,19 @@
|
|||
//
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
//
|
||||
// $Log: /Speech_To_Speech_Translation/dbn/dbn/fileutil.h $
|
||||
//
|
||||
// 71 1/03/13 8:53p Kaisheny
|
||||
// Asynchronous SGD using data pipe.
|
||||
//
|
||||
// 70 9/30/12 10:46a Fseide
|
||||
// new optional parameter to fuptodate()--caller can now choose whether a
|
||||
// missing input file, with target file present, will cause a failure or
|
||||
// considers the target up-to-date
|
||||
//
|
||||
// 69 11/09/11 10:01 Fseide
|
||||
// added a new overload for fgetfilelines() that returns an array of char*
|
||||
// instead of strings, to avoid mem alloc
|
||||
//
|
||||
// 68 6/10/11 9:50 Fseide
|
||||
// (fixed a missing 'inline')
|
||||
//
|
||||
// 67 6/10/11 9:49 Fseide
|
||||
// new function fgetfilelines() for reading text files
|
||||
//
|
||||
// 66 6/09/11 15:18 Fseide
|
||||
// added overloads to fexists() that accept STL strings
|
||||
//
|
||||
// 65 3/07/11 12:13 Fseide
|
||||
// actually implemented unlinkOrDie() (was a dummy)
|
||||
//
|
||||
// 64 11/17/10 15:00 Fseide
|
||||
// new function fuptodate();
|
||||
// make_intermediate_dirs() moved to namespace msra::files (all new
|
||||
// functions should be put in there)
|
||||
//
|
||||
// 63 11/15/10 7:04p Fseide
|
||||
// added an overload for freadOrDie (vector) that takes size as a size_t
|
||||
// instead of an int, to pleasr the x64 compiler
|
||||
//
|
||||
// 62 11/08/10 17:07 Fseide
|
||||
// new function make_intermediate_dirs()
|
||||
//
|
||||
// 61 11/08/10 11:43 Fseide
|
||||
// (minor cleanup)
|
||||
//
|
||||
// 60 2/05/09 19:06 Fseide
|
||||
// fgetline() now returns a non-const pointer, because user may want to
|
||||
// post-process the line, and the returned value is a user-specified
|
||||
// buffer anyway
|
||||
//
|
||||
// 59 1/16/09 17:34 Fseide
|
||||
// relpath() and splitpath() moved to fileutil.h
|
||||
//
|
||||
// 58 1/16/09 8:59 Fseide
|
||||
// exported fskipspace()
|
||||
//
|
||||
// 57 1/15/09 7:38 Fseide
|
||||
// some magic to unify fgetstring() for char and wchar_t to a single
|
||||
// template function
|
||||
//
|
||||
// 56 1/15/09 7:26 Fseide
|
||||
// corrected the #include order of basetypes.h
|
||||
//
|
||||
// 55 1/14/09 19:26 Fseide
|
||||
// new functions fsetpos() and fgetpos();
|
||||
// new fixed-buffer size overload for fgetstring() and fgettoken()
|
||||
//
|
||||
// 54 1/08/09 16:14 Fseide
|
||||
// fopenOrDie() now supports "-" as the pathname, referring to stdin or
|
||||
// stdout
|
||||
//
|
||||
// 53 1/08/09 15:32 Fseide
|
||||
// new funtion expand_wildcards()
|
||||
//
|
||||
// 52 1/05/09 8:44 Fseide
|
||||
// (added comments)
|
||||
//
|
||||
// 51 11/11/08 6:04p Qiluo
|
||||
// recover the old fputstring functions
|
||||
//
|
||||
// 50 10/31/08 5:09p Qiluo
|
||||
// remove banned APIs
|
||||
//
|
||||
// 49 7/17/08 7:22p V-spwang
|
||||
// undid changes - back to version 47
|
||||
//
|
||||
// 47 6/24/08 19:03 Fseide
|
||||
// added fgetwstring() and fputstring() for wstrings;
|
||||
// added templates for freadOrDie() and fwriteOrDie() for STL vectors
|
||||
//
|
||||
// 46 6/18/08 11:41 Fseide
|
||||
// added #pragma once
|
||||
//
|
||||
// 45 08-05-29 18:18 Llu
|
||||
// fix the interface of fputwav
|
||||
//
|
||||
// 44 08-05-29 13:54 Llu
|
||||
// add fputwav revise fgetwav using stl instead of short *
|
||||
//
|
||||
// 43 11/27/06 11:40 Fseide
|
||||
// new methods fgetwfx() and fputwfx() for direct access to simple PCM WAV
|
||||
// files
|
||||
//
|
||||
// 42 10/14/06 18:31 Fseide
|
||||
// added char* version of fexists()
|
||||
//
|
||||
// 41 5/22/06 9:34 Fseide
|
||||
// (experimental auto_file class checked in)
|
||||
//
|
||||
// 40 5/14/06 19:59 Fseide
|
||||
// new function fsetmode()
|
||||
//
|
||||
// 39 3/29/06 15:36 Fseide
|
||||
// changed to reading entire file instead of line-by-line, not changing
|
||||
// newlines anymore
|
||||
//
|
||||
// 38 2/21/06 12:39p Kit
|
||||
// Added filesize64 function
|
||||
//
|
||||
// 37 1/09/06 7:12p Rogeryu
|
||||
// wide version of fgetline
|
||||
//
|
||||
// 36 12/19/05 21:52 Fseide
|
||||
// fputfile() added in 8-bit string version
|
||||
//
|
||||
// 35 12/15/05 20:25 Fseide
|
||||
// added getfiletime(), setfiletime(), and fputfile() for strings
|
||||
//
|
||||
// 34 9/27/05 12:22 Fseide
|
||||
// added wstring version of renameOrDie()
|
||||
//
|
||||
// 33 9/22/05 12:26 Fseide
|
||||
// new method fexists()
|
||||
//
|
||||
// 32 9/15/05 11:33 Fseide
|
||||
// new version of fgetline() that avoids buffer allocations, since this
|
||||
// seems very expensive esp. when reading a file line by line with
|
||||
// fgetline()
|
||||
//
|
||||
// 31 9/05/05 4:57p F-xyzhao
|
||||
// added #include <windows.h> for #include <mmreg.h> -- ugh
|
||||
//
|
||||
// 30 9/05/05 11:00 Fseide
|
||||
// new method renameOrDie()
|
||||
//
|
||||
// 29 8/24/05 5:45p Kjchen
|
||||
// merge changes in OneNote
|
||||
//
|
||||
// 28 8/19/05 17:56 Fseide
|
||||
// extended WAVEHEADER with write() and update()
|
||||
//
|
||||
// 27 8/13/05 15:37 Fseide
|
||||
// added new version of fgetline that takes a buffer
|
||||
//
|
||||
// 26 7/26/05 18:54 Fseide
|
||||
// new functions fgetint24() and fputint24()
|
||||
//
|
||||
// 25 2/12/05 15:21 Fseide
|
||||
// fgetdouble() and fputdouble() added
|
||||
//
|
||||
// 24 2/05/05 12:38 Fseide
|
||||
// new methods fputfile(), fgetfile();
|
||||
// new overload for filesize()
|
||||
//
|
||||
// 23 2/03/05 22:34 Fseide
|
||||
// added new version of fgetline() that returns an STL string
|
||||
//
|
||||
// 22 5/31/04 10:06 Fseide
|
||||
// new methods fseekOrDie(), ftellOrDie(), unlinkOrDie(), renameOrDie()
|
||||
//
|
||||
// 21 3/19/04 4:01p Fseide
|
||||
// fwriteOrDie(): first argument changed to const
|
||||
//
|
||||
// 20 2/27/04 10:04a V-xlshi
|
||||
//
|
||||
// 19 2/19/04 3:45p V-xlshi
|
||||
// fgetraw function is added.
|
||||
//
|
||||
// 18 2/19/04 1:49p V-xlshi
|
||||
//
|
||||
// 17 2/03/04 8:17p V-xlshi
|
||||
//
|
||||
// 16 2/03/04 6:20p V-xlshi
|
||||
// WAVEHEADER.prepare() added
|
||||
//
|
||||
// 15 2/03/04 5:58p V-xlshi
|
||||
// WAVEHEADER structure added
|
||||
//
|
||||
// 14 8/15/03 15:40 Fseide
|
||||
// new method filesize()
|
||||
//
|
||||
// 13 8/13/03 21:06 Fseide
|
||||
// new function fputbyte()
|
||||
//
|
||||
// 12 8/13/03 15:37 Fseide
|
||||
// prototype of fOpenOrDie() Unicode version changed
|
||||
//
|
||||
// 11 8/07/03 22:04 Fseide
|
||||
// fprintfOrDie() now really dies in case of error
|
||||
//
|
||||
// 10 03-07-30 12:06 I-rogery
|
||||
// enable both unicode and non-unicode version
|
||||
//
|
||||
// 9 7/25/03 6:07p Fseide
|
||||
// new functions fgetbyte() and fgetwav()
|
||||
//
|
||||
// 8 7/03/02 9:25p Fseide
|
||||
// fcompareTag() now uses string type for both of its arguments (before,
|
||||
// it used const char * for one of them)
|
||||
//
|
||||
// 7 6/10/02 3:14p Fseide
|
||||
// new functions fgettoken(), fgetfloat_ascii(), fskipNewline()
|
||||
//
|
||||
// 6 6/07/02 7:26p Fseide
|
||||
// new functions fcheckTag_ascii() and fgetint_ascii()
|
||||
//
|
||||
// 5 4/15/02 1:12p Fseide
|
||||
// void fputstring (FILE * f, const TSTRING & str) and fpad() added
|
||||
//
|
||||
// 4 4/03/02 3:58p Fseide
|
||||
// VSS keyword and copyright added
|
||||
//
|
||||
// F. Seide 5 Mar 2002
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#ifndef _FILEUTIL_
|
||||
#define _FILEUTIL_
|
||||
|
||||
#include <stdio.h>
|
||||
#ifdef _WIN32
|
||||
#define isfinite(x) _finite(x)
|
||||
#define isnan(x) _isnan(x)
|
||||
#endif
|
||||
#ifdef __unix__
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
#include <algorithm> // for std::find
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
@ -695,6 +482,29 @@ namespace msra { namespace files {
|
|||
// simple support for WAV file I/O
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// define the header if we haven't seen it yet
|
||||
#ifndef _WAVEFORMATEX_
|
||||
#define _WAVEFORMATEX_
|
||||
|
||||
/*
|
||||
* extended waveform format structure used for all non-PCM formats. this
|
||||
* structure is common to all non-PCM formats.
|
||||
*/
|
||||
typedef unsigned short WORD; // in case not defined yet (i.e. linux)
|
||||
typedef struct tWAVEFORMATEX
|
||||
{
|
||||
WORD wFormatTag; /* format type */
|
||||
WORD nChannels; /* number of channels (i.e. mono, stereo...) */
|
||||
DWORD nSamplesPerSec; /* sample rate */
|
||||
DWORD nAvgBytesPerSec; /* for buffer estimation */
|
||||
WORD nBlockAlign; /* block size of data */
|
||||
WORD wBitsPerSample; /* number of bits per sample of mono data */
|
||||
WORD cbSize; /* the count in bytes of the size of */
|
||||
/* extra information (after cbSize) */
|
||||
} WAVEFORMATEX, *PWAVEFORMATEX;
|
||||
|
||||
#endif /* _WAVEFORMATEX_ */
|
||||
|
||||
typedef struct wavehder{
|
||||
char riffchar[4];
|
||||
unsigned int RiffLength;
|
||||
|
|
|
@ -298,7 +298,7 @@ size_t filesize (const wchar_t * pathname)
|
|||
// filesize64(): determine size of the file in bytes (with pathname)
|
||||
int64_t filesize64 (const wchar_t * pathname)
|
||||
{
|
||||
__stat64 fileinfo;
|
||||
struct _stat64 fileinfo;
|
||||
if (_wstat64 (pathname,&fileinfo) == -1)
|
||||
return 0;
|
||||
else
|
||||
|
@ -1375,6 +1375,21 @@ vector<char*> msra::files::fgetfilelines (const wstring & path, vector<char> & b
|
|||
|
||||
bool getfiletime (const wstring & path, FILETIME & time)
|
||||
{ // return file modification time, false if cannot be determined
|
||||
struct _stat buf;
|
||||
int result;
|
||||
|
||||
// Get data associated with "crt_stat.c":
|
||||
result = _wstat(path.c_str(), &buf);
|
||||
// Check if statistics are valid:
|
||||
if( result != 0 )
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
(*(time_t*)(&time))= buf.st_mtime;
|
||||
return true;
|
||||
|
||||
#ifdef OLD
|
||||
WIN32_FIND_DATAW findFileData;
|
||||
auto_handle hFind (FindFirstFileW (path.c_str(), &findFileData), ::FindClose);
|
||||
if (hFind != INVALID_HANDLE_VALUE)
|
||||
|
@ -1386,10 +1401,14 @@ bool getfiletime (const wstring & path, FILETIME & time)
|
|||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void setfiletime (const wstring & path, const FILETIME & time)
|
||||
{ // update the file modification time of an existing file
|
||||
#ifdef LINUX
|
||||
throw new logic_error("setfiletime has not been converted to linux yet...");
|
||||
#else
|
||||
auto_handle h (CreateFileW (path.c_str(), FILE_WRITE_ATTRIBUTES,
|
||||
FILE_SHARE_READ|FILE_SHARE_WRITE, NULL,
|
||||
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL));
|
||||
|
@ -1402,6 +1421,7 @@ void setfiletime (const wstring & path, const FILETIME & time)
|
|||
{
|
||||
RuntimeError ("setfiletime: error setting file time information: %d", GetLastError());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
|
|
@ -124,7 +124,7 @@
|
|||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalDependencies>Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
|
||||
<DelayLoadDLLs>CNTKMath.dll;nvml.dll</DelayLoadDLLs>
|
||||
<DelayLoadDLLs>CNTKMath.dll;nvml.dll;nvcuda.dll</DelayLoadDLLs>
|
||||
</Link>
|
||||
<PostBuildEvent>
|
||||
<Command>
|
||||
|
@ -186,7 +186,7 @@
|
|||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalDependencies>Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<Profile>true</Profile>
|
||||
<DelayLoadDLLs>CNTKMath.dll;nvml.dll</DelayLoadDLLs>
|
||||
<DelayLoadDLLs>CNTKMath.dll;nvml.dll;nvcuda.dll</DelayLoadDLLs>
|
||||
</Link>
|
||||
<PostBuildEvent>
|
||||
<Command>copy $(SolutionDir)..\Common\PTask\bin\*.dll $(TargetDir)</Command>
|
||||
|
|
|
@ -21,13 +21,15 @@
|
|||
|
||||
#ifndef LINUX
|
||||
#include <Windows.h>
|
||||
#define Linux(x)
|
||||
#else
|
||||
#define Linux(x) x
|
||||
|
||||
#ifndef max
|
||||
#define max(a,b) (((a) > (b)) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#include <values.h>
|
||||
#include <cfloat>
|
||||
#endif /* LINUX */
|
||||
|
||||
#ifdef LEAKDETECT
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -324,22 +324,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
switch (kind)
|
||||
{
|
||||
case 0:
|
||||
_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
|
||||
break;
|
||||
case 1:
|
||||
_inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
_inplaceTanhOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
|
||||
break;
|
||||
case 2:
|
||||
_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
|
||||
break;
|
||||
case 3:
|
||||
_inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
_inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 4:
|
||||
_inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
_inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 5:
|
||||
_inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
_inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 6:
|
||||
_inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
|
@ -1205,7 +1205,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
|
||||
// cudaEvent_t done = nullptr;
|
||||
// if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
// _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
|
||||
// _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
|
||||
// if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
// if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
// if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1458,7 +1458,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
|
||||
_addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1480,7 +1480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N,M);
|
||||
_columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N,M);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1503,7 +1503,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.m_pArray,N,M);
|
||||
_rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.m_pArray,N,M);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1568,7 +1568,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
_elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1825,7 +1825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
|
||||
_inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1848,7 +1848,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
|
||||
_assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1866,7 +1866,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
|
||||
_inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1889,7 +1889,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
|
||||
_assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1906,7 +1906,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
|
||||
_setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1964,7 +1964,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
//WARNING: THIS kernel is not the most efficient way!
|
||||
_reductionSumAndAssign<ElemType><<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements());
|
||||
_reductionSumAndAssign<ElemType><<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements());
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2026,7 +2026,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N);
|
||||
_elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2049,7 +2049,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
|
||||
_assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2078,7 +2078,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
|
||||
_assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2119,7 +2119,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
|
||||
_vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2160,7 +2160,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
|
||||
_vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2219,7 +2219,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);
|
||||
_assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2257,7 +2257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);
|
||||
_addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2360,7 +2360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
cudaEvent_t done = nullptr;
|
||||
int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock);
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements());
|
||||
_assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements());
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2380,7 +2380,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
cudaEvent_t done = nullptr;
|
||||
int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock);
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements());
|
||||
_addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements());
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2473,8 +2473,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
cudaEvent_t done = nullptr;
|
||||
//int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock);
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
//_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements());
|
||||
_assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements());
|
||||
//_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements());
|
||||
_assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements());
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2926,7 +2926,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
LONG64 n=(LONG64)a.GetNumElements();
|
||||
int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
|
||||
_addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -2967,7 +2967,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
LONG64 n=(LONG64)a.GetNumElements();
|
||||
int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
|
||||
_assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -3011,7 +3011,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
LONG64 n=(LONG64)a.GetNumElements();
|
||||
int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
|
||||
_addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -3055,7 +3055,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
LONG64 n=(LONG64)a.GetNumElements();
|
||||
int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
|
||||
_assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -3074,7 +3074,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
cudaEvent_t done = nullptr;
|
||||
int blocksPerGrid=1; //only one element
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));
|
||||
_addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -3195,7 +3195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
_innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);
|
||||
_innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -3288,7 +3288,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
LONG64 N=(LONG64)a.GetNumElements();
|
||||
int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
|
||||
_elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);
|
||||
_elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// </copyright>
|
||||
//
|
||||
#ifndef CPU_ONLY
|
||||
#include <float.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "CommonMatrix.h"
|
||||
|
@ -3235,3 +3236,5 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((
|
|||
}
|
||||
}
|
||||
*/
|
||||
|
||||
#endif /*!CPU_ONLY*/
|
||||
|
|
|
@ -909,7 +909,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
_inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_blockVal,threshold,N);
|
||||
_inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_blockVal,threshold,N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
CUDACALL(cudaEventDestroy(done));
|
||||
|
@ -1310,7 +1310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
a.PrepareDevice();
|
||||
long N=(long)a.GetNZElements();
|
||||
int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
|
||||
_elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzLocation(),c.NzLocation(),N);
|
||||
_elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzLocation(),c.NzLocation(),N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
}
|
||||
|
@ -1360,7 +1360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
//int* h_vectArray= new int[a.m_nz];
|
||||
int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
_getSparseVectorRepresntationForMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);
|
||||
_getSparseVectorRepresntationForMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
CUDACALL(cudaEventDestroy(done));
|
||||
|
@ -1411,7 +1411,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
CUDACALL(cudaMemcpy(d_res,res,sizeof(long)*3,cudaMemcpyHostToDevice));
|
||||
|
||||
int blocksPerGrid =(int)ceil(1.0*a.GetNZElements()/threadsPerBlock);
|
||||
_areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res);
|
||||
_areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res);
|
||||
_areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.ColLocation(),b.ColLocation(),(long)a.GetNZElements(),(int)threshold,d_res+1);
|
||||
blocksPerGrid =(int)ceil((1.0*a.GetNumRows()+1.0)/threadsPerBlock);
|
||||
_areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.RowLocation(),b.RowLocation(),(long)a.GetNumRows()+1,(int)threshold,d_res+2);
|
||||
|
@ -1719,7 +1719,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
_elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
_elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
return *this;
|
||||
|
@ -1846,7 +1846,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
_inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
|
||||
_inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
return *this;
|
||||
|
@ -1867,7 +1867,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
_assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
|
||||
_assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
return *this;
|
||||
|
@ -1882,7 +1882,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
_inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
|
||||
_inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
return *this;
|
||||
|
@ -1903,7 +1903,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
_assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
|
||||
_assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
return *this;
|
||||
|
@ -1918,7 +1918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);
|
||||
cudaEvent_t done = nullptr;
|
||||
CUDACALL(cudaEventCreate(&done));
|
||||
_setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
|
||||
_setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
|
||||
CUDACALL(cudaEventRecord(done));
|
||||
CUDACALL(cudaEventSynchronize(done));
|
||||
return *this;
|
||||
|
@ -2012,22 +2012,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
switch (kind)
|
||||
{
|
||||
case 0:
|
||||
_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 1:
|
||||
_inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
_inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 2:
|
||||
_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 3:
|
||||
_inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
_inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 4:
|
||||
_inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
_inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 5:
|
||||
_inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
_inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
break;
|
||||
case 6:
|
||||
_inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<Filter Include="Source Files">
|
||||
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
|
||||
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Header Files">
|
||||
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
|
||||
<Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Resource Files">
|
||||
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
|
||||
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="stdafx.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="targetver.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="CPUMatrix.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Matrix.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Helpers.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="CommonMatrix.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="CPUSparseMatrix.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="stdafx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="dllmain.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="CPUMatrix.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Matrix.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\Common\File.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\Common\fileutil.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="CPUSparseMatrix.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GPUDummy.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="GPUMatrix.cuh">
|
||||
<Filter>Header Files</Filter>
|
||||
</None>
|
||||
<None Include="GPUSparseMatrix.cuh">
|
||||
<Filter>Header Files</Filter>
|
||||
</None>
|
||||
<None Include="GPUWatcher.cuh">
|
||||
<Filter>Header Files</Filter>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -288,15 +288,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
if (m_preferredDeviceId == CPUDEVICE)
|
||||
{
|
||||
m_CPUMatrix = new CPUMatrix<ElemType>(numRows,numCols);
|
||||
m_CPUMatrix = new CPUMatrix<ElemType>(numRows,numCols);
|
||||
SetDataLocation(CPU, DENSE);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_GPUMatrix = new GPUMatrix<ElemType>(numRows,numCols,m_preferredDeviceId);
|
||||
SetDataLocation(GPU, DENSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
m_GPUMatrix = new GPUMatrix<ElemType>(numRows,numCols,m_preferredDeviceId);
|
||||
SetDataLocation(GPU, DENSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
|
@ -840,11 +840,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_CPUMatrix->SetValue(*db_number.ExposePointer2Value()),
|
||||
if (GetDeviceId()!=db_number.GetDeviceId())
|
||||
{
|
||||
#ifndef LINUX
|
||||
throw std::exception("Matrix and device bound number must be on the same device");
|
||||
#else
|
||||
throw std::exception();
|
||||
#endif /* LINUX */
|
||||
throw std::runtime_error("Matrix and device bound number must be on the same device");
|
||||
}
|
||||
m_GPUMatrix->SetValue(db_number.ExposePointer2Value()),
|
||||
NOT_IMPLEMENTED,
|
||||
|
|
Загрузка…
Ссылка в новой задаче