Modify port code to support CPUONLY build

To use this define CPUONLY in the CN project And remove *.cu from the math project and add GPUDummy.cpp instead This allows use of Cygwin to compile both Windows and linux on same machine
2014-10-20 14:50:30 -07:00 · 2014-10-20 14:50:30 -07:00 · ff72d5696f
--- a/Common/BestGpu.cpp
+++ b/Common/BestGpu.cpp
@ -6,14 +6,28 @@

 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings

+#include "BestGpu.h"
+#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX
+
+#ifdef CPUONLY
+namespace Microsoft {
+    namespace MSR {
+        namespace CNTK {
+            short DeviceFromConfig(const ConfigParameters& config)
+            {
+                return CPUDEVICE;
+            }
+        }
+    }
+}
+#else
+
 // CUDA-C includes
 #include <cuda.h>
 #include <windows.h>
 #include <delayimp.h>
 #include <Shlobj.h>
 #include <stdio.h>
-#include "BestGpu.h"
-#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX

 // The "notify hook" gets called for every call to the
 // delay load helper.  This allows a user to hook every call and
@ -507,4 +521,5 @@ void BestGpu::QueryNvmlData()
 	m_nvmlData = true;
 	return;
 }
-}}}
+}}}
+#endif
--- a/Common/Include/BestGpu.h
+++ b/Common/Include/BestGpu.h
@ -4,15 +4,18 @@
 // </copyright>
 //
 #pragma once
+#ifndef CPUONLY
 #include <cuda_runtime.h>
 #include <nvml.h>
 #include <vector>
+#endif
 #include "commandArgUtil.h"


 namespace Microsoft { namespace MSR { namespace CNTK {
 short DeviceFromConfig(const ConfigParameters& config);

+#ifndef CPUONLY
 struct ProcessorData
 {
 	int cores;
@ -68,5 +71,5 @@ public:
    std::vector<int> GetDevices(int number=AllDevices, BestGpuFlags flags=bestGpuNormal); // get multiple devices
 };
 extern BestGpu* g_bestGpu;
-
+#endif
 }}}
--- a/Common/Include/basetypes.h
+++ b/Common/Include/basetypes.h
@ -13,7 +13,13 @@ typedef char16_t TCHAR;
 #define	vsprintf_s vsprintf		/* Not sure this is right... Malcolm */
 #include <chrono>
 #include <thread>
-#endif	 /* LINUX */
+#include <cstdlib> 
+#include <cerrno>
+#define Linux(a) a
+#else
+#include <tchar.h>
+#endif	/* LINUX */
+#include <cmath>        // for HUGE_VAL  // Remove for a test by Malcolm because of double isnan definition...

 #ifndef UNDER_CE    // fixed-buffer overloads not available for wince
 #ifdef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES  // fixed-buffer overloads for strcpy() etc.
@ -80,11 +86,13 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
 #include <errno.h>
 #include <string>
 #include <vector>
-#include <cmath>        // for HUGE_VAL // potential double isnan definition
+#include <math.h>        // for HUGE_VAL // potential double isnan definition
 #include <assert.h>
 #include <stdarg.h>
 #include <map>
 #include <stdexcept>
+#include <locale>         // std::wstring_convert
+#include <codecvt>        // std::codecvt_utf8
 #ifdef _MSC_VER
 #include <windows.h>    // for CRITICAL_SECTION and Unicode conversion functions   --TODO: is there a portable alternative?
 #endif
@ -578,6 +586,9 @@ typedef strfun::_strprintf<wchar_t> wstrprintf; // wchar_t version
 #ifdef _WIN32
 struct utf8 : std::string { utf8 (const std::wstring & p)    // utf-16 to -8
 {
+ //TODO: confirm it builds on VS2013
+       std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cv;
+        (*(std::string*)this) = cv.to_bytes(p);
 #ifdef	MALCOLM
    size_t len = p.length();
    if (len == 0) { return;}    // empty string
@ -592,16 +603,19 @@ struct utf8 : std::string { utf8 (const std::wstring & p)    // utf-16 to -8
 }};
 struct utf16 : std::wstring { utf16 (const std::string & p)  // utf-8 to -16
 {
-#ifdef	MALCOLM
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cv;
+    (*(std::wstring*)this) = cv.from_bytes(p);
+
+#ifdef OLD
    size_t len = p.length();
    if (len == 0) { return;}    // empty string
    msra::basetypes::fixed_vector<wchar_t> buf (len + 1);
    // ... TODO: this fill() should be unnecessary (a 0 is appended)--but verify
-    std::fill (buf.begin (), buf.end (), (wchar_t) 0);
-    int rc = MultiByteToWideChar (CP_UTF8, 0, p.c_str(), (int) len,
-                                  &buf[0], (int) buf.size());
-    if (rc == 0) throw std::runtime_error ("MultiByteToWideChar");
-    ASSERT (rc < buf.size ());
+    std::fill(buf.begin(), buf.end(), (wchar_t)0);
+    int rc = MultiByteToWideChar(CP_UTF8, 0, p.c_str(), (int)len,
+        &buf[0], (int)buf.size());
+    if (rc == 0) throw std::runtime_error("MultiByteToWideChar");
+    ASSERT(rc < buf.size());
    (*(std::wstring*)this) = &buf[0];
 #endif	/* Malcolm */
 }};
@ -641,12 +655,8 @@ static inline std::string wcstombs (const std::wstring & p)  // output: MBCS
 }
 static inline std::wstring mbstowcs (const std::string & p)  // input: MBCS
 {
-    size_t len = p.length();
-    msra::basetypes::fixed_vector<wchar_t> buf (len + 1); // max: >1 mb chars => 1 wchar
-    std::fill (buf.begin (), buf.end (), (wchar_t) 0);
-    OACR_WARNING_SUPPRESS(UNSAFE_STRING_FUNCTION, "Reviewed OK. size checked. [rogeryu 2006/03/21]");
-    ::mbstowcs (&buf[0], p.c_str(), len + 1);
-    return std::wstring (&buf[0]);
+	std::wstring ret = utf16(p);
+    return ret;
 }
 #pragma warning(pop)

@ -769,8 +779,6 @@ static inline FILE* _wfopen(const wchar_t * path, const wchar_t * mode) { return

 namespace msra { namespace basetypes {

-#ifdef	MALCOLM
-
 // FILE* with auto-close; use auto_file_ptr instead of FILE*.
 // Warning: do not pass an auto_file_ptr to a function that calls fclose(),
 // except for fclose() itself.
@ -789,7 +797,7 @@ public:
    auto_file_ptr() : f (NULL) { }
    ~auto_file_ptr() { close(); }
    auto_file_ptr (const char * path, const char * mode) { f = fopen (path, mode); if (f == NULL) openfailed (path); }
-    auto_file_ptr (const wchar_t * path, const char * mode) { f = _wfopen (path, msra::strfun::utf16 (mode).c_str()); if (f == NULL) openfailed (msra::strfun::utf8 (path)); }
+    auto_file_ptr (const wchar_t * wpath, const char * mode) {string path = msra::strfun::utf8(wpath); f = fopen (path.c_str(), mode); if (f == NULL) openfailed (path); }
    FILE * operator= (FILE * other) { close(); f = other; return f; }
    auto_file_ptr (FILE * other) : f (other) { }
    operator FILE * () const { return f; }
@ -797,7 +805,6 @@ public:
    void swap (auto_file_ptr & other)  throw() { std::swap (f, other.f); }
 };
 inline int fclose (auto_file_ptr & af) { return af.fclose(); }
-#endif	/* MALCOLM */

 #ifdef _MSC_VER
 // auto-closing container for Win32 handles.
--- a/Common/Include/fileutil.h
+++ b/Common/Include/fileutil.h
@ -3,232 +3,19 @@
 //
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 //
-// $Log: /Speech_To_Speech_Translation/dbn/dbn/fileutil.h $
-// 
-// 71    1/03/13 8:53p Kaisheny
-// Asynchronous SGD using data pipe.
-// 
-// 70    9/30/12 10:46a Fseide
-// new optional parameter to fuptodate()--caller can now choose whether a
-// missing input file, with target file present, will cause a failure or
-// considers the target up-to-date
-// 
-// 69    11/09/11 10:01 Fseide
-// added a new overload for fgetfilelines() that returns an array of char*
-// instead of strings, to avoid mem alloc
-// 
-// 68    6/10/11 9:50 Fseide
-// (fixed a missing 'inline')
-// 
-// 67    6/10/11 9:49 Fseide
-// new function fgetfilelines() for reading text files
-// 
-// 66    6/09/11 15:18 Fseide
-// added overloads to fexists() that accept STL strings
-// 
-// 65    3/07/11 12:13 Fseide
-// actually implemented unlinkOrDie() (was a dummy)
-// 
-// 64    11/17/10 15:00 Fseide
-// new function fuptodate();
-// make_intermediate_dirs() moved to namespace msra::files (all new
-// functions should be put in there)
-// 
-// 63    11/15/10 7:04p Fseide
-// added an overload for freadOrDie (vector) that takes size as a size_t
-// instead of an int, to pleasr the x64 compiler
-// 
-// 62    11/08/10 17:07 Fseide
-// new function make_intermediate_dirs()
-// 
-// 61    11/08/10 11:43 Fseide
-// (minor cleanup)
-// 
-// 60    2/05/09 19:06 Fseide
-// fgetline() now returns a non-const pointer, because user may want to
-// post-process the line, and the returned value is a user-specified
-// buffer anyway
-// 
-// 59    1/16/09 17:34 Fseide
-// relpath() and splitpath() moved to fileutil.h
-// 
-// 58    1/16/09 8:59 Fseide
-// exported fskipspace()
-// 
-// 57    1/15/09 7:38 Fseide
-// some magic to unify fgetstring() for char and wchar_t to a single
-// template function
-// 
-// 56    1/15/09 7:26 Fseide
-// corrected the #include order of basetypes.h
-// 
-// 55    1/14/09 19:26 Fseide
-// new functions fsetpos() and fgetpos();
-// new fixed-buffer size overload for fgetstring() and fgettoken()
-// 
-// 54    1/08/09 16:14 Fseide
-// fopenOrDie() now supports "-" as the pathname, referring to stdin or
-// stdout
-// 
-// 53    1/08/09 15:32 Fseide
-// new funtion expand_wildcards()
-// 
-// 52    1/05/09 8:44 Fseide
-// (added comments)
-// 
-// 51    11/11/08 6:04p Qiluo
-// recover the old fputstring functions
-// 
-// 50    10/31/08 5:09p Qiluo
-// remove banned APIs
-// 
-// 49    7/17/08 7:22p V-spwang
-// undid changes - back to version 47
-// 
-// 47    6/24/08 19:03 Fseide
-// added fgetwstring() and fputstring() for wstrings;
-// added templates for freadOrDie() and fwriteOrDie() for STL vectors
-// 
-// 46    6/18/08 11:41 Fseide
-// added #pragma once
-// 
-// 45    08-05-29 18:18 Llu
-// fix the interface of fputwav
-// 
-// 44    08-05-29 13:54 Llu
-// add fputwav revise fgetwav using stl instead of short *
-// 
-// 43    11/27/06 11:40 Fseide
-// new methods fgetwfx() and fputwfx() for direct access to simple PCM WAV
-// files
-// 
-// 42    10/14/06 18:31 Fseide
-// added char* version of fexists()
-// 
-// 41    5/22/06 9:34 Fseide
-// (experimental auto_file class checked in)
-// 
-// 40    5/14/06 19:59 Fseide
-// new function fsetmode()
-// 
-// 39    3/29/06 15:36 Fseide
-// changed to reading entire file instead of line-by-line, not changing
-// newlines anymore
-// 
-// 38    2/21/06 12:39p Kit
-// Added filesize64 function
-// 
-// 37    1/09/06 7:12p Rogeryu
-// wide version of fgetline
-// 
-// 36    12/19/05 21:52 Fseide
-// fputfile() added in 8-bit string version
-// 
-// 35    12/15/05 20:25 Fseide
-// added getfiletime(), setfiletime(), and fputfile() for strings
-// 
-// 34    9/27/05 12:22 Fseide
-// added wstring version of renameOrDie()
-// 
-// 33    9/22/05 12:26 Fseide
-// new method fexists()
-// 
-// 32    9/15/05 11:33 Fseide
-// new version of fgetline() that avoids buffer allocations, since this
-// seems very expensive esp. when reading a file line by line with
-// fgetline()
-// 
-// 31    9/05/05 4:57p F-xyzhao
-// added #include <windows.h> for #include <mmreg.h> -- ugh
-// 
-// 30    9/05/05 11:00 Fseide
-// new method renameOrDie()
-// 
-// 29    8/24/05 5:45p Kjchen
-// merge changes in OneNote
-// 
-// 28    8/19/05 17:56 Fseide
-// extended WAVEHEADER with write() and update()
-// 
-// 27    8/13/05 15:37 Fseide
-// added new version of fgetline that takes a buffer
-// 
-// 26    7/26/05 18:54 Fseide
-// new functions fgetint24() and fputint24()
-// 
-// 25    2/12/05 15:21 Fseide
-// fgetdouble() and fputdouble() added
-// 
-// 24    2/05/05 12:38 Fseide
-// new methods fputfile(), fgetfile();
-// new overload for filesize()
-// 
-// 23    2/03/05 22:34 Fseide
-// added new version of fgetline() that returns an STL string
-// 
-// 22    5/31/04 10:06 Fseide
-// new methods fseekOrDie(), ftellOrDie(), unlinkOrDie(), renameOrDie()
-// 
-// 21    3/19/04 4:01p Fseide
-// fwriteOrDie(): first argument changed to const
-// 
-// 20    2/27/04 10:04a V-xlshi
-// 
-// 19    2/19/04 3:45p V-xlshi
-// fgetraw function is added.
-// 
-// 18    2/19/04 1:49p V-xlshi
-// 
-// 17    2/03/04 8:17p V-xlshi
-// 
-// 16    2/03/04 6:20p V-xlshi
-// WAVEHEADER.prepare() added
-// 
-// 15    2/03/04 5:58p V-xlshi
-// WAVEHEADER structure added
-// 
-// 14    8/15/03 15:40 Fseide
-// new method filesize()
-// 
-// 13    8/13/03 21:06 Fseide
-// new function fputbyte()
-// 
-// 12    8/13/03 15:37 Fseide
-// prototype of fOpenOrDie() Unicode version changed
-// 
-// 11    8/07/03 22:04 Fseide
-// fprintfOrDie() now really dies in case of error
-// 
-// 10    03-07-30 12:06 I-rogery
-// enable both unicode and non-unicode version
-// 
-// 9     7/25/03 6:07p Fseide
-// new functions fgetbyte() and fgetwav()
-// 
-// 8     7/03/02 9:25p Fseide
-// fcompareTag() now uses string type for both of its arguments (before,
-// it used const char * for one of them)
-// 
-// 7     6/10/02 3:14p Fseide
-// new functions fgettoken(), fgetfloat_ascii(), fskipNewline()
-// 
-// 6     6/07/02 7:26p Fseide
-// new functions fcheckTag_ascii() and fgetint_ascii()
-// 
-// 5     4/15/02 1:12p Fseide
-// void fputstring (FILE * f, const TSTRING & str) and fpad() added
-// 
-// 4     4/03/02 3:58p Fseide
-// VSS keyword and copyright added
-//
-// F. Seide 5 Mar 2002
-//
-
 #pragma once
 #ifndef _FILEUTIL_
 #define _FILEUTIL_

 #include <stdio.h>
+#ifdef	_WIN32
+#define isfinite(x) _finite(x)
+#define isnan(x) _isnan(x)
+#endif
+#ifdef __unix__
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
 #include <algorithm>    // for std::find
 #include <vector>
 #include <map>
@ -695,6 +482,29 @@ namespace msra { namespace files {
 // simple support for WAV file I/O
 // ----------------------------------------------------------------------------

+// define the header if we haven't seen it yet
+#ifndef _WAVEFORMATEX_
+#define _WAVEFORMATEX_
+
+/*
+ *  extended waveform format structure used for all non-PCM formats. this
+ *  structure is common to all non-PCM formats.
+ */
+typedef unsigned short WORD;  // in case not defined yet (i.e. linux)
+typedef struct tWAVEFORMATEX
+{
+    WORD        wFormatTag;         /* format type */
+    WORD        nChannels;          /* number of channels (i.e. mono, stereo...) */
+    DWORD       nSamplesPerSec;     /* sample rate */
+    DWORD       nAvgBytesPerSec;    /* for buffer estimation */
+    WORD        nBlockAlign;        /* block size of data */
+    WORD        wBitsPerSample;     /* number of bits per sample of mono data */
+    WORD        cbSize;             /* the count in bytes of the size of */
+                                    /* extra information (after cbSize) */
+} WAVEFORMATEX, *PWAVEFORMATEX;
+
+#endif /* _WAVEFORMATEX_ */
+
 typedef struct wavehder{
    char          riffchar[4];
    unsigned int  RiffLength;
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@ -298,7 +298,7 @@ size_t filesize (const wchar_t * pathname)
 // filesize64(): determine size of the file in bytes (with pathname)
 int64_t filesize64 (const wchar_t * pathname)
 {
-    __stat64 fileinfo;
+    struct _stat64 fileinfo;
    if (_wstat64 (pathname,&fileinfo) == -1) 
        return 0;
    else
@ -1375,6 +1375,21 @@ vector<char*> msra::files::fgetfilelines (const wstring & path, vector<char> & b

 bool getfiletime (const wstring & path, FILETIME & time)
 {   // return file modification time, false if cannot be determined
+	struct _stat buf;
+	int result;
+
+	// Get data associated with "crt_stat.c": 
+	result = _wstat(path.c_str(), &buf);
+	// Check if statistics are valid: 
+	if( result != 0 )
+	{
+		return false;
+	}
+
+	(*(time_t*)(&time))= buf.st_mtime;
+	return true;
+
+#ifdef OLD
    WIN32_FIND_DATAW findFileData;
    auto_handle hFind (FindFirstFileW (path.c_str(), &findFileData), ::FindClose);
    if (hFind != INVALID_HANDLE_VALUE)
@ -1386,10 +1401,14 @@ bool getfiletime (const wstring & path, FILETIME & time)
    {
        return false;
    }
+#endif
 }

 void setfiletime (const wstring & path, const FILETIME & time)
 {   // update the file modification time of an existing file
+#ifdef LINUX
+	throw new logic_error("setfiletime has not been converted to linux yet...");
+#else
    auto_handle h (CreateFileW (path.c_str(), FILE_WRITE_ATTRIBUTES,
                                FILE_SHARE_READ|FILE_SHARE_WRITE, NULL,
                                OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL));
@ -1402,6 +1421,7 @@ void setfiletime (const wstring & path, const FILETIME & time)
    {
        RuntimeError ("setfiletime: error setting file time information: %d", GetLastError());
    }
+#endif
 }

 #if 0
--- a/MachineLearning/cn/cn.vcxproj
+++ b/MachineLearning/cn/cn.vcxproj
@ -124,7 +124,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
-      <DelayLoadDLLs>CNTKMath.dll;nvml.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>CNTKMath.dll;nvml.dll;nvcuda.dll</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
      <Command>
@ -186,7 +186,7 @@
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalDependencies>Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
-      <DelayLoadDLLs>CNTKMath.dll;nvml.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>CNTKMath.dll;nvml.dll;nvcuda.dll</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
      <Command>copy $(SolutionDir)..\Common\PTask\bin\*.dll $(TargetDir)</Command>
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -21,13 +21,15 @@

 #ifndef	 LINUX
 #include <Windows.h>
+#define Linux(x)
 #else
+#define Linux(x) x

 #ifndef max
 #define max(a,b)            (((a) > (b)) ? (a) : (b))
 #endif

-#include <values.h>
+#include <cfloat> 
 #endif	/* LINUX */

 #ifdef LEAKDETECT
--- a/Math/Math/GPUDummy.cpp
+++ b/Math/Math/GPUDummy.cpp
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -324,22 +324,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        switch (kind)
        {
        case 0:
-            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);            
+			_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
            break;
        case 1:
-            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+			_inplaceTanhOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
            break;
        case 2:
-            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+			_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
            break;
        case 3:
-            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
            break;
        case 4:
-            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
            break;
        case 5:
-            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
            break;
        case 6:
            _inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
@ -1205,7 +1205,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //    int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
        //    cudaEvent_t done = nullptr;
        //    if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        //    _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N); 
+        //    _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
        //    if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        //    if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        //    if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1458,7 +1458,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);                        
+        _addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1480,7 +1480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N,M);                        
+        _columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N,M);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1503,7 +1503,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.m_pArray,N,M);                        
+        _rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.m_pArray,N,M);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1568,7 +1568,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);                        
+        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));     
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1825,7 +1825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);                        
+        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1848,7 +1848,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);                        
+        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1866,7 +1866,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);                        
+        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1889,7 +1889,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);                        
+        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1906,7 +1906,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);                        
+        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1964,7 +1964,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
        //WARNING: THIS kernel is not the most efficient way!
-        _reductionSumAndAssign<ElemType><<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements());       
+        _reductionSumAndAssign<ElemType><<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements());
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2026,7 +2026,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N);                        
+        _elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2049,7 +2049,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);                        
+        _assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2078,7 +2078,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);                        
+        _assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2119,7 +2119,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }       

        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-        _vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);  
+        _vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2160,7 +2160,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }       

        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-        _vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);  
+        _vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2219,7 +2219,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);                        
+        _assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2257,7 +2257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        a.PrepareDevice();
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);                        
+        _addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2360,7 +2360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cudaEvent_t done = nullptr;
        int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock);  
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements());                        
+        _assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements());
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));    
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2380,7 +2380,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cudaEvent_t done = nullptr;
        int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock);  
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements());                        
+        _addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements());
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));    
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2473,8 +2473,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cudaEvent_t done = nullptr;
        //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock);  
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        //_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements());                        
-        _assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements());                        
+        //_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements());
+        _assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements());
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2926,7 +2926,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            LONG64 n=(LONG64)a.GetNumElements();            
            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);                        
+            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2967,7 +2967,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            LONG64 n=(LONG64)a.GetNumElements();            
            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);                        
+            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -3011,7 +3011,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            LONG64 n=(LONG64)a.GetNumElements();            
            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);                        
+            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -3055,7 +3055,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            LONG64 n=(LONG64)a.GetNumElements();            
            int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);                        
+            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -3074,7 +3074,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cudaEvent_t done = nullptr;
        int blocksPerGrid=1;  //only one element
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));                        
+        _addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -3195,7 +3195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }       

            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-            _innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);  
+            _innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);
            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -3288,7 +3288,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));            
            LONG64 N=(LONG64)a.GetNumElements();
            int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);             
+            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);
            if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
            if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -3,6 +3,7 @@
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 // </copyright>
 //
+#ifndef CPU_ONLY
 #include <float.h>
 #include <cuda_runtime.h>
 #include "CommonMatrix.h"
@ -3235,3 +3236,5 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((
 }
 }
 */
+
+#endif /*!CPU_ONLY*/
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@ -909,7 +909,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
            cudaEvent_t done = nullptr;
            CUDACALL(cudaEventCreate(&done));        
-            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_blockVal,threshold,N);                        
+            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_blockVal,threshold,N);
            CUDACALL(cudaEventRecord(done));        
            CUDACALL(cudaEventSynchronize(done));   
            CUDACALL(cudaEventDestroy(done));
@ -1310,7 +1310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            a.PrepareDevice();
            long N=(long)a.GetNZElements();
            int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzLocation(),c.NzLocation(),N);             
+            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzLocation(),c.NzLocation(),N);
            CUDACALL(cudaEventRecord(done));        
            CUDACALL(cudaEventSynchronize(done));   
        }
@ -1360,7 +1360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //int* h_vectArray= new int[a.m_nz];
        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);   
        CUDACALL(cudaEventCreate(&done));
-        _getSparseVectorRepresntationForMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);        
+        _getSparseVectorRepresntationForMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);
        CUDACALL(cudaEventRecord(done));        
        CUDACALL(cudaEventSynchronize(done));
        CUDACALL(cudaEventDestroy(done));
@ -1411,7 +1411,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CUDACALL(cudaMemcpy(d_res,res,sizeof(long)*3,cudaMemcpyHostToDevice));

        int blocksPerGrid =(int)ceil(1.0*a.GetNZElements()/threadsPerBlock); 
-        _areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res);        
+        _areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res);
        _areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.ColLocation(),b.ColLocation(),(long)a.GetNZElements(),(int)threshold,d_res+1);
        blocksPerGrid =(int)ceil((1.0*a.GetNumRows()+1.0)/threadsPerBlock); 
        _areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.RowLocation(),b.RowLocation(),(long)a.GetNumRows()+1,(int)threshold,d_res+2);
@ -1719,7 +1719,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
        cudaEvent_t done = nullptr;
        CUDACALL(cudaEventCreate(&done));        
-        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);                        
+        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
        CUDACALL(cudaEventRecord(done));        
        CUDACALL(cudaEventSynchronize(done));        
        return *this;
@ -1846,7 +1846,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
        cudaEvent_t done = nullptr;
        CUDACALL(cudaEventCreate(&done));        
-        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);                        
+        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
        CUDACALL(cudaEventRecord(done));        
        CUDACALL(cudaEventSynchronize(done)); 
        return *this;
@ -1867,7 +1867,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
        cudaEvent_t done = nullptr;
        CUDACALL(cudaEventCreate(&done));        
-        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);                        
+        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
        CUDACALL(cudaEventRecord(done));        
        CUDACALL(cudaEventSynchronize(done));
        return *this;
@ -1882,7 +1882,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
        cudaEvent_t done = nullptr;
        CUDACALL(cudaEventCreate(&done));        
-        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);                        
+        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
        CUDACALL(cudaEventRecord(done));        
        CUDACALL(cudaEventSynchronize(done)); 
        return *this;        
@ -1903,7 +1903,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
        cudaEvent_t done = nullptr;
        CUDACALL(cudaEventCreate(&done));        
-        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);                        
+        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
        CUDACALL(cudaEventRecord(done));        
        CUDACALL(cudaEventSynchronize(done));
        return *this;        
@ -1918,7 +1918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
        cudaEvent_t done = nullptr;
        CUDACALL(cudaEventCreate(&done));        
-        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);                        
+        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
        CUDACALL(cudaEventRecord(done));        
        CUDACALL(cudaEventSynchronize(done)); 
        return *this;  
@ -2012,22 +2012,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        switch (kind)
        {
        case 0:
-            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);            
+            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
            break;
        case 1:
-            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
            break;
        case 2:
-            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
            break;
        case 3:
-            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
            break;
        case 4:
-            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
            break;
        case 5:
-            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
            break;
        case 6:
            _inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
--- a/Math/Math/Math.vcxproj.filters
+++ b/Math/Math/Math.vcxproj.filters
@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="stdafx.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="CPUMatrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="Matrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="Helpers.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="CommonMatrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="CPUSparseMatrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="dllmain.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="CPUMatrix.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="Matrix.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\File.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="CPUSparseMatrix.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPUDummy.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="GPUMatrix.cuh">
+      <Filter>Header Files</Filter>
+    </None>
+    <None Include="GPUSparseMatrix.cuh">
+      <Filter>Header Files</Filter>
+    </None>
+    <None Include="GPUWatcher.cuh">
+      <Filter>Header Files</Filter>
+    </None>
+  </ItemGroup>
+</Project>
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -288,15 +288,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            if (m_preferredDeviceId == CPUDEVICE)
            {
-            m_CPUMatrix = new CPUMatrix<ElemType>(numRows,numCols);
+				m_CPUMatrix = new CPUMatrix<ElemType>(numRows,numCols);
                SetDataLocation(CPU, DENSE);
-        }
-        else
-        {
-                m_GPUMatrix = new GPUMatrix<ElemType>(numRows,numCols,m_preferredDeviceId);
-                SetDataLocation(GPU, DENSE);
-        }
-    }
+			}
+			else
+			{
+					m_GPUMatrix = new GPUMatrix<ElemType>(numRows,numCols,m_preferredDeviceId);
+					SetDataLocation(GPU, DENSE);
+			}
+		}
    }

    template<class ElemType>
@ -840,11 +840,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_CPUMatrix->SetValue(*db_number.ExposePointer2Value()), 
            if (GetDeviceId()!=db_number.GetDeviceId()) 
            {
-#ifndef	LINUX
-                throw std::exception("Matrix and device bound number must be on the same device");
-#else
-                throw std::exception();
-#endif	/* LINUX */
+                throw std::runtime_error("Matrix and device bound number must be on the same device");
            }
            m_GPUMatrix->SetValue(db_number.ExposePointer2Value()), 
            NOT_IMPLEMENTED,