From ff72d5696f07c4365c0272df63d212841a441caf Mon Sep 17 00:00:00 2001
From: adame <adamever@hotmail.com>
Date: Mon, 20 Oct 2014 14:50:30 -0700
Subject: [PATCH] Modify port code to support CPUONLY build To use this define
 CPUONLY in the CN project And remove *.cu from the math project and add
 GPUDummy.cpp instead This allows use of Cygwin to compile both Windows and
 linux on same machine

---
 Common/BestGpu.cpp                |   21 +-
 Common/Include/BestGpu.h          |    5 +-
 Common/Include/basetypes.h        |   43 +-
 Common/Include/fileutil.h         |  252 +----
 Common/fileutil.cpp               |   22 +-
 MachineLearning/cn/cn.vcxproj     |    4 +-
 Math/Math/CPUMatrix.cpp           |    4 +-
 Math/Math/GPUDummy.cpp            | 1667 +++++++++++++++++++++++++++++
 Math/Math/GPUMatrix.cu            |   70 +-
 Math/Math/GPUMatrixCUDAKernels.cu |    3 +
 Math/Math/GPUSparseMatrix.cu      |   32 +-
 Math/Math/Math.vcxproj.filters    |   77 ++
 Math/Math/Matrix.cpp              |   22 +-
 13 files changed, 1911 insertions(+), 311 deletions(-)
 create mode 100644 Math/Math/GPUDummy.cpp
 create mode 100644 Math/Math/Math.vcxproj.filters
diff --git a/Common/BestGpu.cpp b/Common/BestGpu.cpp
index 3204e4e8c..426342c8b 100644
--- a/Common/BestGpu.cpp
+++ b/Common/BestGpu.cpp
@@ -6,14 +6,28 @@
 
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
 
+#include "BestGpu.h"
+#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX
+
+#ifdef CPUONLY
+namespace Microsoft {
+    namespace MSR {
+        namespace CNTK {
+            short DeviceFromConfig(const ConfigParameters& config)
+            {
+                return CPUDEVICE;
+            }
+        }
+    }
+}
+#else
+
 // CUDA-C includes
 #include <cuda.h>
 #include <windows.h>
 #include <delayimp.h>
 #include <Shlobj.h>
 #include <stdio.h>
-#include "BestGpu.h"
-#include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX
 
 // The "notify hook" gets called for every call to the
 // delay load helper.  This allows a user to hook every call and
@@ -507,4 +521,5 @@ void BestGpu::QueryNvmlData()
 	m_nvmlData = true;
 	return;
 }
-}}}
\ No newline at end of file
+}}}
+#endif
diff --git a/Common/Include/BestGpu.h b/Common/Include/BestGpu.h
index 3c04c4ab2..849b43ac6 100644
--- a/Common/Include/BestGpu.h
+++ b/Common/Include/BestGpu.h
@@ -4,15 +4,18 @@
 // </copyright>
 //
 #pragma once
+#ifndef CPUONLY
 #include <cuda_runtime.h>
 #include <nvml.h>
 #include <vector>
+#endif
 #include "commandArgUtil.h"
 
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 short DeviceFromConfig(const ConfigParameters& config);
 
+#ifndef CPUONLY
 struct ProcessorData
 {
 	int cores;
@@ -68,5 +71,5 @@ public:
     std::vector<int> GetDevices(int number=AllDevices, BestGpuFlags flags=bestGpuNormal); // get multiple devices
 };
 extern BestGpu* g_bestGpu;
-
+#endif
 }}}
\ No newline at end of file
diff --git a/Common/Include/basetypes.h b/Common/Include/basetypes.h
index d3145570a..95819f825 100644
--- a/Common/Include/basetypes.h
+++ b/Common/Include/basetypes.h
@@ -13,7 +13,13 @@ typedef char16_t TCHAR;
 #define	vsprintf_s vsprintf		/* Not sure this is right... Malcolm */
 #include <chrono>
 #include <thread>
-#endif	 /* LINUX */
+#include <cstdlib> 
+#include <cerrno>
+#define Linux(a) a
+#else
+#include <tchar.h>
+#endif	/* LINUX */
+#include <cmath>        // for HUGE_VAL  // Remove for a test by Malcolm because of double isnan definition...
 
 #ifndef UNDER_CE    // fixed-buffer overloads not available for wince
 #ifdef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES  // fixed-buffer overloads for strcpy() etc.
@@ -80,11 +86,13 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
 #include <errno.h>
 #include <string>
 #include <vector>
-#include <cmath>        // for HUGE_VAL // potential double isnan definition
+#include <math.h>        // for HUGE_VAL // potential double isnan definition
 #include <assert.h>
 #include <stdarg.h>
 #include <map>
 #include <stdexcept>
+#include <locale>         // std::wstring_convert
+#include <codecvt>        // std::codecvt_utf8
 #ifdef _MSC_VER
 #include <windows.h>    // for CRITICAL_SECTION and Unicode conversion functions   --TODO: is there a portable alternative?
 #endif
@@ -578,6 +586,9 @@ typedef strfun::_strprintf<wchar_t> wstrprintf; // wchar_t version
 #ifdef _WIN32
 struct utf8 : std::string { utf8 (const std::wstring & p)    // utf-16 to -8
 {
+ //TODO: confirm it builds on VS2013
+       std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cv;
+        (*(std::string*)this) = cv.to_bytes(p);
 #ifdef	MALCOLM
     size_t len = p.length();
     if (len == 0) { return;}    // empty string
@@ -592,16 +603,19 @@ struct utf8 : std::string { utf8 (const std::wstring & p)    // utf-16 to -8
 }};
 struct utf16 : std::wstring { utf16 (const std::string & p)  // utf-8 to -16
 {
-#ifdef	MALCOLM
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cv;
+    (*(std::wstring*)this) = cv.from_bytes(p);
+
+#ifdef OLD
     size_t len = p.length();
     if (len == 0) { return;}    // empty string
     msra::basetypes::fixed_vector<wchar_t> buf (len + 1);
     // ... TODO: this fill() should be unnecessary (a 0 is appended)--but verify
-    std::fill (buf.begin (), buf.end (), (wchar_t) 0);
-    int rc = MultiByteToWideChar (CP_UTF8, 0, p.c_str(), (int) len,
-                                  &buf[0], (int) buf.size());
-    if (rc == 0) throw std::runtime_error ("MultiByteToWideChar");
-    ASSERT (rc < buf.size ());
+    std::fill(buf.begin(), buf.end(), (wchar_t)0);
+    int rc = MultiByteToWideChar(CP_UTF8, 0, p.c_str(), (int)len,
+        &buf[0], (int)buf.size());
+    if (rc == 0) throw std::runtime_error("MultiByteToWideChar");
+    ASSERT(rc < buf.size());
     (*(std::wstring*)this) = &buf[0];
 #endif	/* Malcolm */
 }};
@@ -641,12 +655,8 @@ static inline std::string wcstombs (const std::wstring & p)  // output: MBCS
 }
 static inline std::wstring mbstowcs (const std::string & p)  // input: MBCS
 {
-    size_t len = p.length();
-    msra::basetypes::fixed_vector<wchar_t> buf (len + 1); // max: >1 mb chars => 1 wchar
-    std::fill (buf.begin (), buf.end (), (wchar_t) 0);
-    OACR_WARNING_SUPPRESS(UNSAFE_STRING_FUNCTION, "Reviewed OK. size checked. [rogeryu 2006/03/21]");
-    ::mbstowcs (&buf[0], p.c_str(), len + 1);
-    return std::wstring (&buf[0]);
+	std::wstring ret = utf16(p);
+    return ret;
 }
 #pragma warning(pop)
 
@@ -769,8 +779,6 @@ static inline FILE* _wfopen(const wchar_t * path, const wchar_t * mode) { return
 
 namespace msra { namespace basetypes {
 
-#ifdef	MALCOLM
-
 // FILE* with auto-close; use auto_file_ptr instead of FILE*.
 // Warning: do not pass an auto_file_ptr to a function that calls fclose(),
 // except for fclose() itself.
@@ -789,7 +797,7 @@ public:
     auto_file_ptr() : f (NULL) { }
     ~auto_file_ptr() { close(); }
     auto_file_ptr (const char * path, const char * mode) { f = fopen (path, mode); if (f == NULL) openfailed (path); }
-    auto_file_ptr (const wchar_t * path, const char * mode) { f = _wfopen (path, msra::strfun::utf16 (mode).c_str()); if (f == NULL) openfailed (msra::strfun::utf8 (path)); }
+    auto_file_ptr (const wchar_t * wpath, const char * mode) {string path = msra::strfun::utf8(wpath); f = fopen (path.c_str(), mode); if (f == NULL) openfailed (path); }
     FILE * operator= (FILE * other) { close(); f = other; return f; }
     auto_file_ptr (FILE * other) : f (other) { }
     operator FILE * () const { return f; }
@@ -797,7 +805,6 @@ public:
     void swap (auto_file_ptr & other)  throw() { std::swap (f, other.f); }
 };
 inline int fclose (auto_file_ptr & af) { return af.fclose(); }
-#endif	/* MALCOLM */
 
 #ifdef _MSC_VER
 // auto-closing container for Win32 handles.
diff --git a/Common/Include/fileutil.h b/Common/Include/fileutil.h
index 51ef8f82a..8371de1b9 100644
--- a/Common/Include/fileutil.h
+++ b/Common/Include/fileutil.h
@@ -3,232 +3,19 @@
 //
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 //
-// $Log: /Speech_To_Speech_Translation/dbn/dbn/fileutil.h $
-// 
-// 71    1/03/13 8:53p Kaisheny
-// Asynchronous SGD using data pipe.
-// 
-// 70    9/30/12 10:46a Fseide
-// new optional parameter to fuptodate()--caller can now choose whether a
-// missing input file, with target file present, will cause a failure or
-// considers the target up-to-date
-// 
-// 69    11/09/11 10:01 Fseide
-// added a new overload for fgetfilelines() that returns an array of char*
-// instead of strings, to avoid mem alloc
-// 
-// 68    6/10/11 9:50 Fseide
-// (fixed a missing 'inline')
-// 
-// 67    6/10/11 9:49 Fseide
-// new function fgetfilelines() for reading text files
-// 
-// 66    6/09/11 15:18 Fseide
-// added overloads to fexists() that accept STL strings
-// 
-// 65    3/07/11 12:13 Fseide
-// actually implemented unlinkOrDie() (was a dummy)
-// 
-// 64    11/17/10 15:00 Fseide
-// new function fuptodate();
-// make_intermediate_dirs() moved to namespace msra::files (all new
-// functions should be put in there)
-// 
-// 63    11/15/10 7:04p Fseide
-// added an overload for freadOrDie (vector) that takes size as a size_t
-// instead of an int, to pleasr the x64 compiler
-// 
-// 62    11/08/10 17:07 Fseide
-// new function make_intermediate_dirs()
-// 
-// 61    11/08/10 11:43 Fseide
-// (minor cleanup)
-// 
-// 60    2/05/09 19:06 Fseide
-// fgetline() now returns a non-const pointer, because user may want to
-// post-process the line, and the returned value is a user-specified
-// buffer anyway
-// 
-// 59    1/16/09 17:34 Fseide
-// relpath() and splitpath() moved to fileutil.h
-// 
-// 58    1/16/09 8:59 Fseide
-// exported fskipspace()
-// 
-// 57    1/15/09 7:38 Fseide
-// some magic to unify fgetstring() for char and wchar_t to a single
-// template function
-// 
-// 56    1/15/09 7:26 Fseide
-// corrected the #include order of basetypes.h
-// 
-// 55    1/14/09 19:26 Fseide
-// new functions fsetpos() and fgetpos();
-// new fixed-buffer size overload for fgetstring() and fgettoken()
-// 
-// 54    1/08/09 16:14 Fseide
-// fopenOrDie() now supports "-" as the pathname, referring to stdin or
-// stdout
-// 
-// 53    1/08/09 15:32 Fseide
-// new funtion expand_wildcards()
-// 
-// 52    1/05/09 8:44 Fseide
-// (added comments)
-// 
-// 51    11/11/08 6:04p Qiluo
-// recover the old fputstring functions
-// 
-// 50    10/31/08 5:09p Qiluo
-// remove banned APIs
-// 
-// 49    7/17/08 7:22p V-spwang
-// undid changes - back to version 47
-// 
-// 47    6/24/08 19:03 Fseide
-// added fgetwstring() and fputstring() for wstrings;
-// added templates for freadOrDie() and fwriteOrDie() for STL vectors
-// 
-// 46    6/18/08 11:41 Fseide
-// added #pragma once
-// 
-// 45    08-05-29 18:18 Llu
-// fix the interface of fputwav
-// 
-// 44    08-05-29 13:54 Llu
-// add fputwav revise fgetwav using stl instead of short *
-// 
-// 43    11/27/06 11:40 Fseide
-// new methods fgetwfx() and fputwfx() for direct access to simple PCM WAV
-// files
-// 
-// 42    10/14/06 18:31 Fseide
-// added char* version of fexists()
-// 
-// 41    5/22/06 9:34 Fseide
-// (experimental auto_file class checked in)
-// 
-// 40    5/14/06 19:59 Fseide
-// new function fsetmode()
-// 
-// 39    3/29/06 15:36 Fseide
-// changed to reading entire file instead of line-by-line, not changing
-// newlines anymore
-// 
-// 38    2/21/06 12:39p Kit
-// Added filesize64 function
-// 
-// 37    1/09/06 7:12p Rogeryu
-// wide version of fgetline
-// 
-// 36    12/19/05 21:52 Fseide
-// fputfile() added in 8-bit string version
-// 
-// 35    12/15/05 20:25 Fseide
-// added getfiletime(), setfiletime(), and fputfile() for strings
-// 
-// 34    9/27/05 12:22 Fseide
-// added wstring version of renameOrDie()
-// 
-// 33    9/22/05 12:26 Fseide
-// new method fexists()
-// 
-// 32    9/15/05 11:33 Fseide
-// new version of fgetline() that avoids buffer allocations, since this
-// seems very expensive esp. when reading a file line by line with
-// fgetline()
-// 
-// 31    9/05/05 4:57p F-xyzhao
-// added #include <windows.h> for #include <mmreg.h> -- ugh
-// 
-// 30    9/05/05 11:00 Fseide
-// new method renameOrDie()
-// 
-// 29    8/24/05 5:45p Kjchen
-// merge changes in OneNote
-// 
-// 28    8/19/05 17:56 Fseide
-// extended WAVEHEADER with write() and update()
-// 
-// 27    8/13/05 15:37 Fseide
-// added new version of fgetline that takes a buffer
-// 
-// 26    7/26/05 18:54 Fseide
-// new functions fgetint24() and fputint24()
-// 
-// 25    2/12/05 15:21 Fseide
-// fgetdouble() and fputdouble() added
-// 
-// 24    2/05/05 12:38 Fseide
-// new methods fputfile(), fgetfile();
-// new overload for filesize()
-// 
-// 23    2/03/05 22:34 Fseide
-// added new version of fgetline() that returns an STL string
-// 
-// 22    5/31/04 10:06 Fseide
-// new methods fseekOrDie(), ftellOrDie(), unlinkOrDie(), renameOrDie()
-// 
-// 21    3/19/04 4:01p Fseide
-// fwriteOrDie(): first argument changed to const
-// 
-// 20    2/27/04 10:04a V-xlshi
-// 
-// 19    2/19/04 3:45p V-xlshi
-// fgetraw function is added.
-// 
-// 18    2/19/04 1:49p V-xlshi
-// 
-// 17    2/03/04 8:17p V-xlshi
-// 
-// 16    2/03/04 6:20p V-xlshi
-// WAVEHEADER.prepare() added
-// 
-// 15    2/03/04 5:58p V-xlshi
-// WAVEHEADER structure added
-// 
-// 14    8/15/03 15:40 Fseide
-// new method filesize()
-// 
-// 13    8/13/03 21:06 Fseide
-// new function fputbyte()
-// 
-// 12    8/13/03 15:37 Fseide
-// prototype of fOpenOrDie() Unicode version changed
-// 
-// 11    8/07/03 22:04 Fseide
-// fprintfOrDie() now really dies in case of error
-// 
-// 10    03-07-30 12:06 I-rogery
-// enable both unicode and non-unicode version
-// 
-// 9     7/25/03 6:07p Fseide
-// new functions fgetbyte() and fgetwav()
-// 
-// 8     7/03/02 9:25p Fseide
-// fcompareTag() now uses string type for both of its arguments (before,
-// it used const char * for one of them)
-// 
-// 7     6/10/02 3:14p Fseide
-// new functions fgettoken(), fgetfloat_ascii(), fskipNewline()
-// 
-// 6     6/07/02 7:26p Fseide
-// new functions fcheckTag_ascii() and fgetint_ascii()
-// 
-// 5     4/15/02 1:12p Fseide
-// void fputstring (FILE * f, const TSTRING & str) and fpad() added
-// 
-// 4     4/03/02 3:58p Fseide
-// VSS keyword and copyright added
-//
-// F. Seide 5 Mar 2002
-//
-
 #pragma once
 #ifndef _FILEUTIL_
 #define _FILEUTIL_
 
 #include <stdio.h>
+#ifdef	_WIN32
+#define isfinite(x) _finite(x)
+#define isnan(x) _isnan(x)
+#endif
+#ifdef __unix__
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
 #include <algorithm>    // for std::find
 #include <vector>
 #include <map>
@@ -695,6 +482,29 @@ namespace msra { namespace files {
 // simple support for WAV file I/O
 // ----------------------------------------------------------------------------
 
+// define the header if we haven't seen it yet
+#ifndef _WAVEFORMATEX_
+#define _WAVEFORMATEX_
+
+/*
+ *  extended waveform format structure used for all non-PCM formats. this
+ *  structure is common to all non-PCM formats.
+ */
+typedef unsigned short WORD;  // in case not defined yet (i.e. linux)
+typedef struct tWAVEFORMATEX
+{
+    WORD        wFormatTag;         /* format type */
+    WORD        nChannels;          /* number of channels (i.e. mono, stereo...) */
+    DWORD       nSamplesPerSec;     /* sample rate */
+    DWORD       nAvgBytesPerSec;    /* for buffer estimation */
+    WORD        nBlockAlign;        /* block size of data */
+    WORD        wBitsPerSample;     /* number of bits per sample of mono data */
+    WORD        cbSize;             /* the count in bytes of the size of */
+                                    /* extra information (after cbSize) */
+} WAVEFORMATEX, *PWAVEFORMATEX;
+
+#endif /* _WAVEFORMATEX_ */
+
 typedef struct wavehder{
     char          riffchar[4];
     unsigned int  RiffLength;
diff --git a/Common/fileutil.cpp b/Common/fileutil.cpp
index 675c3d3c7..eca7ce6af 100644
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@@ -298,7 +298,7 @@ size_t filesize (const wchar_t * pathname)
 // filesize64(): determine size of the file in bytes (with pathname)
 int64_t filesize64 (const wchar_t * pathname)
 {
-    __stat64 fileinfo;
+    struct _stat64 fileinfo;
     if (_wstat64 (pathname,&fileinfo) == -1) 
         return 0;
     else
@@ -1375,6 +1375,21 @@ vector<char*> msra::files::fgetfilelines (const wstring & path, vector<char> & b
 
 bool getfiletime (const wstring & path, FILETIME & time)
 {   // return file modification time, false if cannot be determined
+	struct _stat buf;
+	int result;
+
+	// Get data associated with "crt_stat.c": 
+	result = _wstat(path.c_str(), &buf);
+	// Check if statistics are valid: 
+	if( result != 0 )
+	{
+		return false;
+	}
+
+	(*(time_t*)(&time))= buf.st_mtime;
+	return true;
+
+#ifdef OLD
     WIN32_FIND_DATAW findFileData;
     auto_handle hFind (FindFirstFileW (path.c_str(), &findFileData), ::FindClose);
     if (hFind != INVALID_HANDLE_VALUE)
@@ -1386,10 +1401,14 @@ bool getfiletime (const wstring & path, FILETIME & time)
     {
         return false;
     }
+#endif
 }
 
 void setfiletime (const wstring & path, const FILETIME & time)
 {   // update the file modification time of an existing file
+#ifdef LINUX
+	throw new logic_error("setfiletime has not been converted to linux yet...");
+#else
     auto_handle h (CreateFileW (path.c_str(), FILE_WRITE_ATTRIBUTES,
                                 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL,
                                 OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL));
@@ -1402,6 +1421,7 @@ void setfiletime (const wstring & path, const FILETIME & time)
     {
         RuntimeError ("setfiletime: error setting file time information: %d", GetLastError());
     }
+#endif
 }
 
 #if 0
diff --git a/MachineLearning/cn/cn.vcxproj b/MachineLearning/cn/cn.vcxproj
index ebd270b41..d84aa6994 100644
--- a/MachineLearning/cn/cn.vcxproj
+++ b/MachineLearning/cn/cn.vcxproj
@@ -124,7 +124,7 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
-      <DelayLoadDLLs>CNTKMath.dll;nvml.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>CNTKMath.dll;nvml.dll;nvcuda.dll</DelayLoadDLLs>
     </Link>
     <PostBuildEvent>
       <Command>
@@ -186,7 +186,7 @@
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalDependencies>Delayimp.lib;nvml.lib;cudart.lib;cntkMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <Profile>true</Profile>
-      <DelayLoadDLLs>CNTKMath.dll;nvml.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>CNTKMath.dll;nvml.dll;nvcuda.dll</DelayLoadDLLs>
     </Link>
     <PostBuildEvent>
       <Command>copy $(SolutionDir)..\Common\PTask\bin\*.dll $(TargetDir)</Command>
diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index d76320f66..9a3df0ee8 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -21,13 +21,15 @@
 
 #ifndef	 LINUX
 #include <Windows.h>
+#define Linux(x)
 #else
+#define Linux(x) x
 
 #ifndef max
 #define max(a,b)            (((a) > (b)) ? (a) : (b))
 #endif
 
-#include <values.h>
+#include <cfloat> 
 #endif	/* LINUX */
 
 #ifdef LEAKDETECT
diff --git a/Math/Math/GPUDummy.cpp b/Math/Math/GPUDummy.cpp
new file mode 100644
index 000000000..5528816e5
--- /dev/null
+++ b/Math/Math/GPUDummy.cpp
@@ -0,0 +1,1667 @@
+//
+// <copyright file="GPUMatrix.cu" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#include "GPUMatrix.cuh"
+#include "GPUSparseMatrix.cuh"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+    void PrepareDevice(short deviceId);
+
+#pragma region Constructors and Destructor
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix()
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ZeroInit()
+    {}
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const GPUMatrix<ElemType>& deepCopy)
+    {}
+
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const GPUSparseMatrix<ElemType>& deepCopy)
+    {}
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t nz, ElemType* pArray, 
+        const size_t matrixFlags /*=matrixFormatSparseCSR*/, int deviceId /*=MANAGEDEXTERN*/, const size_t elemSizeAllocated /*=0*/)
+    {}
+
+    // PrepareDevice - Setup the correct cuda context for an operation
+    // deviceId - the device on which the operation will take place
+    //            defaults to -1, which means use matrices current device
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::PrepareDevice(short deviceId /*=-1*/) const
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopy)
+    {}
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::CopyToDenseMatrix()
+    {
+        GPUMatrix<ElemType> res;
+        return res;            
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& denseMatrix)
+    {}
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseMatrix<ElemType>& deepCopy)
+    {
+        return *this;       
+    }
+
+#ifndef LINUX
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(GPUSparseMatrix<ElemType>&& moveFrom)
+    {}
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(GPUSparseMatrix<ElemType>&& moveFrom)
+    {
+        return *this;
+    }
+#endif /* LINUX */
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClearNew()
+    {}
+
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Clear()
+    {}
+
+    //ResizeAs - Resize this sparse matrix to have the same element structure as the passed matrix
+    // a - sparse matrix whose structure we want to clone
+    // remark: this was done for element wise operations where the structure will be identical after an operation
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ResizeAs(const GPUSparseMatrix<ElemType>& a)
+    {}
+
+    //-------------------------------------------------------------------------
+    // Start of new GPU Sparse Matrix code 
+    //-------------------------------------------------------------------------
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Init()
+    {}
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>::GPUSparseMatrix(const MatrixFormat format, const int deviceId)
+    {}
+
+    template<class ElemType>
+    ElemType* GPUSparseMatrix<ElemType>::BufferPointer() const
+    {
+        return this->m_blockVal;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, int size)
+    {}
+
+    //Reset matrix so it can be reused
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Reset()
+    {}
+
+#pragma endregion Constructors and Destructor
+
+#pragma region Static BLAS Functions
+    
+    // copy features to GPU matrix 
+     template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetMatrixFromCSCFormat(size_t *h_row, size_t *h_rowIdx, size_t size, size_t blockSize)
+    {}
+       
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetMatrixFromLabelAndClass(size_t *h_row, size_t *h_block2Id, size_t *h_block2UniqId, size_t labelSize, size_t expandedSize, size_t blockSize)
+    {}
+
+    // forward pass from feature to hidden layer
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, 
+        const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c)
+
+    {}
+
+    // backward pass from hidden layer to feature weight
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA, 
+        const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, GPUSparseMatrix<ElemType>& c)
+    {}
+
+    // used for gradients udpate
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& rhs)
+    {}
+
+    // a: H x No: H is hidden layer size and No is mini-batch size
+    // weight: V x H, V is vocab size
+    // label: V x No
+    // cls: 2 x Nc, Nc is number of classes, each col is start and end word ids of a class
+    // idx2cls: V x 1, mapping from word to class id
+    // etp: V x No, stores predicted values
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClassEntropy(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& weight,
+        const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
+        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& etp, GPUMatrix<ElemType>& entropyScore)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClassEntropyError(GPUSparseMatrix<ElemType>& a)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClassEntropyGradientOfInput(const GPUSparseMatrix<ElemType>& error, const GPUMatrix<ElemType>& weight,  GPUMatrix<ElemType>& grd)
+    {}
+    
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ClassEntropyGradientOfWeight(const GPUSparseMatrix<ElemType>& error,  const GPUMatrix<ElemType>& input, const GPUSparseMatrix<ElemType> & label, const GPUMatrix<ElemType>& cls, 
+        const GPUMatrix<ElemType>& idx2cls, GPUSparseMatrix<ElemType>& grd)
+    {}
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncate (const ElemType threshold)
+    {
+        return *this;
+    } 
+
+    // normal update for smoothed gradients c and current gradients (this)
+    template<class ElemType> 
+    void GPUSparseMatrix<ElemType>::NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum)
+    {}
+
+    //-------------------------------------------------------------------------
+    // End of new GPU Sparse Matrix code 
+    //-------------------------------------------------------------------------
+
+    template<class ElemType>
+    void  GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA, 
+        const GPUMatrix<ElemType>& b, ElemType beta, GPUMatrix<ElemType>& c)
+    {}
+       
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C)
+    {}
+
+    // ElemCountFromBufferSize - Return the elemCountAllocated for a particular buffersize
+    // totalBufferSize - total buffer we have to use
+    // return: size of allocated elements/index slots available
+    template<class ElemType>
+    size_t GPUSparseMatrix<ElemType>::ElemCountFromBufferSize(size_t totalBufferSize)
+    {
+        return 0;
+    }
+
+    // PrepareBuffer - Get the dimensions start buffer, computes the starting row/column of each value
+    // m - rows in the source
+    // n - cols in the source
+    // canReuseBuffer - target matrix can be reused for temporary space
+    // func - function to call to count elements in the result (returns count, and fills csrRowPtr array)
+    template<class ElemType>
+#ifndef LINUX
+    void GPUSparseMatrix<ElemType>::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function<size_t (int* csrRowPtrC)> func)
+#else
+    void GPUSparseMatrix<ElemType>::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, size_t (*func)(int *csRowPtrC))
+#endif  /* LINUX */
+    {}
+
+#ifdef  LINUX
+    size_t PrepareBufferMultiply(int* csrRowPtrC)
+        {
+            int nnzTotal = -1; 
+            return nnzTotal;
+        }
+#endif
+
+    // Multiply - multiply one spares matrix by another sparse matrix
+    // S1 - first sparse matrix
+    // transposeS1 - transpose first matrix?
+    // S2 - second sparse matrix
+    // transposeS2 - tanspose second matrix?
+    // c - result matrix
+    // NOTE: if c has enough space allocated, it will be reused, otherwise it will be freed and a new memory block used
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Multiply(const GPUSparseMatrix<ElemType>& S1, bool transposeS1, const GPUSparseMatrix<ElemType>& S2, bool transposeS2, GPUSparseMatrix<ElemType> &c)
+    {}
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignProductOf(const GPUSparseMatrix<ElemType>& a, const bool transposeA, const GPUSparseMatrix<ElemType>& b, const bool transposeB)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUSparseMatrix<ElemType>& c)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUSparseMatrix<ElemType>& a, ElemType beta, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, ElemType beta, const GPUSparseMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Scale(ElemType alpha, GPUSparseMatrix<ElemType>& a)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::ElementWisePower (ElemType alpha, const GPUSparseMatrix<ElemType>& a, GPUSparseMatrix<ElemType>& c)
+    {}
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return ElemType(0);        
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
+    {
+        return ElemType(0);        
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, 
+        const ElemType threshold)
+    {
+        return false;
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b, 
+        const ElemType threshold)
+    {
+            return false;
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, 
+        const ElemType threshold)
+    {
+            return false;
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUSparseMatrix<ElemType>& a, const ElemType threshold) const
+    {
+        return false;
+    }
+
+    template<class ElemType>
+    bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold) const
+    {
+        return false;
+    }
+#pragma endregion Static BLAS Functions
+
+#pragma region Member BLAS Functions
+
+    template<class ElemType>
+    int GPUSparseMatrix<ElemType>::GetComputeDeviceId() const 
+    {
+        return -1;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf (const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        GPUMatrix<ElemType> c;
+        return c;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf (const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
+    {
+        return GPUSparseMatrix<ElemType>::ElementProductOf(b,a);        
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator+ (const GPUSparseMatrix<ElemType>& a) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator- (const GPUSparseMatrix<ElemType>& a) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator^ (ElemType alpha) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator*=(ElemType alpha)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator* (ElemType alpha) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementPowerOf(const GPUSparseMatrix<ElemType>& a, const ElemType power)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::Transpose() const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTransposeOf(const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::InplaceTranspose()
+    {}
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
+    {
+        return ElemType(0);
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
+    {
+        return ElemType(0);
+    }
+
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const 
+    {
+        return ElemType(0);
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
+    {
+        return ElemType(0);
+    }
+
+    template<class ElemType>
+    ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
+    {
+        return ElemType(0);
+    }
+
+#pragma endregion Member BLAS Functions
+
+#pragma region Other Functions
+
+    template<class ElemType>    
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse ()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementInverseOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSigmoid()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSigmoidOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLinearRectifierDerivative()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLinearRectifierDerivativeOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTanh()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTanhOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSqrt()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSqrtOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceExp()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignExpOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLog()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLogOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceAbs()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignAbsOf (const GPUSparseMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTruncateBottomOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold)
+    {
+        return *this;
+    }   
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateTop (const ElemType threshold)
+    {
+        return *this;        
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignTruncateTopOf (const GPUSparseMatrix<ElemType>& a, const ElemType threshold)
+    {
+        return *this;        
+    }
+
+    template<class ElemType>
+    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan (const ElemType threshold)
+    {
+        return *this;  
+    }
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::Unrolling (//GPUSparseMatrix<ElemType>& debugMatrix, 
+        GPUMatrix<ElemType>& UnrolledMatrix, const GPUMatrix<ElemType>& InMatrix, GPUSparseMatrix<ElemType>& UnrollMapping, 
+        const int inputWidth, const int inputHeight, const int inputChannelNum,
+        const int FltWidth,const int FltHeight, const int FltChannel,
+        const int FltStepW,  const int FltStepH)
+    {
+    }
+
+#pragma endregion
+
+#pragma region Helper Functions
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::performInplaceFunction(int kind)
+    {}
+
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::SetMatrixFromCSRFormat(int *h_CSRRow, int *h_Col, ElemType *h_Val, size_t nz, size_t numRows, size_t numCols, bool IsOnDevice, int devId)
+    {}
+
+    // NOTE: we should change this to just use a single buffer, and return pointers into it
+    template<class ElemType>
+    void GPUSparseMatrix<ElemType>::GetMatrixFromCSRFormat(int*& h_CSRRow, int*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
+    {}
+
+#pragma endregion Helper Functions
+
+    template class GPUSparseMatrix<float>; 
+    template class GPUSparseMatrix<double>;    
+
+    template <class ElemType>
+    MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
+    {
+        return stream;
+    }
+
+    template MATH_API File& operator>>(File& stream, GPUSparseMatrix<float>& us);
+    template MATH_API File& operator>>(File& stream, GPUSparseMatrix<double>& us);
+
+    template <class ElemType>
+    MATH_API File& operator<<(File& stream, const GPUSparseMatrix<ElemType>& us)
+    {
+        return stream;
+    }
+    template MATH_API File& operator<<(File& stream, const GPUSparseMatrix<float>& us);
+    template MATH_API File& operator<<(File& stream, const GPUSparseMatrix<double>& us);
+
+
+#pragma region DeviceBoundNumber class
+
+    template<class ElemType>
+    DeviceBoundNumber<ElemType>::DeviceBoundNumber(const DeviceBoundNumber<ElemType> &deepCopy)
+    {
+        NOT_IMPLEMENTED;
+    }
+
+#ifndef LINUX
+    template<class ElemType>
+    DeviceBoundNumber<ElemType>::DeviceBoundNumber(DeviceBoundNumber<ElemType> &&shallowCopy)
+    {
+        this->ShallowCopyFrom(shallowCopy.m_data,shallowCopy.m_computeDevice);
+        shallowCopy.m_data=NULL;
+    }
+#endif
+
+    template<class ElemType>
+    void DeviceBoundNumber<ElemType>::ShallowCopyFrom(ElemType* newVal,int newValsDevceId)
+    {}
+
+    template<class ElemType>
+    DeviceBoundNumber<ElemType>::~DeviceBoundNumber()
+    {}
+
+#pragma endregion DeviceBoundNumber class
+
+#pragma region Helper functions
+
+    // GetBestGPUDeviceId - Get the best GPU DeviceId, based on cuda information
+    //  TODO: should be replaced by BestGpu class instead, it's much better
+    template<class ElemType>
+    int GPUMatrix<ElemType>::GetBestGPUDeviceId() //returns -1 if no GPUs can be used
+    {      
+            return -1; // CPU
+    }
+
+    // PrepareDevice - Setup the correct cuda context for an operation
+    // deviceId - the device on which the operation will take place
+    //            defaults to -1, which means use matrices current device
+    template<class ElemType>
+    void GPUMatrix<ElemType>::PrepareDevice(short deviceId /*=-1*/) const
+    {}
+
+    template<class ElemType>
+    ElemType* GPUMatrix<ElemType>::CopyToArray() const
+    {
+        return NULL;
+    }
+
+    //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done
+    //return number of elements copied
+    template<class ElemType>
+    size_t  GPUMatrix<ElemType>::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const
+    {
+        return 0;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ChangeDeviceTo(int to_id)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::performInplaceFunction(int kind)    
+    {}
+
+
+#pragma endregion Helper functions
+
+#pragma region Constructors and Destructor
+
+   //should only be used by constructors.
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ZeroInit(int deviceId)
+    {}
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(int deviceId) 
+    {};
+
+    //matrixName is used to verify that correct matrix is read.
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(FILE* f, const char * matrixName, int deviceId)
+    {}
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(const size_t numRows, const size_t numCols,int deviceId)
+    {};
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags, int deviceId)
+    {};               
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(const GPUMatrix<ElemType>& deepCopyFrom)
+    {}
+
+#ifndef LINUX
+    template<class ElemType>
+    GPUMatrix<ElemType>::GPUMatrix(GPUMatrix<ElemType>&& moveFrom)
+    {}
+#endif
+
+    //assignment operator, deep copy
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator=(const GPUMatrix<ElemType>& deepCopyFrom)  
+    {
+        return *this;
+    }
+
+#ifndef LINUX
+    //move assignment operator, shallow copy
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator=(GPUMatrix<ElemType>&& moveFrom)  
+    {
+        return *this;
+    }
+#endif /* LINUX */
+
+    template<class ElemType>
+    GPUMatrix<ElemType>::~GPUMatrix(void)
+    {
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Clear()
+    {}
+#pragma endregion Constructors and Destructor 
+
+    template<class ElemType>
+    int GPUMatrix<ElemType>::GetComputeDeviceId() const 
+    {
+        return -1;
+    }
+
+#pragma region Basic Operators
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::ColumnSlice(size_t startColumn, size_t numCols) const
+    {
+        GPUMatrix<ElemType> slice;
+
+        return slice;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols)
+    {
+        return *this;
+    }     
+
+
+    //for each column of a, we assign numRows starting from startIndex to this
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
+    {
+        return *this;
+    }
+
+    //for each column of a, we add all rows of a to this starting from startIndex
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddToRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::Transpose() const
+    {
+        return *this;
+    }
+
+    // GetCublasHandle - get a cublas handle for the given GPU, should only need one per GPU
+    // computeDevice - The compute device for which the cublas handle is desired
+    // returns: cublas handle
+    // NOTE: we currently don't bother to ever free the CUBLAS handle, it will be freed automatically by CUDA when the process ends
+    template<class ElemType>
+    cublasHandle_t GPUMatrix<ElemType>::GetCublasHandle(int computeDevice/*=-1*/)
+    {
+        cublasHandle_t cuHandle = 0;
+        return cuHandle;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTransposeOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetValue(const ElemType v)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetValue(const ElemType* d_v) //d_v is pointer to the the value in GPU memory
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetColumn(const ElemType* colPointer, size_t colInd)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& deepCopyFrom)
+    {}
+
+    template<class ElemType>    
+    void GPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, size_t matrixFlags, int deviceId)
+    {}
+
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetDiagonalValue(const ElemType v)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetDiagonalValue(GPUMatrix<ElemType>& vector)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed)
+    {}
+
+    //maskRate: percentage of values masked out (similar to dropout rate)
+    //scaleValue: which scale value to set to the left ones (unmasked items).
+    template<class ElemType>
+    void GPUMatrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& gradients)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
+    {}
+
+    template<class ElemType>
+    size_t GPUMatrix<ElemType>::LocateElement (const size_t row, const size_t col) const 
+    { 
+        return 0;
+    }  
+
+    template<class ElemType>
+    size_t GPUMatrix<ElemType>::LocateColumn (const size_t col) const 
+    { 
+        return 0;
+    }  
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::Get00Element() const 
+    {        
+        ElemType res=0;        
+        return res;
+    }
+#pragma endregion Basic Operators
+
+#pragma region Member BLAS Functions
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator+= (ElemType alpha) 
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator+ (ElemType alpha) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
+    {
+        return (*this);
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator+= (const GPUMatrix<ElemType>& a) 
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator+ (const GPUMatrix<ElemType>& a) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return (*this);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator-= (ElemType alpha) 
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator- (ElemType alpha) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const GPUMatrix<ElemType>& a, const ElemType alpha)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator-= (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator- (const GPUMatrix<ElemType>& a) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignDifferenceOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator*= (ElemType alpha)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator* (ElemType alpha) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignProductOf(const ElemType alpha, const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignProductOf (const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator* (const GPUMatrix<ElemType>& a) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator/= (ElemType alpha)
+    {
+        return (*this);
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator/ (ElemType alpha) const
+    {
+        return *this;
+    }
+
+    //element-wise power
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::operator^= (ElemType alpha)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::operator^ (ElemType alpha) const
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementPowerOf(const GPUMatrix<ElemType>& a, const ElemType power)
+    {
+        return *this;
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddElementProductOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ColumnElementMultiplyWith(const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::RowElementMultiplyWith(const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ElementInverse ()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementInverseOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSigmoid()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSigmoidOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSigmoidDerivative()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSigmoidDerivativeOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTanh()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTanhOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSoftmax (const bool isColWise)
+    {
+        return *this; 
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSoftmaxOf (const GPUMatrix<ElemType>& a, const bool isColWise)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSqrt()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSqrtOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceExp()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignExpOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceLog()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLogOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceAbs()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignAbsOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceLinearRectifierDerivative()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLinearRectifierDerivativeOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceCosine()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCosineOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceNegativeSine()
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignNegativeSineOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTruncateBottomOf (const GPUMatrix<ElemType>& a, const ElemType threshold)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateTop (const ElemType threshold)
+    {
+        return *this;        
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTruncateTopOf (const GPUMatrix<ElemType>& a, const ElemType threshold)
+    {
+        return *this;        
+    }
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan (const ElemType threshold)
+    {
+        return *this;  
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
+    {
+        return ElemType(0);
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::SumOfElements() const
+    {
+        return ElemType(0);
+    }
+
+    
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOfElements(const GPUMatrix<ElemType>& a)
+    {
+        return (*this);
+    }
+
+    template<class ElemType>
+    DeviceBoundNumber<ElemType> GPUMatrix<ElemType>::Sum_AsDeviceBoundNum() const
+    {
+        DeviceBoundNumber<ElemType> result;
+        return result;
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::Max() const
+    {
+        return ElemType(0);
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::ElementMultiplyWith (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementProductOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignElementDivisionOf (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    bool GPUMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold /*= 1e-8*/) const
+    {
+        return AreEqual(*this, a, threshold);
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorNorm1(GPUMatrix<ElemType>& c, const bool isColWise) const
+    {
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNorm1Of(GPUMatrix<ElemType>& a, const bool isColWise)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorNorm2(GPUMatrix<ElemType>& c, const bool isColWise) const
+    {}
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNorm2Of(GPUMatrix<ElemType>& a, const bool isColWise)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorNormInf(GPUMatrix<ElemType>& c, const bool isColWise) const
+    {}
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignVectorNormInfOf(GPUMatrix<ElemType>& a, const bool isColWise)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignInnerProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const bool isColWise)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignKhatriRaoProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return *this;
+    }
+
+    //column-wise reshaped product. Used to compute KhatriRaoProduct Gradient
+    //   this = reshape each column of a from (K1xK2,1) to (K1, K2) 
+    //   if each column of a is not transposed, each (K1, K2) times each column of b (K2, frames).
+    //   the output is a (K1, frames) matrix
+    //   if each column of a is tranposed, each (K1, K2)^T times each column of b(K1, frames) and output is (K2, frames)
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddColumnReshapeProductOf(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const bool transposeAColumn)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithScaleOf(ElemType alpha, const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::FrobeniusNorm() const
+    {
+        ElemType h_sum=0;
+        return (h_sum); 
+    }
+    
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignFrobeniusNormOf (const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::MatrixNormInf() const
+    {
+        ElemType h_maxAbs=0;
+        return h_maxAbs; 
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::MatrixNorm1() const
+    {
+        return ElemType(0);              
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::MatrixNorm0() const
+    {
+        return ElemType(0);              
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSignOf(const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddSignOf(const GPUMatrix<ElemType>& a)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise) const
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<ElemType>& minValues, const bool isColWise) const
+    {}
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return *this;
+    }
+
+#pragma endregion Member BLAS Functions    
+
+#pragma region Other helper functions
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Print(const char* matrixName /*=nullptr*/) const
+    {}
+
+    // file I/O
+    //matrixName is used to verify that correct matrix is read.
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ReadFromFile(FILE* f, const char * matrixName) 
+    {}
+
+    //matrixName is used to verify that correct matrix is read.
+    template<class ElemType>
+    void GPUMatrix<ElemType>::WriteToFile(FILE* f, const char * matrixName) 
+    {}
+
+    //helpfer function used for convolution neural network 
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignPackedConvolutionInput(const GPUMatrix<ElemType>& inputSubBatch, 
+                                            const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
+                                            const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
+                                            const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                            const bool zeroPadding)
+    {
+        return *this;
+    }
+
+    //helpfer function used for convolution neural network 
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::UnpackConvolutionInput(GPUMatrix<ElemType>& inputSubBatch, 
+                                            const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
+                                            const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
+                                            const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                            const bool zeroPadding) const
+    {
+        GPUMatrix<ElemType> mat;
+        return mat;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignMaxPoolingResult(const GPUMatrix<ElemType>& inputBatch, const size_t channels, 
+                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AddMaxPoolingGradient(const GPUMatrix<ElemType>& outputGradientBatch, const GPUMatrix<ElemType>& inputBatch, const GPUMatrix<ElemType>& outputBatch, 
+                                                const size_t channels, 
+                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignAveragePoolingResult(const GPUMatrix<ElemType>& inputBatch, const size_t channels, 
+                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
+    {
+        return *this;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMatrix<ElemType>& outputGradientBatch, 
+                                                const size_t channels, 
+                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, 
+                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, 
+                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
+    {
+        return *this;
+    }
+
+#pragma endregion Other helper functions
+
+#pragma region Static BLAS Functions
+    template<class ElemType>
+    void GPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, 
+        ElemType beta, GPUMatrix<ElemType>& c)
+    {
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::MultiplyAndAdd(const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, GPUMatrix<ElemType>& c)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, GPUMatrix<ElemType>& c)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {}
+
+    /// <summary>Matrix-scalar multiply with col-major matrices: c = alpha * a + c</summary>
+    /// if a is a column vector, add to all columns of c 
+    /// if a is a row vector, add to all rows of c    
+    /// if a is a scalar, add to all elements of c
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha,const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+    {}
+
+    /// <summary>c += alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>
+    void GPUMatrix<ElemType>::AddScaledDifference(const ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {}
+
+    /// <summary> c = alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>    
+    void GPUMatrix<ElemType>::AssignScaledDifference(const ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {}
+
+    /// <summary>c += alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">1X1 matrix</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>
+    void GPUMatrix<ElemType>::AddScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {}
+
+    /// <summary> c = alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>    
+    void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+    {}
+
+    //c[ci,cj] += a[ai,aj]
+    template<class ElemType>
+    void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
+    {}
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
+    {}
+
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& a)
+    {}
+
+    template<class ElemType> //c = alpha * a
+    void GPUMatrix<ElemType>::Scale(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+    {}
+
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::InnerProduct (const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c, const bool isColWise)
+    {}
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::InnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return ElemType(0);
+    }
+
+
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignInnerProductOfMatrices(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
+    {
+        return *this;
+    }
+
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ElementWisePower(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+    {}
+
+    template<class ElemType>
+    bool GPUMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const ElemType threshold /*= 1e-8*/)
+    {
+            return false;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Ones(const size_t rows, const size_t cols)
+    {
+        GPUMatrix<ElemType> mat;
+        return mat;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Zeros(const size_t rows, const size_t cols)
+    {
+        GPUMatrix<ElemType> mat;
+        return mat;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>  GPUMatrix<ElemType>::Eye(const size_t rows)
+    {
+        GPUMatrix<ElemType> mat;
+        return mat;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType>  GPUMatrix<ElemType>::RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed)
+    {
+        GPUMatrix<ElemType> mat;
+        return mat;
+    }
+
+    template<class ElemType>
+    GPUMatrix<ElemType> GPUMatrix<ElemType>::RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed)
+    {
+        GPUMatrix<ElemType> mat;
+        return mat;
+    }
+
+    template<class ElemType>
+    ElemType GPUMatrix<ElemType>::GetLearnRateForBlock_Helper(const GPUMatrix<ElemType> &Gradients, const GPUMatrix<ElemType> &SmoothedGradients)
+    {                
+        return ElemType(0);
+    }
+
+#pragma endregion Static BLAS Functions
+
+    template class GPUMatrix<float>; 
+    template class GPUMatrix<double>;
+    template class DeviceBoundNumber<float>;
+    template class DeviceBoundNumber<double>;
+
+    template<class ElemType>
+    cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus]={0};
+
+    template<class ElemType>
+    void* GPUMatrix<ElemType>::s_curandGenerator=NULL;    
+}}}
+
+// define a dummy GPUWatcher class too
+#include "GPUWatcher.cuh"
+
+int GPUWatcher::GetGPUIdWithTheMostFreeMemory()
+{
+    return 0;
+}
+
+
+size_t GPUWatcher::GetFreeMemoryOnCUDADevice(int devId)
+{
+    return 0;
+}
+
+GPUWatcher::GPUWatcher(void)
+{
+}
+
+GPUWatcher::~GPUWatcher(void)
+{
+}
+
+
+
+
+
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index 9d690f0e4..692b059a3 100644
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -324,22 +324,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         switch (kind)
         {
         case 0:
-            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);            
+			_inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
             break;
         case 1:
-            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+			_inplaceTanhOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
             break;
         case 2:
-            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+			_inplaceSqrtOnCuda<ElemType><<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(this->m_pArray, N);
             break;
         case 3:
-            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
             break;
         case 4:
-            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
             break;
         case 5:
-            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);   
+            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
             break;
         case 6:
             _inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
@@ -1205,7 +1205,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //    int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);
         //    cudaEvent_t done = nullptr;
         //    if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        //    _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N); 
+        //    _addValue<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(m_pArray,a.m_pArray,N);
         //    if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         //    if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         //    if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1458,7 +1458,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);                        
+        _addElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1480,7 +1480,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N,M);                        
+        _columnElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N,M);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1503,7 +1503,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.m_pArray,N,M);                        
+        _rowElementMultiplyWith<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.m_pArray,N,M);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));      
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1568,7 +1568,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);                        
+        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));     
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1825,7 +1825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);                        
+        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1848,7 +1848,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);                        
+        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1866,7 +1866,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);                        
+        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1889,7 +1889,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);                        
+        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,threshold,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1906,7 +1906,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);                        
+        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,threshold,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -1964,7 +1964,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
         //WARNING: THIS kernel is not the most efficient way!
-        _reductionSumAndAssign<ElemType><<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements());       
+        _reductionSumAndAssign<ElemType><<<1,1024>>>(this->m_pArray,a.m_pArray,(LONG64)a.GetNumElements(),(LONG64)this->GetNumElements());
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2026,7 +2026,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N);                        
+        _elemMul<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2049,7 +2049,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);                        
+        _assignElementProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2078,7 +2078,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);                        
+        _assignElementDivisionOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2119,7 +2119,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }       
 
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-        _vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);  
+        _vectorNorm1<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2160,7 +2160,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }       
 
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-        _vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);  
+        _vectorNorm2<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, this->m_pArray,n,m,isColWise);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2219,7 +2219,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);                        
+        _assignKhatriRaoProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray,rowsA, rowsB, cols);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2257,7 +2257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         a.PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);                        
+        _addColumnReshapeProductOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray,a.m_pArray,b.m_pArray, rowsB, rowsC, cols, transposeAColumn);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2360,7 +2360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         cudaEvent_t done = nullptr;
         int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock);  
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements());                        
+        _assignSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (long)this->GetNumElements());
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));    
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2380,7 +2380,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         cudaEvent_t done = nullptr;
         int blocksPerGrid=(int)ceil(1.0*this->GetNumElements()/threadsPerBlock);  
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements());                        
+        _addSignOf<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(this->m_pArray, a.m_pArray, (LONG64)this->GetNumElements());
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));    
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2473,8 +2473,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         cudaEvent_t done = nullptr;
         //int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/threadsPerBlock);  
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        //_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements());                        
-        _assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements());                        
+        //_assignNumOfDiff<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, a.GetNumElements());
+        _assignNumOfDiff<ElemType><<<1,1024,0,t_stream>>>(a.m_pArray, b.m_pArray, this->m_pArray, (LONG64)a.GetNumElements());
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2926,7 +2926,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             LONG64 n=(LONG64)a.GetNumElements();            
             int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
             if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);                        
+            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
             if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
             if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
             if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2967,7 +2967,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             LONG64 n=(LONG64)a.GetNumElements();            
             int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
             if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);                        
+            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha, a.m_pArray, b.m_pArray, c.m_pArray, n);
             if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
             if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
             if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -3011,7 +3011,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             LONG64 n=(LONG64)a.GetNumElements();            
             int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
             if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);                        
+            _addScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
             if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
             if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));   
             if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -3055,7 +3055,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             LONG64 n=(LONG64)a.GetNumElements();            
             int blocksPerGrid=(int)ceil(1.0*n/threadsPerBlock);  
             if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);                        
+            _assignScaledDifference<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha.m_pArray, a.m_pArray, b.m_pArray, c.m_pArray, n);
             if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
             if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
             if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -3074,7 +3074,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         cudaEvent_t done = nullptr;
         int blocksPerGrid=1;  //only one element
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));                        
+        _addElementToElement<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(a.m_pArray, (LONG64)a.LocateElement(ai, aj), c.m_pArray, (LONG64)c.LocateElement(ci, cj));
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));  
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -3195,7 +3195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }       
 
             if (do_sync)    CUDA_CALL(cudaEventCreate(&done));  
-            _innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);  
+            _innerProduct<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(c.m_pArray, a.m_pArray,b.m_pArray,m,n,isColWise);
             if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
             if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
             if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -3288,7 +3288,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (do_sync)    CUDA_CALL(cudaEventCreate(&done));            
             LONG64 N=(LONG64)a.GetNumElements();
             int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);             
+            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock,0,t_stream>>>(alpha,a.m_pArray,c.m_pArray,N);
             if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
             if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
             if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index 55af99884..7a116df3a 100644
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -3,6 +3,7 @@
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
 // </copyright>
 //
+#ifndef CPU_ONLY
 #include <float.h>
 #include <cuda_runtime.h>
 #include "CommonMatrix.h"
@@ -3235,3 +3236,5 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((
 }
 }
 */
+
+#endif /*!CPU_ONLY*/
diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu
index e0e908292..1882798dc 100644
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@@ -909,7 +909,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
             cudaEvent_t done = nullptr;
             CUDACALL(cudaEventCreate(&done));        
-            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_blockVal,threshold,N);                        
+            _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_blockVal,threshold,N);
             CUDACALL(cudaEventRecord(done));        
             CUDACALL(cudaEventSynchronize(done));   
             CUDACALL(cudaEventDestroy(done));
@@ -1310,7 +1310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             a.PrepareDevice();
             long N=(long)a.GetNZElements();
             int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
-            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzLocation(),c.NzLocation(),N);             
+            _elementWisePowerOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzLocation(),c.NzLocation(),N);
             CUDACALL(cudaEventRecord(done));        
             CUDACALL(cudaEventSynchronize(done));   
         }
@@ -1360,7 +1360,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //int* h_vectArray= new int[a.m_nz];
         int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);   
         CUDACALL(cudaEventCreate(&done));
-        _getSparseVectorRepresntationForMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);        
+        _getSparseVectorRepresntationForMatrix<ElemType><<<blocksPerGrid,threadsPerBlock>>>(cscColPtrA,cscRowIndA,vectArray,M,N);
         CUDACALL(cudaEventRecord(done));        
         CUDACALL(cudaEventSynchronize(done));
         CUDACALL(cudaEventDestroy(done));
@@ -1411,7 +1411,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CUDACALL(cudaMemcpy(d_res,res,sizeof(long)*3,cudaMemcpyHostToDevice));
 
         int blocksPerGrid =(int)ceil(1.0*a.GetNZElements()/threadsPerBlock); 
-        _areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res);        
+        _areEqual<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzLocation(),b.NzLocation(),(long)a.GetNZElements(),threshold,d_res);
         _areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.ColLocation(),b.ColLocation(),(long)a.GetNZElements(),(int)threshold,d_res+1);
         blocksPerGrid =(int)ceil((1.0*a.GetNumRows()+1.0)/threadsPerBlock); 
         _areEqual<int><<<blocksPerGrid,threadsPerBlock>>>(a.RowLocation(),b.RowLocation(),(long)a.GetNumRows()+1,(int)threshold,d_res+2);
@@ -1719,7 +1719,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         int blocksPerGrid =(int)ceil(1.0*N/threadsPerBlock);                
         cudaEvent_t done = nullptr;
         CUDACALL(cudaEventCreate(&done));        
-        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);                        
+        _elemInverse<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
         CUDACALL(cudaEventRecord(done));        
         CUDACALL(cudaEventSynchronize(done));        
         return *this;
@@ -1846,7 +1846,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
         cudaEvent_t done = nullptr;
         CUDACALL(cudaEventCreate(&done));        
-        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);                        
+        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
         CUDACALL(cudaEventRecord(done));        
         CUDACALL(cudaEventSynchronize(done)); 
         return *this;
@@ -1867,7 +1867,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
         cudaEvent_t done = nullptr;
         CUDACALL(cudaEventCreate(&done));        
-        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);                        
+        _assignTruncateBottom<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
         CUDACALL(cudaEventRecord(done));        
         CUDACALL(cudaEventSynchronize(done));
         return *this;
@@ -1882,7 +1882,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
         cudaEvent_t done = nullptr;
         CUDACALL(cudaEventCreate(&done));        
-        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);                        
+        _inplaceTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
         CUDACALL(cudaEventRecord(done));        
         CUDACALL(cudaEventSynchronize(done)); 
         return *this;        
@@ -1903,7 +1903,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
         cudaEvent_t done = nullptr;
         CUDACALL(cudaEventCreate(&done));        
-        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);                        
+        _assignTruncateTop<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,a.NzLocation(),threshold,N);
         CUDACALL(cudaEventRecord(done));        
         CUDACALL(cudaEventSynchronize(done));
         return *this;        
@@ -1918,7 +1918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         int blocksPerGrid =(int)ceil(N*1.0/threadsPerBlock);                
         cudaEvent_t done = nullptr;
         CUDACALL(cudaEventCreate(&done));        
-        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);                        
+        _setToZeroIfAbsLessThan<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,threshold,N);
         CUDACALL(cudaEventRecord(done));        
         CUDACALL(cudaEventSynchronize(done)); 
         return *this;  
@@ -2012,22 +2012,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         switch (kind)
         {
         case 0:
-            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);            
+            _inplaceSigmoidOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
             break;
         case 1:
-            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceTanhOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
             break;
         case 2:
-            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceSqrtOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
             break;
         case 3:
-            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceExpOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
             break;
         case 4:
-            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceLogOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
             break;
         case 5:
-            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);   
+            _inplaceAbsOnCuda<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
             break;
         case 6:
             _inplaceLinRectDerivative<ElemType><<<blocksPerGrid,threadsPerBlock>>>(this->m_pArray,N);
diff --git a/Math/Math/Math.vcxproj.filters b/Math/Math/Math.vcxproj.filters
new file mode 100644
index 000000000..4846433c6
--- /dev/null
+++ b/Math/Math/Math.vcxproj.filters
@@ -0,0 +1,77 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="stdafx.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="CPUMatrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="Matrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="Helpers.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="CommonMatrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="CPUSparseMatrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="dllmain.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="CPUMatrix.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="Matrix.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\File.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\Common\fileutil.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="CPUSparseMatrix.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPUDummy.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="GPUMatrix.cuh">
+      <Filter>Header Files</Filter>
+    </None>
+    <None Include="GPUSparseMatrix.cuh">
+      <Filter>Header Files</Filter>
+    </None>
+    <None Include="GPUWatcher.cuh">
+      <Filter>Header Files</Filter>
+    </None>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index cdbd2526d..eecc95d96 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -288,15 +288,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             if (m_preferredDeviceId == CPUDEVICE)
             {
-            m_CPUMatrix = new CPUMatrix<ElemType>(numRows,numCols);
+				m_CPUMatrix = new CPUMatrix<ElemType>(numRows,numCols);
                 SetDataLocation(CPU, DENSE);
-        }
-        else
-        {
-                m_GPUMatrix = new GPUMatrix<ElemType>(numRows,numCols,m_preferredDeviceId);
-                SetDataLocation(GPU, DENSE);
-        }
-    }
+			}
+			else
+			{
+					m_GPUMatrix = new GPUMatrix<ElemType>(numRows,numCols,m_preferredDeviceId);
+					SetDataLocation(GPU, DENSE);
+			}
+		}
     }
 
     template<class ElemType>
@@ -840,11 +840,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_CPUMatrix->SetValue(*db_number.ExposePointer2Value()), 
             if (GetDeviceId()!=db_number.GetDeviceId()) 
             {
-#ifndef	LINUX
-                throw std::exception("Matrix and device bound number must be on the same device");
-#else
-                throw std::exception();
-#endif	/* LINUX */
+                throw std::runtime_error("Matrix and device bound number must be on the same device");
             }
             m_GPUMatrix->SetValue(db_number.ExposePointer2Value()), 
             NOT_IMPLEMENTED,