From a863c7746fc0364eab82212902a8ef47bd5b6895 Mon Sep 17 00:00:00 2001
From: Malcolm Slaney <mslaney@microsoft.com>
Date: Thu, 9 Oct 2014 10:44:27 -0700
Subject: [PATCH] More changes to get it compiling.. but not finished.

---
 Common/Include/basetypes.h        | 11 ++++++++--
 Math/Math/CommonMatrix.h          |  1 +
 Math/Math/GPUMatrix.cu            |  6 +++---
 Math/Math/GPUMatrixCUDAKernels.cu |  7 +++++++
 Math/Math/GPUSparseMatrix.cu      | 35 ++++++++++++++++++++++++++++---
 Math/Math/GPUSparseMatrix.cuh     |  4 +++-
 Math/Math/Makefile                |  9 ++++----
 Math/Math/Matrix.cpp              |  4 ++--
 Math/Math/Matrix.h                |  4 ++++
 9 files changed, 66 insertions(+), 15 deletions(-)
diff --git a/Common/Include/basetypes.h b/Common/Include/basetypes.h
index 7b424502c..d3145570a 100644
--- a/Common/Include/basetypes.h
+++ b/Common/Include/basetypes.h
@@ -11,6 +11,8 @@
 typedef char16_t TCHAR;
 #include <stdarg.h>
 #define	vsprintf_s vsprintf		/* Not sure this is right... Malcolm */
+#include <chrono>
+#include <thread>
 #endif	 /* LINUX */
 
 #ifndef UNDER_CE    // fixed-buffer overloads not available for wince
@@ -108,6 +110,7 @@ using namespace std;
 #define __inout_cap(x)
 #define __inout_cap_c(x)
 #endif
+#endif	// LINUX 
 #ifndef __out_z_cap    // non-VS2005 annotations
 #define __out_cap(x)
 #define __out_z_cap(x)
@@ -321,7 +324,6 @@ public:
 #endif
 };
 
-#ifndef	LINUX
 
 // locks a critical section, and unlocks it automatically
 // when the lock goes out of scope
@@ -447,7 +449,11 @@ public:
 #include <xlocale>      // uses strlen()
 #endif
 #define strlen strlen_
+#ifndef	LINUX
 template<typename _T> inline __declspec(deprecated("Dummy general template, cannot be used directly")) 
+#else
+template<typename _T> inline 
+#endif	// LINUX
 size_t strlen_(_T &s) { return strnlen_s(static_cast<const char *>(s), SIZE_MAX); } // never be called but needed to keep compiler happy
 template<typename _T> inline size_t strlen_(const _T &s)     { return strnlen_s(static_cast<const char *>(s), SIZE_MAX); }
 template<> inline size_t strlen_(char * &s)                  { return strnlen_s(s, SIZE_MAX); }
@@ -980,7 +986,8 @@ template<typename FUNCTION> static void attempt (int retries, const FUNCTION & b
 #ifndef	LINUX
             ::Sleep (1000); // wait a little, then try again
 #else
-            sleep(1);
+            std::chrono::milliseconds dura(1000);
+            std::this_thread::sleep_for(dura);
 #endif	/* LINUX */
         }
     }
diff --git a/Math/Math/CommonMatrix.h b/Math/Math/CommonMatrix.h
index f0dd57d46..da4a501a4 100644
--- a/Math/Math/CommonMatrix.h
+++ b/Math/Math/CommonMatrix.h
@@ -10,6 +10,7 @@
 
 #ifdef	LINUX
 #define	wcsnlen_s	wcsnlen			/* Not sure if this is best replacement... Malcolm */
+// typedef	char wchar_t;
 #endif	
 
 #define AUTOPLACEMATRIX 1000 // used in parameters only
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index 7374aee54..9d690f0e4 100644
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -424,7 +424,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         ZeroInit(deepCopyFrom.m_computeDevice);
         SetValue(deepCopyFrom);
-        SetMatrixName(deepCopyFrom.m_matrixName);       
+        this->SetMatrixName(deepCopyFrom.m_matrixName);       
     }
 
 #ifndef	LINUX
@@ -452,7 +452,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (this != &deepCopyFrom)
         {
             SetValue(deepCopyFrom);
-            SetMatrixName(deepCopyFrom.m_matrixName);       
+            this->SetMatrixName(deepCopyFrom.m_matrixName);       
         }
         return *this;
     }
@@ -464,7 +464,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         if (this != &moveFrom)
         {
-            if (OwnBuffer() && this->m_pArray!=NULL)
+            if (this->OwnBuffer() && this->m_pArray!=NULL)
             {
                 CUDA_CALL(cudaFree(this->m_pArray));  
             }
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index 6c035c2db..55af99884 100644
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -18,6 +18,13 @@
 #define MINLOGEXP -9.2103
 #define LSMALL -0.5E10
 
+// Predefine this for later.
+#ifndef LINUX
+static __inline__ __device__ double atomicAdd(double* address, double val);
+#else
+static  __device__ double atomicAdd(double* address, double val);
+#endif
+
 //CUDA Kernels code
 template<class ElemType>
 __global__ void _elementWisePowerOnCuda(
diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu
index 31bf6bb8f..e0e908292 100644
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@@ -1044,7 +1044,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // canReuseBuffer - target matrix can be reused for temporary space
     // func - function to call to count elements in the result (returns count, and fills csrRowPtr array)
     template<class ElemType>
+#ifndef	LINUX
     void GPUSparseMatrix<ElemType>::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function<size_t (int* csrRowPtrC)> func)
+#else
+    void GPUSparseMatrix<ElemType>::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, size_t (*func)(int *csRowPtrC))
+#endif	/* LINUX */
     {
         int* csrRowPtrC=NULL;
         GPUSparseMatrix<ElemType>& c = *this;
@@ -1099,6 +1103,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CUDACALL(cudaFree(csrRowPtrC));
     }
 
+#ifdef	LINUXxx
+    size_t PrepareBufferMultiply(int* csrRowPtrC)
+        {
+            int nnzTotal = -1; 
+            CUSPARSECALL(cusparseXcsrgemmNnz(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,S1.RowLocation(),S1.ColLocation(),descrB,nnzB,
+                S2.RowLocation(),S2.ColLocation(),descrC,csrRowPtrC,&nnzTotal));
+            return nnzTotal;
+        }
+#endif
+
     // Multiply - multiply one spares matrix by another sparse matrix
     // S1 - first sparse matrix
     // transposeS1 - transpose first matrix?
@@ -1136,13 +1150,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CUDACALL(cudaEventCreate(&done));
         //Step 1 
         c.PrepareBuffer(m, n, true, // true means we can reuse the "c" buffer if it exists for temporaries
+#ifndef	LINUX
             [&](int* csrRowPtrC) -> size_t
         {
             int nnzTotal = -1; 
             CUSPARSECALL(cusparseXcsrgemmNnz(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,S1.RowLocation(),S1.ColLocation(),descrB,nnzB,
                 S2.RowLocation(),S2.ColLocation(),descrC,csrRowPtrC,&nnzTotal));
             return nnzTotal;
-        });
+        }
+#else
+	NULL		// PrepareBufferMultiply
+#endif
+	);
 
 
         //Step 2
@@ -1196,12 +1215,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CUDACALL(cudaEventCreate(&done));
         //Step 1 
         bool inOutParameter = (&b == &c);
-        c.PrepareBuffer(m, n, !inOutParameter, [&] (int* csrRowPtrC) -> size_t
+        c.PrepareBuffer(m, n, !inOutParameter, 
+#ifndef	LINUX
+	[&] (int* csrRowPtrC) -> size_t
         {
             int nnzTotal = -1;
             CUSPARSECALL(cusparseXcsrgeamNnz(cusparseHandle,m,n,descrA,nnzA,a.RowLocation(),a.ColLocation(),descrB,nnzB,b.RowLocation(),b.ColLocation(),descrC,csrRowPtrC,&nnzTotal));
             return nnzTotal;
-        });
+        }
+#else
+	NULL
+#endif	// Linux
+	);
 
         //Step 2
         if (sizeof(ElemType)==sizeof(float))
@@ -1588,7 +1613,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (this->IsEmpty())
             return;
         // transfer converted block over to this pointer
+#ifndef	LINUX
         *this = std::move(this->Transpose());
+#else	
+	std::cerr << "Not sure how to do the InplaceTranspose()";
+#endif
     }
 
     template<class ElemType>
diff --git a/Math/Math/GPUSparseMatrix.cuh b/Math/Math/GPUSparseMatrix.cuh
index 6691071ca..8b1f959a2 100644
--- a/Math/Math/GPUSparseMatrix.cuh
+++ b/Math/Math/GPUSparseMatrix.cuh
@@ -29,7 +29,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Clear();
 #ifndef	LINUX
         void PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function<size_t (int* csrRowPtrC)> func);
-#endif
+#else
+        void PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, size_t (*func)(int *csRowPtrC));
+#endif	
         size_t ElemCountFromBufferSize(size_t totalBufferSize);
         void PrepareDevice(short deviceId=-1) const;
 
diff --git a/Math/Math/Makefile b/Math/Math/Makefile
index 154f2ad83..87c9e1f35 100644
--- a/Math/Math/Makefile
+++ b/Math/Math/Makefile
@@ -1,8 +1,8 @@
 CSOURCES = CPUMatrix.cpp CPUSparseMatrix.cpp Matrix.cpp 
 
 OBJECTS = CPUMatrix.o CPUSparseMatrix.o Matrix.o \
-	GPUSparseMatrix.o GPUWatcher.o \
-	GPUMatrixCUDAKernels.o GPUMatrix.o 
+	GPUMatrixCUDAKernels.o GPUMatrix.o \
+	GPUWatcher.o GPUSparseMatrix.o  
 
 INCLUDES = -I../../Common/Include -I/opt/acml5.3.1/gfortran64_mp_int64/include 
 
@@ -10,9 +10,10 @@ DEPS =
 
 CFLAGS =  $(INCLUDES)  \
 	-D BASETYPES_NO_UNSAFECRTOVERLOAD -DBASETYPES_NO_STRPRINTF \
-	-DLINUX -D_FILEUTIL_  -Wnon-template-friend  -std=c++11
+	-DLINUX -Wnon-template-friend  -std=c++11 # -D_FILEUTIL_  
 
-NVCFLAGS = -DLINUX -I../../Common/Include -D_FILEUTIL_ -arch sm_11
+NVCFLAGS = -DLINUX -D BASETYPES_NO_UNSAFECRTOVERLOAD -DBASETYPES_NO_STRPRINTF \
+	-I../../Common/Include -arch=compute_20  -std=c++11 # -D_FILEUTIL_ 
 
 CXX = gcc
 NVCC = nvcc
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index 6b2fc5e74..cdbd2526d 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -3435,9 +3435,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 else 
                 {
                     GPUMatrix<ElemType> firstDummy = transposeA ? a.m_GPUMatrix->Transpose()*alpha : (*a.m_GPUMatrix)*alpha;
-                    GPUMatrix<ElemType> & first= firstDummy;				// By Malcolm.. gcc doesn't support auto
+                    GPUMatrix<ElemType> & first= firstDummy;				// By Malcolm.. gcc doesn't support auto like original
                     GPUSparseMatrix<ElemType> secondDummy = transposeB ? b.m_GPUSparseMatrix->Transpose() : *b.m_GPUSparseMatrix;
-                    GPUSparseMatrix<ElemType> & second = secondDummy;			// By Malcolm.. gcc doesn't support auto
+                    GPUSparseMatrix<ElemType> & second = secondDummy;			// By Malcolm.. gcc doesn't support auto like original
                     if (beta==0)
                     {
                         GPUSparseMatrix<ElemType>::Multiply(first,second,*c.m_GPUMatrix);
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 2fb72901c..3ae9e9b54 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -9,6 +9,10 @@
 #include "GPUMatrix.cuh"
 #include "GPUSparseMatrix.cuh"
 
+#ifdef	LINUX
+// typedef char wchar_t;
+#endif
+
 // This class is exported from the Math.dll
 namespace Microsoft { namespace MSR { namespace CNTK {
     enum CurrentDataLocation