From a863c7746fc0364eab82212902a8ef47bd5b6895 Mon Sep 17 00:00:00 2001 From: Malcolm Slaney Date: Thu, 9 Oct 2014 10:44:27 -0700 Subject: [PATCH] More changes to get it compiling.. but not finished. --- Common/Include/basetypes.h | 11 ++++++++-- Math/Math/CommonMatrix.h | 1 + Math/Math/GPUMatrix.cu | 6 +++--- Math/Math/GPUMatrixCUDAKernels.cu | 7 +++++++ Math/Math/GPUSparseMatrix.cu | 35 ++++++++++++++++++++++++++++--- Math/Math/GPUSparseMatrix.cuh | 4 +++- Math/Math/Makefile | 9 ++++---- Math/Math/Matrix.cpp | 4 ++-- Math/Math/Matrix.h | 4 ++++ 9 files changed, 66 insertions(+), 15 deletions(-) diff --git a/Common/Include/basetypes.h b/Common/Include/basetypes.h index 7b424502c..d3145570a 100644 --- a/Common/Include/basetypes.h +++ b/Common/Include/basetypes.h @@ -11,6 +11,8 @@ typedef char16_t TCHAR; #include #define vsprintf_s vsprintf /* Not sure this is right... Malcolm */ +#include +#include #endif /* LINUX */ #ifndef UNDER_CE // fixed-buffer overloads not available for wince @@ -108,6 +110,7 @@ using namespace std; #define __inout_cap(x) #define __inout_cap_c(x) #endif +#endif // LINUX #ifndef __out_z_cap // non-VS2005 annotations #define __out_cap(x) #define __out_z_cap(x) @@ -321,7 +324,6 @@ public: #endif }; -#ifndef LINUX // locks a critical section, and unlocks it automatically // when the lock goes out of scope @@ -447,7 +449,11 @@ public: #include // uses strlen() #endif #define strlen strlen_ +#ifndef LINUX template inline __declspec(deprecated("Dummy general template, cannot be used directly")) +#else +template inline +#endif // LINUX size_t strlen_(_T &s) { return strnlen_s(static_cast(s), SIZE_MAX); } // never be called but needed to keep compiler happy template inline size_t strlen_(const _T &s) { return strnlen_s(static_cast(s), SIZE_MAX); } template<> inline size_t strlen_(char * &s) { return strnlen_s(s, SIZE_MAX); } @@ -980,7 +986,8 @@ template static void attempt (int retries, const FUNCTION & b #ifndef LINUX ::Sleep (1000); // wait a little, then try again #else - sleep(1); + std::chrono::milliseconds dura(1000); + std::this_thread::sleep_for(dura); #endif /* LINUX */ } } diff --git a/Math/Math/CommonMatrix.h b/Math/Math/CommonMatrix.h index f0dd57d46..da4a501a4 100644 --- a/Math/Math/CommonMatrix.h +++ b/Math/Math/CommonMatrix.h @@ -10,6 +10,7 @@ #ifdef LINUX #define wcsnlen_s wcsnlen /* Not sure if this is best replacement... Malcolm */ +// typedef char wchar_t; #endif #define AUTOPLACEMATRIX 1000 // used in parameters only diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu index 7374aee54..9d690f0e4 100644 --- a/Math/Math/GPUMatrix.cu +++ b/Math/Math/GPUMatrix.cu @@ -424,7 +424,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { ZeroInit(deepCopyFrom.m_computeDevice); SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); + this->SetMatrixName(deepCopyFrom.m_matrixName); } #ifndef LINUX @@ -452,7 +452,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (this != &deepCopyFrom) { SetValue(deepCopyFrom); - SetMatrixName(deepCopyFrom.m_matrixName); + this->SetMatrixName(deepCopyFrom.m_matrixName); } return *this; } @@ -464,7 +464,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (this != &moveFrom) { - if (OwnBuffer() && this->m_pArray!=NULL) + if (this->OwnBuffer() && this->m_pArray!=NULL) { CUDA_CALL(cudaFree(this->m_pArray)); } diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu index 6c035c2db..55af99884 100644 --- a/Math/Math/GPUMatrixCUDAKernels.cu +++ b/Math/Math/GPUMatrixCUDAKernels.cu @@ -18,6 +18,13 @@ #define MINLOGEXP -9.2103 #define LSMALL -0.5E10 +// Predefine this for later. +#ifndef LINUX +static __inline__ __device__ double atomicAdd(double* address, double val); +#else +static __device__ double atomicAdd(double* address, double val); +#endif + //CUDA Kernels code template __global__ void _elementWisePowerOnCuda( diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu index 31bf6bb8f..e0e908292 100644 --- a/Math/Math/GPUSparseMatrix.cu +++ b/Math/Math/GPUSparseMatrix.cu @@ -1044,7 +1044,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { // canReuseBuffer - target matrix can be reused for temporary space // func - function to call to count elements in the result (returns count, and fills csrRowPtr array) template +#ifndef LINUX void GPUSparseMatrix::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function func) +#else + void GPUSparseMatrix::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, size_t (*func)(int *csRowPtrC)) +#endif /* LINUX */ { int* csrRowPtrC=NULL; GPUSparseMatrix& c = *this; @@ -1099,6 +1103,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { CUDACALL(cudaFree(csrRowPtrC)); } +#ifdef LINUXxx + size_t PrepareBufferMultiply(int* csrRowPtrC) + { + int nnzTotal = -1; + CUSPARSECALL(cusparseXcsrgemmNnz(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,S1.RowLocation(),S1.ColLocation(),descrB,nnzB, + S2.RowLocation(),S2.ColLocation(),descrC,csrRowPtrC,&nnzTotal)); + return nnzTotal; + } +#endif + // Multiply - multiply one spares matrix by another sparse matrix // S1 - first sparse matrix // transposeS1 - transpose first matrix? @@ -1136,13 +1150,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { CUDACALL(cudaEventCreate(&done)); //Step 1 c.PrepareBuffer(m, n, true, // true means we can reuse the "c" buffer if it exists for temporaries +#ifndef LINUX [&](int* csrRowPtrC) -> size_t { int nnzTotal = -1; CUSPARSECALL(cusparseXcsrgemmNnz(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,S1.RowLocation(),S1.ColLocation(),descrB,nnzB, S2.RowLocation(),S2.ColLocation(),descrC,csrRowPtrC,&nnzTotal)); return nnzTotal; - }); + } +#else + NULL // PrepareBufferMultiply +#endif + ); //Step 2 @@ -1196,12 +1215,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { CUDACALL(cudaEventCreate(&done)); //Step 1 bool inOutParameter = (&b == &c); - c.PrepareBuffer(m, n, !inOutParameter, [&] (int* csrRowPtrC) -> size_t + c.PrepareBuffer(m, n, !inOutParameter, +#ifndef LINUX + [&] (int* csrRowPtrC) -> size_t { int nnzTotal = -1; CUSPARSECALL(cusparseXcsrgeamNnz(cusparseHandle,m,n,descrA,nnzA,a.RowLocation(),a.ColLocation(),descrB,nnzB,b.RowLocation(),b.ColLocation(),descrC,csrRowPtrC,&nnzTotal)); return nnzTotal; - }); + } +#else + NULL +#endif // Linux + ); //Step 2 if (sizeof(ElemType)==sizeof(float)) @@ -1588,7 +1613,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (this->IsEmpty()) return; // transfer converted block over to this pointer +#ifndef LINUX *this = std::move(this->Transpose()); +#else + std::cerr << "Not sure how to do the InplaceTranspose()"; +#endif } template diff --git a/Math/Math/GPUSparseMatrix.cuh b/Math/Math/GPUSparseMatrix.cuh index 6691071ca..8b1f959a2 100644 --- a/Math/Math/GPUSparseMatrix.cuh +++ b/Math/Math/GPUSparseMatrix.cuh @@ -29,7 +29,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Clear(); #ifndef LINUX void PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function func); -#endif +#else + void PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, size_t (*func)(int *csRowPtrC)); +#endif size_t ElemCountFromBufferSize(size_t totalBufferSize); void PrepareDevice(short deviceId=-1) const; diff --git a/Math/Math/Makefile b/Math/Math/Makefile index 154f2ad83..87c9e1f35 100644 --- a/Math/Math/Makefile +++ b/Math/Math/Makefile @@ -1,8 +1,8 @@ CSOURCES = CPUMatrix.cpp CPUSparseMatrix.cpp Matrix.cpp OBJECTS = CPUMatrix.o CPUSparseMatrix.o Matrix.o \ - GPUSparseMatrix.o GPUWatcher.o \ - GPUMatrixCUDAKernels.o GPUMatrix.o + GPUMatrixCUDAKernels.o GPUMatrix.o \ + GPUWatcher.o GPUSparseMatrix.o INCLUDES = -I../../Common/Include -I/opt/acml5.3.1/gfortran64_mp_int64/include @@ -10,9 +10,10 @@ DEPS = CFLAGS = $(INCLUDES) \ -D BASETYPES_NO_UNSAFECRTOVERLOAD -DBASETYPES_NO_STRPRINTF \ - -DLINUX -D_FILEUTIL_ -Wnon-template-friend -std=c++11 + -DLINUX -Wnon-template-friend -std=c++11 # -D_FILEUTIL_ -NVCFLAGS = -DLINUX -I../../Common/Include -D_FILEUTIL_ -arch sm_11 +NVCFLAGS = -DLINUX -D BASETYPES_NO_UNSAFECRTOVERLOAD -DBASETYPES_NO_STRPRINTF \ + -I../../Common/Include -arch=compute_20 -std=c++11 # -D_FILEUTIL_ CXX = gcc NVCC = nvcc diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp index 6b2fc5e74..cdbd2526d 100644 --- a/Math/Math/Matrix.cpp +++ b/Math/Math/Matrix.cpp @@ -3435,9 +3435,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { else { GPUMatrix firstDummy = transposeA ? a.m_GPUMatrix->Transpose()*alpha : (*a.m_GPUMatrix)*alpha; - GPUMatrix & first= firstDummy; // By Malcolm.. gcc doesn't support auto + GPUMatrix & first= firstDummy; // By Malcolm.. gcc doesn't support auto like original GPUSparseMatrix secondDummy = transposeB ? b.m_GPUSparseMatrix->Transpose() : *b.m_GPUSparseMatrix; - GPUSparseMatrix & second = secondDummy; // By Malcolm.. gcc doesn't support auto + GPUSparseMatrix & second = secondDummy; // By Malcolm.. gcc doesn't support auto like original if (beta==0) { GPUSparseMatrix::Multiply(first,second,*c.m_GPUMatrix); diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h index 2fb72901c..3ae9e9b54 100644 --- a/Math/Math/Matrix.h +++ b/Math/Math/Matrix.h @@ -9,6 +9,10 @@ #include "GPUMatrix.cuh" #include "GPUSparseMatrix.cuh" +#ifdef LINUX +// typedef char wchar_t; +#endif + // This class is exported from the Math.dll namespace Microsoft { namespace MSR { namespace CNTK { enum CurrentDataLocation