rewrite all BlockCol and BlockRow format related GPU Kernals. The original versions are all incorrect and slow.

Revoke back to redirect stderr even under debug version since users can just comment out the stderr line in the config to stop redirection.
2015-02-06 10:02:07 -08:00 · 2015-02-06 10:02:07 -08:00 · b7fb0ddc87
--- a/Common/ConfigFile.cpp
+++ b/Common/ConfigFile.cpp
@ -5,8 +5,9 @@
 //
 // ConfigFile.cpp : Defines the configuration file loader.
 //
-
+#ifndef _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif

 #include "File.h"
 #include "commandArgUtil.h"
--- a/Common/File.cpp
+++ b/Common/File.cpp
@ -4,7 +4,9 @@
 // </copyright>
 //

+#ifndef _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif

 #include "basetypes.h"
 #define FORMAT_SPECIALIZE // to get the specialized version of the format routines
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@ -4,7 +4,10 @@
 // </copyright>
 //

-#define _CRT_SECURE_NO_WARNINGS     // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+#endif
+
 #define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _
 #pragma warning (disable: 4996)     // ^^ this does not seem to work--TODO: make it work
 #define _FILE_OFFSET_BITS 64        // to force fseeko() and ftello() 64 bit in Linux
--- a/MachineLearning/cn/cn.cpp
+++ b/MachineLearning/cn/cn.cpp
@ -684,11 +684,8 @@ int wmain(int argc, wchar_t* argv[])
                oss << myRank;
                logpath += L"rank" + oss.str();
            }
-#ifndef _DEBUG
+
 			RedirectStdErr(logpath);
-#else
-			printf("INFO: in debug mode, do not redirect output\n");
-#endif
        }


--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -3356,45 +3356,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #pragma endregion Static BLAS Functions


-    //#pragma region File << and >> operators
-    //    template<class ElemType>
-    //    File& operator>>(File& stream, GPUMatrix<ElemType> &us)
-    //    {
-    //        //auto& us = *this;
-    //
-    //        stream.GetMarker(fileMarkerBeginSection, std::string("BMAT"));
-    //        size_t elsize;
-    //        stream>>elsize;
-    //        if (sizeof(ElemType)!=elsize)
-    //            throw std::runtime_error("Template argument size doesn't match those in file");
-    //        std::wstring matrixName;
-    //        size_t numRows, numCols;
-    //        stream>>matrixName>>numRows>>numCols;
-    //        ElemType* d_array = new ElemType[numRows*numCols];
-    //        for (long i=0;i<numRows*numCols;++i)
-    //            stream>>d_array[i];
-    //        stream.GetMarker(fileMarkerEndSection, std::string("EMAT"));
-    //        us.SetValue(numRows,numCols,d_array, matrixFlagNormal);
-    //        us.m_matrixName = matrixName;
-    //        return stream;
-    //    }
-    //
-    //    template<class ElemType>
-    //    File& operator<<(File& stream, GPUMatrix<ElemType> &us)
-    //    {
-    //        //auto& us = *this;
-    //
-    //        stream.PutMarker(fileMarkerBeginSection, std::string("BMAT"));
-    //        stream<<sizeof(ElemType)<<us.m_matrixName<<us.m_numRows<<us.m_numCols;
-    //        ElemType *d_array = us.CopyToArray();
-    //        for (long i=0;i<us.GetNumElements();++i)
-    //            stream<<d_array[i];
-    //        stream.PutMarker(fileMarkerEndSection, std::string("EMAT"));
-    //        return stream;
-    //    }
-    //
-    //#pragma endregion File << and >> operators
-
    template class GPUMatrix<float>; 
    template class GPUMatrix<double>;
    template class DeviceBoundNumber<float>;
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -2250,37 +2250,12 @@ __global__ void _sparseCSRElemMulDense(
    }
 }

-// forward pass from feature to hidden layer
-/*template<class ElemType>
-__global__ void _denseMulSparseCSCToDense(
-    ElemType alpha,
-    const ElemType* lhs,
-    int numrows,
-    int numcols,
-    const GPUSPARSE_INDEX_TYPE* row,
-    ElemType* c)
-{
-    int loadPerThread = (numrows+blockDim.x-1)/blockDim.x;
-    int tStart = loadPerThread * threadIdx.x;
-    int tEnd = min(numrows, loadPerThread + tStart);
-
-    int p = blockIdx.x;
-    int i = row[p];
-    int j = blockIdx.x;
-
-    for (int h = tStart; h < tEnd; h++) 
-    {
-        ElemType res = alpha * lhs[IDX2C(h, i, numrows)]; 
-        atomicAdd(&c[IDX2C(h,j,numrows)], res);
-    }
-}*/

 //c = alpha * op(a) * op(b) + beta*c
 //this function can be further improved by using shared memory
 template<class ElemType>
 __global__ void _denseMultSparseCSCAndWeightedAddToDense(
    int m, //rowDense
-    int k,  //colDense = rowSparse
    int n,   //colSparse
    ElemType alpha,
    const ElemType* a,  //dense
@ -2311,62 +2286,77 @@ __global__ void _denseMultSparseCSCAndWeightedAddToDense(
 }

 // backward pass from hidden layer to feature weight
+//result (sparse BlockCol)= alpha * (lhs (dense) X rhs^T (sparse CSC)
+//assume resultValues are 0-initialized
 template<class ElemType>
-__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(    
-	ElemType alpha,
-    ElemType* lhs,
-    size_t nrs,
-	ElemType* rhsNZValues,
-    const GPUSPARSE_INDEX_TYPE* row,
-    const size_t* rowIdx,
-    ElemType* blockVal,
-    size_t* blockIds)
+__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
+    ElemType alpha,
+    ElemType* lhsValues,
+    size_t numRowsLhs,
+    size_t numColsRhs,
+    ElemType* rhsNZValues,
+    const GPUSPARSE_INDEX_TYPE* rhsRows,
+    const GPUSPARSE_INDEX_TYPE* rhsCols,
+    const size_t* rhsRowIdx,
+    ElemType* resultValues,
+    size_t* resultBlockIds)
 {
-    int p = blockIdx.x;
-    int i = row[p];
-    int ii = rowIdx[p];
-    int j = blockIdx.x;
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    const LONG64 lhsCol = index / numRowsLhs; //rhsCol == lhsCol
+    if (lhsCol >= numColsRhs)
+        return;
+    const LONG64 lhsRow = index - numRowsLhs*lhsCol; //resultRow == lhsRow

-    int load = (nrs+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pEnd = min((int)nrs, load + pStart);
+    //each thread handles one [row, col] combination
+    ElemType lhsValue = alpha*lhsValues[IDX2C(lhsRow, lhsCol, numRowsLhs)];

-    for(int h = pStart; h < pEnd; h++) 
-    {        
-        ElemType temp = alpha*lhs[IDX2C(h, j, nrs)]*rhsNZValues[p];    
-        atomicAdd(&blockVal[ii*nrs+h], temp);
-        blockIds[ii] = i;
+    LONG64 start = rhsCols[lhsCol]; //rhsCol == lhsCol
+    LONG64 end = rhsCols[lhsCol + 1];
+
+    for (LONG64 p = start; p < end; p++)
+    {
+        LONG64 rhsRow = rhsRows[p]; 
+        ElemType rhsVal = rhsNZValues[p];
+        LONG64 resultCol = rhsRowIdx[p]; //resultCol == rhsRow maps to columnid 
+        resultBlockIds[resultCol] = rhsRow;  //indicate which colmn it actually points to
+
+        //assume resultValues are 0-initialized
+        atomicAdd(&resultValues[IDX2C(lhsRow, resultCol, numRowsLhs)], lhsValue * rhsVal);
    }
 }

+
 // gradients update
 template<class ElemType>
-__global__ void _scaleSparseAndAddToDense(    
-    ElemType alpha,
-    bool blockCol,
-    ElemType* blockVal,
-    size_t* blockIds,
-    size_t len,
-    ElemType* rhs,
-    size_t numrows)
+__global__ void _scaleSparseBlockAndAddToDense(    
+    const ElemType alpha,
+    const bool blockCol, //true if blockRow
+    const size_t numRows,
+    const size_t numCols,
+    const size_t numBlocks,
+    const ElemType* lhsValues,  //lhs is blockCol or blockRow
+    const size_t* blockIds,
+    ElemType* rhs)
 {
-    int ii = blockIdx.x;
-    int i = blockIds[ii];
-    int load = (len+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pEnd = min((int)len, load + pStart);
-
-    for(int h = pStart; h < pEnd; h++) 
-    {   ElemType temp = alpha*blockVal[ii*len + h];
-        if(blockCol)
-        {
-            atomicAdd(&rhs[IDX2C(h, i, numrows)], temp);
-        }
-        else
-        {
-            atomicAdd(&rhs[IDX2C(i, h, numrows)], temp);
-        }
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    LONG64 row, col;
+    if (blockCol)
+    {
+        const LONG64 blockId = index / numRows;
+        if (blockId >= numBlocks)
+            return;
+        row = index - numRows* blockId;
+        col = blockIds[blockId];
    }
+    else
+    {
+        const LONG64 blockId = index / numCols;
+        if (blockId >= numBlocks)
+            return;
+        col = index - numCols* blockId;
+        row = blockIds[blockId];
+    }
+    rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];
 }

 // compute predictions in cross entory node
@ -2638,30 +2628,36 @@ __global__ void _inplaceTruncate(
 }

 template<class ElemType>
-__global__ void _normalGrad(
-    bool isBlockCol,
-    size_t len,
+__global__ void _normalGradForSparseBlock(
    const ElemType momentum,
-    size_t* blockIds,
-    ElemType* blockVal,
-    ElemType* c,
-    size_t numrows)
+    const bool blockCol, //true if blockRow
+    const size_t numRows,
+    const size_t numCols,
+    const size_t numBlocks,
+    ElemType* lhsValues,  //lhs is blockCol or blockRow
+    const size_t* blockIds,
+    ElemType* rhs)
 {
-    int j = blockIdx.x;
-    int i = blockIds[j];
-    int start = j * len;
-
-    int load = (len+blockDim.x-1)/blockDim.x;
-    int pStart = load * threadIdx.x;
-    int pLen = min((int)len, load + pStart);
-
-    for(int p = start+pStart; p < start+pLen; p++) 
+    const LONG64 index = blockIdx.x * blockDim.x + threadIdx.x;
+    LONG64 row, col;
+    if (blockCol)
    {
-        int row = isBlockCol ? (p - start) : i;
-        int col = isBlockCol ? i: (p - start);
-        c[IDX2C(row, col, numrows)] = (1-momentum)*blockVal[p] + momentum*c[IDX2C(row, col, numrows)];
-        blockVal[p] = c[IDX2C(row, col, numrows)];
+        const LONG64 blockId = index / numRows;
+        if (blockId >= numBlocks)
+            return;
+        row = index - numRows* blockId;
+        col = blockIds[blockId];
    }
+    else
+    {
+        const LONG64 blockId = index / numCols;
+        if (blockId >= numBlocks)
+            return;
+        col = index - numCols* blockId;
+        row = blockIds[blockId];
+    }
+    rhs[IDX2C(row, col, numRows)] = (1 - momentum)*lhsValues[index] + momentum*rhs[IDX2C(row, col, numRows)];
+    lhsValues[index] = rhs[IDX2C(row, col, numRows)];
 }

 static __inline__ __device__ double atomicAdd(double* address, double val)
@ -3263,4 +3259,147 @@ d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt(d_tmp[1]))/max((
 }
 */

+
+template<class ElemType>
+__global__ void _assignElementProductOfWithShiftNeg(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int shift,
+	const int NTPlusOne,
+	const int BS)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= NTPlusOne || idy >= BS)
+		return;
+
+	if (idx == 0)
+	{
+		// this is row-0. No need to shift
+		us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[idy];
+	}
+	else
+	{
+		int cs = shift + idx - 1;
+		int tmpidy = (idy + cs) % BS;
+		us[IDX2C(idx, idy, NTPlusOne)] = a[idy] * b[tmpidy];
+	}
+}
+
+template<class ElemType>
+__global__ void _innerProductWithShiftNeg(
+	ElemType* c,
+	const ElemType* a,
+	const ElemType* b,
+	const long N, //a.GetNumRows();
+	const long M, //a.GetNumCols();
+	const long shift,
+	const long NTPlusOne
+	)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= NTPlusOne || idy >= M)
+		return;
+
+	ElemType sum = 0;
+	long index_a = 0;
+	long index_b = 0;
+	long col_a = 0;
+	long col_b = 0;
+	if (idx == 0)
+	{
+		// this is row 0. No need to shift
+		// the product of a(:,idy) dot b(:,idy)
+		col_a = idy;
+		for (long i = 0; i < N; ++i)
+		{
+			index_a = IDX2C(i, col_a, N);
+			sum += a[index_a] * b[index_a];
+		}
+	}
+	else
+	{
+		int cs = shift + idx - 1;
+		col_a = idy;
+		col_b = (idy + cs) % M;
+		for (int i = 0; i < N; ++i)
+		{
+			index_a = IDX2C(i, col_a, N);
+			index_b = IDX2C(i, col_b, N);
+			sum += a[index_a] * b[index_b];
+		}
+	}
+	c[IDX2C(idx, idy, NTPlusOne)] = sum;
+
+}
+
+template<class ElemType>
+__global__ void _getARowByIndex(
+	ElemType* us,
+	const ElemType* a,
+	const int O, // a's rows
+	const int P, // a's cols
+	const int m // the m-th row of a
+	)
+{
+	LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (id >= P)
+		return;
+	//	us[id] = a[id] * b[id];
+	us[id] = a[IDX2C(m, id, O)];
+}
+
+
+template<class ElemType>
+__global__ void _conductRowElementMultiplyWithShift(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int O, // b's rows
+	const int P, // b's cols
+	const int shift,
+	const bool isafixed)
+{
+	LONG64 idx = blockDim.x * blockIdx.x + threadIdx.x;
+	LONG64 idy = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (idx >= O || idy >= P)
+		return;
+
+	int tmpidy = (idy + shift) % P;
+	if (isafixed)
+	{
+		// we fix a, and shift b
+		us[IDX2C(idx, idy, O)] = a[idy] * b[IDX2C(idx, tmpidy, O)];
+	}
+	else
+	{
+		// we fix b, but shift a
+		us[IDX2C(idx, idy, O)] = a[tmpidy] * b[IDX2C(idx, idy, O)];
+	}
+
+}
+
+template<class ElemType>
+__global__ void _assignElementProductOfWithShift(
+	ElemType* us,
+	const ElemType* a,
+	const ElemType* b,
+	const int shift,
+	const LONG64 N)
+{
+	LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (id >= N)
+		return;
+
+	int tmpidb = (id + shift) % N;
+	us[id] = a[id] * b[tmpidb];
+}
+
+
+
 #endif // !CPUONLY
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@ -828,7 +828,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            rowToId[i] = indexer[row];
        }  
-
+        m_blockSize = indexer.size();
        CUDACALL(cudaMemcpy(m_rowToId, rowToId, sizeof(size_t)*nz, cudaMemcpyHostToDevice));
    }

@ -928,7 +928,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (do_sync)    CUDACALL(cudaEventCreate(&done));
                _denseMultSparseCSCAndWeightedAddToDense<ElemType> <<< blocksPerGrid, threadsPerBlock >>> (
                    m, //rowDense
-                    k,  //colDense = rowSparse
                    n,   //colSparse
                    alpha,
                    reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
@ -991,25 +990,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (rhs.GetFormat() != matrixFormatSparseCSC)
                NOT_IMPLEMENTED;

+            lhs.PrepareDevice();
+
            c.SetFormat(matrixFormatSparseBlockCol);
-            c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
+            //c.m_blockSize = n < rhs.m_nz ? n : rhs.m_nz;
+            c.m_blockSize = rhs.m_blockSize;
            c.m_nz = m*c.m_blockSize;
            c.Resize(m, n, c.m_nz);
            CUDACALL(cudaMemset(c.m_blockVal, 0, sizeof(ElemType)*(c.m_nz)));
            CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));

+            LONG64 N = (LONG64)lhs.GetNumElements();  //here we process for each row in lhs and each column in rhs (==columns in lhs)
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
            cudaEvent_t done = nullptr;
            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            int blocksPerGrid = rhs.GetNumNZElements();
-            _denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
-                alpha,
-                lhs.BufferPointer(),
-                m,
-                rhs.BufferPointer(),
-                rhs.RowLocation(),
-                rhs.m_rowToId,
-                c.m_blockVal, 
-                c.m_blockIds);
+            _denseMulSparseCSCTransposeToSparseBlockCol<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(
+                    alpha,
+                    lhs.BufferPointer(),
+                    m,
+                    l,
+                    rhs.BufferPointer(),
+                    rhs.RowLocation(),
+                    rhs.ColLocation(),
+                    rhs.m_rowToId,
+                    c.BufferPointer(),
+                    c.m_blockIds);
+
            if (do_sync)    CUDACALL(cudaEventRecord(done));
            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
            if (do_sync)    CUDACALL(cudaEventDestroy(done));
@ -1028,25 +1034,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    void GPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix<ElemType>& lhs, GPUMatrix<ElemType>& rhs)
    {
-        if (lhs.GetComputeDeviceId()!=rhs.GetComputeDeviceId())
+        if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols())
+            throw std::logic_error("ScaleAndAdd: dimension mismatch");
+
+        if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId())
            throw std::runtime_error("GPUSparseMatrix::ScaleAndAdd: All matrices must be on the same GPU");

        if (lhs.m_format == matrixFormatSparseBlockCol || lhs.m_format == matrixFormatSparseBlockRow) 
        {
-            size_t len = (lhs.m_format == matrixFormatSparseBlockCol) ? lhs.GetNumRows(): lhs.GetNumCols();
            bool blockCol = (lhs.m_format == matrixFormatSparseBlockCol);

            cudaEvent_t done = nullptr;
            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            size_t blocksPerGrid = lhs.m_blockSize;
-            _scaleSparseAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
+            LONG64 N = (LONG64)lhs.GetNumNZElements(); 
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
+            _scaleSparseBlockAndAddToDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
                alpha,
                blockCol,
+                lhs.GetNumRows(),
+                lhs.GetNumCols(),
+                lhs.m_blockSize,
                lhs.m_blockVal,
                lhs.m_blockIds,
-                len,
-                rhs.BufferPointer(),
-                rhs.GetNumRows());
+                rhs.BufferPointer());
+
            if (do_sync)    CUDACALL(cudaEventRecord(done));
            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
            if (do_sync)    CUDACALL(cudaEventDestroy(done));
@ -1247,19 +1258,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        if(m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow) 
        {
-            size_t blocksPerGrid = m_blockSize;
            bool isBlockCol = (m_format == MatrixFormat::matrixFormatSparseBlockCol);
-            size_t len = isBlockCol ? GetNumRows(): GetNumCols();
            cudaEvent_t done = nullptr;
            if (do_sync)    CUDACALL(cudaEventCreate(&done));
-            _normalGrad<ElemType><<<blocksPerGrid,threadsPerBlock>>>(
-                isBlockCol,
-                len,
+            LONG64 N = (LONG64)GetNumNZElements();
+            int blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
+
+            _normalGradForSparseBlock<ElemType> << <blocksPerGrid, threadsPerBlock >> >(
                momentum,
+                isBlockCol,
+                GetNumRows(),
+                GetNumCols(),
+                m_blockSize,
+                BufferPointer(),
                m_blockIds,
-                m_blockVal,
-                c.BufferPointer(),
-                c.GetNumRows());                        
+                c.BufferPointer());
+
            if (do_sync)    CUDACALL(cudaEventRecord(done));
            if (do_sync)    CUDACALL(cudaEventSynchronize(done));
            if (do_sync)    CUDACALL(cudaEventDestroy(done));