Fix for GPU Sparse ColumnSlice

2015-11-28 17:16:38 -08:00 · 2015-11-28 17:16:38 -08:00 · d5387bd49b
--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@ -54,6 +54,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        size_t BufferSize() const {return m_elemSizeAllocated*sizeof(ElemType);}
        ElemType* BufferPointer() const;
+        inline size_t GetNumElemAllocated() const { return m_elemSizeAllocated; }

        CPUSparseMatrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
        CPUMatrix<ElemType> CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
--- a/Math/Math/GPUMatrixCUDAKernels.cuh
+++ b/Math/Math/GPUMatrixCUDAKernels.cuh
@ -1256,6 +1256,7 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
    int start = aColCSCIndex[col];
    int end = aColCSCIndex[col + 1];
    int current = start;
+
    for (size_t nc = 0; nc < N; nc++)
    {
        // recover the 5 indices from the loop counter
@ -1278,6 +1279,9 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
            }
        }
    }
+
+    cColCSCIndex[col] = start;
+    cColCSCIndex[col + 1] = end;
 }

 template<class ElemType>
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@ -128,14 +128,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ChangeDeviceTo(deepCopy.m_computeDevice);
        deepCopy.PrepareDevice();

-        Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_nz, deepCopy.m_format, true, false);
+        Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_elemSizeAllocated, deepCopy.m_format, true, false);
        m_nz = deepCopy.m_nz;
-        CUDA_CALL(cudaMemcpy(NzValues(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
+        m_sliceViewOffset = deepCopy.m_sliceViewOffset;
+
+        CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));
        CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
        CUDA_CALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));

        m_externalBuffer = false;
-        m_sliceViewOffset = 0;
        SetMatrixName(deepCopy.m_matrixName);

        //TODO: to copy other varibles used only for class based LM
@ -165,12 +166,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        if (deepCopy.GetFormat() == matrixFormatSparseCSR)
        {
-            SetMatrixFromCSRFormat(deepCopy.RowLocation(), deepCopy.ColLocation(), deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
-
+            SetMatrixFromCSRFormat(deepCopy.RowLocation(), deepCopy.ColLocation(), deepCopy.BufferPointer(), deepCopy.GetNumElemAllocated(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
        }
        else if (deepCopy.GetFormat() == matrixFormatSparseCSC)
        {
-            SetMatrixFromCSCFormat(deepCopy.ColLocation(), deepCopy.RowLocation(), deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
+            SetMatrixFromCSCFormat(deepCopy.ColLocation(), deepCopy.RowLocation(), deepCopy.BufferPointer(), deepCopy.GetNumElemAllocated(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
        }
        else
            NOT_IMPLEMENTED;
@ -189,7 +189,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (this->GetFormat() == matrixFormatSparseCSR)
        {
            //we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
-            cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements(), true, false);
+            cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumElemAllocated(), true, false);
            cpuSparseMatrix.SetNzCount(GetNumNZElements());

            PrepareDevice();
@ -210,7 +210,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
            }

-            CUDA_CALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+            CUDA_CALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost));

        }
        else if (this->GetFormat() == matrixFormatSparseCSC)
@ -236,7 +236,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
            }

-            CUDA_CALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+            CUDA_CALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost));
        }
        else
            NOT_IMPLEMENTED;
@ -269,22 +269,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            if (sizeof(ElemType) == sizeof(float))
            {
-                CUSPARSE_CALL(cusparseScsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)NzValues(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
+                CUSPARSE_CALL(cusparseScsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)BufferPointer(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
            }
            else
            {
-                CUSPARSE_CALL(cusparseDcsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)NzValues(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
+                CUSPARSE_CALL(cusparseDcsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)BufferPointer(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
            }
        }
        else if (m_format == MatrixFormat::matrixFormatSparseCSC)
        {
            if (sizeof(ElemType) == sizeof(float))
            {
-                CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)NzValues(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
+                CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)BufferPointer(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
            }
            else
            {
-                CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)NzValues(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
+                CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)BufferPointer(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
            }
        }
        else
@ -333,14 +333,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            if (sizeof(ElemType) == sizeof(float))
            {
-                CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_nz),
-                    (float*)NzValues(), RowLocation(), ColLocation(), (float*)outMatrix.NzValues(),
+                CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_elemSizeAllocated),
+                    (float*)BufferPointer(), RowLocation(), ColLocation(), (float*)outMatrix.BufferPointer(),
                    outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
            }
            else
            {
-                CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_nz),
-                    (double*)NzValues(), RowLocation(), ColLocation(), (double*)outMatrix.NzValues(),
+                CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_elemSizeAllocated),
+                    (double*)BufferPointer(), RowLocation(), ColLocation(), (double*)outMatrix.BufferPointer(),
                    outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
            }
        }
@ -499,12 +499,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (sizeof(ElemType) == sizeof(float))
            {
                CUSPARSE_CALL(cusparseSdense2csr(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<float*>(denseMatrix.BufferPointer()),
-                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(NzValues()), RowLocation(), ColLocation()));
+                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(BufferPointer()), RowLocation(), ColLocation()));
            }
            else
            {
                CUSPARSE_CALL(cusparseDdense2csr(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<double*>(denseMatrix.BufferPointer()),
-                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(NzValues()), RowLocation(), ColLocation()));
+                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(BufferPointer()), RowLocation(), ColLocation()));
            }
        }
        else if (m_format == MatrixFormat::matrixFormatSparseCSC)
@ -512,12 +512,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (sizeof(ElemType) == sizeof(float))
            {
                CUSPARSE_CALL(cusparseSdense2csc(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<float*>(denseMatrix.BufferPointer()),
-                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(NzValues()), RowLocation(), ColLocation()));
+                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(BufferPointer()), RowLocation(), ColLocation()));
            }
            else
            {
                CUSPARSE_CALL(cusparseDdense2csc(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<double*>(denseMatrix.BufferPointer()),
-                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(NzValues()), RowLocation(), ColLocation()));
+                    (int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(BufferPointer()), RowLocation(), ColLocation()));
            }
        }
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
@ -529,11 +529,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseMatrix<ElemType>& deepCopy)
    {
+        Clear();
        if (this != &deepCopy)
-        {
            SetValue(deepCopy);
-        }
-        return *this;       
+
+        SetMatrixName(deepCopy.m_matrixName);
+        return *this;
    }

    template<class ElemType>
@ -566,7 +567,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (this != &moveFrom)
        {
            if (OwnBuffer())
-                Clear();  //always delete the data pointer since we will use the pointer from moveFrom
+                ReleaseMemory();  //always delete the data pointer since we will use the pointer from moveFrom
            m_computeDevice = moveFrom.m_computeDevice;
            m_numRows = moveFrom.m_numRows;
            m_numCols = moveFrom.m_numCols;
@ -595,13 +596,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
    {
-        Clear();
+        ReleaseMemory();
    }

    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::Clear()
+    void GPUSparseMatrix<ElemType>::ReleaseMemory()
    {
-        // If m_externalBuffer is true then this matrix
+        // If OwnBuffer() is false then this matrix
        // is simply a view over another matrix. In that
        // case we shouldn't free anything.
        if (OwnBuffer())
@ -617,9 +618,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            CUDA_CALL(cudaFree(m_rowToId));
            m_rowToId = nullptr;
-
-            ZeroInit(m_format, m_computeDevice);
        }
+
+        ZeroInit(m_format, m_computeDevice);
    }

    //ResizeAsAndCopyIndexFrom - Resize this sparse matrix to have the same element structure as the passed matrix
@ -662,7 +663,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        if (m_pArray != nullptr)
        {
-            CUDA_CALL(cudaMemcpy(pArray, NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
+            CUDA_CALL(cudaMemcpy(pArray, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));

            GPUSPARSE_INDEX_TYPE* majorIndexInNewBuffer = (GPUSPARSE_INDEX_TYPE*)(pArray + m_elemSizeAllocated);
            GPUSPARSE_INDEX_TYPE* secondaryIndexInNewBuffer = majorIndexInNewBuffer + MajorIndexCount(numRows, numCols, m_elemSizeAllocated, m_format);
@ -734,7 +735,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    if (m_nz > numNZElemToReserve || m_totalBufferSizeAllocated > bufferSizeNeeded)
                        LogicError("Resize: To keep values m_nz should <= numNZElemToReserve.");

-                    CUDA_CALL(cudaMemcpy(pArray, NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
+                    CUDA_CALL(cudaMemcpy(pArray, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));

                    GPUSPARSE_INDEX_TYPE* majorIndexInNewBuffer = (GPUSPARSE_INDEX_TYPE*)(pArray + numNZElemToReserve);

@ -795,7 +796,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SetNzCount(nz);

        cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
-        CUDA_CALL(cudaMemcpy(NzValues(), h_Val, NzSize(), kind));
+        CUDA_CALL(cudaMemcpy(BufferPointer(), h_Val, NzSize(), kind));

        if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
        {
@ -817,7 +818,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    // this function will allocate memory while the caller needs to release it
    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
+    void GPUSparseMatrix<ElemType>::GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &numElemAllocated, size_t &nz, size_t &numRows, size_t &numCols) const
    {
        if (!OwnBuffer())
            LogicError("Cannot Set since the buffer is managed externally.");
@ -825,6 +826,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (h_CSRRow != nullptr || h_Col != nullptr || h_Val != nullptr)
            LogicError("GetMatrixFromCSRFormat: Passed pointers must be nullptr");

+        numElemAllocated = GetNumElemAllocated();
        nz = GetNumNZElements();
        numRows = GetNumRows();
        numCols = GetNumCols();
@ -833,12 +835,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return;
        else
        {
-            h_Val = new ElemType[nz];
+            h_Val = new ElemType[numElemAllocated];
            h_CSRRow = new CPUSPARSE_INDEX_TYPE[m_numRows + 1];
            h_Col = new CPUSPARSE_INDEX_TYPE[nz];

            PrepareDevice();
-            CUDA_CALL(cudaMemcpy(h_Val, NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+            CUDA_CALL(cudaMemcpy(h_Val, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost));

            if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
            {
@ -875,7 +877,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SetNzCount(nz);

        cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
-        CUDA_CALL(cudaMemcpy(NzValues(), h_Val, NzSize(), kind));
+        CUDA_CALL(cudaMemcpy(BufferPointer(), h_Val, NzSize(), kind));

        if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
        {
@ -897,11 +899,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    // this function will allocate memory while the caller needs to release it
    template<class ElemType>
-    void GPUSparseMatrix<ElemType>::GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
+    void GPUSparseMatrix<ElemType>::GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &numElemAllocated, size_t &nz, size_t &numRows, size_t &numCols) const
    {
        if (h_CSCCol != nullptr || h_Row != nullptr || h_Val != nullptr)
            LogicError("GetMatrixFromCSCFormat: Passed pointers must be nullptr");

+        numElemAllocated = GetNumElemAllocated();
        nz = GetNumNZElements();
        numRows = GetNumRows();
        numCols = GetNumCols();
@ -910,12 +913,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return;
        else
        {
-            h_Val = new ElemType[nz];
+            h_Val = new ElemType[numElemAllocated];
            h_CSCCol = new GPUSPARSE_INDEX_TYPE[m_numRows + 1];
            h_Row = new GPUSPARSE_INDEX_TYPE[nz];

            PrepareDevice();
-            CUDA_CALL(cudaMemcpy(h_Val, NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+            CUDA_CALL(cudaMemcpy(h_Val, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost));

            if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
            {
@ -1038,7 +1041,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    alpha,
                    reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
                    transposeA,
-                    reinterpret_cast<const ElemType*>(rhs.NzValues()),  //sparse nz values
+                    reinterpret_cast<const ElemType*>(rhs.BufferPointer()),  //sparse nz values
                    rhs.RowLocation(),
                    rhs.ColLocation(),
                    beta,
@ -1072,7 +1075,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        alpha,
                        reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
                        transposeA,
-                        reinterpret_cast<const ElemType*>(rhs.NzValues()),  //sparse nz values
+                        reinterpret_cast<const ElemType*>(rhs.BufferPointer()),  //sparse nz values
                        rhs.RowLocation(),
                        rhs.ColLocation(),
                        reinterpret_cast<ElemType*> (c.BufferPointer())  //dense target
@ -1121,10 +1124,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CUDA_LONG N = (CUDA_LONG)c.GetNumCols();
        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
        _tensorShuffleScaleAndAddRowSparse<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(
-            reinterpret_cast<const ElemType*>(a.NzValues()),  // source nz values
+            reinterpret_cast<const ElemType*>(a.BufferPointer()),  // source nz values
            a.RowLocation(),
            a.ColLocation(),
-            reinterpret_cast<ElemType*>(c.NzValues()),  // target nz values
+            reinterpret_cast<ElemType*>(c.BufferPointer()),  // target nz values
            c.RowLocation(),
            c.ColLocation(),
            D, S, M, K, T);
@ -1204,7 +1207,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t nnz = m*c.m_blockSize;
                c.Resize(m, n, nnz, true, true);  //we need to keep the col2blockid and blockid2col info when resizing.
                c.m_nz = nnz;
-                CUDA_CALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
+                CUDA_CALL(cudaMemset(c.BufferPointer(), 0, sizeof(ElemType)*(c.m_elemSizeAllocated)));

                LONG64 N = (LONG64)lhs.GetNumElements();  //here we process for each row in lhs and each column in rhs (==columns in lhs)
                blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
@ -1213,11 +1216,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    lhs.BufferPointer(),
                    m,
                    l,
-                    rhs.NzValues(),
+                    rhs.BufferPointer(),
                    rhs.RowLocation(),
                    rhs.ColLocation(),
                    c.ColOrRow2BlockId(),
-                    c.NzValues());
+                    c.BufferPointer());
            }
            else
            {
@ -1225,7 +1228,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                size_t nnz = m*c.m_blockSize;
                c.Resize(m, n, nnz, true, false);
                c.m_nz = nnz;
-                CUDA_CALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
+                CUDA_CALL(cudaMemset(c.BufferPointer(), 0, sizeof(ElemType)*(c.m_elemSizeAllocated)));
                CUDA_CALL(cudaMemset(c.BlockId2ColOrRow(), 0, sizeof(GPUSPARSE_INDEX_TYPE)*(c.m_blockSize)));

                LONG64 N = (LONG64)lhs.GetNumElements();  //here we process for each row in lhs and each column in rhs (==columns in lhs)
@ -1235,11 +1238,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    lhs.BufferPointer(),
                    m,
                    l,
-                    rhs.NzValues(),
+                    rhs.BufferPointer(),
                    rhs.RowLocation(),
                    rhs.ColLocation(),
                    rhs.m_rowToId,
-                    c.NzValues(),
+                    c.BufferPointer(),
                    c.BlockId2ColOrRow());
            }

@ -1307,7 +1310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                lhs.GetNumRows(),
                lhs.GetNumCols(),
                lhs.m_blockSize,
-                lhs.NzValues(),
+                lhs.BufferPointer(),
                lhs.BlockId2ColOrRow(),
                rhs.BufferPointer());

@ -1388,7 +1391,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                GetNumRows(),
                GetNumCols(),
                m_blockSize,
-                NzValues(),
+                BufferPointer(),
                BlockId2ColOrRow(),
                c.BufferPointer());

@ -1435,7 +1438,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            int blocksPerGrid = (m_nz + threadsPerBlock - 1) / threadsPerBlock;
            bool colMajor = (m_format == MatrixFormat::matrixFormatSparseBlockCol ? true : false);
            size_t len = colMajor ? GetNumRows() : GetNumCols();
-            _adagrad4BlockSparse<ElemType> << <blocksPerGrid, threadsPerBlock >> >(c.GetArray(), c.GetNumRows(), NzValues(), BlockId2ColOrRow(), multipliers, colMajor, len, m_nz);
+            _adagrad4BlockSparse<ElemType> << <blocksPerGrid, threadsPerBlock >> >(c.GetArray(), c.GetNumRows(), BufferPointer(), BlockId2ColOrRow(), multipliers, colMajor, len, m_nz);
        }
        else
            NOT_IMPLEMENTED;
@ -1494,13 +1497,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
        if (sizeof(ElemType)==sizeof(float))
        {
-            CUSPARSE_CALL(cusparseScsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumNZElements(),reinterpret_cast <float*>(&alpha),descr,reinterpret_cast <const float*>(a.NzValues()),
+            CUSPARSE_CALL(cusparseScsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumElemAllocated(),reinterpret_cast <float*>(&alpha),descr,reinterpret_cast <const float*>(a.BufferPointer()),
                a.RowLocation(), a.ColLocation(), reinterpret_cast <float*>(b.BufferPointer()),
                (int)b.GetNumRows(),reinterpret_cast <float*>(&beta),reinterpret_cast <float*>(c.BufferPointer()),(int)c.GetNumRows()));
        }
        else 
        {
-            CUSPARSE_CALL(cusparseDcsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumNZElements(),reinterpret_cast <double*>(&alpha),descr,reinterpret_cast <const double*>(a.NzValues()),
+            CUSPARSE_CALL(cusparseDcsrmm(cusparseHandle, oper, m, n, k, (int)a.GetNumElemAllocated(), reinterpret_cast <double*>(&alpha), descr, reinterpret_cast <const double*>(a.BufferPointer()),
                a.RowLocation(), a.ColLocation(), reinterpret_cast <double*>(b.BufferPointer()),
                (int)b.GetNumRows(),reinterpret_cast <double*>(&beta),reinterpret_cast <double*>(c.BufferPointer()),(int)c.GetNumRows()));
        }
@ -1586,9 +1589,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        bool allocatedBuffer = false;

        // do we have enough memory to store just the row buffer?
-        if (cSize >= rowBufferRequired && c.NzValues() != nullptr && canReuseBuffer)
+        if (cSize >= rowBufferRequired && c.BufferPointer() != nullptr && canReuseBuffer)
        {
-            csrRowPtrC = (GPUSPARSE_INDEX_TYPE*)c.NzValues();
+            csrRowPtrC = (GPUSPARSE_INDEX_TYPE*)c.BufferPointer();
        }
        else
        {
@ -1665,15 +1668,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //Step 2
        if (sizeof(float)==sizeof(ElemType))
        {
-            CUSPARSE_CALL(cusparseScsrgemm(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,(const float*)S1.NzValues(),S1.RowLocation(),S1.ColLocation(),
-                descrB,nnzB,(const float*)S2.NzValues(),S2.RowLocation(),S2.ColLocation(),
-                descrC,(float*)c.NzValues(),c.RowLocation(),c.ColLocation()));
+            CUSPARSE_CALL(cusparseScsrgemm(cusparseHandle, operA, operB, m, n, k, descrA, nnzA, (const float*)S1.BufferPointer(), S1.RowLocation(), S1.ColLocation(),
+                descrB, nnzB, (const float*)S2.BufferPointer(), S2.RowLocation(), S2.ColLocation(),
+                descrC, (float*)c.BufferPointer(), c.RowLocation(), c.ColLocation()));
        }
        else
        {
-            CUSPARSE_CALL(cusparseDcsrgemm(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,(const double*)S1.NzValues(),S1.RowLocation(),S1.ColLocation(),
-                descrB,nnzB,(const double*)S2.NzValues(),S2.RowLocation(),S2.ColLocation(),
-                descrC,(double*)c.NzValues(),c.RowLocation(),c.ColLocation()));
+            CUSPARSE_CALL(cusparseDcsrgemm(cusparseHandle, operA, operB, m, n, k, descrA, nnzA, (const double*)S1.BufferPointer(), S1.RowLocation(), S1.ColLocation(),
+                descrB, nnzB, (const double*)S2.BufferPointer(), S2.RowLocation(), S2.ColLocation(),
+                descrC, (double*)c.BufferPointer(), c.RowLocation(), c.ColLocation()));
        }
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
@ -1731,13 +1734,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //Step 2
        if (sizeof(ElemType)==sizeof(float))
        {
-            CUSPARSE_CALL(cusparseScsrgeam(cusparseHandle,m,n,reinterpret_cast <const float*>(&alpha),descrA,nnzA,reinterpret_cast <const float*>(a.NzValues()),a.RowLocation(),a.ColLocation(),
-                reinterpret_cast <const float*>(&beta),descrB,nnzB,reinterpret_cast <const float*>(b.NzValues()),b.RowLocation(),b.ColLocation(),descrC,reinterpret_cast <float*>(c.NzValues()),c.RowLocation(),c.ColLocation()));
+            CUSPARSE_CALL(cusparseScsrgeam(cusparseHandle, m, n, reinterpret_cast <const float*>(&alpha), descrA, nnzA, reinterpret_cast <const float*>(a.BufferPointer()), a.RowLocation(), a.ColLocation(),
+                reinterpret_cast <const float*>(&beta), descrB, nnzB, reinterpret_cast <const float*>(b.BufferPointer()), b.RowLocation(), b.ColLocation(), descrC, reinterpret_cast <float*>(c.BufferPointer()), c.RowLocation(), c.ColLocation()));
        }
        else
        {
-            CUSPARSE_CALL(cusparseDcsrgeam(cusparseHandle,m,n,reinterpret_cast <const double*>(&alpha),descrA,nnzA,reinterpret_cast <const double*>(a.NzValues()),a.RowLocation(),a.ColLocation(),
-                reinterpret_cast <const double*>(&beta),descrB,nnzB,reinterpret_cast <const double*>(b.NzValues()),b.RowLocation(),b.ColLocation(),descrC,reinterpret_cast <double*>(c.NzValues()),c.RowLocation(),c.ColLocation()));
+            CUSPARSE_CALL(cusparseDcsrgeam(cusparseHandle, m, n, reinterpret_cast <const double*>(&alpha), descrA, nnzA, reinterpret_cast <const double*>(a.BufferPointer()), a.RowLocation(), a.ColLocation(),
+                reinterpret_cast <const double*>(&beta), descrB, nnzB, reinterpret_cast <const double*>(b.BufferPointer()), b.RowLocation(), b.ColLocation(), descrC, reinterpret_cast <double*>(c.BufferPointer()), c.RowLocation(), c.ColLocation()));
        }
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
@ -1766,7 +1769,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
        CUDA_LONG M=(CUDA_LONG)a.GetNumRows();
        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);        
-        _sparseCSRPlusDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzValues(),a.RowLocation(),a.ColLocation(),c.BufferPointer(),M);
+        _sparseCSRPlusDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.BufferPointer(),a.RowLocation(),a.ColLocation(),c.BufferPointer(),M);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -1859,11 +1862,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
            if (sizeof(ElemType) == sizeof(float))
            {
-                CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(a.NzValues()), a.RowLocation(), a.ColLocation(), reinterpret_cast<float*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
+                CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast<float*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
            }
            else
            {
-                CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(a.NzValues()), a.RowLocation(), a.ColLocation(), reinterpret_cast<double*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
+                CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast<double*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
            }
            if (do_sync)    CUDA_CALL(cudaEventRecord(done));
            if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
@ -1871,7 +1874,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
        else if (a.m_format == matrixFormatSparseCSC)
        {
-            cscValA = (ElemType*)a.NzValues();
+            cscValA = (ElemType*)a.BufferPointer();
            cscRowIndA = a.RowLocation();
            cscColPtrA = a.ColLocation();
        }
@ -2023,7 +2026,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
        CUDA_LONG M=(CUDA_LONG)a.GetNumRows();
        int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);        
-        _sparseCSRElemMulDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzValues(),a.RowLocation(),a.ColLocation(),b.BufferPointer(),c.BufferPointer(),M);
+        _sparseCSRElemMulDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(a.BufferPointer(), a.RowLocation(), a.ColLocation(), b.BufferPointer(), c.BufferPointer(), M);
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@ -2120,26 +2123,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            if (sizeof(ElemType) == sizeof(float))
            {
-                CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->NzValues()), this->RowLocation(), this->ColLocation(),
-                    reinterpret_cast<float*>(c.NzValues()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
+                CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->BufferPointer()), this->RowLocation(), this->ColLocation(),
+                    reinterpret_cast<float*>(c.BufferPointer()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
            }
            else
            {
-                CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->NzValues()), this->RowLocation(), this->ColLocation(),
-                    reinterpret_cast<double*>(c.NzValues()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
+                CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->BufferPointer()), this->RowLocation(), this->ColLocation(),
+                    reinterpret_cast<double*>(c.BufferPointer()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
            }
        }
        else if (m_format == matrixFormatSparseCSC)
        {
            if (sizeof(ElemType) == sizeof(float))
            {
-                CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->NzValues()), this->ColLocation(), this->RowLocation(),
-                    reinterpret_cast<float*>(c.NzValues()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
+                CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->BufferPointer()), this->ColLocation(), this->RowLocation(),
+                    reinterpret_cast<float*>(c.BufferPointer()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
            }
            else
            {
-                CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->NzValues()), this->ColLocation(), this->RowLocation(),
-                    reinterpret_cast<double*>(c.NzValues()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
+                CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->BufferPointer()), this->ColLocation(), this->RowLocation(),
+                    reinterpret_cast<double*>(c.BufferPointer()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
            }
        }
        else
@ -2191,7 +2194,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        slice.m_computeDevice = m_computeDevice;
        slice.m_numRows = m_numRows;
        slice.m_numCols = numCols;
-        slice.m_nz = m_nz;
+        slice.m_nz = SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
        slice.m_elemSizeAllocated = m_elemSizeAllocated;
        slice.m_totalBufferSizeAllocated = m_totalBufferSizeAllocated;
        slice.m_pArray = m_pArray;
@ -2237,11 +2240,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CUSPARSE_CALL(cusparseSetStream(cusparseHandle, t_stream));
        if (sizeof(ElemType) == sizeof(float))
        {
-            CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, m, numCols, descr, (float*)NzValues(), RowLocation(), ColLocation() + startColumn, (float*)slice.BufferPointer(), m));
+            CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, m, numCols, descr, (float*)BufferPointer(), RowLocation(), ColLocation() + startColumn, (float*)slice.BufferPointer(), m));
        }
        else
        {
-            CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, m, numCols, descr, (double*)NzValues(), RowLocation(), ColLocation() + startColumn, (double*)slice.BufferPointer(), m));
+            CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, m, numCols, descr, (double*)BufferPointer(), RowLocation(), ColLocation() + startColumn, (double*)slice.BufferPointer(), m));
        }

        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
@ -2787,7 +2790,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            stream<<us.GetMatrixName();
        }

-        size_t nz = us.GetNumNZElements(), numRows=us.GetNumRows(), numCols=us.GetNumCols();
+        size_t nz = us.GetNumNZElements(), numElemAllocated = us.GetNumElemAllocated(), numRows = us.GetNumRows(), numCols = us.GetNumCols();
        size_t compressedSize = us.SecondaryIndexCount();
        int format = us.GetFormat();

@ -2800,9 +2803,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            CPUSPARSE_INDEX_TYPE* unCompressedIndex = nullptr;

            if (us.m_format == matrixFormatSparseCSC)
-                us.GetMatrixFromCSCFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, numRows, numCols);
+                us.GetMatrixFromCSCFormat(compressedIndex, unCompressedIndex, dataBuffer, numElemAllocated, nz, numRows, numCols);
            else if (us.m_format == matrixFormatSparseCSR)
-                us.GetMatrixFromCSRFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, numRows, numCols);
+                us.GetMatrixFromCSRFormat(compressedIndex, unCompressedIndex, dataBuffer, numElemAllocated, nz, numRows, numCols);
            else
                NOT_IMPLEMENTED;

--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@ -69,8 +69,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // in memory format is always in the following order:
        // Non-zero data elements, Full index locations, compressed index locations
        // In CSR row data is compressed, in CSC col data is compressed
-        inline const ElemType* NzValues() const {return m_pArray;}
-        inline ElemType* NzValues() {return m_pArray;}
+        inline const ElemType* NzValues() const { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
+        inline ElemType* NzValues() { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
        inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use

        GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const //row/col ids in CSC/CSR format, blockId2col/blockId2row in BlockCol/BlockRow format
@ -134,6 +134,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
        inline ElemType* BufferPointer() const { return m_pArray; }
+        inline size_t GetNumElemAllocated() const { return m_elemSizeAllocated; }
+        inline size_t GetSizeElemAllocated() const { return sizeof(ElemType)*m_elemSizeAllocated; }

        // the column and row locations will swap based on what format we are in. Full index always follows the data array
        GPUSPARSE_INDEX_TYPE* RowLocation() const 
@ -163,7 +165,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            assert(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR);

            return (m_format&matrixFormatRowMajor) ? MajorIndexSize() : SecondaryIndexSize();
-        } 
+        }
+        GPUSPARSE_INDEX_TYPE SecondaryIndexValueAt(size_t idx) const
+        {
+            GPUSPARSE_INDEX_TYPE value;
+            CUDA_CALL(cudaMemcpy(&value, SecondaryIndexLocation() + idx, sizeof(GPUSPARSE_INDEX_TYPE), cudaMemcpyDeviceToHost));
+
+            return value;
+        }
        GPUSPARSE_INDEX_TYPE* BlockId2ColOrRow() const
        {
            //not a valid function for other formats
@ -225,9 +234,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);

        //Gets sparse matrix in CSR format. this acts as deep copy. All passed pointers must be NULL. the function will allocate memory itself.
-        void GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
+        void GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &numElemAllocated, size_t &nz, size_t &numRows, size_t &numCols) const;

-        void GetMatrixFromCSCFormat(CPUSPARSE_INDEX_TYPE*& h_CSCCol, CPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
+        void GetMatrixFromCSCFormat(CPUSPARSE_INDEX_TYPE*& h_CSCCol, CPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &numElemAllocated, size_t &nz, size_t &numRows, size_t &numCols) const;

        void ConvertToSparseFormat(MatrixFormat newFormat);
        void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
@ -326,7 +335,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    private:
        void performInplaceFunction(const int kind);
        void DeepCopy(const GPUSparseMatrix<ElemType>& deepCopyFrom);
-        void Clear();
+        void ReleaseMemory();
        void PrepareBuffer(const size_t numRows, const size_t numCols, const bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func);

        size_t ElemCountFromBufferSize(const size_t numRows, const size_t numCols, const MatrixFormat format, const size_t totalBufferSize) const;
--- a/Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp
@ -222,8 +222,8 @@ namespace Microsoft
 					float *arr = nullptr;
 					int *ii = nullptr;
 					int *jj = nullptr;
-					size_t nz, nr, nc;
-					poweredMatrix.GetMatrixFromCSRFormat(ii, jj, arr, nz, nr, nc);
+					size_t ea, nz, nr, nc;
+					poweredMatrix.GetMatrixFromCSRFormat(ii, jj, arr, ea, nz, nr, nc);

 					for (int index = 0; index < c_size; ++index)
 					{
--- a/Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp
+++ b/Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp
@ -180,6 +180,7 @@ namespace Microsoft
                    bool transposeA = false, transposeB = false;
                    float alpha = 2.4f;
                    float beta = 0.0f;
+
                    Matrix<float>::MultiplyAndWeightedAdd(alpha, mAdense, transposeA, mBdense, transposeB, beta, mCdense);
                    Matrix<float>::MultiplyAndWeightedAdd(alpha, mAsparse, transposeA, mBsparse, transposeB, beta, mCsparse);
                    mCsparse.SwitchToMatrixType(MatrixType::DENSE, matrixFormatDense, true);