diff --git a/MachineLearning/cn/TrainingCriterionNode.h b/MachineLearning/cn/TrainingCriterionNode.h
index 16709ec71..39ca907b9 100644
--- a/MachineLearning/cn/TrainingCriterionNode.h
+++ b/MachineLearning/cn/TrainingCriterionNode.h
@@ -1143,8 +1143,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (m_children.size() != 3) 
                 throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs.");
 
-            if (Inputs(0)->OperationName() != L"SparseInputValue" 
-                && Inputs(0)->OperationName() != L"InputValue")
+            if (Inputs(0)->OperationName() != SparseInputValue<ElemType>::TypeName()
+                && Inputs(0)->OperationName() != InputValue<ElemType>::TypeName())
                 throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
 
             if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols()  &&  // input and matrix can be timed
diff --git a/Math/Math/CPUSparseMatrix.h b/Math/Math/CPUSparseMatrix.h
index 56831159a..aac886ec1 100644
--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@@ -141,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     public:
         const ElemType* NzValues() const { return m_pArray; }
-        ElemType* NzValues() { return m_pArray; }
+        inline ElemType* NzValues() { return m_pArray; }
         size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use
 
         CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index 7351819fb..9030c7c15 100644
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -2290,11 +2290,11 @@ __global__ void _denseMultSparseCSCAndWeightedAddToDense(
 //assume resultValues are 0-initialized
 template<class ElemType>
 __global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
-    ElemType alpha,
-    ElemType* lhsValues,
-    size_t numRowsLhs,
-    size_t numColsRhs,
-    ElemType* rhsNZValues,
+    const ElemType alpha,
+    const ElemType* lhsValues,
+    const size_t numRowsLhs,
+    const size_t numColsRhs,
+    const ElemType* rhsNZValues,
     const GPUSPARSE_INDEX_TYPE* rhsRows,
     const GPUSPARSE_INDEX_TYPE* rhsCols,
     const size_t* rhsRowIdx,
diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu
index 970e4ca35..74fda4f6b 100644
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@@ -79,7 +79,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_matrixName=nullptr;
 
         m_blockSize = 0;
-        m_blockVal = nullptr;
         m_blockIds = nullptr;
 
         m_expandedSize = 0;
@@ -241,7 +240,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
             }
 
-            CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+            CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
 
         }
         else if (this->GetFormat() == matrixFormatSparseCSC)
@@ -267,7 +266,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
             }
 
-            CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
+            CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
         }
         else
             NOT_IMPLEMENTED;
@@ -571,7 +570,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_matrixName=moveFrom.m_matrixName;
 
         m_blockSize = moveFrom.m_blockSize;
-        m_blockVal = moveFrom.m_blockVal;
         m_blockIds = moveFrom.m_blockIds;
 
         m_expandedSize = moveFrom.m_expandedSize;
@@ -602,7 +600,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_matrixName=moveFrom.m_matrixName;
 
         m_blockSize = moveFrom.m_blockSize;
-        m_blockVal = moveFrom.m_blockVal;
         m_blockIds = moveFrom.m_blockIds;
 
         m_expandedSize = moveFrom.m_expandedSize;
@@ -636,8 +633,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if(m_pArray != nullptr) 
             CUDACALL(cudaFree(m_pArray));
 
-        if(m_blockVal != nullptr) 
-            CUDACALL(cudaFree(m_blockVal));
         if(m_blockIds != nullptr) 
             CUDACALL(cudaFree(m_blockIds));
         if (m_rowToId != nullptr)
@@ -669,22 +664,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     //-------------------------------------------------------------------------
     // Start of new GPU Sparse Matrix code 
     //-------------------------------------------------------------------------
-
-    template<class ElemType>
-    ElemType* GPUSparseMatrix<ElemType>::BufferPointer() const
-    {
-        if(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR) 
-        {
-            return m_pArray;
-        }  
-        else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
-        {
-            return m_blockVal;
-        }
-        else
-            NOT_IMPLEMENTED;
-    }
-
     template<class ElemType>
     void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
     {
@@ -728,13 +707,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 m_totalBufferSizeAllocated = bufferSizeNeeded;
                 m_elemSizeAllocated = numNZElemToReserve;
             }
+            else
+            {
+                m_elemSizeAllocated = ElemCountFromBufferSize();
+            }
         } 
         else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
         {
             if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
             {
-                if (m_blockVal != nullptr)
-                    CUDACALL(cudaFree(m_blockVal));
+                if (m_pArray != nullptr)
+                    CUDACALL(cudaFree(m_pArray));
                 if (m_blockIds != nullptr)
                     CUDACALL(cudaFree(m_blockIds));
                 if (m_block2UniqId != nullptr)
@@ -742,7 +725,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 PrepareDevice();
                 size_t newCompIndexSize = max(numRows, numCols) + 1;
-                CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve));
+                CUDACALL(cudaMalloc((void **)&m_pArray, sizeof(ElemType)*numNZElemToReserve));
                 CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize));
                 CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize));
 
@@ -997,7 +980,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             c.m_blockSize = rhs.m_blockSize;
             c.m_nz = m*c.m_blockSize;
             c.Resize(m, n, c.m_nz);
-            CUDACALL(cudaMemset(c.m_blockVal, 0, sizeof(ElemType)*(c.m_nz)));
+            CUDACALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
             CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));
 
             LONG64 N = (LONG64)lhs.GetNumElements();  //here we process for each row in lhs and each column in rhs (==columns in lhs)
@@ -1009,11 +992,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     lhs.BufferPointer(),
                     m,
                     l,
-                    rhs.BufferPointer(),
+                    rhs.NzValues(),
                     rhs.RowLocation(),
                     rhs.ColLocation(),
                     rhs.m_rowToId,
-                    c.BufferPointer(),
+                    c.NzValues(),
                     c.m_blockIds);
 
             if (do_sync)    CUDACALL(cudaEventRecord(done));
@@ -1054,7 +1037,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 lhs.GetNumRows(),
                 lhs.GetNumCols(),
                 lhs.m_blockSize,
-                lhs.m_blockVal,
+                lhs.NzValues(),
                 lhs.m_blockIds,
                 rhs.BufferPointer());
 
@@ -1115,7 +1098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             label.m_block2Id,
             cls.BufferPointer(),
             idx2cls.BufferPointer(),            
-            etp.m_pArray,
+            etp.NzValues(),
             etp.MajorIndexLocation(),
             etp.SecondaryIndexLocation());
 
@@ -1195,7 +1178,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         grd.m_blockSize = label.m_blockSize;      
         grd.m_nz = nz;
-        CUDACALL(cudaMemset(grd.m_blockVal,0,sizeof(ElemType)*(grd.m_nz)));
+        CUDACALL(cudaMemset(grd.BufferPointer(),0,sizeof(ElemType)*(grd.m_nz)));
         CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize)));
 
         cudaEvent_t done = nullptr;  
@@ -1214,7 +1197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             idx2cls.BufferPointer(),              
             input.BufferPointer(),
             input.GetNumRows(),
-            grd.m_blockVal, 
+            grd.BufferPointer(), 
             grd.m_blockIds);
         if (do_sync)    CUDACALL(cudaEventRecord(done));
         if (do_sync)    CUDACALL(cudaEventSynchronize(done));
@@ -1232,8 +1215,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             cudaEvent_t done = nullptr;
             if (do_sync)    CUDACALL(cudaEventCreate(&done));
             ElemType * values = NzValues();
-            if (m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow)
-                values = m_blockVal;
             _inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
             if (do_sync)    CUDACALL(cudaEventRecord(done));
             if (do_sync)    CUDACALL(cudaEventSynchronize(done));
@@ -1270,7 +1251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 GetNumRows(),
                 GetNumCols(),
                 m_blockSize,
-                BufferPointer(),
+                NzValues(),
                 m_blockIds,
                 c.BufferPointer());
 
diff --git a/Math/Math/GPUSparseMatrix.h b/Math/Math/GPUSparseMatrix.h
index d7b6a2269..b68ef9783 100644
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@@ -49,9 +49,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // in memory format is always in the following order:
         // Non-zero data elements, Full index locations, compressed index locations
         // In CSR row data is compressed, in CSC col data is compressed
-        const ElemType* NzValues() const {return m_pArray;}
-        ElemType* NzValues() {return m_pArray;}
-        size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
+        inline const ElemType* NzValues() const {return m_pArray;}
+        inline ElemType* NzValues() {return m_pArray;}
+        inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
 
         GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
         size_t MajorIndexCount() const { return m_nz; }
@@ -82,8 +82,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         size_t BufferSizeNeeded(const size_t numNZ) const 
         { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
 
-        size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
-        ElemType* BufferPointer() const;
+        inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
+        inline ElemType* BufferPointer() const { return m_pArray; }
 
         // the column and row locations will swap based on what format we are in. Full index always follows the data array
         GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
@@ -125,7 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
     public:
         virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
-        size_t GetNumNZElements() const {return m_nz;}
+        inline size_t GetNumNZElements() const {return m_nz;}
 
         //Sets sparse matrix in CSR format. this acts as deep copy
         void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val, 
@@ -249,7 +249,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         size_t m_totalBufferSizeAllocated;
 
         size_t m_blockSize; //block size        
-        ElemType *m_blockVal; //block values
         size_t *m_blockIds; //block ids
         size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.