Merge branch 'master' of https://git.codeplex.com/cntk into fseide/reshaping

2015-11-24 20:19:39 -08:00 · 2015-11-24 20:19:39 -08:00 · 5f7e6cfbea
--- a/ExampleSetups/LM/RNNLM/GPU/rnnlm.config
+++ b/ExampleSetups/LM/RNNLM/GPU/rnnlm.config
@ -22,6 +22,7 @@ printValues=true
 devtest=[action=devtest]

 train=[
+modelPath=$ExpFolder$\modelRnnCNTK
 action=train
 minibatchSize=10
 traceLevel=1
@ -67,7 +68,7 @@ numMBsToShowResult=2000
 # gradUpdateType=AdaGrad
 gradUpdateType=None

-modelPath=$ExpFolder$\modelRnnCNTK
+
 loadBestModel=true

 # settings for Auto Adjust Learning Rate
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@ -1102,7 +1102,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                pLeft->FunctionValues() = redU;
                pRight->FunctionValues() = redVT;

-                shared_ptr<ComputationNode<ElemType>> pTimes = AddNodeToNetAndAttachInputs(New<TimesNode<ElemType>>(m_deviceId, name + L"-SVD", true /*createOutputMatrix*/), pLeft, pRight);
+                shared_ptr<ComputationNode<ElemType>> pTimes = AddNodeToNetAndAttachInputs(New<TimesNode<ElemType>>(m_deviceId, name + L"-SVD"), pLeft, pRight);

                //========================================
                // Step 3. remove old node
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@ -899,7 +899,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (m_gradientValues != nullptr && m_gradientValues->GetMatrixType() != SPARSE)  //since we don't have a sparse pool yet
                    ReleaseMatrixToPool(m_gradientValues, matrixPool);

-                ReleaseMatrixToPool(m_functionValues, matrixPool);
+                if (m_functionValues->GetMatrixType() != SPARSE)
+                    ReleaseMatrixToPool(m_functionValues, matrixPool);
            }
        }
        virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const;
@ -950,6 +951,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            auto f_numCols = m_functionValues->GetNumCols();
            if (f_numRows != m_numRows || f_numCols != m_numCols)
                LogicError("UpdateFunctionMBSize: m_functionValues out of sync with m_numRows/m_numCols");
+
+#ifdef SHOW_MATRIX_TYPE
+            fprintf(stderr, "MatrixType %ls: %ls(%ls  %ls)\n",
+                NodeName().c_str(),
+                OperationName().c_str(),
+                FunctionValues().GetMatrixType() == MatrixType::DENSE ? L"Dense" : L"Sparse",
+                FunctionValues().GetCurrentMatrixLocation() == GPU ? L"GPU" :
+                FunctionValues().GetCurrentMatrixLocation() == CPU ? L"CPU" : L"BOTH");
+#endif        
        }

        void ValidateInferChildDims(size_t i, size_t rows, size_t cols) override final;
@ -1047,9 +1057,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        const Matrix<ElemType>& FunctionValues() const { return *m_functionValues; }
        Matrix<ElemType>& FunctionValues() { return *m_functionValues; }
+        shared_ptr<Matrix<ElemType>>& FunctionValuesPtr() { return m_functionValues; }

        const Matrix<ElemType>& GradientValues() const { return *m_gradientValues; }
        Matrix<ElemType>& GradientValues() { return *m_gradientValues; }
+        shared_ptr<Matrix<ElemType>>& GradientValuesPtr() { return m_gradientValues; }

        // function to access any input and output, value and gradient, whole batch or single frame
        // Note: This returns a reference into 'data' in the form of a column slice, i.e. a small matrix object that just points into 'data'.
@ -1291,6 +1303,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                matrixPtr = make_shared<Matrix<ElemType>>(m_deviceId);
            }
        }
+
        //to be called by derived classed if that class needs to print node values
        void PrintNodeValuesToFile(const bool printValues, File& fstream) const
        {
@ -1479,7 +1492,7 @@ protected: \
    using Base::SetDims; /*using Base::NotifyFunctionValuesMBSizeModified;*/ using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadFunctionValues; \
    using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \
    using Base::MaskMissingColumnsToZero; using Base::MaskMissingValuesColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValuesColumns; using Base::InvalidateMissingGradientColumns; \
-    using Base::DataSlice; using Base::ValueSlice; using Base::GradientValues; using Base::GradientSlice; using Base::MaskedValueSlice; using Base::MaskedGradientSlice; \
+    using Base::DataSlice; using Base::ValueSlice; using Base::GradientValues; using Base::GradientValuesPtr; using Base::GradientSlice; using Base::MaskedValueSlice; using Base::MaskedGradientSlice; \
    using Base::EvaluateThisNode; using Base::ComputeInputPartial; \
    using Base::m_children; using Base::m_deviceId; using Base::m_functionValues; using Base::m_gradientValues; \
    using Base::m_inputImageLayout; using Base::m_imageLayout; \
@ -1503,7 +1516,7 @@ protected: \
 public: \
    using Base::RequiresPreCompute; \
    using Base::AttachInputs; using Base::NodeName; \
-    using Base::FunctionValues
+    using Base::FunctionValues; using Base::FunctionValuesPtr

 #define ComputationNodeBoilerplate \
 protected:    /* some boilerplate goes here */ \
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@ -392,18 +392,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static const std::wstring TypeName() { return L"Times"; }
    public:

-        // TODO: The createOutputMatrix parameter here is temporarily added to allow creating the function values
-        // matrix for the times node added during SVD decomposition. Since ValidateSubNetwork is called after addition
-        // of the times node, the validation crashes if the function values matrix has not yet been allocated
-        // This can be removed after the  Validation has been fixed to not access the function values matrix at all
        DeclareConstructorFromConfigWithNumInputs(TimesNode);
-        TimesNode(DEVICEID_TYPE deviceId, const wstring & name, bool createOutputMatrix = false) :
+        TimesNode(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name)
        {
-            if (createOutputMatrix)
-            {
-                CreateMatrixIfNull(m_functionValues);
-            }
        }

        virtual void /*ComputationNode::*/ComputeInputPartial(const size_t inputIndex, const FrameRange & frameRange) override
@ -488,6 +480,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //after multiplication the structure is lost
            m_imageLayout = ImageLayoutWHC(1, Inputs(0)->GetNumRows(), 1);
        }
+
+        virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool) override
+        {
+            //this is a special handling case. We need to allocate sparse matrix directly instead of from pool.
+            if (m_children[0]->NeedGradient() && Inputs(1)->FunctionValues().GetMatrixType() == SPARSE)
+            {
+                CreateMatrixIfNull(Inputs(0)->GradientValuesPtr());
+                Inputs(0)->GradientValues().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
+            }
+           
+            //we need to call base allocation at end since we will need to allocate special ones first 
+            //so that the default allocator will not allocate it again.
+            Base::AllocateGradientMatricesForChildren(matrixPool);
+        }
    };

    template class TimesNode<float>; 
--- a/MachineLearning/CNTKComputationNetworkLib/MatrixPool.h
+++ b/MachineLearning/CNTKComputationNetworkLib/MatrixPool.h
@ -30,8 +30,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void Release(shared_ptr<Matrix<ElemType>> freeMatrix)
        {
            vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
-            if (freeMatrix == nullptr)
-                RuntimeError("MatrixPool::Release: freeMatrix should not be null.");
+            if (freeMatrix == nullptr || freeMatrix->GetMatrixType() == SPARSE)
+                RuntimeError("MatrixPool::Release: freeMatrix should not be null or sparse.");
 #ifdef _DEBUG
            for (int i = 0; i < releasedMatrices.size(); i++)
            {
--- a/Math/Math/CPUSparseMatrix.cpp
+++ b/Math/Math/CPUSparseMatrix.cpp
@ -113,9 +113,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
        {
            m_blockSize = 0;      
+            m_blockIdShift = 0;
            m_pArray = NULL;
            m_blockIds = NULL;
        }
+        m_nzValues = NULL;
    }

    //should only be used by constructors.
@ -166,34 +168,109 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }

+    //move constructor, shallow copy
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>::CPUSparseMatrix(CPUSparseMatrix<ElemType>&& moveFrom)
+    {
+        m_format = moveFrom.m_format;
+        m_numRows = moveFrom.m_numRows;
+        m_numCols = moveFrom.m_numCols;
+        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+        m_compIndexSize = moveFrom.m_compIndexSize;
+        m_externalBuffer = moveFrom.m_externalBuffer;
+        m_computeDevice = moveFrom.m_computeDevice;
+        m_nz = moveFrom.m_nz;
+        m_matrixName = moveFrom.m_matrixName;
+
+        m_colIdx = moveFrom.m_colIdx;
+        m_pArray = moveFrom.m_pArray;
+        m_nzValues = moveFrom.m_nzValues;
+        m_unCompIndex = moveFrom.m_unCompIndex;
+        m_compIndex = moveFrom.m_compIndex;
+
+        m_blockSize = moveFrom.m_blockSize;
+        m_blockIdShift = moveFrom.m_blockIdShift;
+        m_blockIds = moveFrom.m_blockIds;
+
+        //release the pointer from the source object so that the destructor won't release it twice
+        moveFrom.ZeroInit();
+    }
+
+    //move assignment operator, shallow copy
+    template<class ElemType>
+    CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(CPUSparseMatrix<ElemType>&& moveFrom)
+    {
+        if (this != &moveFrom)
+        {
+            if (OwnBuffer())
+                ReleaseMemory();  //always delete the data pointer since we will use the pointer from moveFrom
+
+            m_format = moveFrom.m_format;
+            m_numRows = moveFrom.m_numRows;
+            m_numCols = moveFrom.m_numCols;
+            m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+            m_compIndexSize = moveFrom.m_compIndexSize;
+            m_externalBuffer = moveFrom.m_externalBuffer;
+            m_computeDevice = moveFrom.m_computeDevice;
+            m_nz = moveFrom.m_nz;
+            m_matrixName = moveFrom.m_matrixName;
+
+            m_colIdx = moveFrom.m_colIdx;
+            m_pArray = moveFrom.m_pArray;
+            m_nzValues = moveFrom.m_nzValues;
+            m_unCompIndex = moveFrom.m_unCompIndex;
+            m_compIndex = moveFrom.m_compIndex;
+
+            m_blockSize = moveFrom.m_blockSize;
+            m_blockIdShift = moveFrom.m_blockIdShift;
+            m_blockIds = moveFrom.m_blockIds;
+
+            //release the pointer from the source object so that the destructor won't release it twice
+            moveFrom.ZeroInit();
+        }
+        return *this;
+    }
+
    template<class ElemType>
    CPUSparseMatrix<ElemType>::~CPUSparseMatrix()
+    {
+        ReleaseMemory();
+    }
+
+    template<class ElemType>
+    void CPUSparseMatrix<ElemType>::ReleaseMemory()
    {
        // If m_externalBuffer is true then this matrix
        // is simply a view over another matrix. In that
        // case we shouldn't free anything.
        if (!m_externalBuffer)
        {
-            if (m_matrixName!=NULL) 
+            delete[] m_matrixName;
+
+            if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
            {
-                delete[] m_matrixName;
+                delete[] m_pArray; 
+                m_pArray = nullptr;
+                m_nzValues = nullptr;
+
+                delete[] m_unCompIndex; 
+                m_unCompIndex = nullptr;
+
+                delete[] m_compIndex; 
+                m_compIndex = nullptr;
            }
-            if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) 
+            else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
            {
-                    delete[] m_pArray;
-                    delete[] m_unCompIndex;
-                    delete[] m_compIndex;
-            }
-            else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) 
-            {
-                delete[] m_pArray;
-                delete[] m_blockIds;
+                delete[] m_pArray; 
+                m_pArray = nullptr;
+                m_nzValues = nullptr;
+
+                delete[] m_blockIds; 
+                m_blockIds = nullptr;
            }
        }
    }

-
-
 #pragma endregion Constructors and Destructor

 #pragma region Basic Operators
@ -307,15 +384,64 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (startColumn + numCols > m_numCols)
            InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int)startColumn, (int)numCols, (int)m_numCols);

-        if (m_format != MatrixFormat::matrixFormatSparseCSC)
+        if (m_format != MatrixFormat::matrixFormatSparseCSC && m_format != MatrixFormat::matrixFormatSparseBlockCol)
            NOT_IMPLEMENTED;

-        CPUSparseMatrix<ElemType> slice(m_format, m_numRows, numCols, m_elemSizeAllocated);
-        slice.m_pArray = m_pArray;
-        slice.m_unCompIndex = m_unCompIndex;
-        slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
-        slice.m_externalBuffer = true;
-        slice.m_nz = m_nz;
+        CPUSparseMatrix<ElemType> slice(m_format);
+        slice.m_numRows = m_numRows;
+        slice.m_numCols = numCols;
+
+        if (m_format == MatrixFormat::matrixFormatSparseCSC)
+        {
+            slice.m_pArray = m_pArray;
+            slice.m_nzValues = m_pArray + m_compIndex[startColumn];  //note: m_compIndex is always against  m_pArray
+            slice.m_unCompIndex = m_unCompIndex;
+            slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
+            slice.m_externalBuffer = true;
+            slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
+            slice.m_elemSizeAllocated = slice.m_nz;
+            slice.m_compIndexSize = numCols + 1;
+        }
+        else if (m_format == MatrixFormat::matrixFormatSparseBlockCol)
+        {
+            long long startColBlock = 0, endColBlock = 0;
+            bool foundStart = false, foundEnd = false;
+            for (size_t j = 0; j < m_blockSize; j++)
+            {
+                if (j > 0)
+                {
+                    assert(m_blockIds[j] > m_blockIds[j - 1]);  //assume ids are increasing.Is this valid?
+                }
+
+                if (!foundStart && (long long) m_blockIds[j] - (long long)m_blockIdShift >= (long long)startColumn) // start column with values
+                {
+                    startColBlock = j;
+                    foundStart = true;
+                }
+                else if ((long long)m_blockIds[j] - (long long)m_blockIdShift >= (long long)(startColumn + numCols)) //end column with values
+                {
+                    endColBlock = j;
+                    foundEnd = true;
+                    break;
+                }
+            }
+            if (!foundStart)
+            {
+                startColBlock = (long long)m_blockSize;
+            }
+            if (!foundEnd)
+            {
+                endColBlock = (long long)m_blockSize;
+            }
+
+            slice.m_pArray = m_pArray + startColBlock * m_numRows;
+            slice.m_nzValues = slice.m_pArray;
+            slice.m_blockIds = m_blockIds + startColBlock; //the value stored in the block id is based on the original column numbers
+            slice.m_blockSize = (size_t)max((long long)0, endColBlock - startColBlock);
+            slice.m_blockIdShift = m_blockIdShift + startColumn;
+            slice.m_externalBuffer = true;
+            slice.m_nz = slice.m_blockSize * m_numRows;
+        }

        return slice;
    }
@ -439,22 +565,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (keepExistingValues && m_nz > 0)
                {
                    assert(m_compIndexSize > 0 && m_nz < numNZElemToReserve);
-                    memcpy(pArray, m_pArray, NzSize());
+                    memcpy(pArray, m_nzValues, NzSize());
                    memcpy(unCompIndex, m_unCompIndex, MajorIndexSize());
                    memcpy(compIndex, m_compIndex, SecondaryIndexSize());
                }

-                if (m_pArray != NULL)
-                    delete [] m_pArray;
-                if (m_unCompIndex != NULL)
-                    delete [] m_unCompIndex;
-                if (m_compIndex != NULL)
-                    delete [] m_compIndex;
-                m_pArray = NULL;
-                m_unCompIndex = NULL;
-                m_compIndex = NULL;
+                delete [] m_pArray;
+                delete [] m_unCompIndex;
+                delete [] m_compIndex;

                m_pArray = pArray;
+                m_nzValues = m_pArray;
                m_unCompIndex = unCompIndex;
                m_compIndex = compIndex;
            }
@ -469,18 +590,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (keepExistingValues && m_elemSizeAllocated > 0)
                {
                    assert(m_compIndexSize > 0 && m_elemSizeAllocated < numNZElemToReserve);
-                    memcpy(blockVal, m_pArray, NzSize());
+                    memcpy(blockVal, m_nzValues, NzSize());
                    memcpy(blockIds, m_blockIds, sizeof(size_t)*m_compIndexSize);
                }

-                if (m_pArray != NULL)
-                    delete[] m_pArray;
-                if(m_blockIds != NULL) 
-                    delete[] m_blockIds;
-                m_pArray = NULL;
-                m_blockIds = NULL;
+                delete[] m_pArray;
+                delete[] m_blockIds;

                m_pArray = blockVal;
+                m_nzValues = m_pArray;
                m_blockIds = blockIds;
            }

@ -496,6 +614,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_nz = 0;
        m_colIdx = -1;
        m_blockSize = 0;
+        m_blockIdShift = 0;
    }

    //c = alpha*op(lhs) * op(rhs) + beta*c
@ -712,7 +831,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            for(size_t j = 0; j < lhs.m_blockSize; j++) 
            {
-                size_t i = lhs.m_blockIds[j];
+                size_t i = lhs.m_blockIds[j] - lhs.m_blockIdShift;
                size_t len = (lhs.m_format == MatrixFormat::matrixFormatSparseBlockCol) ? lhs.GetNumRows() : lhs.GetNumCols();
                size_t start = j * len;
                for(size_t p = start; p < start+len; p++) 
@ -771,7 +890,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            for(size_t j = 0; j < m_blockSize; j++) 
            {
-                size_t i = m_blockIds[j];
+                size_t i = m_blockIds[j] - m_blockIdShift;
                size_t len = (m_format == MatrixFormat::matrixFormatSparseBlockCol) ? GetNumRows() : GetNumCols();
                size_t start = j* len;
                for(size_t p = start; p < start+len; p++) 
@ -834,7 +953,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            size_t p = 0;
            for (long j = 0; j < m_blockSize; j++)
            {
-                size_t colOrRow = m_blockIds[j];
+                size_t colOrRow = m_blockIds[j] - m_blockIdShift;
                for (long i = 0; i < len; i++, p++) 
                {
                    ElemType val = m_pArray[p];
@ -1063,18 +1182,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (sizeof(ElemType) == sizeof(double))
        {
 #ifndef USE_MKL
-            return (ElemType)dasum((int)this->NzCount(), reinterpret_cast <double*>(m_pArray), 1);
+            return (ElemType)dasum((int)this->NzCount(), reinterpret_cast <double*>(m_nzValues), 1);
 #else  
-            return (ElemType)cblas_dasum((int)this->NzCount(), reinterpret_cast <double*>(m_pArray), 1);
+            return (ElemType)cblas_dasum((int)this->NzCount(), reinterpret_cast <double*>(m_nzValues), 1);
 #endif
        }
        else
        {
 #pragma warning (suppress: 4244)
 #ifndef USE_MKL
-            return sasum((int)this->NzCount(), reinterpret_cast <float*>(m_pArray), 1);
+            return sasum((int)this->NzCount(), reinterpret_cast <float*>(m_nzValues), 1);
 #else
-            return cblas_sasum((int)this->NzCount(), reinterpret_cast <float*>(m_pArray), 1);
+            return cblas_sasum((int)this->NzCount(), reinterpret_cast <float*>(m_nzValues), 1);
 #endif
        }
    }
@ -1217,6 +1336,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template CPUSparseMatrix<char>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
    template CPUSparseMatrix<char>::CPUSparseMatrix(MatrixFormat);
    template CPUSparseMatrix<char>::CPUSparseMatrix(CPUSparseMatrix<char> const &);
+    template CPUSparseMatrix<char>::CPUSparseMatrix(CPUSparseMatrix<char> &&);
+    template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(CPUSparseMatrix<char>&& moveFrom);
    template void CPUSparseMatrix<char>::SetValue(size_t, size_t, char);
    template char* CPUSparseMatrix<char>::BufferPointer() const;
    template void CPUSparseMatrix<char>::Reset(void);
--- a/Math/Math/CPUSparseMatrix.h
+++ b/Math/Math/CPUSparseMatrix.h
@ -33,14 +33,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    private:
        void ZeroInit();
        void CheckInit(const MatrixFormat format);
+        void ReleaseMemory();

    public:
        CPUSparseMatrix(const MatrixFormat format);
        CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
        CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deepCopyFrom);  //copy constructor, deep copy
        CPUSparseMatrix<ElemType>& operator=(const CPUSparseMatrix<ElemType>& deepCopyFrom);  //assignment operator, deep copy
-        
-        
+        CPUSparseMatrix(CPUSparseMatrix<ElemType>&& moveFrom);  //move constructor, shallow copy
+        CPUSparseMatrix<ElemType>& operator=(CPUSparseMatrix<ElemType>&& moveFrom);  //move assignment operator, shallow copy
        ~CPUSparseMatrix();

    public:
@ -137,8 +138,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {


    public:
-        const ElemType* NzValues() const { return m_pArray; }
-        inline ElemType* NzValues() { return m_pArray; }
+        const ElemType* NzValues() const { return m_nzValues; }
+        inline ElemType* NzValues() { return m_nzValues; }
        size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use

        CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
@ -169,6 +170,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    private:
        int m_colIdx; //used to SetValue()
        size_t m_compIndexSize;
+        ElemType* m_nzValues;

        //non-zero values are stored in m_pArray
        CPUSPARSE_INDEX_TYPE *m_unCompIndex; //row/col ids in CSC/CSR format
@ -176,6 +178,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        size_t m_blockSize; //block size
        size_t *m_blockIds; //block ids
+        size_t m_blockIdShift; //used to get efficient slice, actual col = blockIds[j] - m_blockIdShift
    };

    typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
--- a/Tests/Image/QuickE2E/baseline.windows.debug.cpu.txt
+++ b/Tests/Image/QuickE2E/baseline.windows.debug.cpu.txt
@ -893,8 +893,8 @@ already there from last epoch
 Starting minibatch loop.
 randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
 randomordering: recached sequence for seed 11: 6, 31, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  0.37213734; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.65604s; TotalTimePerSample = 6.56038ms; SamplesPerSecond = 152
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.37213734; EvalErrPerSample = 0; Ave LearnRatePerSample = 0.004999999888; EpochTime=0.656382
+ Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  0.37077690; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.65604s; TotalTimePerSample = 6.56038ms; SamplesPerSecond = 152
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.37077689; EvalErrPerSample = 0; Ave LearnRatePerSample = 0.004999999888; EpochTime=0.656382
 CNTKCommandTrainEnd: Train


@ -2269,8 +2269,8 @@ reading from record 0 to 100 to be positioned properly for epoch
 Starting minibatch loop.
 randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
 randomordering: recached sequence for seed 11: 6, 31, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  0.37792297; EvalErr[0]PerSample = 0.00000000; TotalTime = 1.34518s; TotalTimePerSample = 13.45185ms; SamplesPerSecond = 74
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.37792295; EvalErrPerSample = 0; Ave LearnRatePerSample = 0.004999999888; EpochTime=1.371377
+ Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  0.37650299; EvalErr[0]PerSample = 0.00000000; TotalTime = 1.34518s; TotalTimePerSample = 13.45185ms; SamplesPerSecond = 74
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.37650299; EvalErrPerSample = 0; Ave LearnRatePerSample = 0.004999999888; EpochTime=1.371377
 CNTKCommandTrainEnd: Train


--- a/Tests/Image/QuickE2E/baseline.windows.release.cpu.txt
+++ b/Tests/Image/QuickE2E/baseline.windows.release.cpu.txt
@ -864,8 +864,8 @@ already there from last epoch
 Starting minibatch loop.
 randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
 randomordering: recached sequence for seed 11: 6, 31, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  0.37213734; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.08724s; TotalTimePerSample = 0.87241ms; SamplesPerSecond = 1146
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.37213734; EvalErrPerSample = 0; Ave LearnRatePerSample = 0.004999999888; EpochTime=0.087336
+ Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  0.37077690; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.08724s; TotalTimePerSample = 0.87241ms; SamplesPerSecond = 1146
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.37077689; EvalErrPerSample = 0; Ave LearnRatePerSample = 0.004999999888; EpochTime=0.087336
 CNTKCommandTrainEnd: Train


@ -2182,8 +2182,8 @@ reading from record 0 to 100 to be positioned properly for epoch
 Starting minibatch loop.
 randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
 randomordering: recached sequence for seed 11: 6, 31, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  0.37792297; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.89367s; TotalTimePerSample = 8.93670ms; SamplesPerSecond = 111
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.37792295; EvalErrPerSample = 0; Ave LearnRatePerSample = 0.004999999888; EpochTime=0.908817
+ Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  0.37650299; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.89367s; TotalTimePerSample = 8.93670ms; SamplesPerSecond = 111
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.37650299; EvalErrPerSample = 0; Ave LearnRatePerSample = 0.004999999888; EpochTime=0.908817
 CNTKCommandTrainEnd: Train


--- a/Tests/Image/QuickE2E/cntk.config
+++ b/Tests/Image/QuickE2E/cntk.config
@ -5,6 +5,7 @@ deviceId=$DeviceId$
 ndlMacros=$ConfigDir$/Macros.ndl

 parallelTrain=false
+NumCPUThreads=8

 Train=[
    action=train
--- a/Tests/Image/QuickE2E/testcases.yml
+++ b/Tests/Image/QuickE2E/testcases.yml
@ -1,9 +1,9 @@
 dataDir: ../Data
 tags:
     # running on every BVT job in 'I' (Image) leg:
-     - bvt-i  os=='windows' or device=='gpu'
+     - bvt-i  device=='gpu'
     # running every Nightly job in 'I' leg
-     - nightly-i os=='windows' or device=='gpu'
+     - nightly-i device=='gpu'

 testCases:
  CNTK Run must be completed: