merged from thhoens/bs

2016-04-29 22:08:16 -07:00 · 2016-04-29 22:08:16 -07:00 · 2d15aa116f
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -524,7 +524,7 @@ Parameters =
 RNNs =
 [
    # LSTMP -- LSTM function with projection and self-stabilization
-    # Projection it enabled by passing different values for outputDim and cellDim.
+    # Projection is enabled by passing different values for outputDim and cellDim.
    # This is the stateless version that takes the previous state as an input.
    # It returns a dictionary with three members: h and c, and dim=h.dim for convenience. prevState must have h and c.
    LSTMP (outputDim, cellDim=outputDim, x, inputDim=x.dim, prevState, enableSelfStabilization=false) =
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -938,7 +938,7 @@ public:
            if (m_value)
            {
                node->CreateValueMatrixIfNull();
-            node->m_value->SetValue(*m_value);
+                node->m_value->SetValue(*m_value);
            }
            else
                node->m_value = nullptr;
@ -1549,6 +1549,7 @@ public:

    void Trace()
    {
+        //DebugLogMinibatch();
 #if 0
        static const std::set<std::wstring> toLog{
            L"labelSentenceStartEmbedded",
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -320,6 +320,7 @@ public:
        {
            // currently we only support one combination when the input is sparse
            // If input data is sparse, then gradient is block sparse.
+            // BUGBUG: This does not accumulate into the Input(0)->Gradient, which might cause problems elsewhere.
            if (Input(1)->Value().GetMatrixType() == SPARSE && Input(0)->Gradient().GetMatrixType() == DENSE && Gradient().GetMatrixType() == DENSE)
                Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
            auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true,   fr.AllowBroadcast());
@ -433,9 +434,11 @@ public:
                std::swap(dimsA[0], dimsA[1]);
            // update if LearnableParameter
            Input(0)->ValidateInferInputDimsFrom(TensorShape(dimsA));
+#if 0 // Removed this, because the check is just wrong.
            // and verify once again
            if (isFinalValidationPass && Input(0)->GetSampleLayout().GetDims() != dimsA)
                InvalidArgument("%ls %ls operation: Left [%s] and right [%s] operands' shapes are not compatible.", NodeName().c_str(), OperationName().c_str(), dimsAstring.c_str(), dimsBstring.c_str());
+#endif
        }
    }

@ -575,7 +578,7 @@ public:
        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
        Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-        sliceOutputValue.SetValue(sliceInput1Value);
+        sliceOutputValue.AssignValuesOf(sliceInput1Value);
        sliceOutputValue.ColumnElementMultiplyWith(Input(0)->ValueAsMatrix());
    }

--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -321,7 +321,7 @@ public:
                        inp = Input(0)->ValueFor(frDelayed.Sequence(id));
                    // inp = Input(0)->ValueFor(FrameRange(m_pMBLayout, t_delayed).Sequence(id));

-                    out.SetValue(inp);
+                    out.AssignValuesOf(inp);
                }
            }
        }
@ -358,7 +358,7 @@ public:
                inp = Input(0)->ValueFor(frDelayed);
            // inp = Input(0)->ValueFor(FrameRange(m_pMBLayout, t_delayed));

-            out.SetValue(inp);
+            out.AssignValuesOf(inp);
        }
    }

--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -150,12 +150,12 @@ public:

    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
-        ValueFor(fr).SetValue(Input(0)->ValueFor(fr));
+        ValueFor(fr).AssignValuesOf(Input(0)->ValueFor(fr));
    }

    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
    {
-        Input(inputIndex)->GradientFor(fr).SetValue(GradientFor(fr));
+        Input(inputIndex)->GradientFor(fr).AssignValuesOf(GradientFor(fr));
    }

    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
@ -255,7 +255,7 @@ public:
                            Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str());

        // copy the data from 'dataInput'
-        ValueFor(fr).SetValue(Input(0)->ValueFor(fr.WithLayout(Input(0)->GetMBLayout()))); // just propagate through
+        ValueFor(fr).AssignValuesOf(Input(0)->ValueFor(fr.WithLayout(Input(0)->GetMBLayout()))); // just propagate through
        // TODO: Once we do in-place, the above must include a copy-to-self check (either here or inside the matrix lib).
    }

@ -1080,7 +1080,7 @@ public:
        // (We still need to copy the values since there is currently no way to point to an input function value while reshaping at the same time.)
        if (!m_pMBLayout || factor() == 1)
        {
-            Value().Reshaped(newCols * m_numTargetRows, 1).SetValue(Input(0)->Value().Reshaped(cols * rows, 1)); // copy the values as one long vector
+            Value().Reshaped(newCols * m_numTargetRows, 1).AssignValuesOf(Input(0)->Value().Reshaped(cols * rows, 1)); // copy the values as one long vector
        }
        // layout case: reshape semantics happens across parallel seqeunces, i.e. requiring data shuffling
        else
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -177,6 +177,7 @@ public:
        // first compute the softmax (column-wise)
        // Note that we need both log and non-log for gradient computation.
        m_logSoftmaxOfRight->AssignLogSoftmaxOf(Input(1)->ValueFor(fr), true);
+        // BUGBUG: No need to compute m_softmaxOfRight in ForwardProp, should be moved to BackpropTo().
        m_softmaxOfRight->SetValue(*m_logSoftmaxOfRight);
        m_softmaxOfRight->InplaceExp();
        // flatten all gaps to zero, such that gaps will contribute zero to the sum
@ -780,7 +781,7 @@ private:
                case 3:
                {
                    Matrix<ElemType> grd_t = Input(CLASSPROBINDATA)->GradientFor(fr);
-                    grd_t.SetValue(Input(CLASSPROBINDATA)->DataFor(m_clsSoftmax, fr));
+                    grd_t.AssignValuesOf(Input(CLASSPROBINDATA)->DataFor(m_clsSoftmax, fr));
                    ComputeCEPartialToSoftmaxInputs(grd_t, Gradient(), c_t);
                    break;
                }
@ -811,7 +812,7 @@ private:
                size_t idx_in_class = y_t - lft_bnd;
                ComputeCEPartialToSoftmaxInputs(softMax, Gradient(), idx_in_class);

-                m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd).SetValue(softMax);
+                m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd).AssignValuesOf(softMax);
            });

            m_needRecomputeGradientToSoftmaxInput = false;
@ -1768,7 +1769,7 @@ public:
                                      sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);

            m_mbCount++;
-        }
+            }

    void Validate(bool isFinalValidationPass) override
    {
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -852,6 +852,24 @@ void CPUMatrix<ElemType>::SetValue(const CPUMatrix<ElemType>& deepCopyFrom)
 	SetValue(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols(), deepCopyFrom.Data(), 0);
 }

+template <class ElemType>
+void CPUMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& /*deepCopyFrom*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom)
+{
+    deepCopyFrom.AssignColumnSliceToDense(*this, 0, deepCopyFrom.GetNumCols());
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& /*deepCopyFrom*/)
+{
+    NOT_IMPLEMENTED;
+}
+
 template <class ElemType>
 void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, ElemType* pArray, const size_t matrixFlags)
 {
@ -6290,6 +6308,9 @@ template CPUMatrix<char>& CPUMatrix<char>::operator=(CPUMatrix<char>&&);
 template void CPUMatrix<char>::SetValue(const char);
 template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char* pArray, size_t matrixFlags);
 template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
+template void CPUMatrix<char>::SetValue(GPUMatrix<char> const&);
+template void CPUMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
+template void CPUMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
 template void CPUMatrix<char>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly);
 template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);

--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -13,6 +13,10 @@
 #include <ctime>
 #include <limits.h>

+#include "GPUMatrix.h"
+#include "CPUSparseMatrix.h"
+#include "GPUSparseMatrix.h"
+
 // NOTE NOTE NOTE:
 // use CPUSingleMatrix and CPUDoubleMatrix instead of using the template directly
 ///////////////////////////////////////////////
@ -127,6 +131,9 @@ public:

    void SetValue(const ElemType v);
    void SetValue(const CPUMatrix<ElemType>& deepCopyFrom);
+    void SetValue(const GPUMatrix<ElemType>& deepCopyFrom);
+    void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
+    void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
    void SetValue(const size_t numRows, const size_t numCols, ElemType* pArray, size_t matrixFlags = matrixFlagNormal);

    void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val);
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -264,6 +264,32 @@ void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
        memcpy(RowLocation(), v.RowLocation(), v.RowSize());
        memcpy(ColLocation(), v.ColLocation(), v.ColSize());
    }
+    if (v.m_sliceViewOffset > 0)
+    {
+        CPUSPARSE_INDEX_TYPE* loc = (GetFormat() == matrixFormatSparseCSC) ? ColLocation() : RowLocation();
+        size_t len = (GetFormat() == matrixFormatSparseCSC) ? ColSize() : RowSize();
+        CPUSPARSE_INDEX_TYPE offset = loc[0];
+        for (size_t c = 0; c < len; c++)
+            loc[c] -= offset;
+    }
+}
+
+template <class ElemType>
+void CPUSparseMatrix<ElemType>::SetValue(const CPUMatrix<ElemType>& /*v*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void CPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& /*v*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void CPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& /*v*/)
+{
+    NOT_IMPLEMENTED;
 }

 template <class ElemType>
@ -403,6 +429,7 @@ void CPUSparseMatrix<ElemType>::AssignColumnSliceToDense(CPUMatrix<ElemType>& sl
    // We can either error out or RequireSize. Because RequireSize will error out if it's not allowed, I think this makes more sense.
    slice.RequireSize(m_numRows, numCols);

+    memset(slice.Data(), 0, sizeof(ElemType) * slice.GetNumElements());
 #pragma omp parallel for
    for (long j = 0; j < numCols; j++)
    {
@ -1348,7 +1375,10 @@ template CPUSparseMatrix<char>::CPUSparseMatrix(CPUSparseMatrix<char> const&);
 template CPUSparseMatrix<char>::CPUSparseMatrix(CPUSparseMatrix<char>&&);
 template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(CPUSparseMatrix<char>&& moveFrom);
 template void CPUSparseMatrix<char>::SetValue(size_t, size_t, char);
+template void CPUSparseMatrix<char>::SetValue(CPUMatrix<char> const&);
+template void CPUSparseMatrix<char>::SetValue(GPUMatrix<char> const&);
 template void CPUSparseMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
+template void CPUSparseMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
 template char* CPUSparseMatrix<char>::Data() const;
 template char* CPUSparseMatrix<char>::Data();
 template void CPUSparseMatrix<char>::Reset(void);
--- a/Source/Math/CPUSparseMatrix.h
+++ b/Source/Math/CPUSparseMatrix.h
@ -6,6 +6,8 @@

 #include <stdio.h>
 #include "CPUMatrix.h"
+#include "GPUMatrix.h"
+#include "GPUSparseMatrix.h"
 #include <map>
 #include <unordered_map>

@ -82,7 +84,11 @@ public:
 public:

    void SetValue(const size_t row, const size_t col, ElemType val);
+    void SetValue(const CPUMatrix<ElemType>& /*val*/);
+    void SetValue(const GPUMatrix<ElemType>& /*val*/);
    void SetValue(const CPUSparseMatrix<ElemType>& /*val*/);
+    void SetValue(const GPUSparseMatrix<ElemType>& /*val*/);
+
    void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val);

    size_t BufferSize() const
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -312,7 +312,7 @@ protected:
            if (in.GetMatrixType() == MatrixType::DENSE || m_gpuSparse1D)
                inputSubBatch = in.ColumnSlice(startSampleId, smallBatchSize);
            else
-                inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize), in.GetFormat());
+                inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize));

            if (m_gpuSparseOpt)
            {
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -1093,6 +1093,25 @@ void GPUMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& deepCopyFrom)
 	SetValue(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols(), deepCopyFrom.GetComputeDeviceId(), deepCopyFrom.Data(), matrixFlagSetValueOnDevice);
 }

+template <class ElemType>
+void GPUMatrix<ElemType>::SetValue(const CPUMatrix<ElemType>& /*deepCopyFrom*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& /*deepCopyFrom*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom)
+{
+    deepCopyFrom.CopyToDenseMatrix(*this);
+}
+
+
 template <class ElemType>
 void GPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, size_t matrixFlags)
 {
@ -4358,7 +4377,10 @@ template GPUMatrix<char>& GPUMatrix<char>::operator=(GPUMatrix<char>&&);
 template GPUMatrix<char>::GPUMatrix(int);
 template void GPUMatrix<char>::SetValue(const char);
 template void GPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, int deviceId, char* pArray, size_t matrixFlags);
+template void GPUMatrix<char>::SetValue(CPUMatrix<char> const&);
 template void GPUMatrix<char>::SetValue(GPUMatrix<char> const&);
+template void GPUMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
+template void GPUMatrix<char>::SetValue(GPUSparseMatrix<char> const&);

 template GPUMatrix<int>::GPUMatrix(const size_t, const size_t, int, int*, const size_t);
 template GPUMatrix<int>::~GPUMatrix();
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -19,6 +19,10 @@
 #include <memory>   // for unique_ptr
 #include <limits.h> // for ULONG_MAX

+#include "CPUMatrix.h"
+#include "CPUSparseMatrix.h"
+#include "GPUSparseMatrix.h"
+
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@ -223,7 +227,10 @@ public:

    void MaskColumnsValue(const GPUMatrix<char>& columnsMask, ElemType val);

+    void SetValue(const CPUMatrix<ElemType>& deepCopyFrom);
    void SetValue(const GPUMatrix<ElemType>& deepCopyFrom);
+    void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
+    void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
    void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, size_t matrixFlags = matrixFlagNormal);

    void SetDiagonalValue(const ElemType v);
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -260,22 +260,22 @@ void GPUSparseMatrix<ElemType>::CopyToDenseMatrix(GPUMatrix<ElemType>& denseMatr
    {
        if (sizeof(ElemType) == sizeof(float))
        {
-            CUSPARSE_CALL(cusparseScsr2dense(cusparseHandle, int(GetNumRows()), int(GetNumCols()), descr, (float*) Data(), RowLocation(), ColLocation(), (float*) denseMatrix.Data(), int(GetNumRows())));
+            CUSPARSE_CALL(cusparseScsr2dense(cusparseHandle, int(GetNumRows()), int(GetNumCols()), descr, (float*) Buffer(), RowLocation(), ColLocation(), (float*) denseMatrix.Data(), int(GetNumRows())));
        }
        else
        {
-            CUSPARSE_CALL(cusparseDcsr2dense(cusparseHandle, int(GetNumRows()), int(GetNumCols()), descr, (double*) Data(), RowLocation(), ColLocation(), (double*) denseMatrix.Data(), int(GetNumRows())));
+            CUSPARSE_CALL(cusparseDcsr2dense(cusparseHandle, int(GetNumRows()), int(GetNumCols()), descr, (double*) Buffer(), RowLocation(), ColLocation(), (double*) denseMatrix.Data(), int(GetNumRows())));
        }
    }
    else if (GetFormat() == MatrixFormat::matrixFormatSparseCSC)
    {
        if (sizeof(ElemType) == sizeof(float))
        {
-            CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, int(GetNumRows()), int(GetNumCols()), descr, (float*) Data(), RowLocation(), ColLocation(), (float*) denseMatrix.Data(), int(GetNumRows())));
+            CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, int(GetNumRows()), int(GetNumCols()), descr, (float*) Buffer(), RowLocation(), ColLocation(), (float*) denseMatrix.Data(), int(GetNumRows())));
        }
        else
        {
-            CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, int(GetNumRows()), int(GetNumCols()), descr, (double*) Data(), RowLocation(), ColLocation(), (double*) denseMatrix.Data(), int(GetNumRows())));
+            CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, int(GetNumRows()), int(GetNumCols()), descr, (double*) Buffer(), RowLocation(), ColLocation(), (double*) denseMatrix.Data(), int(GetNumRows())));
        }
    }
    else
@ -414,6 +414,12 @@ void GPUSparseMatrix<ElemType>::ChangeDeviceTo(DEVICEID_TYPE to_id)
    SetComputeDeviceId(PrepareDevice(to_id));
 }

+template <class ElemType>
+void GPUSparseMatrix<ElemType>::SetValue(const CPUMatrix<ElemType>& /*denseMatrix*/)
+{
+    NOT_IMPLEMENTED;
+}
+
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& denseMatrix)
 {
@ -2656,6 +2662,7 @@ template GPUSparseMatrix<char>::GPUSparseMatrix(GPUSparseMatrix<char>&&);
 template void GPUSparseMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
 template void GPUSparseMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
 template void GPUSparseMatrix<char>::SetValue(GPUMatrix<char> const&);
+template void GPUSparseMatrix<char>::SetValue(CPUMatrix<char> const&);
 template void GPUSparseMatrix<char>::CopyToDenseMatrix(GPUMatrix<char>&) const;
 template void GPUSparseMatrix<char>::CopyToCPUSparseMatrix(CPUSparseMatrix<char>&) const;
 template void GPUSparseMatrix<char>::ChangeDeviceTo(int);
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@ -5,6 +5,7 @@

 #pragma once

+#include "CPUMatrix.h"
 #include "GPUMatrix.h"
 #include "CPUSparseMatrix.h"
 #include <functional>
@ -281,10 +282,11 @@ public:
        return SecondaryIndexLocation();
    }

-    void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
-    void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
-    void SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat);
+    void SetValue(const CPUMatrix<ElemType>& denseMatrix);
    void SetValue(const GPUMatrix<ElemType>& denseMatrix);
+    void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
+    void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
+    void SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat);
    
    GPUSPARSE_INDEX_TYPE* GetCondensedVector() const;
    void MaskColumnsValue(const GPUMatrix<char>& columnsMask, ElemType val);
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -1198,14 +1198,14 @@ void Matrix<ElemType>::SetColumn(const Matrix<ElemType>& colMat, size_t colInd)
 }

 template <class ElemType>
-void Matrix<ElemType>::SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format /*= matrixFormatSparseCSR*/)
+void Matrix<ElemType>::SetValue(const Matrix<ElemType>& deepCopyFrom)
 {
    if (this == &deepCopyFrom)
        return;

    m_preferredDeviceId = deepCopyFrom.m_preferredDeviceId;
    DecideAndMoveToRightDevice(deepCopyFrom, *this);
-    SwitchToMatrixType(deepCopyFrom.GetMatrixType(), format, false);
+    SwitchToMatrixType(deepCopyFrom.GetMatrixType(), deepCopyFrom.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom,
                            this,
@ -1215,6 +1215,48 @@ void Matrix<ElemType>::SetValue(const Matrix<ElemType>& deepCopyFrom, const Matr
                            m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix));
 }

+template <class ElemType>
+void Matrix<ElemType>::AssignValuesOf(const Matrix<ElemType>& deepCopyFrom)
+{
+    if (this == &deepCopyFrom)
+        return;
+    
+    DISPATCH_MATRIX_ON_FLAG(this, this,
+        { 
+            // Set CPUMatrix from:
+            DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom, &deepCopyFrom,
+                { m_CPUMatrix->SetValue(*deepCopyFrom.m_CPUMatrix); },
+                { m_CPUMatrix->SetValue(*deepCopyFrom.m_GPUMatrix); },
+                { m_CPUMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix); },
+                { m_CPUMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix); });
+        },
+        { 
+            // Set GPUMatrix from:
+            DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom, &deepCopyFrom,
+                { m_GPUMatrix->SetValue(*deepCopyFrom.m_CPUMatrix); },
+                { m_GPUMatrix->SetValue(*deepCopyFrom.m_GPUMatrix); },
+                { m_GPUMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix); },
+                { m_GPUMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix); });
+        },
+        { 
+            // Set CPUSparseMatrix from:
+            DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom, &deepCopyFrom,
+                { m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUMatrix); },
+                { m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUMatrix); },
+                { m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix); },
+                { m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix); });
+        },
+        { 
+            // Set GPUSparseMatrix from:
+            DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom, &deepCopyFrom,
+                { m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUMatrix); },
+                { m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUMatrix); },
+                { m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix); },
+                { m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix); });
+        });
+
+}
+
 template <class ElemType>
 void Matrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags)
 {
@ -1248,18 +1290,18 @@ void Matrix<ElemType>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE* h_CSCC
    // Note: The current implementation uses the xPUSparseMatrix as temporary space. This allows for memory sharing between calls. If
    // xPUSparseMatrix is a view, this code will cause an error during runtime stating that the view is not writable nor resizable.
    DISPATCH_MATRIX_ON_FLAG(this, this,
-    {
-        if (!m_CPUSparseMatrix) m_CPUSparseMatrix = make_shared<CPUSparseMatrix<ElemType>>(matrixFormatSparseCSC, numRows, numCols, nz);
-        m_CPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols);
-        m_CPUSparseMatrix->AssignColumnSliceToDense(*m_CPUMatrix, 0, numCols);
-    },
-    {
-        if (!m_GPUSparseMatrix) m_GPUSparseMatrix = make_shared<GPUSparseMatrix<ElemType>>(numRows, numCols, nz, GetDeviceId(), matrixFormatSparseCSC);
-        m_GPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols);
-        m_GPUSparseMatrix->AssignColumnSliceToDense(*m_GPUMatrix, 0, numCols);
-    },
-    { m_CPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols); },
-    { m_GPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols); });
+        {
+            if (!m_CPUSparseMatrix) m_CPUSparseMatrix = make_shared<CPUSparseMatrix<ElemType>>(matrixFormatSparseCSC, numRows, numCols, nz);
+            m_CPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols);
+            m_CPUSparseMatrix->AssignColumnSliceToDense(*m_CPUMatrix, 0, numCols);
+        },
+        {
+            if (!m_GPUSparseMatrix) m_GPUSparseMatrix = make_shared<GPUSparseMatrix<ElemType>>(numRows, numCols, nz, GetDeviceId(), matrixFormatSparseCSC);
+            m_GPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols);
+            m_GPUSparseMatrix->AssignColumnSliceToDense(*m_GPUMatrix, 0, numCols);
+        },
+        { m_CPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols); },
+        { m_GPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols); });
 }

 template <class ElemType>
@ -1397,52 +1439,58 @@ void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients,

    if (!useNesterovMomentum)
    {
-        DISPATCH_MATRIX_ON_FLAG(&gradients,
-                                nullptr,
-                                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
-                                functionValues -= *this,
-                                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
-                                functionValues -= *this,
-                                if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum);
-                                ScaleAndAdd(-learnRatePerSample, gradients, functionValues),
-                                if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
-                                ScaleAndAdd(-learnRatePerSample, gradients, functionValues));
+        DISPATCH_MATRIX_ON_FLAG(&gradients, nullptr,
+            { 
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
+                functionValues -= *this;
+            },
+            { 
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
+                functionValues -= *this;
+            },
+            { 
+                if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum);
+                ScaleAndAdd(-learnRatePerSample, gradients, functionValues);
+            },
+            { 
+                if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
+                ScaleAndAdd(-learnRatePerSample, gradients, functionValues);
+            });
    }
    else
    {
-        DISPATCH_MATRIX_ON_FLAG(&gradients,
-                                nullptr,
-                                { /* CPU dense */
-                                  ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
-                                  ScaleAndAdd(-momentum, *this, functionValues);
-                                  ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradients, functionValues);
-                                  // w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient,
-                                },
-                                { /* GPU dense */
-                                  ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
-                                  ScaleAndAdd(-momentum, *this, functionValues);
-                                  ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradients, functionValues);
-                                },
-                                { /* CPU sparse */
-                                  if (momentum != 0)
-                                  {
-                                      Matrix<ElemType> gradientCache(gradients.GetDeviceId());
-                                      gradientCache.SetValue(gradients);
-                                      gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum);
-                                      ScaleAndAdd(-momentum, *this, functionValues);
-                                      ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradientCache, functionValues);
-                                  }
-                                },
-                                { /* GPU sparse */
-                                  if (momentum != 0)
-                                  {
-                                      Matrix<ElemType> gradientCache(gradients.GetDeviceId());
-                                      gradientCache.SetValue(gradients);
-                                      gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
-                                      ScaleAndAdd(-momentum, *this, functionValues);
-                                      ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradientCache, functionValues);
-                                  }
-                                });
+        DISPATCH_MATRIX_ON_FLAG(&gradients, nullptr,
+            { /* CPU dense */
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
+                ScaleAndAdd(-momentum, *this, functionValues);
+                ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradients, functionValues);
+                // w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient,
+            },
+            { /* GPU dense */
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
+                ScaleAndAdd(-momentum, *this, functionValues);
+                ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradients, functionValues);
+            },
+            { /* CPU sparse */
+                if (momentum != 0)
+                {
+                    Matrix<ElemType> gradientCache(gradients.GetDeviceId());
+                    gradientCache.AssignValuesOf(gradients);
+                    gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum);
+                    ScaleAndAdd(-momentum, *this, functionValues);
+                    ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradientCache, functionValues);
+                }
+            },
+            { /* GPU sparse */
+                if (momentum != 0)
+                {
+                    Matrix<ElemType> gradientCache(gradients.GetDeviceId());
+                    gradientCache.AssignValuesOf(gradients);
+                    gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
+                    ScaleAndAdd(-momentum, *this, functionValues);
+                    ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradientCache, functionValues);
+                }
+            });
    }
 }

@ -1543,7 +1591,7 @@ Matrix<ElemType> Matrix<ElemType>::RepMat(const Matrix<ElemType>& frmMat, const
    Matrix<ElemType> c(nRows, newCols, frmMat.GetDeviceId());
    for (size_t i = 0; i < colRatio; i++)
    {
-        c.ColumnSlice(i * nCols, nCols).SetValue(frmMat);
+        c.ColumnSlice(i * nCols, nCols).AssignValuesOf(frmMat);
    }

    return c;
@ -5007,9 +5055,9 @@ Matrix<ElemType>& Matrix<ElemType>::Shift(const Matrix<ElemType>& a, int shift)
    long n = (long) GetNumCols();

    if (shift >= 0 && shift < n)
-        us.ColumnSlice(shift, n - shift).SetValue(a.ColumnSlice(0, n - shift));
+        us.ColumnSlice(shift, n - shift).AssignValuesOf(a.ColumnSlice(0, n - shift));
    if (shift < 0 && shift > -n)
-        us.ColumnSlice(0, n + shift).SetValue(a.ColumnSlice(-shift, n + shift));
+        us.ColumnSlice(0, n + shift).AssignValuesOf(a.ColumnSlice(-shift, n + shift));
    return *this;
 }

@ -5328,7 +5376,9 @@ template size_t Matrix<char>::GetNumRows() const;
 template size_t Matrix<char>::GetNumCols() const;
 template void Matrix<char>::SetValue(const char);
 template void Matrix<char>::SetValue(size_t numRows, const size_t numCols, int deviceId, char* pArray, size_t matrixFlags);
-template void Matrix<char>::SetValue(const Matrix<char>&, MatrixFormat);
+//template void Matrix<char>::SetValue(const Matrix<char>&, MatrixFormat);
+template void Matrix<char>::SetValue(const Matrix<char>&);
+template void Matrix<char>::AssignValuesOf   (const Matrix<char>&);
 template bool Matrix<char>::IsEmpty() const;
 template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);

--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -236,7 +236,11 @@ public:

    void SetValue(const ElemType v);
    void SetValue(const DeviceBoundNumber<ElemType>& db_number);
-    void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format = matrixFormatSparseCSR); // BUGBUG: default for 'format' is unexpected
+    //void SetValue       (const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format = matrixFormatSparseCSR); // BUGBUG: default for 'format' is unexpected
+    // SetValue respects the source matrix's information. It moves the target's location (if necessary), and then copies the sources values.
+    void SetValue      (const Matrix<ElemType>& deepCopyFrom);
+    // AssignValuesOf respects the target matrix's information. It copies the values from the target into the memory of the source.
+    void AssignValuesOf(const Matrix<ElemType>& deepCopyFrom);
    void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal);
    void SetValue(const size_t rIdx, const size_t cIdx, ElemType val); // set matrix sparsely
    void SetValue(const size_t numRows, const size_t numCols, std::initializer_list<ElemType> l) // SetValue(2,3, {1,2,3,  4,5,6});
--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.cpp
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.cpp
@ -335,6 +335,8 @@ void SparseBinaryInput<ElemType>::Init(std::map<std::wstring, std::wstring> rena

    m_inFile.seekg(0, ios::end);
    m_fileSize = (size_t) m_inFile.tellg();
+    
+    m_maxMBSize = 0;
 }

 template <class ElemType>
@ -436,19 +438,28 @@ void SparseBinaryInput<ElemType>::StartDistributedMinibatchLoop(size_t mbSize, s

    ReadOffsets(startMB, m_windowSize);

-    m_maxMBSize = 0;
+    size_t maxMBSize = 0;
    for (size_t c = 0; c < m_windowSize; c++)
    {
-        m_maxMBSize = max(m_maxMBSize, (size_t)(m_offsets[c + 1] - m_offsets[c]));
+        maxMBSize = max(maxMBSize, (size_t)(m_offsets[c + 1] - m_offsets[c]));
        // fprintf(stderr, "m_offsets[%lu] = %lu\n", c, m_offsets[c]);
    }
-    // fprintf(stderr, "max mb size: %ld\n", m_maxMBSize);
-    size_t maxMem = 1024 * 1024 * 1024; // 1GB
-    size_t maxPointers = maxMem / m_maxMBSize;
-    for (size_t c = 0; c < maxPointers; c++)
+    if (maxMBSize > m_maxMBSize)
    {
-        void* dataBuffer = malloc(m_maxMBSize);
-        m_dataToProduce.push(dataBuffer);
+        m_maxMBSize = maxMBSize;
+        while (m_dataToProduce.size() > 0)
+        {
+            free(m_dataToProduce.pop());
+        }
+        // fprintf(stderr, "max mb size: %ld\n", m_maxMBSize);
+
+        size_t maxMem = 1024 * 1024 * 1024; // 1GB
+        size_t maxPointers = maxMem / m_maxMBSize;
+        for (size_t c = 0; c < maxPointers; c++)
+        {
+            void* dataBuffer = malloc(m_maxMBSize);
+            m_dataToProduce.push(dataBuffer);
+        }
    }

    std::thread readData([this]
--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.h
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.h
@ -34,6 +34,15 @@ private:
    std::deque<T> d_queue;

 public:
+    void release()
+    {
+        while (!d_queue.empty())
+            free(d_queue.pop_front());
+    }
+    size_t size()
+    {
+        return d_queue.size();
+    }
    void push(T const& value)
    {
        {
--- a/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
+++ b/Tests/UnitTests/MathPerformanceTests/MathPerformanceTests.cpp
@ -120,7 +120,7 @@ void oldRNNForwardPropSRP(const size_t timeIdxInSeq, const int delay, const bool
            inp = pastActivity.ColumnSlice(d + indexInBatch, 1);
        else
            inp = inputFunctionValues.ColumnSlice(d + indexInBatch, 1);
-        out.SetValue(inp);
+        out.AssignValuesOf(inp);
    }
 }

--- a/Tests/UnitTests/MathTests/MatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/MatrixTests.cpp
@ -884,7 +884,7 @@ BOOST_FIXTURE_TEST_CASE(MatrixColumnSlice, RandomSeedFixture)
    cg.SetUniformRandomValue(-1, 1, IncrementCounter());

    Matrix<float> dg(k, m, c_deviceIdZero);
-    dg.SetValue(cg);
+    dg.AssignValuesOf(cg);

    Matrix<float>::MultiplyAndAdd(ag, false, bg, false, dg);