some clean-up of sparse matrices;

disabled meaningless elementwise sparse ops such as log(); sparse matrice slices now keep track of their original matrix (not used yet); bug fix: ~SyncGuard() should not swallow errors since CUDA errors may get discovered here; bug fix: HTKMLFReader test 16 must use consistent precision
2016-03-01 10:43:50 -08:00 · 2016-03-01 10:43:50 -08:00 · 7793b7be93
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -59,16 +59,13 @@
 #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
 #endif

-#define SWAP(a, b)  \
-    {               \
-        (a) ^= (b); \
-        (b) ^= (a); \
-        (a) ^= (b); \
-    }
+// TODO: Move to CommonMatrix.h
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
+
 namespace Microsoft { namespace MSR { namespace CNTK {

 #pragma region Helpful Enum Definitions
+
 enum class MatrixOrder
 {
    RowMajor = 101, // row-major arrays
@ -77,56 +74,55 @@ enum class MatrixOrder

 enum class MatrixTranspose : char
 {
-    NoTrans = 'N',  // trans='N'
-    Trans = 'T',    // trans='T'
-    ConjTrans = 'C' // trans='C'
+    NoTrans   = 'N', // trans='N'
+    Trans     = 'T', // trans='T'
+    ConjTrans = 'C'  // trans='C'
 };

 enum class SymMatrixType : char
 {
-    Up = 'U',          // symmetric matrix is stored in the upper part
-    Low = 'L',         // symmetric matrix is stored in thelower part
-    Full = 'F',        // full populated
-    NotSymmetric = 'N' // not a symmetric matrix
+    Up           = 'U', // symmetric matrix is stored in the upper part
+    Low          = 'L', // symmetric matrix is stored in thelower part
+    Full         = 'F', // full populated
+    NotSymmetric = 'N'  // not a symmetric matrix
 };

 enum class MatrixOpSide : char
 {
-    Left = 'L',  // left multiply
+    Left  = 'L', // left multiply
    Right = 'R', // right multiply
 };
+
 #pragma endregion Helpful Enum Definitions

 #pragma region Constructors and Destructor

-//should only be used by constructors.
-template <class ElemType>
-void CPUSparseMatrix<ElemType>::ZeroInit()
-{
-    m_numRows = 0;
-    m_numCols = 0;
-    m_elemSizeAllocated = 0;
-    m_compIndexSize = 0;
-    m_externalBuffer = false;
-    m_computeDevice = CPUDEVICE;
-    m_nz = 0;
-    m_matrixName = NULL;
+//-------------------------------------------------------------------------
+// construction and conversion
+//-------------------------------------------------------------------------

+// should only be used by constructors.
+template <class ElemType>
+/*private*/ void CPUSparseMatrix<ElemType>::ZeroInit()
+{
+    Base::ZeroInit();
+    m_computeDevice = CPUDEVICE;
+
+    m_sliceOf       = nullptr;
+    m_compIndexSize = 0;
    // if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
    {
-        m_colIdx = -1;
-        m_pArray = NULL;
-        m_unCompIndex = NULL;
-        m_compIndex = NULL;
+        m_colIdx      = -1;
+        m_unCompIndex = nullptr;
+        m_compIndex   = nullptr;
    }
    // else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
    {
-        m_blockSize = 0;
+        m_blockSize     = 0;
        m_blockIdShift = 0;
-        m_pArray = NULL;
-        m_blockIds = NULL;
+        m_blockIds     = nullptr;
    }
-    m_nzValues = NULL;
+    m_nzValues = nullptr;
 }

 //should only be used by constructors.
@ -144,19 +140,17 @@ void CPUSparseMatrix<ElemType>::CheckInit(const MatrixFormat format)
 template <class ElemType>
 CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format)
 {
-
    CheckInit(format);
 }

 template <class ElemType>
 CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size)
 {
-
    CheckInit(format);
    Resize(numRows, numCols, size, true, false);
 }

-//copy constructor, deep copy
+// copy constructor, deep copy
 template <class ElemType>
 CPUSparseMatrix<ElemType>::CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deepCopyFrom)
 {
@ -166,7 +160,7 @@ CPUSparseMatrix<ElemType>::CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deep
    SetMatrixName(deepCopyFrom.m_matrixName);
 }

-//assignment operator, deep copy
+// assignment operator, deep copy
 template <class ElemType>
 CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(const CPUSparseMatrix<ElemType>& deepCopyFrom)
 {
@ -177,29 +171,23 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(const CPUSparseM
    return *this;
 }

-//move constructor, shallow copy
+// move constructor, shallow copy
 template <class ElemType>
 CPUSparseMatrix<ElemType>::CPUSparseMatrix(CPUSparseMatrix<ElemType>&& moveFrom)
 {
-    m_format = moveFrom.m_format;
-    m_numRows = moveFrom.m_numRows;
-    m_numCols = moveFrom.m_numCols;
-    m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+    Base::ShallowCopyFrom(moveFrom);
+    // BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now.
+
    m_compIndexSize = moveFrom.m_compIndexSize;
-    m_externalBuffer = moveFrom.m_externalBuffer;
-    m_computeDevice = moveFrom.m_computeDevice;
-    m_nz = moveFrom.m_nz;
-    m_matrixName = moveFrom.m_matrixName;

-    m_colIdx = moveFrom.m_colIdx;
-    m_pArray = moveFrom.m_pArray;
-    m_nzValues = moveFrom.m_nzValues;
+    m_colIdx      = moveFrom.m_colIdx;
+    m_nzValues    = moveFrom.m_nzValues;
    m_unCompIndex = moveFrom.m_unCompIndex;
-    m_compIndex = moveFrom.m_compIndex;
+    m_compIndex   = moveFrom.m_compIndex;

-    m_blockSize = moveFrom.m_blockSize;
+    m_blockSize    = moveFrom.m_blockSize;
    m_blockIdShift = moveFrom.m_blockIdShift;
-    m_blockIds = moveFrom.m_blockIds;
+    m_blockIds     = moveFrom.m_blockIds;

    // release the pointer from the source object so that the destructor won't release it twice
    moveFrom.ZeroInit();
@ -213,26 +201,19 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(CPUSparseMatrix<
    {
        if (OwnBuffer())
            ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom
+        Base::ShallowCopyFrom(moveFrom);
+        // BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now.

-        m_format = moveFrom.m_format;
-        m_numRows = moveFrom.m_numRows;
-        m_numCols = moveFrom.m_numCols;
-        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
        m_compIndexSize = moveFrom.m_compIndexSize;
-        m_externalBuffer = moveFrom.m_externalBuffer;
-        m_computeDevice = moveFrom.m_computeDevice;
-        m_nz = moveFrom.m_nz;
-        m_matrixName = moveFrom.m_matrixName;

-        m_colIdx = moveFrom.m_colIdx;
-        m_pArray = moveFrom.m_pArray;
-        m_nzValues = moveFrom.m_nzValues;
+        m_colIdx      = moveFrom.m_colIdx;
+        m_nzValues    = moveFrom.m_nzValues;
        m_unCompIndex = moveFrom.m_unCompIndex;
-        m_compIndex = moveFrom.m_compIndex;
+        m_compIndex   = moveFrom.m_compIndex;

-        m_blockSize = moveFrom.m_blockSize;
+        m_blockSize    = moveFrom.m_blockSize;
        m_blockIdShift = moveFrom.m_blockIdShift;
-        m_blockIds = moveFrom.m_blockIds;
+        m_blockIds     = moveFrom.m_blockIds;

        // release the pointer from the source object so that the destructor won't release it twice
        moveFrom.ZeroInit();
@ -249,9 +230,8 @@ CPUSparseMatrix<ElemType>::~CPUSparseMatrix()
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::ReleaseMemory()
 {
-    // If m_externalBuffer is true then this matrix
-    // is simply a view over another matrix. In that
-    // case we shouldn't free anything.
+    // If m_externalBuffer is true then this matrix is simply a view over another matrix.
+    // In that case we shouldn't free anything.
    if (!m_externalBuffer)
    {
        delete[] m_matrixName;
@ -335,24 +315,26 @@ void CPUSparseMatrix<ElemType>::SetValue(const size_t row, const size_t col, con
    m_nz++;
 }

-//make sure call order in colume wise for CSC and row wise for CSR
+// make sure call order in colume wise for CSC and row wise for CSR
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
 {
-    if (!OwnBuffer())
+    if (!OwnBuffer()) // TODO: GPU version allows to overwrite a view with a fresh non-view
        LogicError("Cannot modify since the buffer is managed externally.");

-    this->Reset();
-    m_format = v.GetFormat();
+    Reset();
+    m_format         = v.GetFormat();
+    m_externalBuffer = false;
+    m_sliceOf        = nullptr;

-    this->Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize());
+    Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize());
    m_nz = v.NzCount();

    if (m_nz > 0)
    {
-        memcpy(this->NzValues(), v.NzValues(), v.NzSize());
-        memcpy(this->RowLocation(), v.RowLocation(), v.RowSize());
-        memcpy(this->ColLocation(), v.ColLocation(), v.ColSize());
+        memcpy(NzValues(),    v.NzValues(),    v.NzSize());
+        memcpy(RowLocation(), v.RowLocation(), v.RowSize());
+        memcpy(ColLocation(), v.ColLocation(), v.ColSize());
    }
 }

@ -402,17 +384,22 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
    CPUSparseMatrix<ElemType> slice(m_format);
    slice.m_numRows = m_numRows;
    slice.m_numCols = numCols;
+    // BUGBUG: m_matrixName?
+    // BUGBUG: m_sliceViewOffset?
+    slice.m_externalBuffer    = true;
+    slice.m_sliceOf           = const_cast<CPUSparseMatrix<ElemType>*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be.

    if (m_format == MatrixFormat::matrixFormatSparseCSC)
    {
-        slice.m_pArray = m_pArray;
-        slice.m_nzValues = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against  m_pArray
-        slice.m_unCompIndex = m_unCompIndex;
-        slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
-        slice.m_externalBuffer = true;
-        slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
+        slice.m_pArray            = m_pArray;
+
+        slice.m_nzValues          = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against  m_pArray
+        slice.m_unCompIndex       = m_unCompIndex;
+        slice.m_compIndex         = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
+        slice.m_compIndexSize     = numCols + 1;
+
+        slice.m_nz                = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
        slice.m_elemSizeAllocated = slice.m_nz;
-        slice.m_compIndexSize = numCols + 1;
    }
    else if (m_format == MatrixFormat::matrixFormatSparseBlockCol)
    {
@ -446,13 +433,15 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
            endColBlock = (long long) m_blockSize;
        }

-        slice.m_pArray = m_pArray + startColBlock * m_numRows;
-        slice.m_nzValues = slice.m_pArray;
-        slice.m_blockIds = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers
-        slice.m_blockSize = (size_t) max((long long) 0, endColBlock - startColBlock);
+        // BUGBUG: m_elemSizeAllocated?
+        slice.m_pArray       = m_pArray + startColBlock * m_numRows;
+
+        slice.m_nzValues     = slice.m_pArray;
+        slice.m_blockIds     = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers
+        slice.m_blockSize    = (size_t) max((long long) 0, endColBlock - startColBlock);
        slice.m_blockIdShift = m_blockIdShift + startColumn;
-        slice.m_externalBuffer = true;
-        slice.m_nz = slice.m_blockSize * m_numRows;
+
+        slice.m_nz           = slice.m_blockSize * m_numRows;
    }

    return slice;
@ -461,9 +450,6 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
 template <class ElemType>
 CPUMatrix<ElemType> CPUSparseMatrix<ElemType>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const
 {
-    // if (numCols == 0)
-    //    LogicError("The slice cannot have 0 columns.");
-
    if (startColumn + numCols > m_numCols)
        InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) m_numCols);

@ -614,17 +600,21 @@ void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
    }
 }

-//Reset matrix so it can be reused
+// Reset matrix to 0.
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::Reset()
 {
+    if (!OwnBuffer())
+        LogicError("Cannot Reset since the buffer is managed externally.");
+
    m_nz = 0;
    m_colIdx = -1;
    m_blockSize = 0;
    m_blockIdShift = 0;
 }

-//c = alpha*op(lhs) * op(rhs) + beta*c
+// c = alpha*op(lhs) * op(rhs) + beta*c
+// dense x sparse = dense
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
                                                       const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c)
@ -711,7 +701,8 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
    }
 }

-//c = alpha * op(lhs) * op(rhs)
+// dense x sparse = sparse
+// c = alpha * op(lhs) * op(rhs)
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
                                               const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, CPUSparseMatrix<ElemType>& c)
@ -807,6 +798,7 @@ void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<E
    }
 }

+// dense += sparse
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& rhs)
 {
@ -861,7 +853,7 @@ void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSpars
 }

 template <class ElemType>
-bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
+/*static*/ bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
 {
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("AreEqual: one of the input matrices is empty.");
@ -894,6 +886,7 @@ void CPUSparseMatrix<ElemType>::NormalGrad(CPUMatrix<ElemType>& c, const ElemTyp
        c.Resize(GetNumRows(), GetNumCols());
        c.SetValue(0.0);
    }
+    // BUGBUG: dimension/ownbuffer check?

    if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
    {
@ -927,6 +920,7 @@ ElemType CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c, const bool n
        c.Resize(GetNumRows(), GetNumCols());
        c.SetValue(0.0);
    }
+    // BUGBUG: dimension/ownbuffer check?

    ElemType aveMultiplier = 0;

@ -1156,12 +1150,12 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceSoftThreshold(const
 template <class ElemType>
 ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
 {
-    if (this->IsEmpty())
-        LogicError("FrobeniusNorm: Matrix is empty.");
+    if (IsEmpty())
+        return 0;

-    ElemType v = 0;
+    ElemType v = 0; // TODO: do this in 'double'?

-    long m = (long) this->NzCount();
+    long m = (long) NzCount();
    const ElemType* nzValues = NzValues();

 //four-way unrolling
@ -1183,8 +1177,8 @@ ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
 template <class ElemType>
 ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
 {
-    if (this->IsEmpty())
-        LogicError("SumOfAbsElements: Matrix is empty.");
+    if (IsEmpty())
+        return 0;

    if (sizeof(ElemType) == sizeof(double))
    {
@ -1209,12 +1203,12 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
 template <class ElemType>
 ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
 {
-    if (this->IsEmpty())
-        LogicError("SumOfElements: Matrix is empty.");
+    if (IsEmpty())
+        return 0;

-    ElemType sum = 0;
+    ElemType sum = 0; // TODO: Do this in 'double'?

-    long m = (long) this->NzCount();
+    long m = (long) NzCount();
    const ElemType* nzValues = NzValues();

 //four-way unrolling
@ -1235,6 +1229,9 @@ ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
 template <typename ElemType>
 MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
 {
+    if (!us.OwnBuffer())
+        LogicError("Cannot read into a managed external matrix");
+
    stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
    size_t elsize;
    stream >> elsize;
@ -1351,4 +1348,5 @@ template CPUSparseMatrix<char>::~CPUSparseMatrix();
 template CPUSparseMatrix<char> CPUSparseMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
 template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
 template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
-} } }
+
+}}}
--- a/Source/Math/CPUSparseMatrix.h
+++ b/Source/Math/CPUSparseMatrix.h
@ -22,21 +22,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 template <class ElemType>
 class MATH_API CPUSparseMatrix : public BaseMatrix<ElemType>
 {
-    typedef BaseMatrix<ElemType> B;
-    using B::m_elemSizeAllocated;
-    using B::m_computeDevice;
-    using B::m_externalBuffer;
-    using B::m_format;
-    using B::m_matrixName;
-    using B::m_numCols;
-    using B::m_numRows;
-    using B::m_nz;
-    using B::m_pArray; // without this, base members would require to use thi-> in GCC
-    using B::OwnBuffer;
-    using B::Clear;
+    typedef BaseMatrix<ElemType> Base;
+    using Base::m_elemSizeAllocated;
+    using Base::m_computeDevice;
+    using Base::m_externalBuffer;
+    using Base::m_format;
+    using Base::m_matrixName;
+    using Base::m_numCols;
+    using Base::m_numRows;
+    using Base::m_nz;
+    using Base::m_pArray; // without this, base members would require to use thi-> in GCC
+    using Base::Clear;
+    using Base::NzCount;

 public:
-    using B::SetMatrixName;
+    using Base::OwnBuffer;
+    using Base::IsEmpty;
+    using Base::SetMatrixName;

 private:
    void ZeroInit();
@ -53,8 +55,8 @@ public:
    ~CPUSparseMatrix();

 public:
-    using B::GetNumCols;
-    using B::GetNumRows;
+    using Base::GetNumCols;
+    using Base::GetNumRows;

    void SetValue(const size_t row, const size_t col, ElemType val);
    void SetValue(const CPUSparseMatrix<ElemType>& /*val*/);
@ -249,6 +251,8 @@ private:
    size_t m_blockSize;    // block size
    size_t* m_blockIds;    // block ids
    size_t m_blockIdShift; // used to get efficient slice, actual col = blockIds[j] - m_blockIdShift
+
+    CPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from
 };

 typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -158,7 +158,7 @@ enum ElementWiseOperator
    Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); \
    Macro(ElementwiseProductWithLogDerivativeFromOutput);             \
    Macro(ElementwiseProductWithCosDerivative); \
-//Macro(Index);
+    //Macro(Index);

 #define ForAllTernaryOps(Macro) \
    Macro(Cond);                \
@ -170,6 +170,7 @@ enum ElementWiseOperator

 enum MatrixFlagBitPosition
 {
+    // TODO: remove all formats that are actually not supported
    bitPosRowMajor = 0,         // row major matrix
    bitPosSparse = 1,           // sparse matrix (COO if uncompressed)
    bitPosCompressed = 2,       // a compressed sparse format (CSC/CSR)
@ -179,6 +180,7 @@ enum MatrixFlagBitPosition

 enum MatrixFormat
 {
+    // TODO: remove all formats that are actually not supported
    matrixFormatDense = 0,                          // default is dense
    matrixFormatColMajor = 0,                       // default is column major
    matrixFormatRowMajor = 1 << bitPosRowMajor,     // row major matrix
@ -314,17 +316,46 @@ protected:
        m_matrixName = nullptr;
    }

+    void ZeroInit()
+    {
+        m_numRows           = 0;
+        m_numCols           = 0;
+        m_elemSizeAllocated = 0;
+        m_sliceViewOffset   = 0;
+        m_externalBuffer    = false;
+        m_pArray            = nullptr;
+        m_nz                = 0;
+        m_matrixName        = nullptr;
+    }
+
+    // copy all metadata (but not content taht pArray points to)
+    void ShallowCopyFrom(const BaseMatrix& other)
+    {
+        m_format            = other.m_format;
+        m_computeDevice     = other.m_computeDevice;
+
+        m_numRows           = other.m_numRows;
+        m_numCols           = other.m_numCols;
+        m_elemSizeAllocated = other.m_elemSizeAllocated;
+        m_sliceViewOffset   = other.m_sliceViewOffset;
+        m_externalBuffer    = other.m_externalBuffer;
+        m_pArray            = other.m_pArray;
+        m_nz                = other.m_nz;
+        m_matrixName        = other.m_matrixName;
+    }
+
 protected:
+    MatrixFormat m_format;
+    mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
+
    size_t m_numRows;
    size_t m_numCols;
    size_t m_elemSizeAllocated;
-    size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format
-    MatrixFormat m_format;
+    size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format  --TODO: move to sparse matrix implementations? Or common sparse base class?
    bool m_externalBuffer; // is the buffer used by this matrix,
    ElemType* m_pArray;
-    mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
    size_t m_nz;                           // Number of non-zero elements for sparse matrices (unused in other formats)
    wchar_t* m_matrixName;                 // TODO: Use std::wstring?
 };

-} } }
+}}}
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -579,7 +579,7 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam

 class SyncGuard
 {
-    bool DoSync()
+    static bool DoSync()
    {
 #ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
        static bool do_sync = false;
@ -588,27 +588,30 @@ class SyncGuard
 #endif
        return do_sync;
    }
-    cudaEvent_t done;
+    cudaEvent_t m_done;
 public:
    SyncGuard()
    {
-        done = nullptr;
+        m_done = nullptr;
        if (DoSync())
-            CUDA_CALL(cudaEventCreate(&done));
+            CUDA_CALL(cudaEventCreate(&m_done));
    }
    ~SyncGuard()
    {
        if (DoSync())
        {
-            try
+            // The regular use of this destructor is to synchronize the GPU, but also
+            // to check for errors. So this destructor is where CUDA errors would be thrown.
+            // If this destructor runs during stack unwinding, then a different error has
+            // already happened that should be reported; so we only clean up the resource.
+            if (std::uncaught_exception())
+                cudaEventDestroy(m_done);
+            else
            {
-                CUDA_CALL(cudaEventRecord(done));
-                CUDA_CALL(cudaEventSynchronize(done));
-                CUDA_CALL(cudaEventDestroy(done));
-            }
-            catch (const std::exception& e) // can't throw in destructors!
-            {
-                std::cerr << "SyncGuard: Destructor swallowing CUDA failure: " << e.what() << std::endl;
+                // failures in a prior launch might be reported here
+                CUDA_CALL(cudaEventRecord(m_done));
+                CUDA_CALL(cudaEventSynchronize(m_done));
+                CUDA_CALL(cudaEventDestroy(m_done));
            }
        }
    }
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -52,6 +52,10 @@ GPUSPARSE_INDEX_TYPE GPUSparseMatrix<ElemType>::SecondaryIndexValueAt(size_t idx
    return value;
 }

+//-------------------------------------------------------------------------
+// construction and conversion
+//-------------------------------------------------------------------------
+
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE computeDevice)
 {
@ -59,25 +63,18 @@ void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const
        matrixFormat != MatrixFormat::matrixFormatSparseBlockCol && matrixFormat != MatrixFormat::matrixFormatSparseBlockRow)
    {
        LogicError("GPUSparseMatrix:  unsupported sparse matrix format");
+        // BUGBUG: Then why even define others?
    }
+    Base::ZeroInit();
+    m_computeDevice            = computeDevice; // current GPU device Id
+    m_format                   = matrixFormat;

-    m_computeDevice = computeDevice; // current GPU device Id
-    m_numRows = 0;
-    m_numCols = 0;
-    m_elemSizeAllocated = m_nz = 0; // Number of non-zero elements
+    m_sliceOf                  = nullptr;
    m_totalBufferSizeAllocated = 0;
-    m_sliceViewOffset = 0;
-    m_format = matrixFormat;
-    m_externalBuffer = false;
-    m_pArray = nullptr;
-    m_matrixName = nullptr;
-
-    m_blockSize = 0;
-
-    m_rowToId = nullptr;
-
-    m_tempHostBuffer = nullptr;
-    m_tempHostBufferSize = 0;
+    m_blockSize                = 0;
+    m_rowToId                  = nullptr;
+    m_tempHostBuffer           = nullptr;
+    m_tempHostBufferSize       = 0;
 }

 template <class ElemType>
@ -123,13 +120,15 @@ DEVICEID_TYPE GPUSparseMatrix<ElemType>::PrepareDevice(DEVICEID_TYPE deviceId /*
 }

 template <class ElemType>
-void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
+/*private*/ void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
 {
    ChangeDeviceTo(deepCopy.m_computeDevice);
    deepCopy.PrepareDevice();

    Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false);
-    m_nz = deepCopy.m_nz;
+    m_externalBuffer  = false;
+    m_sliceOf         = nullptr;
+    m_nz              = deepCopy.m_nz;
    m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix

    CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
@ -146,7 +145,6 @@ void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCo
            GetNumNZElements());
    }

-    m_externalBuffer = false;
    SetMatrixName(deepCopy.m_matrixName);

    // TODO: to copy other varibles used only for class based LM
@ -156,7 +154,7 @@ template <class ElemType>
 void GPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopy)
 {
    if (!OwnBuffer())
-        LogicError("Cannot SetValue on Managed external matrix");
+        LogicError("Cannot SetValue on managed external matrix");

    DeepCopy(deepCopy);
 }
@ -190,6 +188,9 @@ void GPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCo
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType>& cpuSparseMatrix) const
 {
+    if (!cpuSparseMatrix.OwnBuffer())
+        LogicError("Cannot CopyToCPUSparseMatrix on managed external matrix");
+
    cpuSparseMatrix.SetFormat(GetFormat());
    if (IsEmpty())
    {
@ -307,6 +308,9 @@ void GPUSparseMatrix<ElemType>::CopyToDenseMatrix(GPUMatrix<ElemType>& denseMatr
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const
 {
+    if (!outMatrix.OwnBuffer())
+        LogicError("Cannot ConvertToSparseFormat to managed external matrix");
+
    if (IsEmpty())
    {
        outMatrix.ZeroInit(newFormat, GetComputeDeviceId());
@ -528,26 +532,15 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseM
 template <class ElemType>
 GPUSparseMatrix<ElemType>::GPUSparseMatrix(GPUSparseMatrix<ElemType>&& moveFrom)
 {
-    m_computeDevice = moveFrom.m_computeDevice;
-    m_numRows = moveFrom.m_numRows;
-    m_numCols = moveFrom.m_numCols;
-    m_nz = moveFrom.m_nz;
-    m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+    Base::ShallowCopyFrom(moveFrom);
+    // TODO: implement this using operator= or a shared function
    m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
-    m_pArray = moveFrom.m_pArray;
-    m_sliceViewOffset = moveFrom.m_sliceViewOffset;
-    m_format = moveFrom.m_format;
-    m_externalBuffer = moveFrom.m_externalBuffer;
-    m_matrixName = moveFrom.m_matrixName;
-
-    m_blockSize = moveFrom.m_blockSize;
-
-    m_rowToId = moveFrom.m_rowToId;
-
-    m_tempHostBuffer = moveFrom.m_tempHostBuffer;
-    m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
-
-    moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freeed
+    m_sliceOf                  = moveFrom.m_sliceOf;
+    m_blockSize                = moveFrom.m_blockSize;
+    m_rowToId                  = moveFrom.m_rowToId;
+    m_tempHostBuffer           = moveFrom.m_tempHostBuffer;
+    m_tempHostBufferSize       = moveFrom.m_tempHostBufferSize;
+    moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freed
 }

 template <class ElemType>
@ -557,26 +550,13 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(GPUSparseMatrix<
    {
        if (OwnBuffer())
            ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom
-        m_computeDevice = moveFrom.m_computeDevice;
-        m_numRows = moveFrom.m_numRows;
-        m_numCols = moveFrom.m_numCols;
-        m_nz = moveFrom.m_nz;
-        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+        Base::ShallowCopyFrom(moveFrom);
        m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
-        m_pArray = moveFrom.m_pArray;
-        m_sliceViewOffset = moveFrom.m_sliceViewOffset;
-        m_format = moveFrom.m_format;
-        m_externalBuffer = moveFrom.m_externalBuffer;
-
-        m_matrixName = moveFrom.m_matrixName;
-
-        m_blockSize = moveFrom.m_blockSize;
-
-        m_rowToId = moveFrom.m_rowToId;
-
-        m_tempHostBuffer = moveFrom.m_tempHostBuffer;
-        m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
-
+        m_sliceOf                  = moveFrom.m_sliceOf;
+        m_blockSize                = moveFrom.m_blockSize;
+        m_rowToId                  = moveFrom.m_rowToId;
+        m_tempHostBuffer           = moveFrom.m_tempHostBuffer;
+        m_tempHostBufferSize       = moveFrom.m_tempHostBufferSize;
        moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice);
    }

@ -590,11 +570,10 @@ GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
 }

 template <class ElemType>
-void GPUSparseMatrix<ElemType>::ReleaseMemory()
+/*private*/ void GPUSparseMatrix<ElemType>::ReleaseMemory()
 {
-    // If OwnBuffer() is false then this matrix
-    // is simply a view over another matrix. In that
-    // case we shouldn't free anything.
+    // If OwnBuffer() is false then this matrix is a view over another matrix.
+    // In that case we shouldn't free anything.
    if (OwnBuffer())
    {
        delete[] m_matrixName;
@ -609,7 +588,6 @@ void GPUSparseMatrix<ElemType>::ReleaseMemory()
        TracingGPUMemoryAllocator::Free<GPUSPARSE_INDEX_TYPE>(m_computeDevice, m_rowToId);
        m_rowToId = nullptr;
    }
-
    ZeroInit(m_format, m_computeDevice);
 }

@ -627,17 +605,18 @@ void GPUSparseMatrix<ElemType>::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<E
 }

 //-------------------------------------------------------------------------
-// Start of new GPU Sparse Matrix code
+// main operations
 //-------------------------------------------------------------------------
+
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
 {
-    if (!OwnBuffer())
-        LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally.");
-
    if (m_numRows == numRows && m_numCols == numCols)
        return;

+    if (!OwnBuffer())
+        LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally.");
+
    if (m_format != MatrixFormat::matrixFormatSparseCSC)
        NOT_IMPLEMENTED;

@ -688,8 +667,8 @@ void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
    Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly, keepExistingValues);
 }

-//WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start.
-//TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
+// WARNING: When memory is reallocated, existing information will be lost.
+// TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*=true*/)
 {
@ -749,13 +728,17 @@ void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
    m_format = matrixFormat;
 }

-//Reset matrix so it can be reused
+// Reset matrix to 0.
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::Reset()
 {
+    if (!OwnBuffer())
+        LogicError("Cannot Reset since the buffer is managed externally.");
+
    m_nz = 0;
    m_blockSize = 0;
 }
+
 // copy features to GPU
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE* h_CSRRow, const GPUSPARSE_INDEX_TYPE* h_Col, const ElemType* h_Val,
@ -966,9 +949,11 @@ void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPU
    }
 }

+// dense X sparse = dense
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
-                                                       const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
+                                                       const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta,
+                                                       GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
 {
    if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
        RuntimeError("GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd: All matrices must be on the same GPU");
@ -1222,7 +1207,7 @@ void GPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const GPUMatrix<E
    }
 }

-//find the rows of rhs with values
+// find the rows of rhs with values
 template <class ElemType>
 size_t GPUSparseMatrix<ElemType>::IdentifyRowsWithValues() const
 {
@ -1404,11 +1389,7 @@ ElemType GPUSparseMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& c, const bool n
    }
 }

-//-------------------------------------------------------------------------
-// End of new GPU Sparse Matrix code
-//-------------------------------------------------------------------------
-
-//sparse X dense = dense
+// sparse X dense = dense
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA,
                                                       const GPUMatrix<ElemType>& b, const bool transposeD, ElemType beta, GPUMatrix<ElemType>& c)
@ -1758,6 +1739,7 @@ void GPUSparseMatrix<ElemType>::ElementWisePower(ElemType alpha, const GPUSparse
    }
 }

+// sparse x dense = scalar
 template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
 {
@ -1878,8 +1860,8 @@ bool GPUSparseMatrix<ElemType>::IsValid() const
 }

 template <class ElemType>
-bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
-                                         const ElemType threshold)
+/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
+                                                    const ElemType threshold)
 {
    if (a.GetNumNZElements() != b.GetNumNZElements() || a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
        return false;
@ -1908,8 +1890,8 @@ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, con
 }

 template <class ElemType>
-bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
-                                         const ElemType threshold)
+/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
+                                                    const ElemType threshold)
 {
    if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
        return false;
@ -1919,8 +1901,8 @@ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPU
 }

 template <class ElemType>
-bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b,
-                                         const ElemType threshold)
+/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b,
+                                                    const ElemType threshold)
 {
    if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
        return false;
@ -1940,6 +1922,7 @@ bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const El
 {
    return AreEqual(*this, a, threshold);
 }
+
 #pragma endregion Static BLAS Functions

 #pragma region Member BLAS Functions
@ -1958,6 +1941,7 @@ DEVICEID_TYPE GPUSparseMatrix<ElemType>::GetComputeDeviceId() const
        return m_computeDevice;
 }

+// sparse x dense = dense
 template <class ElemType>
 GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
 {
@ -1980,6 +1964,7 @@ GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseM
    return c;
 }

+// sparse x dense = dense
 template <class ElemType>
 GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
 {
@ -2002,6 +1987,7 @@ GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator-(const GPUSparseMa
    return res;
 }

+// TODO: This is an unusual use of this operator. Remove this.
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
 {
@ -2010,6 +1996,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
    return us;
 }

+// TODO: This is an unusual use of this operator. Remove this.
 template <class ElemType>
 GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator^(ElemType alpha) const
 {
@ -2154,6 +2141,7 @@ GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
    slice.m_pArray                   = m_pArray;
    slice.m_format                   = m_format;
    slice.m_externalBuffer           = true;
+    slice.m_sliceOf                  = const_cast<GPUSparseMatrix<ElemType>*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be.
    slice.m_matrixName               = m_matrixName;
    slice.m_blockSize                = m_blockSize;
    slice.m_rowToId                  = m_rowToId;
@ -2231,7 +2219,7 @@ template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
 {
    if (IsEmpty())
-        LogicError("SumOfAbsElements: Matrix is empty");
+        return 0;

    cublasHandle_t cuHandle = GPUMatrix<ElemType>::GetCublasHandle(GetComputeDeviceId());
    if (sizeof(ElemType) == sizeof(float))
@ -2264,11 +2252,12 @@ ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
    return h_sum;
 }

+// sqrt(sum all elements^2)
 template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const
 {
    if (IsEmpty())
-        LogicError("FrobeniusNorm: Matrix is empty.");
+        return 0;

    ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(m_computeDevice, 1);
    ElemType h_sum = 0;
@ -2287,7 +2276,7 @@ template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
 {
    if (IsEmpty())
-        LogicError("MatrixNorm1: Matrix is empty.");
+        return 0;

    ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(m_computeDevice, 1);
    ElemType h_maxAbs = 0;
@ -2305,8 +2294,6 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
 template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
 {
-    if (IsEmpty())
-        LogicError("MatrixNorm1: Matrix is empty.");
    return SumOfAbsElements();
 }

@ -2317,6 +2304,10 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse()
 {
+#if 1
+    // Note: This makes no sense because sparse matrices are defined by having lots of zeroes.
+    NOT_IMPLEMENTED;
+#else
    if (!OwnBuffer())
        LogicError("Cannot modify since the buffer is managed externally.");

@ -2328,29 +2319,45 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse()
    SyncGuard syncGuard;
    _elemInverse<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(NzValues(), N);
    return *this;
+#endif
 }

 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementInverseOf(const GPUSparseMatrix<ElemType>& a)
 {
+#if 1
+    // Note: This makes no sense because sparse matrices are defined by having lots of zeroes.
+    UNUSED(a); NOT_IMPLEMENTED;
+#else
    SetValue(a);
    return ElementInverse();
+#endif
 }

 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSigmoid()
 {
+#if 1
+    // Note: This makes no sense because sigmoid(0) != 0.
+    NOT_IMPLEMENTED;
+#else
    performElementWiseFunction(ElementWiseOperator::opSigmoid, *this);
    return *this;
+#endif
 }

 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSigmoidOf(const GPUSparseMatrix<ElemType>& a)
 {
+#if 1
+    // Note: This makes no sense because sigmoid(0) != 0.
+    UNUSED(a); NOT_IMPLEMENTED;
+#else
    if (this != &a)
        Resize(a.GetNumRows(), a.GetNumCols());
    performElementWiseFunction(ElementWiseOperator::opSigmoid, a);
    return *this;
+#endif
 }

 template <class ElemType>
@ -2404,33 +2411,53 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSqrtOf(const GPUSpar
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceExp()
 {
+#if 1
+    // Note: This makes no sense because exp(0) != 0.
+    NOT_IMPLEMENTED;
+#else
    performElementWiseFunction(ElementWiseOperator::opExp, *this);
    return *this;
+#endif
 }

 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignExpOf(const GPUSparseMatrix<ElemType>& a)
 {
+#if 1
+    // Note: This makes no sense because exp(0) != 0.
+    UNUSED(a); NOT_IMPLEMENTED;
+#else
    if (this != &a)
        Resize(a.GetNumRows(), a.GetNumCols());
    performElementWiseFunction(ElementWiseOperator::opExp, a);
    return *this;
+#endif
 }

 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLog()
 {
+#if 1
+    // Note: This makes no sense because log(0) != 0.
+    NOT_IMPLEMENTED;
+#else
    performElementWiseFunction(ElementWiseOperator::opLog, *this);
    return *this;
+#endif
 }

 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLogOf(const GPUSparseMatrix<ElemType>& a)
 {
+#if 1
+    // Note: This makes no sense because log(0) != 0.
+    UNUSED(a); NOT_IMPLEMENTED;
+#else
    if (this != &a)
        Resize(a.GetNumRows(), a.GetNumCols());
    performElementWiseFunction(ElementWiseOperator::opLog, a);
    return *this;
+#endif
 }

 template <class ElemType>
@ -2449,6 +2476,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignAbsOf(const GPUSpars
    return *this;
 }

+// TODO: Check whether these functions always map 0 to 0.
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
 {
@ -2543,7 +2571,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
 //outBuffer should be allocated to be >= size by the caller
 template <class ElemType>
 template <class OutType, class InType>
-void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size)
+/*private*/ void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size)
 {
 #pragma omp parallel for
    for (size_t i = 0; i < (size & ~3); i += 4)
@ -2629,6 +2657,9 @@ template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix
 template <class ElemType>
 MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
 {
+    if (!us.OwnBuffer())
+        LogicError("Cannot read into a managed external matrix");
+
    stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
    size_t elsize;
    stream >> elsize;
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@ -23,29 +23,30 @@ template <class ElemType>
 class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
 {
 public:
-    typedef BaseMatrix<ElemType> B;
-    using B::m_numRows;
-    using B::m_numCols;
-    using B::m_pArray;
-    using B::m_elemSizeAllocated;
-    using B::m_sliceViewOffset;
-    using B::m_nz;
-    using B::m_format;
-    using B::m_computeDevice;
-    using B::m_externalBuffer;
-    using B::m_matrixName;
-    using B::OwnBuffer;
-    using B::GetFormat;
-    using B::SetFormat;
-    using B::GetNumRows;
-    using B::GetNumCols;
-    using B::IsEmpty;
-    using B::SetComputeDeviceId;
-    using B::SetMatrixName;
-    using B::SetNzCount;
-    using B::Clear;
+    typedef BaseMatrix<ElemType> Base;
+    using Base::m_numRows;
+    using Base::m_numCols;
+    using Base::m_pArray;
+    using Base::m_elemSizeAllocated;
+    using Base::m_sliceViewOffset;
+    using Base::m_nz;
+    using Base::m_format;
+    using Base::m_computeDevice;
+    using Base::m_externalBuffer;
+    using Base::m_matrixName;
+    using Base::OwnBuffer;
+    using Base::GetFormat;
+    using Base::SetFormat;
+    using Base::GetNumRows;
+    using Base::GetNumCols;
+    using Base::SetComputeDeviceId;
+    using Base::SetMatrixName;
+    using Base::SetNzCount;
+    using Base::Clear;
    // without this, base members would require to use thi-> in GCC
 public:
+    using Base::IsEmpty;
+
    GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);

    explicit GPUSparseMatrix(DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
@ -390,6 +391,8 @@ private:

    mutable void* m_tempHostBuffer; // used to copy values.
    mutable size_t m_tempHostBufferSize;
+
+    GPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from
 };

 }}}
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -169,7 +169,7 @@ public:
    // REVIEW alexeyk: GPU version copies from device to host only, implement all versions (device <-> host).
    void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const;

-    Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
+    Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const; // note: 'const' is misleading here, as the returned matrix is a mutable reference

    // difference between AssignColumnSlice and SetColumnSlice
    // AssignColumnSlice :      this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1)
--- a/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk
+++ b/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk
@ -4,7 +4,7 @@ DataDir = $RootDir$
 # deviceId = -1 for CPU, >= 0 for GPU devices
 deviceId = -1

-precision = "float"
+precision = "double"

 Simple_Test = [
    reader = [