From 7793b7be939b79fc8bae13f32d951f8fd62c9bb8 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Tue, 1 Mar 2016 10:43:50 -0800
Subject: [PATCH] some clean-up of sparse matrices; disabled meaningless
 elementwise sparse ops such as log(); sparse matrice slices now keep track of
 their original matrix (not used yet); bug fix: ~SyncGuard() should not
 swallow errors since CUDA errors may get discovered here; bug fix:
 HTKMLFReader test 16 must use consistent precision

---
 Source/Math/CPUSparseMatrix.cpp               | 216 +++++++++---------
 Source/Math/CPUSparseMatrix.h                 |  34 +--
 Source/Math/CommonMatrix.h                    |  41 +++-
 Source/Math/GPUMatrix.h                       |  27 ++-
 Source/Math/GPUSparseMatrix.cu                | 209 +++++++++--------
 Source/Math/GPUSparseMatrix.h                 |  45 ++--
 Source/Math/Matrix.h                          |   2 +-
 .../HTKMLFReaderSimpleDataLoop16_Config.cntk  |   2 +-
 8 files changed, 323 insertions(+), 253 deletions(-)
diff --git a/Source/Math/CPUSparseMatrix.cpp b/Source/Math/CPUSparseMatrix.cpp
index f8d00c25a..627efb0f1 100644
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@@ -59,16 +59,13 @@
 #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
 #endif
 
-#define SWAP(a, b)  \
-    {               \
-        (a) ^= (b); \
-        (b) ^= (a); \
-        (a) ^= (b); \
-    }
+// TODO: Move to CommonMatrix.h
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
+
 namespace Microsoft { namespace MSR { namespace CNTK {
 
 #pragma region Helpful Enum Definitions
+
 enum class MatrixOrder
 {
     RowMajor = 101, // row-major arrays
@@ -77,56 +74,55 @@ enum class MatrixOrder
 
 enum class MatrixTranspose : char
 {
-    NoTrans = 'N',  // trans='N'
-    Trans = 'T',    // trans='T'
-    ConjTrans = 'C' // trans='C'
+    NoTrans   = 'N', // trans='N'
+    Trans     = 'T', // trans='T'
+    ConjTrans = 'C'  // trans='C'
 };
 
 enum class SymMatrixType : char
 {
-    Up = 'U',          // symmetric matrix is stored in the upper part
-    Low = 'L',         // symmetric matrix is stored in thelower part
-    Full = 'F',        // full populated
-    NotSymmetric = 'N' // not a symmetric matrix
+    Up           = 'U', // symmetric matrix is stored in the upper part
+    Low          = 'L', // symmetric matrix is stored in thelower part
+    Full         = 'F', // full populated
+    NotSymmetric = 'N'  // not a symmetric matrix
 };
 
 enum class MatrixOpSide : char
 {
-    Left = 'L',  // left multiply
+    Left  = 'L', // left multiply
     Right = 'R', // right multiply
 };
+
 #pragma endregion Helpful Enum Definitions
 
 #pragma region Constructors and Destructor
 
-//should only be used by constructors.
-template <class ElemType>
-void CPUSparseMatrix<ElemType>::ZeroInit()
-{
-    m_numRows = 0;
-    m_numCols = 0;
-    m_elemSizeAllocated = 0;
-    m_compIndexSize = 0;
-    m_externalBuffer = false;
-    m_computeDevice = CPUDEVICE;
-    m_nz = 0;
-    m_matrixName = NULL;
+//-------------------------------------------------------------------------
+// construction and conversion
+//-------------------------------------------------------------------------
 
+// should only be used by constructors.
+template <class ElemType>
+/*private*/ void CPUSparseMatrix<ElemType>::ZeroInit()
+{
+    Base::ZeroInit();
+    m_computeDevice = CPUDEVICE;
+
+    m_sliceOf       = nullptr;
+    m_compIndexSize = 0;
     // if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
     {
-        m_colIdx = -1;
-        m_pArray = NULL;
-        m_unCompIndex = NULL;
-        m_compIndex = NULL;
+        m_colIdx      = -1;
+        m_unCompIndex = nullptr;
+        m_compIndex   = nullptr;
     }
     // else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
     {
-        m_blockSize = 0;
+        m_blockSize     = 0;
         m_blockIdShift = 0;
-        m_pArray = NULL;
-        m_blockIds = NULL;
+        m_blockIds     = nullptr;
     }
-    m_nzValues = NULL;
+    m_nzValues = nullptr;
 }
 
 //should only be used by constructors.
@@ -144,19 +140,17 @@ void CPUSparseMatrix<ElemType>::CheckInit(const MatrixFormat format)
 template <class ElemType>
 CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format)
 {
-
     CheckInit(format);
 }
 
 template <class ElemType>
 CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size)
 {
-
     CheckInit(format);
     Resize(numRows, numCols, size, true, false);
 }
 
-//copy constructor, deep copy
+// copy constructor, deep copy
 template <class ElemType>
 CPUSparseMatrix<ElemType>::CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deepCopyFrom)
 {
@@ -166,7 +160,7 @@ CPUSparseMatrix<ElemType>::CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deep
     SetMatrixName(deepCopyFrom.m_matrixName);
 }
 
-//assignment operator, deep copy
+// assignment operator, deep copy
 template <class ElemType>
 CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(const CPUSparseMatrix<ElemType>& deepCopyFrom)
 {
@@ -177,29 +171,23 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(const CPUSparseM
     return *this;
 }
 
-//move constructor, shallow copy
+// move constructor, shallow copy
 template <class ElemType>
 CPUSparseMatrix<ElemType>::CPUSparseMatrix(CPUSparseMatrix<ElemType>&& moveFrom)
 {
-    m_format = moveFrom.m_format;
-    m_numRows = moveFrom.m_numRows;
-    m_numCols = moveFrom.m_numCols;
-    m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+    Base::ShallowCopyFrom(moveFrom);
+    // BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now.
+
     m_compIndexSize = moveFrom.m_compIndexSize;
-    m_externalBuffer = moveFrom.m_externalBuffer;
-    m_computeDevice = moveFrom.m_computeDevice;
-    m_nz = moveFrom.m_nz;
-    m_matrixName = moveFrom.m_matrixName;
 
-    m_colIdx = moveFrom.m_colIdx;
-    m_pArray = moveFrom.m_pArray;
-    m_nzValues = moveFrom.m_nzValues;
+    m_colIdx      = moveFrom.m_colIdx;
+    m_nzValues    = moveFrom.m_nzValues;
     m_unCompIndex = moveFrom.m_unCompIndex;
-    m_compIndex = moveFrom.m_compIndex;
+    m_compIndex   = moveFrom.m_compIndex;
 
-    m_blockSize = moveFrom.m_blockSize;
+    m_blockSize    = moveFrom.m_blockSize;
     m_blockIdShift = moveFrom.m_blockIdShift;
-    m_blockIds = moveFrom.m_blockIds;
+    m_blockIds     = moveFrom.m_blockIds;
 
     // release the pointer from the source object so that the destructor won't release it twice
     moveFrom.ZeroInit();
@@ -213,26 +201,19 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(CPUSparseMatrix<
     {
         if (OwnBuffer())
             ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom
+        Base::ShallowCopyFrom(moveFrom);
+        // BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now.
 
-        m_format = moveFrom.m_format;
-        m_numRows = moveFrom.m_numRows;
-        m_numCols = moveFrom.m_numCols;
-        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
         m_compIndexSize = moveFrom.m_compIndexSize;
-        m_externalBuffer = moveFrom.m_externalBuffer;
-        m_computeDevice = moveFrom.m_computeDevice;
-        m_nz = moveFrom.m_nz;
-        m_matrixName = moveFrom.m_matrixName;
 
-        m_colIdx = moveFrom.m_colIdx;
-        m_pArray = moveFrom.m_pArray;
-        m_nzValues = moveFrom.m_nzValues;
+        m_colIdx      = moveFrom.m_colIdx;
+        m_nzValues    = moveFrom.m_nzValues;
         m_unCompIndex = moveFrom.m_unCompIndex;
-        m_compIndex = moveFrom.m_compIndex;
+        m_compIndex   = moveFrom.m_compIndex;
 
-        m_blockSize = moveFrom.m_blockSize;
+        m_blockSize    = moveFrom.m_blockSize;
         m_blockIdShift = moveFrom.m_blockIdShift;
-        m_blockIds = moveFrom.m_blockIds;
+        m_blockIds     = moveFrom.m_blockIds;
 
         // release the pointer from the source object so that the destructor won't release it twice
         moveFrom.ZeroInit();
@@ -249,9 +230,8 @@ CPUSparseMatrix<ElemType>::~CPUSparseMatrix()
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::ReleaseMemory()
 {
-    // If m_externalBuffer is true then this matrix
-    // is simply a view over another matrix. In that
-    // case we shouldn't free anything.
+    // If m_externalBuffer is true then this matrix is simply a view over another matrix.
+    // In that case we shouldn't free anything.
     if (!m_externalBuffer)
     {
         delete[] m_matrixName;
@@ -335,24 +315,26 @@ void CPUSparseMatrix<ElemType>::SetValue(const size_t row, const size_t col, con
     m_nz++;
 }
 
-//make sure call order in colume wise for CSC and row wise for CSR
+// make sure call order in colume wise for CSC and row wise for CSR
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
 {
-    if (!OwnBuffer())
+    if (!OwnBuffer()) // TODO: GPU version allows to overwrite a view with a fresh non-view
         LogicError("Cannot modify since the buffer is managed externally.");
 
-    this->Reset();
-    m_format = v.GetFormat();
+    Reset();
+    m_format         = v.GetFormat();
+    m_externalBuffer = false;
+    m_sliceOf        = nullptr;
 
-    this->Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize());
+    Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize());
     m_nz = v.NzCount();
 
     if (m_nz > 0)
     {
-        memcpy(this->NzValues(), v.NzValues(), v.NzSize());
-        memcpy(this->RowLocation(), v.RowLocation(), v.RowSize());
-        memcpy(this->ColLocation(), v.ColLocation(), v.ColSize());
+        memcpy(NzValues(),    v.NzValues(),    v.NzSize());
+        memcpy(RowLocation(), v.RowLocation(), v.RowSize());
+        memcpy(ColLocation(), v.ColLocation(), v.ColSize());
     }
 }
 
@@ -402,17 +384,22 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
     CPUSparseMatrix<ElemType> slice(m_format);
     slice.m_numRows = m_numRows;
     slice.m_numCols = numCols;
+    // BUGBUG: m_matrixName?
+    // BUGBUG: m_sliceViewOffset?
+    slice.m_externalBuffer    = true;
+    slice.m_sliceOf           = const_cast<CPUSparseMatrix<ElemType>*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be.
 
     if (m_format == MatrixFormat::matrixFormatSparseCSC)
     {
-        slice.m_pArray = m_pArray;
-        slice.m_nzValues = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against  m_pArray
-        slice.m_unCompIndex = m_unCompIndex;
-        slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
-        slice.m_externalBuffer = true;
-        slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
+        slice.m_pArray            = m_pArray;
+
+        slice.m_nzValues          = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against  m_pArray
+        slice.m_unCompIndex       = m_unCompIndex;
+        slice.m_compIndex         = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
+        slice.m_compIndexSize     = numCols + 1;
+
+        slice.m_nz                = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
         slice.m_elemSizeAllocated = slice.m_nz;
-        slice.m_compIndexSize = numCols + 1;
     }
     else if (m_format == MatrixFormat::matrixFormatSparseBlockCol)
     {
@@ -446,13 +433,15 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
             endColBlock = (long long) m_blockSize;
         }
 
-        slice.m_pArray = m_pArray + startColBlock * m_numRows;
-        slice.m_nzValues = slice.m_pArray;
-        slice.m_blockIds = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers
-        slice.m_blockSize = (size_t) max((long long) 0, endColBlock - startColBlock);
+        // BUGBUG: m_elemSizeAllocated?
+        slice.m_pArray       = m_pArray + startColBlock * m_numRows;
+
+        slice.m_nzValues     = slice.m_pArray;
+        slice.m_blockIds     = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers
+        slice.m_blockSize    = (size_t) max((long long) 0, endColBlock - startColBlock);
         slice.m_blockIdShift = m_blockIdShift + startColumn;
-        slice.m_externalBuffer = true;
-        slice.m_nz = slice.m_blockSize * m_numRows;
+
+        slice.m_nz           = slice.m_blockSize * m_numRows;
     }
 
     return slice;
@@ -461,9 +450,6 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
 template <class ElemType>
 CPUMatrix<ElemType> CPUSparseMatrix<ElemType>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const
 {
-    // if (numCols == 0)
-    //    LogicError("The slice cannot have 0 columns.");
-
     if (startColumn + numCols > m_numCols)
         InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) m_numCols);
 
@@ -614,17 +600,21 @@ void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
     }
 }
 
-//Reset matrix so it can be reused
+// Reset matrix to 0.
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::Reset()
 {
+    if (!OwnBuffer())
+        LogicError("Cannot Reset since the buffer is managed externally.");
+
     m_nz = 0;
     m_colIdx = -1;
     m_blockSize = 0;
     m_blockIdShift = 0;
 }
 
-//c = alpha*op(lhs) * op(rhs) + beta*c
+// c = alpha*op(lhs) * op(rhs) + beta*c
+// dense x sparse = dense
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
                                                        const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c)
@@ -711,7 +701,8 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
     }
 }
 
-//c = alpha * op(lhs) * op(rhs)
+// dense x sparse = sparse
+// c = alpha * op(lhs) * op(rhs)
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
                                                const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, CPUSparseMatrix<ElemType>& c)
@@ -807,6 +798,7 @@ void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<E
     }
 }
 
+// dense += sparse
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& rhs)
 {
@@ -861,7 +853,7 @@ void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSpars
 }
 
 template <class ElemType>
-bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
+/*static*/ bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
 {
     if (a.IsEmpty() || b.IsEmpty())
         LogicError("AreEqual: one of the input matrices is empty.");
@@ -894,6 +886,7 @@ void CPUSparseMatrix<ElemType>::NormalGrad(CPUMatrix<ElemType>& c, const ElemTyp
         c.Resize(GetNumRows(), GetNumCols());
         c.SetValue(0.0);
     }
+    // BUGBUG: dimension/ownbuffer check?
 
     if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
     {
@@ -927,6 +920,7 @@ ElemType CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c, const bool n
         c.Resize(GetNumRows(), GetNumCols());
         c.SetValue(0.0);
     }
+    // BUGBUG: dimension/ownbuffer check?
 
     ElemType aveMultiplier = 0;
 
@@ -1156,12 +1150,12 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceSoftThreshold(const
 template <class ElemType>
 ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
 {
-    if (this->IsEmpty())
-        LogicError("FrobeniusNorm: Matrix is empty.");
+    if (IsEmpty())
+        return 0;
 
-    ElemType v = 0;
+    ElemType v = 0; // TODO: do this in 'double'?
 
-    long m = (long) this->NzCount();
+    long m = (long) NzCount();
     const ElemType* nzValues = NzValues();
 
 //four-way unrolling
@@ -1183,8 +1177,8 @@ ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
 template <class ElemType>
 ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
 {
-    if (this->IsEmpty())
-        LogicError("SumOfAbsElements: Matrix is empty.");
+    if (IsEmpty())
+        return 0;
 
     if (sizeof(ElemType) == sizeof(double))
     {
@@ -1209,12 +1203,12 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
 template <class ElemType>
 ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
 {
-    if (this->IsEmpty())
-        LogicError("SumOfElements: Matrix is empty.");
+    if (IsEmpty())
+        return 0;
 
-    ElemType sum = 0;
+    ElemType sum = 0; // TODO: Do this in 'double'?
 
-    long m = (long) this->NzCount();
+    long m = (long) NzCount();
     const ElemType* nzValues = NzValues();
 
 //four-way unrolling
@@ -1235,6 +1229,9 @@ ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
 template <typename ElemType>
 MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
 {
+    if (!us.OwnBuffer())
+        LogicError("Cannot read into a managed external matrix");
+
     stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
     size_t elsize;
     stream >> elsize;
@@ -1351,4 +1348,5 @@ template CPUSparseMatrix<char>::~CPUSparseMatrix();
 template CPUSparseMatrix<char> CPUSparseMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
 template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
 template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
-} } }
+
+}}}
diff --git a/Source/Math/CPUSparseMatrix.h b/Source/Math/CPUSparseMatrix.h
index 25c0818e5..f160372a3 100644
--- a/Source/Math/CPUSparseMatrix.h
+++ b/Source/Math/CPUSparseMatrix.h
@@ -22,21 +22,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 template <class ElemType>
 class MATH_API CPUSparseMatrix : public BaseMatrix<ElemType>
 {
-    typedef BaseMatrix<ElemType> B;
-    using B::m_elemSizeAllocated;
-    using B::m_computeDevice;
-    using B::m_externalBuffer;
-    using B::m_format;
-    using B::m_matrixName;
-    using B::m_numCols;
-    using B::m_numRows;
-    using B::m_nz;
-    using B::m_pArray; // without this, base members would require to use thi-> in GCC
-    using B::OwnBuffer;
-    using B::Clear;
+    typedef BaseMatrix<ElemType> Base;
+    using Base::m_elemSizeAllocated;
+    using Base::m_computeDevice;
+    using Base::m_externalBuffer;
+    using Base::m_format;
+    using Base::m_matrixName;
+    using Base::m_numCols;
+    using Base::m_numRows;
+    using Base::m_nz;
+    using Base::m_pArray; // without this, base members would require to use thi-> in GCC
+    using Base::Clear;
+    using Base::NzCount;
 
 public:
-    using B::SetMatrixName;
+    using Base::OwnBuffer;
+    using Base::IsEmpty;
+    using Base::SetMatrixName;
 
 private:
     void ZeroInit();
@@ -53,8 +55,8 @@ public:
     ~CPUSparseMatrix();
 
 public:
-    using B::GetNumCols;
-    using B::GetNumRows;
+    using Base::GetNumCols;
+    using Base::GetNumRows;
 
     void SetValue(const size_t row, const size_t col, ElemType val);
     void SetValue(const CPUSparseMatrix<ElemType>& /*val*/);
@@ -249,6 +251,8 @@ private:
     size_t m_blockSize;    // block size
     size_t* m_blockIds;    // block ids
     size_t m_blockIdShift; // used to get efficient slice, actual col = blockIds[j] - m_blockIdShift
+
+    CPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from
 };
 
 typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h
index 0aa90983b..233bb4358 100644
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@@ -158,7 +158,7 @@ enum ElementWiseOperator
     Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); \
     Macro(ElementwiseProductWithLogDerivativeFromOutput);             \
     Macro(ElementwiseProductWithCosDerivative); \
-//Macro(Index);
+    //Macro(Index);
 
 #define ForAllTernaryOps(Macro) \
     Macro(Cond);                \
@@ -170,6 +170,7 @@ enum ElementWiseOperator
 
 enum MatrixFlagBitPosition
 {
+    // TODO: remove all formats that are actually not supported
     bitPosRowMajor = 0,         // row major matrix
     bitPosSparse = 1,           // sparse matrix (COO if uncompressed)
     bitPosCompressed = 2,       // a compressed sparse format (CSC/CSR)
@@ -179,6 +180,7 @@ enum MatrixFlagBitPosition
 
 enum MatrixFormat
 {
+    // TODO: remove all formats that are actually not supported
     matrixFormatDense = 0,                          // default is dense
     matrixFormatColMajor = 0,                       // default is column major
     matrixFormatRowMajor = 1 << bitPosRowMajor,     // row major matrix
@@ -314,17 +316,46 @@ protected:
         m_matrixName = nullptr;
     }
 
+    void ZeroInit()
+    {
+        m_numRows           = 0;
+        m_numCols           = 0;
+        m_elemSizeAllocated = 0;
+        m_sliceViewOffset   = 0;
+        m_externalBuffer    = false;
+        m_pArray            = nullptr;
+        m_nz                = 0;
+        m_matrixName        = nullptr;
+    }
+
+    // copy all metadata (but not content taht pArray points to)
+    void ShallowCopyFrom(const BaseMatrix& other)
+    {
+        m_format            = other.m_format;
+        m_computeDevice     = other.m_computeDevice;
+
+        m_numRows           = other.m_numRows;
+        m_numCols           = other.m_numCols;
+        m_elemSizeAllocated = other.m_elemSizeAllocated;
+        m_sliceViewOffset   = other.m_sliceViewOffset;
+        m_externalBuffer    = other.m_externalBuffer;
+        m_pArray            = other.m_pArray;
+        m_nz                = other.m_nz;
+        m_matrixName        = other.m_matrixName;
+    }
+
 protected:
+    MatrixFormat m_format;
+    mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
+
     size_t m_numRows;
     size_t m_numCols;
     size_t m_elemSizeAllocated;
-    size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format
-    MatrixFormat m_format;
+    size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format  --TODO: move to sparse matrix implementations? Or common sparse base class?
     bool m_externalBuffer; // is the buffer used by this matrix,
     ElemType* m_pArray;
-    mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
     size_t m_nz;                           // Number of non-zero elements for sparse matrices (unused in other formats)
     wchar_t* m_matrixName;                 // TODO: Use std::wstring?
 };
 
-} } }
+}}}
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 4925da113..22d6601f0 100644
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -579,7 +579,7 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam
 
 class SyncGuard
 {
-    bool DoSync()
+    static bool DoSync()
     {
 #ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
         static bool do_sync = false;
@@ -588,27 +588,30 @@ class SyncGuard
 #endif
         return do_sync;
     }
-    cudaEvent_t done;
+    cudaEvent_t m_done;
 public:
     SyncGuard()
     {
-        done = nullptr;
+        m_done = nullptr;
         if (DoSync())
-            CUDA_CALL(cudaEventCreate(&done));
+            CUDA_CALL(cudaEventCreate(&m_done));
     }
     ~SyncGuard()
     {
         if (DoSync())
         {
-            try
+            // The regular use of this destructor is to synchronize the GPU, but also
+            // to check for errors. So this destructor is where CUDA errors would be thrown.
+            // If this destructor runs during stack unwinding, then a different error has
+            // already happened that should be reported; so we only clean up the resource.
+            if (std::uncaught_exception())
+                cudaEventDestroy(m_done);
+            else
             {
-                CUDA_CALL(cudaEventRecord(done));
-                CUDA_CALL(cudaEventSynchronize(done));
-                CUDA_CALL(cudaEventDestroy(done));
-            }
-            catch (const std::exception& e) // can't throw in destructors!
-            {
-                std::cerr << "SyncGuard: Destructor swallowing CUDA failure: " << e.what() << std::endl;
+                // failures in a prior launch might be reported here
+                CUDA_CALL(cudaEventRecord(m_done));
+                CUDA_CALL(cudaEventSynchronize(m_done));
+                CUDA_CALL(cudaEventDestroy(m_done));
             }
         }
     }
diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu
index 1389de7e8..7f70864ae 100644
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@@ -52,6 +52,10 @@ GPUSPARSE_INDEX_TYPE GPUSparseMatrix<ElemType>::SecondaryIndexValueAt(size_t idx
     return value;
 }
 
+//-------------------------------------------------------------------------
+// construction and conversion
+//-------------------------------------------------------------------------
+
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE computeDevice)
 {
@@ -59,25 +63,18 @@ void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const
         matrixFormat != MatrixFormat::matrixFormatSparseBlockCol && matrixFormat != MatrixFormat::matrixFormatSparseBlockRow)
     {
         LogicError("GPUSparseMatrix:  unsupported sparse matrix format");
+        // BUGBUG: Then why even define others?
     }
+    Base::ZeroInit();
+    m_computeDevice            = computeDevice; // current GPU device Id
+    m_format                   = matrixFormat;
 
-    m_computeDevice = computeDevice; // current GPU device Id
-    m_numRows = 0;
-    m_numCols = 0;
-    m_elemSizeAllocated = m_nz = 0; // Number of non-zero elements
+    m_sliceOf                  = nullptr;
     m_totalBufferSizeAllocated = 0;
-    m_sliceViewOffset = 0;
-    m_format = matrixFormat;
-    m_externalBuffer = false;
-    m_pArray = nullptr;
-    m_matrixName = nullptr;
-
-    m_blockSize = 0;
-
-    m_rowToId = nullptr;
-
-    m_tempHostBuffer = nullptr;
-    m_tempHostBufferSize = 0;
+    m_blockSize                = 0;
+    m_rowToId                  = nullptr;
+    m_tempHostBuffer           = nullptr;
+    m_tempHostBufferSize       = 0;
 }
 
 template <class ElemType>
@@ -123,13 +120,15 @@ DEVICEID_TYPE GPUSparseMatrix<ElemType>::PrepareDevice(DEVICEID_TYPE deviceId /*
 }
 
 template <class ElemType>
-void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
+/*private*/ void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
 {
     ChangeDeviceTo(deepCopy.m_computeDevice);
     deepCopy.PrepareDevice();
 
     Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false);
-    m_nz = deepCopy.m_nz;
+    m_externalBuffer  = false;
+    m_sliceOf         = nullptr;
+    m_nz              = deepCopy.m_nz;
     m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix
 
     CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
@@ -146,7 +145,6 @@ void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCo
             GetNumNZElements());
     }
 
-    m_externalBuffer = false;
     SetMatrixName(deepCopy.m_matrixName);
 
     // TODO: to copy other varibles used only for class based LM
@@ -156,7 +154,7 @@ template <class ElemType>
 void GPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopy)
 {
     if (!OwnBuffer())
-        LogicError("Cannot SetValue on Managed external matrix");
+        LogicError("Cannot SetValue on managed external matrix");
 
     DeepCopy(deepCopy);
 }
@@ -190,6 +188,9 @@ void GPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCo
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType>& cpuSparseMatrix) const
 {
+    if (!cpuSparseMatrix.OwnBuffer())
+        LogicError("Cannot CopyToCPUSparseMatrix on managed external matrix");
+
     cpuSparseMatrix.SetFormat(GetFormat());
     if (IsEmpty())
     {
@@ -307,6 +308,9 @@ void GPUSparseMatrix<ElemType>::CopyToDenseMatrix(GPUMatrix<ElemType>& denseMatr
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const
 {
+    if (!outMatrix.OwnBuffer())
+        LogicError("Cannot ConvertToSparseFormat to managed external matrix");
+
     if (IsEmpty())
     {
         outMatrix.ZeroInit(newFormat, GetComputeDeviceId());
@@ -528,26 +532,15 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseM
 template <class ElemType>
 GPUSparseMatrix<ElemType>::GPUSparseMatrix(GPUSparseMatrix<ElemType>&& moveFrom)
 {
-    m_computeDevice = moveFrom.m_computeDevice;
-    m_numRows = moveFrom.m_numRows;
-    m_numCols = moveFrom.m_numCols;
-    m_nz = moveFrom.m_nz;
-    m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+    Base::ShallowCopyFrom(moveFrom);
+    // TODO: implement this using operator= or a shared function
     m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
-    m_pArray = moveFrom.m_pArray;
-    m_sliceViewOffset = moveFrom.m_sliceViewOffset;
-    m_format = moveFrom.m_format;
-    m_externalBuffer = moveFrom.m_externalBuffer;
-    m_matrixName = moveFrom.m_matrixName;
-
-    m_blockSize = moveFrom.m_blockSize;
-
-    m_rowToId = moveFrom.m_rowToId;
-
-    m_tempHostBuffer = moveFrom.m_tempHostBuffer;
-    m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
-
-    moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freeed
+    m_sliceOf                  = moveFrom.m_sliceOf;
+    m_blockSize                = moveFrom.m_blockSize;
+    m_rowToId                  = moveFrom.m_rowToId;
+    m_tempHostBuffer           = moveFrom.m_tempHostBuffer;
+    m_tempHostBufferSize       = moveFrom.m_tempHostBufferSize;
+    moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freed
 }
 
 template <class ElemType>
@@ -557,26 +550,13 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(GPUSparseMatrix<
     {
         if (OwnBuffer())
             ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom
-        m_computeDevice = moveFrom.m_computeDevice;
-        m_numRows = moveFrom.m_numRows;
-        m_numCols = moveFrom.m_numCols;
-        m_nz = moveFrom.m_nz;
-        m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+        Base::ShallowCopyFrom(moveFrom);
         m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
-        m_pArray = moveFrom.m_pArray;
-        m_sliceViewOffset = moveFrom.m_sliceViewOffset;
-        m_format = moveFrom.m_format;
-        m_externalBuffer = moveFrom.m_externalBuffer;
-
-        m_matrixName = moveFrom.m_matrixName;
-
-        m_blockSize = moveFrom.m_blockSize;
-
-        m_rowToId = moveFrom.m_rowToId;
-
-        m_tempHostBuffer = moveFrom.m_tempHostBuffer;
-        m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
-
+        m_sliceOf                  = moveFrom.m_sliceOf;
+        m_blockSize                = moveFrom.m_blockSize;
+        m_rowToId                  = moveFrom.m_rowToId;
+        m_tempHostBuffer           = moveFrom.m_tempHostBuffer;
+        m_tempHostBufferSize       = moveFrom.m_tempHostBufferSize;
         moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice);
     }
 
@@ -590,11 +570,10 @@ GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
 }
 
 template <class ElemType>
-void GPUSparseMatrix<ElemType>::ReleaseMemory()
+/*private*/ void GPUSparseMatrix<ElemType>::ReleaseMemory()
 {
-    // If OwnBuffer() is false then this matrix
-    // is simply a view over another matrix. In that
-    // case we shouldn't free anything.
+    // If OwnBuffer() is false then this matrix is a view over another matrix.
+    // In that case we shouldn't free anything.
     if (OwnBuffer())
     {
         delete[] m_matrixName;
@@ -609,7 +588,6 @@ void GPUSparseMatrix<ElemType>::ReleaseMemory()
         TracingGPUMemoryAllocator::Free<GPUSPARSE_INDEX_TYPE>(m_computeDevice, m_rowToId);
         m_rowToId = nullptr;
     }
-
     ZeroInit(m_format, m_computeDevice);
 }
 
@@ -627,17 +605,18 @@ void GPUSparseMatrix<ElemType>::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<E
 }
 
 //-------------------------------------------------------------------------
-// Start of new GPU Sparse Matrix code
+// main operations
 //-------------------------------------------------------------------------
+
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
 {
-    if (!OwnBuffer())
-        LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally.");
-
     if (m_numRows == numRows && m_numCols == numCols)
         return;
 
+    if (!OwnBuffer())
+        LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally.");
+
     if (m_format != MatrixFormat::matrixFormatSparseCSC)
         NOT_IMPLEMENTED;
 
@@ -688,8 +667,8 @@ void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
     Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly, keepExistingValues);
 }
 
-//WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start.
-//TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
+// WARNING: When memory is reallocated, existing information will be lost.
+// TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*=true*/)
 {
@@ -749,13 +728,17 @@ void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
     m_format = matrixFormat;
 }
 
-//Reset matrix so it can be reused
+// Reset matrix to 0.
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::Reset()
 {
+    if (!OwnBuffer())
+        LogicError("Cannot Reset since the buffer is managed externally.");
+
     m_nz = 0;
     m_blockSize = 0;
 }
+
 // copy features to GPU
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE* h_CSRRow, const GPUSPARSE_INDEX_TYPE* h_Col, const ElemType* h_Val,
@@ -966,9 +949,11 @@ void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPU
     }
 }
 
+// dense X sparse = dense
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
-                                                       const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
+                                                       const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta,
+                                                       GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
 {
     if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
         RuntimeError("GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd: All matrices must be on the same GPU");
@@ -1222,7 +1207,7 @@ void GPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const GPUMatrix<E
     }
 }
 
-//find the rows of rhs with values
+// find the rows of rhs with values
 template <class ElemType>
 size_t GPUSparseMatrix<ElemType>::IdentifyRowsWithValues() const
 {
@@ -1404,11 +1389,7 @@ ElemType GPUSparseMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& c, const bool n
     }
 }
 
-//-------------------------------------------------------------------------
-// End of new GPU Sparse Matrix code
-//-------------------------------------------------------------------------
-
-//sparse X dense = dense
+// sparse X dense = dense
 template <class ElemType>
 void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA,
                                                        const GPUMatrix<ElemType>& b, const bool transposeD, ElemType beta, GPUMatrix<ElemType>& c)
@@ -1758,6 +1739,7 @@ void GPUSparseMatrix<ElemType>::ElementWisePower(ElemType alpha, const GPUSparse
     }
 }
 
+// sparse x dense = scalar
 template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
 {
@@ -1878,8 +1860,8 @@ bool GPUSparseMatrix<ElemType>::IsValid() const
 }
 
 template <class ElemType>
-bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
-                                         const ElemType threshold)
+/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
+                                                    const ElemType threshold)
 {
     if (a.GetNumNZElements() != b.GetNumNZElements() || a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
         return false;
@@ -1908,8 +1890,8 @@ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, con
 }
 
 template <class ElemType>
-bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
-                                         const ElemType threshold)
+/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
+                                                    const ElemType threshold)
 {
     if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
         return false;
@@ -1919,8 +1901,8 @@ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPU
 }
 
 template <class ElemType>
-bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b,
-                                         const ElemType threshold)
+/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b,
+                                                    const ElemType threshold)
 {
     if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
         return false;
@@ -1940,6 +1922,7 @@ bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const El
 {
     return AreEqual(*this, a, threshold);
 }
+
 #pragma endregion Static BLAS Functions
 
 #pragma region Member BLAS Functions
@@ -1958,6 +1941,7 @@ DEVICEID_TYPE GPUSparseMatrix<ElemType>::GetComputeDeviceId() const
         return m_computeDevice;
 }
 
+// sparse x dense = dense
 template <class ElemType>
 GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
 {
@@ -1980,6 +1964,7 @@ GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseM
     return c;
 }
 
+// sparse x dense = dense
 template <class ElemType>
 GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
 {
@@ -2002,6 +1987,7 @@ GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator-(const GPUSparseMa
     return res;
 }
 
+// TODO: This is an unusual use of this operator. Remove this.
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
 {
@@ -2010,6 +1996,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
     return us;
 }
 
+// TODO: This is an unusual use of this operator. Remove this.
 template <class ElemType>
 GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator^(ElemType alpha) const
 {
@@ -2154,6 +2141,7 @@ GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
     slice.m_pArray                   = m_pArray;
     slice.m_format                   = m_format;
     slice.m_externalBuffer           = true;
+    slice.m_sliceOf                  = const_cast<GPUSparseMatrix<ElemType>*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be.
     slice.m_matrixName               = m_matrixName;
     slice.m_blockSize                = m_blockSize;
     slice.m_rowToId                  = m_rowToId;
@@ -2231,7 +2219,7 @@ template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
 {
     if (IsEmpty())
-        LogicError("SumOfAbsElements: Matrix is empty");
+        return 0;
 
     cublasHandle_t cuHandle = GPUMatrix<ElemType>::GetCublasHandle(GetComputeDeviceId());
     if (sizeof(ElemType) == sizeof(float))
@@ -2264,11 +2252,12 @@ ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
     return h_sum;
 }
 
+// sqrt(sum all elements^2)
 template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const
 {
     if (IsEmpty())
-        LogicError("FrobeniusNorm: Matrix is empty.");
+        return 0;
 
     ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(m_computeDevice, 1);
     ElemType h_sum = 0;
@@ -2287,7 +2276,7 @@ template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
 {
     if (IsEmpty())
-        LogicError("MatrixNorm1: Matrix is empty.");
+        return 0;
 
     ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(m_computeDevice, 1);
     ElemType h_maxAbs = 0;
@@ -2305,8 +2294,6 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
 template <class ElemType>
 ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
 {
-    if (IsEmpty())
-        LogicError("MatrixNorm1: Matrix is empty.");
     return SumOfAbsElements();
 }
 
@@ -2317,6 +2304,10 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse()
 {
+#if 1
+    // Note: This makes no sense because sparse matrices are defined by having lots of zeroes.
+    NOT_IMPLEMENTED;
+#else
     if (!OwnBuffer())
         LogicError("Cannot modify since the buffer is managed externally.");
 
@@ -2328,29 +2319,45 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse()
     SyncGuard syncGuard;
     _elemInverse<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(NzValues(), N);
     return *this;
+#endif
 }
 
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementInverseOf(const GPUSparseMatrix<ElemType>& a)
 {
+#if 1
+    // Note: This makes no sense because sparse matrices are defined by having lots of zeroes.
+    UNUSED(a); NOT_IMPLEMENTED;
+#else
     SetValue(a);
     return ElementInverse();
+#endif
 }
 
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSigmoid()
 {
+#if 1
+    // Note: This makes no sense because sigmoid(0) != 0.
+    NOT_IMPLEMENTED;
+#else
     performElementWiseFunction(ElementWiseOperator::opSigmoid, *this);
     return *this;
+#endif
 }
 
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSigmoidOf(const GPUSparseMatrix<ElemType>& a)
 {
+#if 1
+    // Note: This makes no sense because sigmoid(0) != 0.
+    UNUSED(a); NOT_IMPLEMENTED;
+#else
     if (this != &a)
         Resize(a.GetNumRows(), a.GetNumCols());
     performElementWiseFunction(ElementWiseOperator::opSigmoid, a);
     return *this;
+#endif
 }
 
 template <class ElemType>
@@ -2404,33 +2411,53 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSqrtOf(const GPUSpar
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceExp()
 {
+#if 1
+    // Note: This makes no sense because exp(0) != 0.
+    NOT_IMPLEMENTED;
+#else
     performElementWiseFunction(ElementWiseOperator::opExp, *this);
     return *this;
+#endif
 }
 
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignExpOf(const GPUSparseMatrix<ElemType>& a)
 {
+#if 1
+    // Note: This makes no sense because exp(0) != 0.
+    UNUSED(a); NOT_IMPLEMENTED;
+#else
     if (this != &a)
         Resize(a.GetNumRows(), a.GetNumCols());
     performElementWiseFunction(ElementWiseOperator::opExp, a);
     return *this;
+#endif
 }
 
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLog()
 {
+#if 1
+    // Note: This makes no sense because log(0) != 0.
+    NOT_IMPLEMENTED;
+#else
     performElementWiseFunction(ElementWiseOperator::opLog, *this);
     return *this;
+#endif
 }
 
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLogOf(const GPUSparseMatrix<ElemType>& a)
 {
+#if 1
+    // Note: This makes no sense because log(0) != 0.
+    UNUSED(a); NOT_IMPLEMENTED;
+#else
     if (this != &a)
         Resize(a.GetNumRows(), a.GetNumCols());
     performElementWiseFunction(ElementWiseOperator::opLog, a);
     return *this;
+#endif
 }
 
 template <class ElemType>
@@ -2449,6 +2476,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignAbsOf(const GPUSpars
     return *this;
 }
 
+// TODO: Check whether these functions always map 0 to 0.
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
 {
@@ -2543,7 +2571,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
 //outBuffer should be allocated to be >= size by the caller
 template <class ElemType>
 template <class OutType, class InType>
-void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size)
+/*private*/ void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size)
 {
 #pragma omp parallel for
     for (size_t i = 0; i < (size & ~3); i += 4)
@@ -2629,6 +2657,9 @@ template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix
 template <class ElemType>
 MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
 {
+    if (!us.OwnBuffer())
+        LogicError("Cannot read into a managed external matrix");
+
     stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
     size_t elsize;
     stream >> elsize;
diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h
index 74fc2d62c..0b823e0fa 100644
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@@ -23,29 +23,30 @@ template <class ElemType>
 class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
 {
 public:
-    typedef BaseMatrix<ElemType> B;
-    using B::m_numRows;
-    using B::m_numCols;
-    using B::m_pArray;
-    using B::m_elemSizeAllocated;
-    using B::m_sliceViewOffset;
-    using B::m_nz;
-    using B::m_format;
-    using B::m_computeDevice;
-    using B::m_externalBuffer;
-    using B::m_matrixName;
-    using B::OwnBuffer;
-    using B::GetFormat;
-    using B::SetFormat;
-    using B::GetNumRows;
-    using B::GetNumCols;
-    using B::IsEmpty;
-    using B::SetComputeDeviceId;
-    using B::SetMatrixName;
-    using B::SetNzCount;
-    using B::Clear;
+    typedef BaseMatrix<ElemType> Base;
+    using Base::m_numRows;
+    using Base::m_numCols;
+    using Base::m_pArray;
+    using Base::m_elemSizeAllocated;
+    using Base::m_sliceViewOffset;
+    using Base::m_nz;
+    using Base::m_format;
+    using Base::m_computeDevice;
+    using Base::m_externalBuffer;
+    using Base::m_matrixName;
+    using Base::OwnBuffer;
+    using Base::GetFormat;
+    using Base::SetFormat;
+    using Base::GetNumRows;
+    using Base::GetNumCols;
+    using Base::SetComputeDeviceId;
+    using Base::SetMatrixName;
+    using Base::SetNzCount;
+    using Base::Clear;
     // without this, base members would require to use thi-> in GCC
 public:
+    using Base::IsEmpty;
+
     GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
 
     explicit GPUSparseMatrix(DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
@@ -390,6 +391,8 @@ private:
 
     mutable void* m_tempHostBuffer; // used to copy values.
     mutable size_t m_tempHostBufferSize;
+
+    GPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from
 };
 
 }}}
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 3ee8e5069..68de0c538 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -169,7 +169,7 @@ public:
     // REVIEW alexeyk: GPU version copies from device to host only, implement all versions (device <-> host).
     void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const;
 
-    Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
+    Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const; // note: 'const' is misleading here, as the returned matrix is a mutable reference
 
     // difference between AssignColumnSlice and SetColumnSlice
     // AssignColumnSlice :      this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1)
diff --git a/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk b/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk
index af377b79d..0ac8be307 100644
--- a/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk
+++ b/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk
@@ -4,7 +4,7 @@ DataDir = $RootDir$
 # deviceId = -1 for CPU, >= 0 for GPU devices
 deviceId = -1
 
-precision = "float"
+precision = "double"
 
 Simple_Test = [
     reader = [