Merge branch 'master' into qiwye/multiverso

2016-01-29 14:23:04 +08:00 · 2016-01-29 14:23:04 +08:00 · c7bfebe740
--- a/2
+++ b/2
@ -423,7 +423,7 @@ ALL += $(IMAGEREADER)
 SRC+=$(IMAGEREADER_SRC)

 INCLUDEPATH += $(OPENCV_PATH)/include
-LIBPATH += $(OPENCV_PATH)/release/lib
+LIBPATH += $(OPENCV_PATH)/lib

 $(IMAGEREADER): $(IMAGEREADER_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@ -501,6 +501,22 @@ public:
        return (size_t) location;
    }

+    // get begin and end location (first offset after last element), for validation purposes
+    pair<ptrdiff_t, ptrdiff_t> GetLocationRange() const
+    {
+        auto result = make_pair(m_offset, m_offset);
+        for (size_t k = 0; k < size(); k++)
+        {
+            ptrdiff_t step = (ptrdiff_t)(m_dims[k] - 1) * m_strides[k];
+            if (m_strides[k] > 0) // strides may be negative
+                result.second += step;
+            else
+                result.first += step;
+        }
+        result.second++;    // max --> end
+        return result;
+    }
+
    // helpers for tensor operations
    bool CanFlatten(size_t k) const // can dims k and k-1 be flattened into a single vector? (do they form a matrix without stride)
    {
--- a/Source/Common/fileutil.cpp
+++ b/Source/Common/fileutil.cpp
@ -413,10 +413,33 @@ void fprintfOrDie(FILE* f, const char* fmt, ...)
 void fflushOrDie(FILE* f)
 {
    int rc = fflush(f);
+
    if (rc != 0)
    {
        RuntimeError("error flushing to file: %s", strerror(errno));
    }
+
+    int fd = fileno(f);
+
+    if (fd == -1)
+    {
+        RuntimeError("unable to convert file handle to file descriptor: %s", strerror(errno));
+    }
+
+    // Ensure that all data is synced before returning from this function
+#ifdef _WIN32
+    if (!FlushFileBuffers((HANDLE)_get_osfhandle(fd)))
+    {
+        RuntimeError("error syncing to file: %d", (int) ::GetLastError());
+    }
+#else
+    rc = fsync(fd);
+
+    if (rc != 0)
+    {
+        RuntimeError("error syncing to file: %s", strerror(errno));
+    }
+#endif
 }

 // ----------------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -185,6 +185,7 @@ void ComputationNode<ElemType>::ValidateInferInputDimsFrom(const TensorShape& ot

 // determine the sample tensor dimension to use for operations based on output and all inputs
 // 'Sample tensor' means we only consider single samples. If we have an MBLayout, that is the sample layout of a single matrix column.
+// TODO: Turn rank into a member variable, and call this method once in validation (currently called for every single ForwardProp/BackpropTo()).
 size_t ComputationNodeBase::DetermineElementwiseTensorRank() const
 {
    // determine largest tensor dimension amongst the sample shapes of output and the selected inputs
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -81,7 +81,7 @@ struct /*interface*/ IComputationNode
    virtual void BackpropTo(const size_t inputIndex, const FrameRange&) = 0; // backprop gradient into one of the inputs
    virtual void EndBackprop() = 0;                                          // called after last iteration step of ComputeGradient()

-    // --- these are meant to be overridden by ControlFlowNodes
+    // --- this is meant to be overridden by ControlFlowNodes

    virtual void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) = 0;

@ -491,10 +491,11 @@ public:
 protected:

    size_t DetermineElementwiseTensorRank() const;                          // determine tensor rank when considering all inputs with padding
-    TensorShape GetTensorSliceFor(size_t rank, const FrameRange& fr) const; // form tensor shape of the slice referenced by FrameRange

 public:

+    TensorShape GetTensorSliceFor(size_t rank, const FrameRange& fr) const; // form tensor shape of the slice referenced by FrameRange. Public since nodes may call it for their inputs.
+
    // -----------------------------------------------------------------------
    // inputs
    // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -209,21 +209,6 @@ public:
    {
    }

-    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
-    {
-        Input(0)->GradientFor(fr.WithLayout(Input(0)->GetMBLayout())) += GradientFor(fr);
-        // TODO: Once we do in-place, the above must include a copy-to-self check (pay special attention to adding vs. copying).
-    }
-
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        return false;
-    }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
-    {
-        return false;
-    }
-
    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
        // enforce compatibility of 'dataInput' with 'layoutInput'
@ -239,6 +224,15 @@ public:
        // TODO: Once we do in-place, the above must include a copy-to-self check (either here or inside the matrix lib).
    }

+    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
+    {
+        Input(0)->GradientFor(fr.WithLayout(Input(0)->GetMBLayout())) += GradientFor(fr);
+        // TODO: Once we do in-place, the above must include a copy-to-self check (pay special attention to adding vs. copying).
+    }
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+
    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
@ -256,8 +250,7 @@ template class ReconcileMBLayoutNode<double>;

 // -----------------------------------------------------------------------
 // RowSliceNode (input)
-// this node extracts part of the input by rows as the output
-// it has to be continuous segments of rows since each column is treated as one sample
+// This node extracts a slice of the first tensor dimension (row).
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -277,6 +270,7 @@ public:
          m_sliceHeight(numRows)
    {
    }
+
    RowSliceNode(const ScriptableObjects::IConfigRecordPtr configp)
        : RowSliceNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"startIndex"), configp->Get(L"numRows"))
    {
@ -292,58 +286,62 @@ public:
        node->m_sliceHeight = m_sliceHeight;
    }

-    virtual void Save(File& fstream) const override
-    {
-        Base::Save(fstream);
-        fstream << m_startIndex << m_sliceHeight;
-    }
-
    virtual void Load(File& fstream, size_t modelVersion) override
    {
        Base::Load(fstream, modelVersion);
        fstream >> m_startIndex >> m_sliceHeight;
    }

-    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
+    virtual void Save(File& fstream) const override
    {
-        Input(0)->GradientFor(fr).AddToRowSliceValuesOf(GradientFor(fr), m_startIndex, m_sliceHeight);
+        Base::Save(fstream);
+        fstream << m_startIndex << m_sliceHeight;
    }

-    virtual bool OutputUsedInComputingInputNodesGradients() const override
+private:
+
+    // determine the tensor shape that represents slice of the input that we are taking
+    TensorShape GetInputSlice(size_t rank, const FrameRange & fr) const
    {
-        // The RowSliceNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
+        auto inputSlice = Input(0)->GetTensorSliceFor(rank, fr);    // input must be narrowed down
+        inputSlice.NarrowTo(0, m_startIndex, m_startIndex + m_sliceHeight);
+        return inputSlice;
    }

-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The RowSliceNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+public:

    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
-        ValueFor(fr).AssignRowSliceValuesOf(Input(0)->ValueFor(fr), m_startIndex, m_sliceHeight);
+        size_t rank = DetermineElementwiseTensorRank();
+        auto output =                                ValueTensorFor(rank,         fr);
+        let   input = TensorView<ElemType>(Input(0)->Value(), GetInputSlice(rank, fr.AllowBroadcast()));
+        output.AssignCopyOf(input);
    }

+    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
+    {
+        size_t rank = DetermineElementwiseTensorRank();
+        let outputGrad =                                GradientTensorFor(rank,         fr);
+        auto inputGrad = TensorView<ElemType>(Input(0)->Gradient(), GetInputSlice(rank, fr));
+        inputGrad.AddCopyOf(outputGrad);
+    }
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+
    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
        InferMBLayoutFromInputsForStandardCase();

-        if (isFinalValidationPass && Input(0)->GetSampleMatrixNumRows() < m_startIndex + m_sliceHeight)
-            RuntimeError("%ls %ls operation: m_startIndex + m_sliceHeight exceeds number of rows in the input.", NodeName().c_str(), OperationName().c_str());
+        auto sampleLayout = Input(0)->GetSampleLayout();
+        if (isFinalValidationPass && sampleLayout[0] < m_startIndex + m_sliceHeight)
+            RuntimeError("%ls %ls operation: m_startIndex + m_sliceHeight (%d) exceeds number of rows in the input ([%s]).", NodeName().c_str(), OperationName().c_str(), (int)(m_startIndex + m_sliceHeight), string(sampleLayout).c_str());

-        // RowSlice cannot slice tensors.
-        // TODO: Create a TensorSlice operation, or just Slice.
-        if (isFinalValidationPass && !Input(0)->GetSampleLayout().IsColumnVector()
-            && !Input(0)->GetSampleLayout().IsVectorStoredAsImage() // legacy
-            )
-            RuntimeError("%ls %ls operation: Input must be a vector, tensor shape [%s] not allowed.", NodeName().c_str(), OperationName().c_str(), string(Input(0)->GetSampleLayout()).c_str());
-        SetDims(TensorShape(m_sliceHeight), HasMBLayout());
+        if (sampleLayout[0] >= m_startIndex + m_sliceHeight)    // (this guards against failing an out-of-bounds error if not isFinalValidationPass)
+            sampleLayout.NarrowTo(0, m_startIndex, m_startIndex + m_sliceHeight);
+
+        SetDims(TensorShape(sampleLayout.GetDims()), HasMBLayout());
    }

 private:
@ -604,24 +602,6 @@ public:
    {
    }

-    virtual void Validate(bool isFinalValidationPass) override
-    {
-        Base::Validate(isFinalValidationPass);
-        m_pMBLayout = nullptr;
-
-        if (isFinalValidationPass && Input(0)->HasMBLayout())
-            InvalidArgument("%ls %ls operation cannot operate on minibatch data (which have a layout)", NodeName().c_str(), OperationName().c_str());
-
-        size_t dim = Input(0)->GetAsMatrixNumCols();
-        if (isFinalValidationPass && dim != Input(0)->GetAsMatrixNumRows())
-            InvalidArgument("%ls %ls operation requires a square matrix as its input.", NodeName().c_str(), OperationName().c_str());
-
-        if (Input(0)->HasSampleLayout())
-            fprintf(stderr, "WARNING: Diagonal operation cannot inherit image size information from its child. Image size info is lost.\n");
-
-        SetDims(TensorShape(1, dim), false);
-    }
-
    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
    {
        Input(0)->ValueAsMatrix().AssignDiagonalValuesTo(ValueAsMatrix()); // TODO: use tensor lib; this is a stride operation
@ -646,19 +626,25 @@ public:
        inputGradientValues.SetDiagonalValue(diag);
    }

-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The DiagonalNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }

-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    virtual void Validate(bool isFinalValidationPass) override
    {
-        // The DiagonalNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
+        Base::Validate(isFinalValidationPass);
+        m_pMBLayout = nullptr;
+
+        if (isFinalValidationPass && Input(0)->HasMBLayout())
+            InvalidArgument("%ls %ls operation cannot operate on minibatch data (which have a layout)", NodeName().c_str(), OperationName().c_str());
+
+        size_t dim = Input(0)->GetAsMatrixNumCols();
+        if (isFinalValidationPass && dim != Input(0)->GetAsMatrixNumRows())
+            InvalidArgument("%ls %ls operation requires a square matrix as its input.", NodeName().c_str(), OperationName().c_str());
+
+        if (Input(0)->HasSampleLayout())
+            fprintf(stderr, "WARNING: Diagonal operation cannot inherit image size information from its child. Image size info is lost.\n");
+
+        SetDims(TensorShape(1, dim), false);
    }
 };

@ -839,13 +825,6 @@ public:
        }
    }

-    virtual void Save(File& fstream) const override
-    {
-        Base::Save(fstream);
-        fstream << m_numTargetRows;
-        m_targetImageLayout.Save(fstream);
-    }
-
    virtual void Load(File& fstream, size_t modelVersion) override
    {
        Base::Load(fstream, modelVersion);
@ -853,6 +832,13 @@ public:
        m_targetImageLayout.Load(fstream, /*acceptLegacyFormat=*/true);
    }

+    virtual void Save(File& fstream) const override
+    {
+        Base::Save(fstream);
+        fstream << m_numTargetRows;
+        m_targetImageLayout.Save(fstream);
+    }
+
    virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const override
    {
        fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str());
@ -871,56 +857,6 @@ public:
        // BUGBUG: This interpretaion as image dims is only correct for the 'legacy format, not for cudnn.
    }

-    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-    {
-        Base::Validate(isFinalValidationPass);
-        if (factor() == 1) // canonical case: keeps the MBLayout(e.g. only changing the TensorShape)
-            m_pMBLayout = Input(0)->GetMBLayout();
-        else if (Input(0)->HasMBLayout())
-        {
-            if (!m_pMBLayout)
-                m_pMBLayout = make_shared<MBLayout>(); // mini-batch data: this generates a new layout
-        }
-        else
-            assert(!m_pMBLayout); // reshaping non-mini-batch data
-
-        size_t newCols = 1; // dummy
-        if (!m_pMBLayout)
-        {
-            size_t rows = Input(0)->GetAsMatrixNumRows(), cols = Input(0)->GetAsMatrixNumCols();
-            newCols = cols * rows / m_numTargetRows;
-            if (isFinalValidationPass)
-            {
-                if ((m_numTargetRows > rows && m_numTargetRows % rows != 0) || // grouping columns
-                    (m_numTargetRows < rows && rows % m_numTargetRows != 0))   // splitting columns
-                    InvalidArgument("%ls %ls operation: output row dimension %d is not an integer multiple or divisor of input dimension %d", NodeName().c_str(), OperationName().c_str(), (int) m_numTargetRows, (int) rows);
-                if (rows * cols != m_numTargetRows * newCols)
-                    LogicError("%ls %ls operation: unexpected dimension mismatch", NodeName().c_str(), OperationName().c_str());
-            }
-        }
-
-        // patch up m_targetImageLayout, which was originally a construction parameter
-        InferTargetSampleLayout();
-
-        // setting any dimension to 0 means lose the tensor, flatten to vector
-        if (m_targetImageLayout.GetNumElements() == 0)
-        {
-            if (Input(0)->HasSampleLayout())
-                fprintf(stderr, "WARNING: Reshape operation cannot inherit image size information from its child. Image size info is lost.\n");
-            // TODO: We need to decide what reshaping means in presence of a tensor.
-            if (HasMBLayout())
-                SetDims(TensorShape(m_numTargetRows), true);
-            else
-                SetDims(TensorShape(m_numTargetRows, newCols), false);
-        }
-        else
-        {
-            if (m_numTargetRows != m_targetImageLayout.GetNumElements())
-                LogicError("LegacyReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int) m_numTargetRows);
-            SetDims(m_targetImageLayout, HasMBLayout());
-        }
-    }
-
    // TODO: Clarify/resolve the semantic overlap between BeginForwardProp() and UpdateFunctionMBSize().
    virtual void /*IComputationNode::*/ BeginForwardProp() override
    {
@ -1002,19 +938,57 @@ public:
        }
    }

-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The LegacyReshapeNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }

-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
    {
-        // The LegacyReshapeNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
+        Base::Validate(isFinalValidationPass);
+        if (factor() == 1) // canonical case: keeps the MBLayout(e.g. only changing the TensorShape)
+            m_pMBLayout = Input(0)->GetMBLayout();
+        else if (Input(0)->HasMBLayout())
+        {
+            if (!m_pMBLayout)
+                m_pMBLayout = make_shared<MBLayout>(); // mini-batch data: this generates a new layout
+        }
+        else
+            assert(!m_pMBLayout); // reshaping non-mini-batch data
+
+        size_t newCols = 1; // dummy
+        if (!m_pMBLayout)
+        {
+            size_t rows = Input(0)->GetAsMatrixNumRows(), cols = Input(0)->GetAsMatrixNumCols();
+            newCols = cols * rows / m_numTargetRows;
+            if (isFinalValidationPass)
+            {
+                if ((m_numTargetRows > rows && m_numTargetRows % rows != 0) || // grouping columns
+                    (m_numTargetRows < rows && rows % m_numTargetRows != 0))   // splitting columns
+                    InvalidArgument("%ls %ls operation: output row dimension %d is not an integer multiple or divisor of input dimension %d", NodeName().c_str(), OperationName().c_str(), (int) m_numTargetRows, (int) rows);
+                if (rows * cols != m_numTargetRows * newCols)
+                    LogicError("%ls %ls operation: unexpected dimension mismatch", NodeName().c_str(), OperationName().c_str());
+            }
+        }
+
+        // patch up m_targetImageLayout, which was originally a construction parameter
+        InferTargetSampleLayout();
+
+        // setting any dimension to 0 means lose the tensor, flatten to vector
+        if (m_targetImageLayout.GetNumElements() == 0)
+        {
+            if (Input(0)->HasSampleLayout())
+                fprintf(stderr, "WARNING: Reshape operation cannot inherit image size information from its child. Image size info is lost.\n");
+            // TODO: We need to decide what reshaping means in presence of a tensor.
+            if (HasMBLayout())
+                SetDims(TensorShape(m_numTargetRows), true);
+            else
+                SetDims(TensorShape(m_numTargetRows, newCols), false);
+        }
+        else
+        {
+            if (m_numTargetRows != m_targetImageLayout.GetNumElements())
+                LogicError("LegacyReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int) m_numTargetRows);
+            SetDims(m_targetImageLayout, HasMBLayout());
+        }
    }

 private:
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -313,11 +313,8 @@ public:
 protected:
    void Clear()
    {
-        if (m_matrixName != nullptr)
-        {
-            delete[] m_matrixName;
-            m_matrixName = nullptr;
-        }
+        delete[] m_matrixName;
+        m_matrixName = nullptr;
    }

 protected:
@ -330,6 +327,7 @@ protected:
    ElemType* m_pArray;
    mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
    size_t m_nz;                           // Number of non-zero elements for sparse matrices (unused in other formats)
-    wchar_t* m_matrixName;
+    wchar_t* m_matrixName;                 // TODO: Use std::wstring?
 };
+
 } } }
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -94,10 +94,23 @@ const char* CudaErrString<cudaError_t>(cudaError_t x)
    return cudaGetErrorString(x);
 }
 template <>
-const char* CudaErrString<cublasStatus_t>(cublasStatus_t)
+const char* CudaErrString<cublasStatus_t>(cublasStatus_t e)
 {
    cudaDeviceSynchronize();
-    return "(see cublas_api.h & look for cublasStatus_t or CUBLAS_STATUS_xxx)";
+    switch (e)
+    {
+    case CUBLAS_STATUS_SUCCESS:          return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:  return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:     return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:    return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:    return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:    return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:   return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:    return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:    return "CUBLAS_STATUS_LICENSE_ERROR";
+    default:                             return "(look for CUBLAS_STATUS_xxx in cublas_api.h)";
+    }
 }
 template <>
 const char* CudaErrString<curandStatus>(curandStatus)
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -524,9 +524,8 @@ public:
 };

 typedef GPUMatrix<float> GPUSingleMatrix;
-}
-}
-}
+
+}}}

 // Error handling
 template <typename ERRTYPE>
@ -549,8 +548,8 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam
    }
 }

-#define CUDA_CALL(expr) (CudaCall((expr), #expr, "CUDA", cudaSuccess))
-#define CUBLAS_CALL(expr) (CudaCall((expr), #expr, "CUBLAS", CUBLAS_STATUS_SUCCESS))
+#define CUDA_CALL(expr)     (CudaCall((expr), #expr, "CUDA",     cudaSuccess))
+#define CUBLAS_CALL(expr)   (CudaCall((expr), #expr, "CUBLAS",   CUBLAS_STATUS_SUCCESS))
 #define CUSPARSE_CALL(expr) (CudaCall((expr), #expr, "CUSPARSE", CUSPARSE_STATUS_SUCCESS))
-#define CURAND_CALL(expr) (CudaCall((expr), #expr, "CURAND", CURAND_STATUS_SUCCESS))
-#define CUDNN_CALL(expr) (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS))
+#define CURAND_CALL(expr)   (CudaCall((expr), #expr, "CURAND",   CURAND_STATUS_SUCCESS))
+#define CUDNN_CALL(expr)    (CudaCall((expr), #expr, "cuDNN",    CUDNN_STATUS_SUCCESS))
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@ -36,17 +36,19 @@ using namespace std;
 // construction
 // -------------------------------------------------------------------

-// cast a matrix as a TensorView
+// main constructor (all constructors except the default one route through this)
 template <class ElemType>
-TensorView<ElemType>::TensorView(const Matrix<ElemType>& sob)
-    : m_sob(sob.AsReference()), m_shape(TensorShape(array<size_t, 2>{sob.GetNumRows(), sob.GetNumCols()}))
-{
-}
-// reshape a TensorView
-template <class ElemType>
-TensorView<ElemType>::TensorView(const TensorView<ElemType>& other, const TensorShape& shape)
-    : m_sob(other.m_sob.AsReference()), m_shape(shape)
+TensorView<ElemType>::TensorView(const Matrix<ElemType>& sob, const TensorShape& shape)
+    : m_sob(sob.AsReference()), m_shape(shape)
 {
+#ifdef _DEBUG
+    // check bounds of TensorShape against underlying storage object
+    // This is useful to detect errors like passing a matrix from the wrong input.
+    const auto r = shape.GetLocationRange();
+    const auto n = m_sob.GetNumElements();
+    if (r.first < 0 || (size_t)r.second > n)
+        LogicError("TensorView: Shape bounds [%d,%d) exceed bounds of underlying storage object [0,%d).", (int) r.first, (int) r.second, (int) n);
+#endif
 }

 // -------------------------------------------------------------------
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -25,13 +25,16 @@ public:
    // construction
    // -------------------------------------------------------------------

-    // cast a matrix storage object (SOB) as a TensorView (without shape change)
-    TensorView(const Matrix<ElemType>& sob);
+    // reinterpret a matrix storage object (SOB) as a TensorView with a given TensorShape  --this is the main constructor
+    TensorView(const Matrix<ElemType>& sob, const TensorShape& shape);
+    // cast a Matrix as a 2D TensorView (without shape change)
+    TensorView(const Matrix<ElemType>& sob)
+        : m_sob(sob.AsReference()), m_shape(TensorShape(array<size_t, 2>{sob.GetNumRows(), sob.GetNumCols()}))
+    {
+    }
    // reshape a TensorView
-    TensorView(const TensorView<ElemType>& other, const TensorShape& shape);
-    // reinterpret a SOB as a TensorView with a given TensorShape
-    TensorView(const Matrix<ElemType>& sob, const TensorShape& shape)
-        : TensorView(TensorView(sob) /*cast as a TensorView*/, shape /*with a shape*/)
+    TensorView(const TensorView<ElemType>& other, const TensorShape& shape)
+        : m_sob(other.m_sob.AsReference()), m_shape(shape)
    {
    }
    // empty constructor
--- a/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
+++ b/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
@ -37,6 +37,10 @@ Using full BrainScript configuration

 COMMAND:     --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data  -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs  -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log'  -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance'  -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM'  -D DataDir='.'  -D DeviceId='Auto'  -D Truncated=false  -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]]  -D makeMode=false

+--- Speech\AN4:
+
+COMMAND:    configFile=$(SolutionDir)Examples\Speech\AN4\Config\LSTM-NDL.config  currentDirectory=$(SolutionDir)Examples\Speech\AN4\Data  RunDir=$(SolutionDir)Examples\RunDir\Speech\AN4  DataDir=$(SolutionDir)Examples\Speech\AN4\Data  ConfigDir=$(SolutionDir)Examples\Speech\AN4\Config  OutputDir=$(SolutionDir)Examples\RunDir\Speech\AN4  stderr=$(SolutionDir)Examples\RunDir\Speech\AN4\cntkSpeech.dnn.log  DeviceId=auto  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  parallelTrain=false  makeMode=false
+
 --- Speech\DiscriminativePreTraining:  --currently fails with MEL error 'Parameter name could not be resolved 'HL2.y'

 COMMAND:     currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data  configFile=..\DNN\DiscriminativePreTraining\cntk_dpt.config  stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\DNN\DiscriminativePreTraining\models\cntkSpeech.dnn.log  ConfigDir=$(SolutionDir)Tests\EndToEndTests\Speech\DNN\DiscriminativePreTraining  RunDir=..\RunDir\DNN\DiscriminativePreTraining  DataDir=.  DeviceId=auto  makeMode=false