Merge branch 'master' into qiwye/multiverso

2015-12-19 12:43:46 +08:00 · 2015-12-19 12:43:46 +08:00 · 9664daccb0
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
@ -9154,7 +9154,7 @@ L
 \begin_layout Standard
 \begin_inset Formula 
 \begin{eqnarray}
-\alpha_{t}\left(i\right) & \leftarrow & h_{it}+logadd_{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
+\alpha_{t}\left(i\right) & \leftarrow & h_{it}+LogAdd{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
 \mathbf{\frac{\partial R}{\partial\delta_{t-1}(i)}} & \leftarrow & \sum_{j}\frac{\partial C_{logadd}}{\partial\delta_{t}(j)}\frac{\exp(\delta_{t-1}(i)+a_{i,j})}{\sum_{k}\exp(\delta_{t-1}(k)+a_{k,j})}\\
 \mathbf{\frac{\partial R}{\partial\delta_{T}(i)}} & \leftarrow & \frac{\exp(\delta_{T}(i))}{\sum_{k}\exp(\delta_{T}(k))}\\
 \frac{\partial R}{\partial h_{t}(i)} & \leftarrow & l_{t}(i)-\frac{\partial R}{\partial\delta_{t}(i)}\\
--- a/1
+++ b/1
@ -315,6 +315,7 @@ LMSEQUENCEREADER_SRC =\
 	$(SOURCEDIR)/Readers/LMSequenceReader/Exports.cpp \
 	$(SOURCEDIR)/Readers/LMSequenceReader/SequenceParser.cpp \
 	$(SOURCEDIR)/Readers/LMSequenceReader/SequenceReader.cpp \
+	$(SOURCEDIR)/Readers/LMSequenceReader/SequenceWriter.cpp \

 LMSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LMSEQUENCEREADER_SRC))

--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -11,25 +11,8 @@
 #define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _

 #include "stdafx.h"
-#include "Actions.h"
-#include <string>
-#include <chrono>
-#include <algorithm>
-#if defined(_WIN32)
-#include "io.h"
-#endif
-#include "buildinfo.h"
-#include "hostname.h"
-#ifdef LEAKDETECT
-#include "vld.h" // for memory leak detection
-#endif
-#include <vector>
-#include <iostream>
-#include <queue>
-#include <set>
-#include <memory>
-
 #include "Basics.h"
+#include "Actions.h"
 #include "ComputationNetwork.h"
 #include "ComputationNode.h"
 #include "DataReader.h"
@ -54,6 +37,23 @@
 #include "BrainScriptEvaluator.h"
 #include "BrainScriptParser.h"

+#include <string>
+#include <chrono>
+#include <algorithm>
+#if defined(_WIN32)
+#include "io.h"
+#endif
+#include "buildinfo.h"
+#include "hostname.h"
+#ifdef LEAKDETECT
+#include "vld.h" // for memory leak detection
+#endif
+#include <vector>
+#include <iostream>
+#include <queue>
+#include <set>
+#include <memory>
+
 #ifndef let
 #define let const auto
 #endif
--- a/Source/Common/Include/DataTensor.h
+++ b/Source/Common/Include/DataTensor.h
@ -107,24 +107,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable.

-        void Save(File& fstream) const
+        // verify that this refers to a dense matrix (no strides)
+        void VerifyIsDense() const
        {
            if (m_offset != 0)
-                LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices.");
+                LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int)m_offset);
+            for (size_t k = 0; k < m_dims.size(); k++)  // (TODO: we can save one multiplication here)
+            {
+                ptrdiff_t stride = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
+                if (m_strides[k] != stride)
+                    LogicError("TensorShape: A dense TensorShape expected. Dimension %d is not.", (int)k);
+            }
+        }
+
+        void Save(File& fstream) const
+        {
+            VerifyIsDense();
            // saving as 32-bit ints. This allows to continue to support the old format (size_t W, H, C)
            fstream << (uint32_t)m_dims.size();
-            ptrdiff_t mul = 1;
-            for (size_t k = 0; k < m_dims.size(); k++)
+            for (auto dim : m_dims)
            {
-                auto dim = m_dims[k];
                if (dim > UINT32_MAX)
                    LogicError("TensorShape::Save(): Tensor dimensions %s out of bounds (> 4G).", string(*this).c_str());
                fstream << (uint32_t)dim;
-                if (m_steps[k] != mul)
-                    LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices.");
-                mul *= (ptrdiff_t)dim;
            }
        }
+
        void Load(File& fstream)
        {
            // format: uint32_t n, dim[0], dim[1], ..., dim[n-1]
@ -154,8 +162,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // accessors
        size_t GetDim(size_t k) const { return m_dims[k]; }
        size_t GetNumDims() const { return m_dims.size(); }
-        size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; }
-        ptrdiff_t GetStep(size_t k) const { return m_steps[k]; }
+        size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; } // in slice
        size_t GetOffset() const { return m_offset; }

        // vector-like accessors
@ -163,12 +170,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t size() const { return GetNumDims(); }

        const std::vector<size_t> & GetDims() const { return m_dims; }    // get all, e.g. for logging or for constructing derived tensors with edited dimensions
+        const std::vector<ptrdiff_t> & GetStrides() const { return m_strides; }

        // interpretation as an image tensor
        size_t GetNumChannels() const { return m_dims[0]; }
        size_t GetWidth()       const { return m_dims[1]; }
        size_t GetHeight()      const { return m_dims[2]; }

+        // indexing
+        // Determines the offset into the underlying element array for a given multi-dimensional index.
+        // This function is for reference. Probably not often used.
+        size_t Locate(const std::vector<size_t> & index) const
+        {
+            ptrdiff_t location = m_offset;
+            for (size_t k = 0; k < index.size(); k++)
+            {
+                size_t dim = k < size() ? m_dims[k] : 1;        // dimensions are bottomless
+                if (index[k] >= dim)
+                    LogicError("Locate: Tensor index[%d]=%d exceeds bound %d.", (int)k, (int)index[k], (int)dim);
+                location += (ptrdiff_t)index[k] * m_strides[k]; // strides may be negative
+            }
+            if (location < 0 || (size_t)location >= m_allocation)
+                LogicError("Locate: Tensor index out of bounds.");
+            return (size_t)location;
+        }
+
        // helpers for tensor operations
        bool CanFlatten(size_t k) const     // can dims k and k-1 be flattened into a single vector? (do they form a matrix without stride)
        {
@ -179,66 +205,145 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (m_dims[k] == 1 || m_dims[k - 1] == 1)   // both are broadcasting or scalar--we don't care about stride in this case
                return true;
            else
-                return m_steps[k] == m_steps[k - 1] * (ptrdiff_t)m_dims[k - 1];
+                return m_strides[k] == m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1];
+        }
+        // editing functions
+        // These all create new TensorShape objects.
+        TensorShape Flatten(size_t k) const  // flatten [k] with [k-1]
+        {
+            TensorShape result = *this;
+            if (!CanFlatten(k))
+                LogicError("Flatten() cannot flatten dimensions with gaps");
+            // We reshape local (I x J) sub-matrices to (1 x I*J) sub-matrices.
+            // We merge to right so that we can merge multiple by looping left-to-right.
+            //   m_dims    =   I   J    K     L
+            //   m_strides =   1   I    I*J   I*J*K
+            // flattening J and K
+            //   m_dims    =   I   1    J*K   L
+            //   m_strides =   1   I    I     I*J*K
+            // TODO: rethink whether this is correct for example of negative strides
+            result.m_dims[k] *= result.m_dims[k - 1];
+            result.m_dims[k - 1] = 1;
+            result.m_strides[k] = /*result.m_dims[k - 1] *, it's 1 */ result.m_strides[k - 1];
+            return result;
+        }
+        TensorShape DropDims(const std::vector<bool> & toDrop) const  // remove dimension
+        {
+            // this deletes a dimension while retaining strides
+            // This implies a slice to [0] for this dimension.
+            TensorShape result = *this;
+            size_t j = 0;
+            for (size_t k = 0; k < size(); k++)
+            {
+                if (toDrop[k])
+                    continue;
+                else
+                {
+                    // example
+                    //   m_dims    =   I   1    J   K
+                    //   m_strides =   1   I    I   I*J
+                    // dropping the second dimension
+                    //   m_dims    =   I   %    J   K
+                    //   m_strides =   1   %    I   I*J
+                    result.m_dims[j] = result.m_dims[k];
+                    result.m_strides[j] = result.m_strides[k];
+                    j++;
+                }
+            }
+            result.m_dims.resize(j);
+            result.m_strides.resize(j);
+            return result;
+        }
+        TensorShape WithBroadcastStrides() const  // flatten [k] with [k-1] if toFlatten[k] is set
+        {
+            TensorShape result = *this;
+            for (size_t k = 0; k < size(); k++)
+                if (result.m_dims[k] == 1)
+                    result.m_strides[k] = 0;
+            return result;
+        }
+        TensorShape Pad(size_t numDims) const               // append singleton dimensions
+        {
+            VerifyIsDense();
+            if (numDims < GetNumDims())
+                LogicError("Pad() cannot drop a shorten the dimensions.");
+            else if (numDims == GetNumDims())
+                return *this;
+            auto dims = GetDims();
+            dims.resize(numDims, 1);
+            return TensorShape(dims);
+        }
+        TensorShape Concat(const TensorShape & other) const // concatenate
+        {
+            auto dims = GetDims();
+            auto otherDims = other.GetDims();
+            dims.insert(dims.end(), otherDims.begin(), otherDims.end());
+            return TensorShape(dims);
        }

        // pretty-printing. Returns tensor dims in the form "I x J x K".
        operator std::string() const
        {
            std::string s;
-            for (const auto & dim : m_dims)
+            for (size_t k = 0; k < size(); k++)
            {
                if (!s.empty())
                    s.append(" x ");
-                s.append(std::to_string(dim));
+                s.append(std::to_string(m_dims[k]));
            }
+#ifdef _DEBUG   // also emit the strides, easier for debugging
+            s.append(" {");
+            for (size_t k = 0; k < size(); k++)
+            {
+                if (k > 0)
+                    s.append(",");
+                s.append(std::to_string(m_strides[k]));
+            }
+            s.append("}");
+#endif
            return s;
        }

    private:
-        // reset m_steps and m_offset to represent a canonical no-strides tensor
+        // reset m_strides and m_offset to represent a canonical no-strides tensor
        void InitAsNoSlice()
        {
            m_offset = 0;
-            m_steps.resize(m_dims.size());
-            ptrdiff_t mul = 1;
+            m_strides.resize(m_dims.size());
            for (size_t k = 0; k < m_dims.size(); k++)
-            {
-                m_steps[k] = (ptrdiff_t)mul;
-                mul *= m_dims[k];
-            }
+                m_strides[k] = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
+            m_allocation = m_dims.empty() ? 0 : m_dims.back() * (size_t)m_strides.back();
        }

    private:
        std::vector<size_t> m_dims;     // dimensions of tensor or tensor slice. The size of the box.
-        std::vector<ptrdiff_t> m_steps; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here!
+        std::vector<ptrdiff_t> m_strides; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here!
        size_t m_offset;                // offset to element(0,0,...,0). May be non-0 in case of slicing.
-        // For a regular tensor, there are no strides, m_steps[k] = m_steps[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions.
+        size_t m_allocation;            // allocation size of original dense tensor
+        // For a regular tensor, there are no strides, m_strides[k] = m_strides[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions.
        // For views into existing tensors, we do stride shenanigans to implement broadcasting (plus magic tricks). Examples:
        // To traverse a 5 x 10 matrix with column order reversed:
        //  - op.dims = (5 x 10)
        //  - m_offset points to element (0,9)
-        //  - m_steps[0] = 1            // regular forward iteration within each column
-        //  - m_steps[1] = -5           // backward iteration over columns
+        //  - m_strides = (1, -5)       // backward iteration over columns
        // To compute matrix C(13 x 42) = vector A(13 x 1) + matrix B(13 x 42):
        //  - op = sum
        //  - op.dims = (13 x 42)
-        //  - *.m_steps[0] = 1          // forward iteration through each column
-        //  - C.m_steps[1] = 13         // forward iteration over columns of B--defines the for loop
-        //  - B.m_steps[1] = 13         // forward iteration over columns of B--iterates in sync with C
-        //  - A.m_steps[1] = 0          // A, however, is stuck in column 0 forever
+        //  - C.m_strides = (1, 13)     // forward iteration over columns of B--defines the for loop
+        //  - B.m_strides = (1, 13)     // forward iteration over columns of B--iterates in sync with C
+        //  - A.m_strides = (1, 0)      // A, however, is stuck in column 0 forever
        // Matrix product: C(I x K) = A(I x J) * B(J x K)   --Note: Likely not RAM-bandwidth efficient!
        //  - op = mul
        //  - op.dims   = (I x J x K)   // iteration dimensions
-        //  - C.m_steps = (1, 0, I)     // inverse broadcasting for inner dimension
-        //  - A.m_steps = (1, I, 0)
-        //  - B.m_steps = (0, 1, J)
+        //  - C.m_strides = (1, 0, I)   // inverse broadcasting for inner dimension
+        //  - A.m_strides = (1, I, 0)
+        //  - B.m_strides = (0, 1, J)
        // Convolution of time signals (without padding): Y(T-N+1) = X(T) * H(N):   --Note: Likely not RAM-bandwidth efficient!
        //  - op = mul
        //  - op.dims   = (T-N+1 x N)   // iteration dimensions
-        //  - Y.m_steps = (1, 0)        // inverse broadcasting: this sums up the individual products
-        //  - X.m_steps = (1, 1)        // shift window by 1 for each output sample
-        //  - H.m_steps = (0, -1)       // reuse for each output sample; iterate in reverse order for convolution
+        //  - Y.m_strides = (1, 0)      // inverse broadcasting: this sums up the individual products
+        //  - X.m_strides = (1, 1)      // shift window by 1 for each output sample
+        //  - H.m_strides = (0, -1)     // reuse for each output sample; iterate in reverse order for convolution
        //  - H.m_offset = N - 1        // begin with last element (reverse order for convolution)
        // TODO: double-check all these
        // TODO: Does the same trick work for 2D images?
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -108,12 +108,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_numParallelSequences = numParallelSequences;
            m_numTimeSteps = numTimeSteps;
            // allocate lookup tables (note: except at the start, these don't really allocate new memory most of the time)
-            // PTRDIFF_MAX indicates not initialized (also in the matrix, which is stored as float).
-            m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToStart.SetValue((float)PTRDIFF_MAX);
-            m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToEnd.SetValue((float)PTRDIFF_MAX);
+#if 1
+            if ((m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps) && m_numTimeSteps > 0)   // sanity check for debugging a regression
+                fprintf(stderr, "MBLayout::Init: Resizing m_distanceToStart from %d x %d to %d x %d\n",
+                        (int)m_distanceToStart.GetNumRows(), (int)m_distanceToStart.GetNumCols(), (int)m_numParallelSequences, (int)m_numTimeSteps); // (I really want to know about actual allocations, but this is a necessary condition for them)
+#endif
+            m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps);
+            m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps);
            m_distanceToNearestStart.assign(m_numTimeSteps, PTRDIFF_MAX);
-            m_distanceToNearestEnd.assign(m_numTimeSteps, PTRDIFF_MAX);
+            m_distanceToNearestEnd.assign(m_numTimeSteps,   PTRDIFF_MAX);
            m_timeStepHasGap.assign(m_numTimeSteps, false);
+            m_columnsValidityMask.Resize(0, 0);     // invalidate
            // reset state
            m_numFramesDeclared = 0;
            m_numGapFrames = 0;
@ -121,20 +126,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_writable = true;
        }

-        // short-hand to initialize an MBLayout for the common case of frame mode
-        // In frame mode, there is one parallel "sequence" per sample, which is 1 frame long.
-        void InitAsFrameMode(size_t numSamples)
-        {
-            Init(numSamples, 1);
-            SequenceInfo seqInfo { 0, 0, 0, 1 };
-            for (size_t s = 0; s < numSamples; s++)
-            {
-                seqInfo.seqId = seqInfo.s = s;
-                AddSequence(seqInfo);
-            }
-            Lock();
-        }
-
        // -------------------------------------------------------------------
        // accessors
        // -------------------------------------------------------------------
@ -199,7 +190,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch.");

            // remember it
+#ifdef _DEBUG
+            auto cap = m_sequences.capacity();  // Some sanity check for debugging a speed regression. This should only show up during the first minibatches, and growing only.
            m_sequences.push_back(seqDesc);
+            if (cap != m_sequences.capacity())
+                fprintf(stderr, "AddSequence: m_sequences was reallocated from capacity %d to %d\n", (int)cap, (int)m_sequences.capacity());
+#else
+            m_sequences.push_back(seqDesc);
+#endif

            // create all the cached fast-lookup information
            const auto seqId = seqDesc.seqId;
@ -212,7 +210,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                m_numGapFrames += (e - b);
                for (size_t t = b; t < e; t++)
                {
-                    //Set(s, t, MinibatchPackingFlags::NoInput);
                    m_timeStepHasGap[t] = true;
                    m_distanceToStart(s, t) = -1;   // start flags also encode gaps
                }
@ -220,22 +217,49 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            else for (size_t t = b; t < e; t++)
            {
                // update the nearest sentence boundaries, minimum over all parallel sequences
-                // -1 in distanceToStart(,) stands for a gap
-                assert(m_distanceToStart(s, t) != -1);  // gaps not allowed to overlap
                // If 0, then we are on a boundary. If not 0, we can still test in presence of FrameRange.m_timeOffset.
-                ptrdiff_t distanceToStart = t - beginTime;
-                if (m_distanceToStart(s, t) > (float)distanceToStart)
-                    m_distanceToStart(s, t) = (float)distanceToStart;
+                ptrdiff_t distanceToStart = (ptrdiff_t)t - beginTime;
+                ptrdiff_t distanceToEnd = (ptrdiff_t)(endTime - 1 - t);
+                m_distanceToStart(s, t) = (float)distanceToStart;
+                m_distanceToEnd(s, t) = (float)distanceToEnd;
+                // and the aggregate
                if (m_distanceToNearestStart[t] > distanceToStart)
                    m_distanceToNearestStart[t] = distanceToStart;
-                ptrdiff_t distanceToEnd = endTime - 1 - t;
-                if (m_distanceToEnd(s, t) > (float) distanceToEnd)
-                    m_distanceToEnd(s, t) = (float) distanceToEnd;
                if (m_distanceToNearestEnd[t] > distanceToEnd)
                    m_distanceToNearestEnd[t] = distanceToEnd;
            }
        }

+        // short-hand to initialize an MBLayout for the common case of frame mode
+        // In frame mode, there is one parallel "sequence" per sample, which is 1 frame long.
+        // This function provides an efficient short-cut implementation of AddSequence(t, t, 0, 1) for every sample t.
+        void InitAsFrameMode(size_t numSamples)
+        {
+            Init(numSamples, 1);
+
+            // create sequences array
+            SequenceInfo virginSeqInfo = { 0, 0, 0, 1 };
+            m_sequences.resize(numSamples, virginSeqInfo);  // pass it here since otherwise STL will initialize everything to 0 unnecessarily
+
+            // update sequence indices
+            for (size_t s = 0; s < numSamples; s++)
+            {
+                // remember it
+                auto & seqDesc = m_sequences[s];
+                seqDesc.seqId = s;
+                seqDesc.s = s;
+            }
+            m_numFramesDeclared = numSamples;
+
+            // create all the cached fast-lookup information
+            m_distanceToStart.SetValue(0);
+            m_distanceToEnd.SetValue(0);
+            m_distanceToNearestStart[0] = 0;
+            m_distanceToNearestEnd[0] = 0;
+
+            Lock();
+        }
+
        // mark a range of frames in a parallel sequence as invalid
        // I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them.
        void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); }
@ -330,10 +354,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //                              2  1  0  .  . ]          // (last two time steps undefined)
        // m_distanceToNearestStart = [ 0  1  2  3  4 ]
        // m_distanceToNearestEnd   = [ 2  1  0  1  0 ]
-        Matrix<float> m_distanceToStart, m_distanceToEnd;                   // (s,t); value<0 stands for gap, PTRDIFF_MAX for 'not initialized'
-        vector<ptrdiff_t> m_distanceToNearestStart, m_distanceToNearestEnd; // [t]    (value<0 does NOT stand for gap; consult m_timeStepHasGap[] vector instead)
+        Matrix<float> m_distanceToStart, m_distanceToEnd;                   // (s,t); value<0 stands for gap
+        vector<ptrdiff_t> m_distanceToNearestStart, m_distanceToNearestEnd; // [t]    (does not store info about gaps; consult m_timeStepHasGap[] vector instead)

-        vector<bool> m_timeStepHasGap;                                      // [t]
+        vector<bool> m_timeStepHasGap;                                      // [t] true if at least one gap in time step t

        // Cached mask indicating the validity of each column in the MBLayout
        // TODO: We actually just need a boolean matrix for this.
@ -527,6 +551,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (s == SIZE_MAX)                      // aggregate requested
        {
            // determine flags from aggregate vectors
+            // Note: We allow that all parallel sequences contain gaps (m_distanceToNearestStart[t] == PTRDIFF_MAX)
+            // because that makes implementation of the reader easier for truncated BPTT (it knows too late that there are not that many frames left).
            auto distanceToStart = (ptrdiff_t)m_distanceToNearestStart[t];
            if (distanceToStart < -fr.m_timeOffset)
                return true;
@ -557,7 +583,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // TODO: Remove this version (with sanity checks) after this has been tested. Then the function can be inlined above.
    inline size_t MBLayout::GetActualNumSamples() const
    {
-#if 1       // sanity check  --TODO: delete this after a while
+#if 0       // sanity check  --TODO: delete this after a while
        size_t n = GetNumCols();
        if (HasGaps())
        {
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -13,6 +13,8 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+    using namespace std;
+
    // -----------------------------------------------------------------------
    // subroutines for Validate() implementations
    // -----------------------------------------------------------------------
@ -41,13 +43,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // all are consistent: install it
        LinkToMBLayout(pMBLayout);
    }
+
    // single input that maps its input element-wise (e.g. Sigmoid)
    void ComputationNodeBase::ValidateUnaryMap(bool isFinalValidationPass)
    {
        assert(m_inputs.size() == 1);
        ComputationNodeBase::Validate(isFinalValidationPass);
        InferMBLayoutFromInputsForStandardCase();
-        SetDims(m_inputs[0]->GetNumRows(), DetermineNumCols(m_inputs[0]));
+        SetDims(m_inputs[0]);
        InferImageDimsFromInputs();
    }
    // binary zip operation, e.g. Plus
@ -138,6 +141,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

+    // -----------------------------------------------------------------------
+    // tensor helpers
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    static TensorShape GetSampleShape(const ComputationNode<ElemType> * node)
+    {
+        // TODO: use actual ImageLayout. While those are not yet inferred properly, maybe use it if its dims match numRows?
+        if (node->HasMBLayout())                        // if we have a layout, that dimension is not part of the sample shape
+            return TensorShape(node->GetNumRows());
+        else
+            return TensorShape(node->GetNumRows(), node->GetNumCols());
+    }
+
+    template<class ElemType>
+    std::vector<TensorView<ElemType>> ComputationNode<ElemType>::GetTensorsForwardBinary(const FrameRange & fr)
+    {
+        const size_t N = 3;     // 2 inputs and 1 output
+        // BUGBUG: Currently does not interpret actual ImageLayouts or convolutional models.
+        // TODO: move this into a helper function
+        // get tensor shapes
+        vector<ComputationNode<ElemType>*> nodes;
+        for (size_t i = 0; i < N; i++)
+            nodes.push_back(i < N-1 ? Input(i).get() : this);
+        vector<Matrix<ElemType>> values;
+        vector<TensorShape> shapes;
+        for (size_t i = 0; i < N; i++)
+        {
+            values.push_back(nodes[i]->ValueFor(i < N-1 ? fr.AllowBroadcast() : fr));   // no broadcasting for now allowed for output
+            shapes.push_back(GetSampleShape(nodes[i]));
+        }
+        // pad
+        size_t dims = 0;
+        for (size_t i = 0; i < N; i++)
+            if (dims < shapes[i].GetNumDims())
+                dims = shapes[i].GetNumDims();
+        for (size_t i = 0; i < N; i++)
+            shapes[i] = shapes[i].Pad(dims);
+        // concatenate MBLayout dims
+        // TODO: Is it possible that the output has no layout, but inputs have? Then we lost dimensions. Tensor constructor will catch that, though.
+        if (HasMBLayout())
+        {
+            for (size_t i = 0; i < N; i++)
+            {
+                auto sm = nodes[i]->HasMBLayout() ? TensorShape(GetNumParallelSequences(), GetNumTimeSteps()) : TensorShape(1, 1);
+                shapes[i] = shapes[i].Concat(sm);
+            }
+        }
+        // perform operation
+        std::vector<TensorView<ElemType>> tensors;
+        for (size_t i = 0; i < N; i++)
+            tensors.push_back(TensorView<ElemType>(values[i], shapes[i]));
+        return tensors;
+    }
+
    // -----------------------------------------------------------------------
    // others
    // -----------------------------------------------------------------------
@ -172,6 +230,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<> std::map<size_t, std::map<size_t, FloatMatrix*>>  ComputationNode<float>::s_constOnes{};
    template<> std::map<size_t, std::map<size_t, DoubleMatrix*>> ComputationNode<double>::s_constOnes{};

+    template class ComputationNode<float>;
+    template class ComputationNode<double>;
+
    template class LearnableParameter<float>;
    template class LearnableParameter<double>;
 }}}
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -340,18 +340,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }
        // helper functions for common cases
-    private:
-        // determine number of columns from a child and/or layout
-        size_t DetermineNumCols(const ComputationNodeBasePtr & child) const
-        {
-            size_t childCols = child->GetNumCols();     // this is what the child says
-            if (!m_pMBLayout)                           // no layout: copy from child
-                return childCols;
-            size_t cols = m_pMBLayout->GetNumCols();    // layout: get it from there, but validate against child
-            if (childCols != cols)
-                RuntimeError("%ls %ls operation: Mismatch in number of columns", OperationName().c_str(), NodeName().c_str());
-            return cols;
-        }
    protected:
        void ValidateUnaryMap(bool isFinalValidationPass);
        void ValidateUnaryReduce(bool isFinalValidationPass);
@ -779,7 +767,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    protected:
        //std containers such as list and map does not support class reference so we need to use pointer
        typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
-        ComputationNode() { }
    public:
        using ComputationNodeBase::AttachInputs;    // import the convenience functions that take 1..6 parameters
        using ComputationNodeBase::SetDims;
@ -1085,6 +1072,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        const Matrix<ElemType>& Gradient() const { return *m_gradient; }
        Matrix<ElemType>& Gradient()             { return *m_gradient; }

+        std::vector<TensorView<ElemType>> GetTensorsForwardBinary(const FrameRange & fr);
+
        // Function to return the number of columns for whole batch or single frame
        size_t GetNumColsFor(const FrameRange & fr/*select frame or entire batch*/)
        {
@ -1519,7 +1508,7 @@ protected: \
    using Base::CreateUniqId; \
    using Base::GetNumInputs; using Base::ZeroGradientsOfInputs; using Base::VerifyDims; \
    using Base::ConstOnes; \
-    using Base::GetImageLayout; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \
+    using Base::GetImageLayout; using Base::GetTensorsForwardBinary; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \
    using Base::CopyTo; using Base::CreateUniqNodeName; using Base::DetachInputs; using Base::GetInputsFromConfig; \
    using Base::DumpNodeInfo; using Base::EnumerateNodes; \
    using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -20,6 +20,7 @@

 #include "Basics.h"
 #include "Matrix.h"
+#include "TensorView.h"
 #include "ComputationNode.h"
 #include "ConvolutionalNodes.h"

@ -129,6 +130,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override  
        {
+#if 0       // TODO: use #if 0 until this is working
+            auto args = GetTensorsForwardBinary(fr);
+            args[2].DoSumOf(0.0f, args[0], args[1], 1.0f);
+#else
            Matrix<ElemType> functionValues = ValueForToDense(fr, false); // Switch to dense as a work-around because ColumnSlice doesn't support all the sparse formats
            Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
            Matrix<ElemType> inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast());
@ -185,6 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            else
                LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
+#endif
 #if DUMPOUTPUT
            functionValues.Print("PlusNode");
 #endif
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -9,12 +9,13 @@
 #include "stdafx.h"
 #include "Basics.h"
 #include "File.h"
-
+#include "CPUMatrix.h"
+#include "TensorOps.h"
 #include <assert.h>
 #include <stdexcept>
 #include <omp.h>
 #include <math.h>
-#include "CPUMatrix.h"
+
 #include <random>
 #include <chrono>
 #include <exception>
@ -351,7 +352,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this;

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j = 0; j<n; j++)
        {
            //four-way unrolling
@ -384,7 +385,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        long n = (long)a.GetNumCols();        // note: OpenMP requires loop indices to be long, not size_t
        long k = (long)a.GetNumRows();

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //memory copy might be faster?
@ -428,7 +429,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this; 

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -469,7 +470,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this;

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j = 0; j<n; j++)
        {
            //four-way unrolling
@ -500,7 +501,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this;

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long i = 0; i < m_numRows; i++)
        {
            diag(0, (size_t)i) = us(i, i);
@ -538,7 +539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this;

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j = 0; j<sliceNumCols; j++)
        {
            for (int i = 0; i < inputMatrices.size(); i++)
@ -575,7 +576,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        long n = (long)a.GetNumCols(), m = (long)a.GetNumRows();
        auto& us = *this;

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long q = 0; q < numColRepeats; q++)
        {
            for (long p = 0; p < numRowRepeats; p++)
@ -619,7 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this;

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j = 0; j<n; j++)
        {
            //four-way unrolling
@ -685,7 +686,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this; 

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -719,7 +720,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        else
        {     
            long m=(long)GetNumElements();
-#pragma omp parallel for     
+#pragma omp parallel for
            //four-way unrolling
            for (long i=0; i<(m & ~3); i+=4)
            {
@ -777,7 +778,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this; 
        long m=(long)GetNumRows();
-#pragma omp parallel for     
+#pragma omp parallel for
        //four-way unrolling
        for (long i=0; i<(m & ~3); i+=4)
        {
@ -802,7 +803,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this; 
        long m=(long)GetNumRows();
-#pragma omp parallel for     
+#pragma omp parallel for
        //four-way unrolling
        for (long i=0; i<(m & ~3); i+=4)
        {
@ -827,7 +828,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this; 
        long m=(long)GetNumRows();
-#pragma omp parallel for     
+#pragma omp parallel for
        //four-way unrolling
        for (long i=0; i<(m & ~3); i+=4)
        {
@ -935,7 +936,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        auto& us = *this;
        long m=(long)GetNumRows();
-#pragma omp parallel for     
+#pragma omp parallel for
        //four-way unrolling
        for (long i=0; i<(m & ~3); i+=4)
        {
@ -974,7 +975,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            long m=(long)GetNumRows();
            if (vector.GetNumRows() == 1) //row vector
            {
-#pragma omp parallel for     
+#pragma omp parallel for
                //four-way unrolling
                for (long i=0; i<(m & ~3); i+=4)
                {
@ -991,7 +992,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            else
            {
-#pragma omp parallel for     
+#pragma omp parallel for
                //four-way unrolling
                for (long i=0; i<(m & ~3); i+=4)
                {
@ -1164,7 +1165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ElemType a0, a1, a2, a3;

        //disable omp here because aveMultiper needs to be added atomically. however, it seems the result is incorrect even if rmp atomic and amp critical are used.
-//#pragma omp parallel for     
+//#pragma omp parallel for
        for (long i = 0; i<(n & ~3); i += 4)  //four-way unrolling
        {
            a[i] += d_v[i] * d_v[i];
@ -1495,7 +1496,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -1596,7 +1597,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -1625,7 +1626,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -1816,7 +1817,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -1853,7 +1854,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        auto& us=*this;

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -1921,7 +1922,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        auto& us=*this;

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -1956,7 +1957,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        auto& us=*this;

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            ElemType v = a(0,j);
@ -1991,7 +1992,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        auto& us = *this;

        long m = (long)GetNumRows(), n = (long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j = 0; j<n; j++)
        {
            ElemType v = a(0, j);
@ -2032,7 +2033,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        long m=(long)GetNumRows(), n=(long)GetNumCols();

        ElemType smallValue = EPS_IN_INVERSE;
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            for (long i=0; i<m; i++)
@ -2133,7 +2134,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -2172,7 +2173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -2220,7 +2221,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -2387,7 +2388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -2427,7 +2428,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -2467,7 +2468,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -2620,7 +2621,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        auto& us=*this;

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -2660,7 +2661,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ElemType locTHresholdNeg = -locThresholdPos; 

        long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j=0; j<n; j++)
        {
            //four-way unrolling
@ -2708,7 +2709,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        long m = (long)GetNumElements();

-#pragma omp parallel for     
+#pragma omp parallel for
        for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
        {
            if (m_pArray[i] > threshold)
@ -4304,7 +4305,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (sample_id == 0)
                sample_prob = -sample_prob;
            double score_noise = log_num_noise_samples + sample_prob;
-            double z = logadd(score, score_noise);
+            double z = LogAdd(score, score_noise);
            double logprob = score - z;
            double logprob_noise = score_noise - z;
            tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob);
@ -4387,7 +4388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            ElemType v = alpha*a(0,0);
            long m=(long)c.GetNumRows(), n=(long)c.GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
            for (long j=0; j<n; j++)
            {
                //four-way unrolling
@ -4497,7 +4498,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            LogicError("AddScaledDifference:  Input matrix a is empty.");

        long m=(long)c.GetNumElements();
-#pragma omp parallel for     
+#pragma omp parallel for
        //four-way unrolling
        for (long i=0; i<(m & ~3); i+=4)
        {
@ -4536,7 +4537,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            c.Resize(a.GetNumRows(), a.GetNumCols());

        long m=(long)c.GetNumElements();
-#pragma omp parallel for     
+#pragma omp parallel for
        //four-way unrolling
        for (long i=0; i<(m & ~3); i+=4)
        {
@ -4634,7 +4635,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        c.Resize(m,n);

        long size=(long)c.GetNumElements();
-#pragma omp parallel for     
+#pragma omp parallel for
        //four-way unrolling
        for (long i=0; i<(size & ~3); i+=4)
        {
@ -4944,7 +4945,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        bool bHas = false;

        bool isvFinite = std::isfinite(v);
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j = 0; j < mat.GetNumElements(); j++)
        {
 #pragma omp flush(bHas)
@ -4992,7 +4993,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        long m = (long)GetNumRows(), n = (long)GetNumCols();  // a and b are of size (1,n)
-        //#pragma omp parallel for     
+        //#pragma omp parallel for

        for (long j = 0; j < n; j++)
        {
@ -5247,7 +5248,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        //long m = (long)GetNumRows(), n = (long)GetNumCols();  // a and b are of size (1,n)
        long n = (long)GetNumCols();  // a and b are of size (1,n)
-#pragma omp parallel for     
+#pragma omp parallel for
        for (long j = 0; j<n; j++)
        {
            us(0, j) = a(0, j) * b(0, (j + shift) % n);
@ -5256,34 +5257,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }

-
 #pragma endregion Static BLAS Functions

-    double logadd(double x, double y)
-    {
-        double temp, diff, z; 
-    
-        if (x < y) {
-            temp = x; x = y; y = temp;
-        }
-        diff = y - x; 
-        if (diff < MINLOGEXP)
-        {
-            return (x < LSMALL)?LZERO:x;
-        }
-        else
-        {
-            z = exp(diff);
-            return x + log(1.0 + z);
-        }
-    }
+    // 'double' version of LogAdd
+    double LogAddD(double x, double y) { return LogAdd(x, y); }

    template<class ElemType>
    ElemType CPUMatrix<ElemType>::LogAddSumOfElements() const
    {
        ElemType fAlpha = (ElemType)LZERO;
        for (int k = 0; k < GetNumElements(); k++)
-            fAlpha = (ElemType) logadd(fAlpha, m_pArray[k]);
+            fAlpha = (ElemType) LogAddD(fAlpha, m_pArray[k]);
        return fAlpha;
    }

@ -5330,7 +5314,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            fSum = (ElemType)LZERO;
            for (int j = 0; j < iNumLab; j++)
            {
-                fSum = (ElemType)logadd((double)fSum, alpha(j, t));
+                fSum = (ElemType)LogAddD(fSum, alpha(j, t));
            }

            fTmp = alpha(k, t) - fSum;
@ -5343,10 +5327,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                fSum = (ElemType)LZERO;
                for (int m = 0; m < iNumLab; m++)
                {
-                    fSum = (ElemType)logadd((double)fSum, alpha(m, t) + pair_scores(j, m));
+                    fSum = (ElemType)LogAddD(fSum, alpha(m, t) + pair_scores(j, m));
                }

-                fTmp = (ElemType)logadd(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum);
+                fTmp = (ElemType)LogAddD(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum);
            }
            beta(k, t) = fTmp;
        }
@ -5455,7 +5439,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    else{
                        fTmp2 = a(k, 0);
                    }
-                    fSum = (ElemType)logadd(fSum, fTmp2 + pair_scores(j, k));
+                    fSum = (ElemType)LogAddD(fSum, fTmp2 + pair_scores(j, k));
                }

                fTmp -= fSum;
@ -5533,7 +5517,259 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return numThreads;
    }

-    // The explicit instantiation part
+    // -----------------------------------------------------------------------
+    // TensorView support
+    // -----------------------------------------------------------------------
+
+    // To save time, this makes extensive use of templates and macros.
+
+    // perform loop over reduction index m
+    // This function is declared inside a wrapper struct to allow partial specialization (m = -1).
+    template<class ElemType, typename OPFN, size_t N, int m>
+    struct TensorOpReduction
+    {
+        // reduction case (non-reduction case is specialized)
+        static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
+                                    const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+        {
+            array<ptrdiff_t, N - 1> strides;        // N-1 because last one is the result pointer, which is unused in reduction
+            for (size_t i = 0; i < N - 1; i++)      // N = a small constant, this will be unrolled
+                strides[i] = reducingStrides[i][(size_t)m];
+            ElemType aggregate = 0;
+            for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;)
+            {
+                // need to descend into one loop deeper
+                aggregate += TensorOpReduction<ElemType, OPFN, N, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
+                // advance the pointers
+                for (size_t i = 0; i < N - 1; i++)
+                    pointers[i] += strides[i];      // note: last pointer (result) is unused and untouched here
+            }
+            return aggregate;
+        }
+    };
+
+    // perform loop over reduction index m
+    // This is the specialized version for m = -1, which terminates the recursion.
+    template<class ElemType, typename OPFN, size_t N>
+    struct TensorOpReduction<ElemType, OPFN, N, -1>
+    {
+        static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
+                                    const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &)
+        {
+            return opfn(pointers);          // finally we are doing some work!!!
+        }
+    };
+
+    // perform loop over regular index k and reducing index m for N operands (counting the output)
+    template<class ElemType, typename OPFN, size_t N, bool vectorizable, int m, int k>
+    struct TensorOpIteration
+    {
+        static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+        {
+            // non-scalar case: still nested result loops left
+            array<ptrdiff_t, N> strides;
+            for (size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
+                strides[i] = regularStrides[i][(size_t)k];
+            for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;)
+            {
+                // need to descend into one loop deeper
+                TensorOpIteration<ElemType, OPFN, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                // advance the pointers
+                for (size_t i = 0; i < N; i++)
+                    pointers[i] += strides[i];
+            }
+        }
+    };
+
+    // Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
+    // This is a very common case, e.g. adding vectors or computing the Sigmoid.
+    template<class ElemType, typename OPFN>
+    struct TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
+    {
+        static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
+        {
+            ElemType* pa = pointers[0];
+            ElemType* pb = pointers[1];
+            ElemType* pc = pointers[2];
+            size_t K = regularOpDims[0];
+            // special-case beta and alpha to allow the compiler to short-circuit it
+            if (beta != 0)
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, array<ElemType*, 3> { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            else if (alpha != 1)
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            else
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            // TODO: somehow this does not use 4-way parallelism with SSE (VS 2013), and the signedness of k (required for omp) causes an extra sign-extend
+            // TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
+        }
+    };
+    // and unary
+    template<class ElemType, typename OPFN>
+    struct TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
+    {
+        static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
+        {
+            ElemType* pa = pointers[0];
+            ElemType* pb = pointers[1];
+            size_t K = regularOpDims[0];
+            // special-case beta and alpha to allow the compiler to short-circuit it
+            if (beta != 0)
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, array<ElemType*, 2> { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            else if (alpha != 1)
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 2> { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            else
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 2> { pa + k, pb + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        }
+    };
+
+    template<class ElemType, typename OPFN, size_t N, bool vectorizable, int m>
+    struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
+    {
+        static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+        {
+            // we are at element level for the result: perform the op (there may still be reduction)
+            ElemType val = alpha * TensorOpReduction<ElemType, OPFN, N, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
+            // combine with previous value in target matrix, then write it out
+            auto * pout = pointers.back();
+            if (beta != 0)
+                val += beta * *pout;
+            *pout = val;
+            return;
+        }
+    };
+
+    // tensor operation with k+1 dimensions (-1 means scalar)
+    template<class ElemType, typename OPFN, size_t N, int k>
+    static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, const OPFN & opfn,
+                                        const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                                        const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+    {
+        size_t dims = reducingOpDims.size();
+        switch (dims)
+        {
+        case 2: return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 0:
+            {
+                // if all leading dimensions are 1, we can let the compiler do some unrolling
+                bool leadingAllOne = true;
+                for (size_t i = 0; i < N; i++)
+                    leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
+                if (leadingAllOne)      // special version that uses a hard-coded increment of 1 for all leading dimensions
+                    return TensorOpIteration<ElemType, OPFN, N, true/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                else
+                    return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            }
+        default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims);
+        }
+    }
+
+    // tensor operation, generalized in number of arguments, operation already provided as a lambda
+    // This function now expands into different k.
+    template<class ElemType, typename OPFN, size_t N>
+    static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
+                               const std::array<size_t, N> & offsets,
+                               const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                               const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+    {
+        for (size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
+            pointers[i] += offsets[i];
+        size_t dims = regularOpDims.size();
+        switch (dims)
+        {
+        case 4: return TensorOpWithRegularLoop<ElemType, OPFN, N, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 3: return TensorOpWithRegularLoop<ElemType, OPFN, N, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 2: return TensorOpWithRegularLoop<ElemType, OPFN, N, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return TensorOpWithRegularLoop<ElemType, OPFN, N, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 0: return TensorOpWithRegularLoop<ElemType, OPFN, N, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
+        }
+    }
+
+    // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+    // This maps 'op' to a lambda.
+    template<class ElemType>
+    void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+                                       const std::array<size_t, 2> & offsets,
+                                       const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
+                                       const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
+    {
+        #define CaseUnaryTensorOp(oper) \
+            case ElementWiseOperator::op ## oper: \
+                return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2> & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
+        array<ElemType*, 2> pointers = { a.m_pArray, m_pArray };
+        switch (op)
+        {
+        ForAllUnaryOps(CaseUnaryTensorOp);
+        default: LogicError("TensorUnaryOp: Unknown op code %d.", (int)op);
+        }
+    }
+
+    // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+    // This maps 'op' to a lambda.
+    template<class ElemType>
+    void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+                                       const std::array<size_t, 3> & offsets,
+                                       const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
+                                       const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
+    {
+        #define CaseBinaryTensorOp(oper) \
+            case ElementWiseOperator::op ## oper: \
+                return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
+        array<ElemType*, 3> pointers = { a.m_pArray, b.m_pArray, m_pArray };
+        switch (op)
+        {
+        ForAllBinaryOps(CaseBinaryTensorOp);
+        default: LogicError("TensorBinaryOp: Unknown op code %d.", (int)op);
+        }
+    }
+
+    // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+    // This maps 'op' to a lambda.
+    template<class ElemType>
+    void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+                                       const std::array<size_t, 4> & offsets,
+                                       const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
+                                       const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides)
+    {
+        #define CaseTernaryTensorOp(oper) \
+            case ElementWiseOperator::op ## oper: \
+                return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4> & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
+        array<ElemType*, 4> pointers = { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray };
+        switch (op)
+        {
+        ForAllTernaryOps(CaseTernaryTensorOp);
+        default: LogicError("TensorTernaryOp: Unknown op code %d.", (int)op);
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // explicit instantiations
+    // -----------------------------------------------------------------------
+
    template class MATH_API CPUMatrix<float>;
    template class MATH_API CPUMatrix<double>;

@ -5551,5 +5787,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template void CPUMatrix<char>::SetValue(const char);
    template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char *pArray, size_t matrixFlags);
    template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
+    template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);

 }}}
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -334,6 +334,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static bool AreEqual(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);

        static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
+
+        void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 2> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides);
+        void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 3> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
+        void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 4> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides);

        static CPUMatrix<ElemType> Ones(const size_t rows, const size_t cols);
        static CPUMatrix<ElemType> Zeros(const size_t rows, const size_t cols);
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -41,6 +41,51 @@ MATH_API DEVICEID_TYPE EnforceOneGPUOnly(DEVICEID_TYPE requestedDeviceId);

 namespace Microsoft { namespace MSR { namespace CNTK {    

+    // -----------------------------------------------------------------------
+    // ElementWiseOperator -- This enum represents which function to apply.
+    // This is shared between all matrix types and tensors.
+    // -----------------------------------------------------------------------
+
+    enum ElementWiseOperator
+    {
+        // unary (or binary with constant parameter)
+        opCopy,
+        opNegate, opNot,
+        opAbs,
+        opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine,
+        // these are not implemented yet:
+        opSaturateBetaAlpha, opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
+        // binary
+        opSum, opDifference, opElementWiseProduct, opElementWiseQuotient,
+        opLogSum, opMax, opMin,
+        opEQ, opNE, opGT, opLT, opGE, opLE,
+        // ternary
+        opCond
+        // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
+    };
+
+    // helper to apply a C macro for all operations of each kind
+#define ForAllUnaryOps(Macro) \
+    Macro(Copy); \
+    Macro(Negate); Macro(Not); \
+    Macro(Abs); \
+    Macro(Sigmoid); Macro(SigmoidDerivative); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifierDerivative); Macro(Cosine); Macro(NegativeSine);
+
+#define ForAllParameterizedUnaryOps(Macro) \
+    Macro(SaturateBetaAlpha); Macro(SumAlpha); Macro(SubDifferenceToAlpha); Macro(SubDifferenceFromAlpha);
+
+#define ForAllBinaryOps(Macro) \
+    Macro(Sum); Macro(Difference); Macro(ElementWiseProduct); Macro(ElementWiseQuotient); \
+    Macro(LogSum); Macro(Max); Macro(Min); \
+    Macro(EQ); Macro(NE); Macro(GT); Macro(LT); Macro(GE); Macro(LE);
+
+#define ForAllTernaryOps(Macro) \
+    Macro(Cond);
+
+    // -----------------------------------------------------------------------
+    // various enums to describe 
+    // -----------------------------------------------------------------------
+
    enum MatrixFlagBitPosition
    {
        bitPosRowMajor = 0, // row major matrix
@ -76,6 +121,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        matrixFlagSetValueOnDevice = 1<<bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
    };

+    // -----------------------------------------------------------------------
+    // BaseMatrix -- base class for all matrix types (CPU, GPU) x (dense, sparse)
+    // -----------------------------------------------------------------------

    template<class ElemType>
    class BaseMatrix
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -71,16 +71,6 @@ namespace Microsoft {
    };


-    // -----------------------------------------------------------------------
-    // ElementWiseOperator -- This enum represents which function to apply. It needs to be outside of GPUMatrix, because it is also used in GPUSparseMatrix
-    // -----------------------------------------------------------------------
-
-    enum ElementWiseOperator
-    {
-        opSigmoid = 0, opTanh, opSqrt, opExp, opLog, opAbs, opLinearRectifierDerivative, opCosine, opNegativeSine, opSigmoidDerivative
-    };
-
-
    // -----------------------------------------------------------------------
    // GPUMatrix
    // -----------------------------------------------------------------------
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -162,6 +162,7 @@
    <ClInclude Include="CommonMatrix.h" />
    <ClInclude Include="ConvolutionEngine.h" />
    <ClInclude Include="CPUMatrix.h" />
+    <ClInclude Include="TensorOps.h" />
    <ClInclude Include="TensorView.h" />
    <None Include="ClassDiagram.cd" />
    <None Include="GPUWatcher.cu" />
--- a/Source/Math/Math.vcxproj.filters
+++ b/Source/Math/Math.vcxproj.filters
@ -70,6 +70,9 @@
    <ClInclude Include="TensorView.h">
      <Filter>Tensors</Filter>
    </ClInclude>
+    <ClInclude Include="TensorOps.h">
+      <Filter>Tensors</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="GPUMatrix.h">
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -6,9 +6,8 @@

 // TODO:
 //  - remove empty-matrix checks: if an op is well-defined with empty matrices, then do it
-//  - Resize() must be cheap if it does nothing  (I already did that for CPU, still to be done for GPU)
-//  - an overload for Resize() to match another matrix
-//  - need a way to grow a minibatch matrix without destroying its content, something like PushColumns()
+//  - Resize() must be cheap if it does nothing  (I already did that for CPU; already done for GPU?)
+
 #pragma once

 #include "Basics.h"
@ -16,11 +15,12 @@
 #include "CommonMatrix.h"
 #include <limits.h>
 #include <memory>   // for shared_ptr
+#include <array>
+#include <initializer_list>

 // This class is exported from the Math.dll
 namespace Microsoft { namespace MSR { namespace CNTK {

-
    enum CurrentDataLocation
    {
        NONE, CPU, GPU, BOTH
@ -73,6 +73,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const; 
        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
+        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& d);
        static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);

    public:
@ -168,6 +169,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
       
        void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true);  //by default we only reallocate if need to grow        
+        void Resize(const Matrix<ElemType>& other) { Resize(other.GetNumRows(), other.GetNumCols()); }
        void VerifySize(size_t rows, size_t cols)
        {
            m_baseMatrix->VerifySize(rows, cols);
@ -200,6 +202,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR);
        void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType *pArray, const size_t matrixFlags = matrixFlagNormal);
        void SetValue(const size_t rIdx, const size_t cIdx, ElemType val);  // set matrix sparsely
+        void SetValue(const size_t numRows, const size_t numCols, std::initializer_list<ElemType> l) { std::vector<ElemType> vals(l); assert(vals.size() == numRows * numCols); SetValue(numRows, numCols, GetDeviceId(), vals.data(), matrixFormatRowMajor); } // SetValue(2,3, {1,2,3,  4,5,6});
        static ElemType MakeNan(size_t payload);
        void Invalidate() { SetValue(MakeNan(__LINE__)); }
        void SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE *h_CSCCol, const CPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
@ -376,7 +379,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const;
        void VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const;

-        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false); 
+        Matrix<ElemType>& AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false); 

        Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first

@ -458,6 +461,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static bool HasElement(const Matrix<ElemType>& a, const ElemType value = 0.0);

        static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+
+        void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 2> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides);
+        void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 3> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
+        void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 4> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides);
    public:
        void Read(File& stream);
        void Write(File& stream) const;
--- a/Source/Math/TensorOps.h
+++ b/Source/Math/TensorOps.h
@ -0,0 +1,132 @@
+//
+// <copyright file="TensorView.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+// This implements the elementwise tensor operations, including helper macros and some actual functions.
+
+#pragma once
+
+#include "Basics.h"
+#include "CommonMatrix.h"
+
+#pragma push_macro("TENSOR_OPS_DECL")
+#ifndef TENSOR_OPS_DECL     // to make these accessible to CUDA kernels, say '#define TENSOR_OPS_DECL __device__ __host__'
+#define TENSOR_OPS_DECL
+#endif
+
+#pragma push_macro("DECL")
+#define DECL static inline TENSOR_OPS_DECL
+
+// This class is exported from the Math.dll.
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // -----------------------------------------------------------------------
+    // unified overloads for float/double math functions
+    //
+    // Declare float and double versions of the functions f we need as f_(),
+    // e.g. exp_ -> exp(double), expf(float).
+    // -----------------------------------------------------------------------
+
+#pragma push_macro("OverloadUnaryMathFns")
+    #define OverloadUnaryMathFns(func) \
+        DECL float func ## _(float arg) { return func ## f(arg); } \
+        DECL double func ## _(double arg) { return func(arg); }
+
+    OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
+    OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
+    OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
+#pragma push_macro("OverloadUnaryMathFns")
+
+    // -----------------------------------------------------------------------
+    // additional functions that are standard in our context
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    DECL ElemType Sigmoid(ElemType z)
+    {
+        if (z >= 0)
+            return 1 / (1 + exp_(-z));
+        else
+        {
+            ElemType v = exp_(z);
+            return v / (1 + v);
+        }
+    }
+
+    template<class ElemType>
+    DECL ElemType SigmoidDerivative(ElemType z)
+    {
+        ElemType v = Sigmoid(z);
+        return v * (1 - v);
+    }
+
+    template<class ElemType>
+    DECL ElemType LinearRectifierDerivative(ElemType z)
+    {
+        return z > 0 ? (ElemType)1 : 0;
+    }
+
+    template<class ElemType>
+    DECL ElemType Sqrt(ElemType z)
+    {
+        // BUGBUG: Why clip to 0? An invalid sqrt() should show up as a NaN in the result, instead of hiding it.
+        return sqrt_(z > 0 ? z : 0);
+    }
+
+    // TODO: call this LogAdd() for consistency
+    template<typename ElemType>
+    DECL ElemType LogAdd(ElemType x, ElemType y)
+    {
+        if (x < y)
+        {
+            ElemType temp = x; x = y; y = temp;
+        }
+        ElemType diff = y - x;
+        if (diff < (ElemType)MINLOGEXP)
+        {
+            return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x;
+        }
+        else
+        {
+            ElemType z = exp_(diff);
+            return x + log_((ElemType)1.0 + z);
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // ElementWiseOperator implementations
+    //
+    // Define a static function for every ElementWiseOperator (CommonMatrix.h).
+    // -----------------------------------------------------------------------
+
+#pragma push_macro("DefUnaryOp")
+    #define DefUnaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a) { return expr; }
+
+    DefUnaryOp(Copy, a);
+    DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
+    DefUnaryOp(Abs, fabs_(a));
+    DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
+#pragma pop_macro("DefUnaryOp")
+
+    // parameterized unary ops
+    //DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
+
+#pragma push_macro("DefBinaryOp")
+    #define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b) { return expr; }
+
+    DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b);
+    DefBinaryOp(LogSum, LogAdd(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
+    DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
+#pragma pop_macro("DefBinaryOp")
+
+#pragma push_macro("DefTernaryOp")
+    #define DefTernaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }
+
+    DefTernaryOp(Cond, a ? b : c);
+#pragma pop_macro("DefTernaryOp")
+
+}}}
+#pragma pop_macro("DECL")
+#pragma pop_macro("TENSOR_OPS_DECL")
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@ -26,11 +26,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // construction
    // -------------------------------------------------------------------

-    // cast a matrix as a tensor
+    // cast a matrix as a TensorView
    template<class ElemType>
    TensorView<ElemType>::TensorView(Matrix<ElemType> & sob) :
-        m_sob(sob), m_shape(TensorShape(array<size_t, 2> { sob.GetNumRows(), sob.GetNumCols() }))
+        m_sob(&sob), m_shape(TensorShape(array<size_t, 2> { sob.GetNumRows(), sob.GetNumCols() }))
    { }
+    // reshape a TensorView
    template<class ElemType>
    TensorView<ElemType>::TensorView(const TensorView<ElemType> & other, const TensorShape & shape) :
        m_sob(other.m_sob), m_shape(shape)
@ -40,14 +41,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // TODO: Use the multipliers instead?
        size_t i;
        size_t rowDim = 1;
-        for (i = 0; i < m_shape.size() && rowDim < m_sob.GetNumRows(); i++)
+        for (i = 0; i < m_shape.size() && rowDim < m_sob->GetNumRows(); i++)
            rowDim *= m_shape[i];
        // first i dimensions match matrix row dimension
        size_t colDim = 1;
        for (; i < m_shape.size(); i++)
            colDim *= m_shape[i];
-        if (rowDim != m_sob.GetNumRows() || colDim != m_sob.GetNumCols())
-            LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob.GetNumRows(), (int)m_sob.GetNumCols());
+        if (rowDim != m_sob->GetNumRows() || colDim != m_sob->GetNumCols())
+            LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob->GetNumRows(), (int)m_sob->GetNumCols());
    }

    // -------------------------------------------------------------------
@ -56,96 +57,168 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    static bool Matches(size_t d1, size_t d2) { return d1 == 1 || d2 == 1 || d1 == d2; }    // do two dimensions match?

-    template<class ElemType>
-    void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/)
+    template<class ElemType, size_t N>
+    static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N> & offsets,
+                                      vector<size_t> & regularOpDims,
+                                      array<vector<ptrdiff_t>, N> & regularStrides,
+                                      vector<size_t> & reducingOpDims,
+                                      array<vector<ptrdiff_t>, N> & reducingStrides)
    {
-        TensorView & c = *this;
-
-        // TODO: Turn the inner meat here into a function template using a std::array<., N-nariness>. Nullary ops are generators, e.g. constants.
-
        // massage TensorShapes
        // Note that TensorShapes here may be shapes are stored or shapes with stride magic applied.
-        auto as = a.GetShape().GetDims();
-        auto bs = b.GetShape().GetDims();
-        auto cs = c.GetShape().GetDims();

        // expand ones to make tensors compatible
        // Trailing dimensions broadcast.
        // E.g. A(J) vs. B(J x T) will broadcast A(:) to all T columns.
        // To broadcast an A(T) to all J rows of B, use TensorShape editing to insert a dimension to get A(1,T).
-        auto dims = max(max(as.size(), bs.size()), cs.size());
-        as.resize(dims, 1);
-        bs.resize(dims, 1);
-        cs.resize(dims, 1);
+        size_t dims = 0;
+        for (size_t i = 0; i < N; i++)
+            if (dims < shapes[i].GetNumDims())
+                dims = shapes[i].GetNumDims();
+        for (size_t i = 0; i < N; i++)
+            shapes[i] = shapes[i].Pad(dims);

        // determine operation shape (max over all dimensions)
-        decltype(as) os(dims);
+        vector<size_t> opDims(dims, 0);
        for (size_t k = 0; k < dims; k++)
-            os[k] = max(max(as[k], bs[k]), cs[k]);
+            for (size_t i = 0; i < N; i++)
+                opDims[k] = max(opDims[k], shapes[i][k]);

        // dimension compatibility check
        // Each participant can broadcast. Non-broadcasting dimensions must match the operation dimension.
        for (size_t k = 0; k < dims; k++)
-        {
-            if (!Matches(as[k], os[k]) || !Matches(bs[k], os[k]) || !Matches(cs[k], os[k]))
-                InvalidArgument("Binary tensor operation: Dimension %d is incompatible between the two inputs and output (%d vs. %d vs. %d)", (int)dims, (int)as[k], (int)bs[k], (int)cs[k]);
-        }
+            for (size_t i = 0; i < N; i++)
+                if (!Matches(shapes[i][k], opDims[k]))
+                    InvalidArgument("Binary tensor operation: Dimension %d is incompatible between input %d and output (%s vs. %s)", (int)k, (int)shapes[i][k], string(shapes[i]).c_str(), string(TensorShape(opDims)).c_str());

        // flatten consecutive dimensions
        // Dimensions must be consecutive in memory, and either non-broadcasting or all-broadcasting, across all dimensions.
        // After this, as, bs, and cs no longer match the TensorShape objects.
+        //fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
        for (size_t k = 1; k < dims; k++)
        {
-            // check if stored without gaps to skip
-            if (!a.GetShape().CanFlatten(k) || !b.GetShape().CanFlatten(k) || !c.GetShape().CanFlatten(k))
-                continue;
-            // check if they are either all broadcasting or all not broadcasting
-            if ((as[k] != os[k] || as[k - 1] != os[k - 1]) && (as[k] != 1 || as[k - 1] != 1))
-                continue;
-            if ((bs[k] != os[k] || bs[k - 1] != os[k - 1]) && (bs[k] != 1 || bs[k - 1] != 1))
-                continue;
-            if ((cs[k] != os[k] || cs[k - 1] != os[k - 1]) && (cs[k] != 1 || cs[k - 1] != 1))
-                continue;
-            // merge the dimensions
-            as[k] *= as[k - 1]; as[k - 1] = 1;
-            bs[k] *= bs[k - 1]; bs[k - 1] = 1;
-            cs[k] *= cs[k - 1]; cs[k - 1] = 1;
-            // BUGBUG: Must update multipliers as well
+            for (size_t i = 0; i < N; i++)
+            {
+                // check if stored without gaps to skip
+                if (!shapes[i].CanFlatten(k))
+                    goto nope;
+                // check if they are either all broadcasting or all not broadcasting
+                if ((shapes[i][k] != opDims[k] || shapes[i][k - 1] != opDims[k - 1]) && (shapes[i][k] != 1 || shapes[i][k - 1] != 1))
+                    goto nope;
+            }
+            // these dimensions can be merged
+            for (size_t i = 0; i < N; i++)
+                shapes[i] = shapes[i].Flatten(k);               // TODO: overdoing the immutable thingy much?
+            opDims = TensorShape(opDims).Flatten(k).GetDims();  // (ugh)
+        nope:;
        }
+        //fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());

        // remove singleton dimensions
-        size_t j = 0;
+        vector<bool> toDrop(dims, false);
        for (size_t k = 0; k < dims; k++)
        {
-            if (as[k] == 1 && bs[k] == 1 && cs[k] == 1) // skip all-singleton dimensions
-                continue;
-            as[j] = as[k];
-            bs[j] = bs[k];
-            cs[j] = cs[k];
-            os[j] = os[k];
-            j++;
+            for (size_t i = 0; i < N; i++)
+                if (shapes[i][k] != 1)
+                    goto neither;
+            toDrop[k] = true;           // found an all-singleton dimensions
+        neither:;
        }
-        // note: if op is a scalar, then we end up with 0 dimensions here
-        dims = j;
-        as.resize(dims);
-        bs.resize(dims);
-        cs.resize(dims);
-        os.resize(dims);
-        let as1 = TensorShape(as);   // BUGBUG: We just lost stride info.
-        let bs1 = TensorShape(bs);
-        let cs1 = TensorShape(cs);
-        let os1 = TensorShape(os);
+        for (size_t i = 0; i < N; i++)
+            shapes[i] = shapes[i].DropDims(toDrop);
+        opDims = TensorShape(opDims).DropDims(toDrop).GetDims();    // (ugh)
+        dims = opDims.size();   // #dims has changed
+        for (size_t i = 0; i < N; i++)
+            assert(dims == shapes[i].size());
+        // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed
+        //fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
+
+        // determine broadcasting; that is, set strides to 0 for 1-dimensions
+        // To be more precise, we should only set actually broadcasting dimensions to 0.
+        // But since dimensions that are 1 across all args are eliminated, any 1 must be some form of broadcasting.
+        // TODO: Do we need to allow other strides at this point in time? If not, broadcasting becomes a bit vector.
+        for (size_t i = 0; i < N; i++)
+            shapes[i] = shapes[i].WithBroadcastStrides();
+
+        //fprintf(stderr, "%s  op  %s  ->  %s  via  %s\n", string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());

        // determine inverse broadcasting dimensions
-        // TODO: describe the resulting for loop as a set of tensor dims and strides as well.
-        vector<bool> cBroadcasts(dims);
+        // Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index.
+        // For regular input dims:
+        //  - determine number of steps (product over opDims[.])
+        //  - launch that many kernels
+        //  - pass in:
+        //     - total number of steps
+        //     - strides for all inputs (with stride magic), separated by regular and inverse broadcasting dimensions
+        //     - opDim (no stride magic allowed) for regular broadcasting dimensions
+        //     - reverse broadcasting dimensions
+        //     - opcodes for elementwise op and reduction op
+        //  - in each kernel:
+        //     - map thread index to dimensions (regular broadcasting ones)
+        //     - for-loop over inverse broadcasting dimensions
+        //        - map dimensions (including inverse broadcasting) for every input
+        //        - perform op on the input values
+        //        - accumulate
+        //     - map dimensions (regular) for output
+        //     - save result
+
+        // separate out the inverse-broadcasting dimensions
+        // Any singleton dimension in the result tensor is inverse-broadcasting, because there must be at least one non-1 dimension
+        // in one of the inputs, otherwise the entire dimension would have been optimized away above.
+        vector<bool> isReducingDim(dims);    // true for each inverse-broadcasting dimension
        for (size_t k = 0; k < dims; k++)
-            cBroadcasts[k] = cs1[k] == 1 && (as1[k] != 1 || bs1[k] != 1);
+            isReducingDim[k] = shapes.back()[k] == 1;
+
+        // form the regular (non-inverse-broadcasting) dims
+        for (size_t i = 0; i < N; i++)
+            regularStrides[i] = shapes[i].DropDims(isReducingDim).GetStrides();
+        regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims();    // (ugh)
+
+        // form the inverse-broadcasting dims
+        vector<bool> isRegularDim(dims);    // true for each inverse-broadcasting dimension
+        for (size_t k = 0; k < dims; k++)
+            isRegularDim[k] = !isReducingDim[k];   // (no way to do this more nicely?)
+        for (size_t i = 0; i < N; i++)
+            reducingStrides[i] = shapes[i].DropDims(isRegularDim).GetStrides();
+        reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims();    // (ugh)
+
+        for (size_t i = 0; i < N; i++)
+            offsets[i] = shapes[i].GetOffset();
+    }
+
+    template<class ElemType>
+    void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
+    {
+        // prepare all tensor descriptor information as needed for execution
+        array<size_t, 2> offsets;
+        array<vector<ptrdiff_t>, 2> regularStrides, reducingStrides;
+        vector<size_t> regularOpDims, reducingOpDims;
+        PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);

        // now perform the operation
-        fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(as1).c_str(), string(bs1).c_str(), string(cs1).c_str(), string(os1).c_str());
-        // :)
-        beta; alpha;
+        GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    }
+
+    template<class ElemType>
+    void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
+    {
+        array<size_t, 3> offsets;
+        array<vector<ptrdiff_t>, 3> regularStrides, reducingStrides;
+        vector<size_t> regularOpDims, reducingOpDims;
+        PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+
+        GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    }
+
+    template<class ElemType>
+    void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op)
+    {
+        array<size_t, 4> offsets;
+        array<vector<ptrdiff_t>, 4> regularStrides, reducingStrides;
+        vector<size_t> regularOpDims, reducingOpDims;
+        PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+
+        GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }

    // simple test function for testing stuff
@ -153,16 +226,67 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    /*static*/ void TensorView<ElemType>::Test()
    {
-        Matrix<ElemType> m1(0); m1.Resize(1, 42);
-        Matrix<ElemType> m2(0); m2.Resize(13, 1);
-        Matrix<ElemType> m3(0); m3.Resize(13, 21);
-        TensorShape s1(1, 2, 21);
-        TensorShape s2(13, 1);
-        TensorShape s3(13, 1, 21);
-        let t1 = TensorView<ElemType>(m1, s1); t1;
-        let t2 = TensorView<ElemType>(m2, s2); t2;
-        auto t3 = TensorView<ElemType>(m3, s3); t3;
-        t3.DoSumOf(0, t1, t2, 1);
+        Matrix<ElemType> m1(-1);
+        Matrix<ElemType> m2(-1);
+        Matrix<ElemType> m3(-1);
+        {
+            m1.SetValue(5, 3, { 1, 2, 3,
+                                14, 15, 6,
+                                4, 5, 16,
+                                41, 5, 1,
+                                1.8, 4.5, 7 });
+            m2.SetValue(5, 1, { 42,
+                                13,
+                                1968,
+                                3.1415f,
+                                7 });
+
+            m3.Resize(m1);
+
+            // regular zip  (just add m1 to itself)
+            TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m1), 1);
+            m3.Print();
+
+            // unary op
+            TensorView(m3).DoSqrtOf(0, TensorView(m1), 1);
+            m3.Print();
+
+            // broadcasting of an input
+            TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+
+            TensorView(m3).DoMaxOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+
+            TensorView(m3).DoGTOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+
+            // reduction over columns
+            m3.Resize(5, 1);
+            TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+
+            // reduction over rows
+            m3.Resize(1, 3);
+            TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+
+            TensorView(m3).DoLogSumOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+        }
+        {
+            m1.Resize(1, 42);
+            m2.Resize(13, 1);
+            m3.Resize(13, 21);
+            TensorShape s1(1, 2, 21);
+            TensorShape s2(13, 1);
+            TensorShape s3(13, 1, 21);
+            let t1 = TensorView<ElemType>(m1, s1); t1;
+            let t2 = TensorView<ElemType>(m2, s2); t2;
+            auto t3 = TensorView<ElemType>(m3, s3); t3;
+            t3.DoSumOf(0, t1, t2, 1);
+            m3.Print();
+        }
    }

    template class TensorView<float>;
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -4,7 +4,7 @@
 // </copyright>
 //

-// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor.
+// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor. [fseide]

 #pragma once

@ -36,17 +36,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        { }
        // copy constructor
        TensorView(const TensorView<ElemType> & other) :
-            TensorView(other.m_sob, other.m_shape)
+            TensorView(*other.m_sob, other.m_shape)
        { }
-        // assignment is forbidden since we contain a reference
-        // If you ever need this, change the reference to a pointer.
-        void operator=(const TensorView & other) = delete;  // since we have a reference

        // -------------------------------------------------------------------
        // accessors
        // -------------------------------------------------------------------

-        const Matrix<ElemType> & GetSOB() const { return m_sob; }
+        Matrix<ElemType> & GetSOB() const { return *m_sob; }
        const TensorShape & GetShape() const { return m_shape; }

        // -------------------------------------------------------------------
@ -59,20 +56,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
        // -------------------------------------------------------------------

-        void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, 0); }
+#pragma push_macro("DeclareUnaryTensorOp")
+#define DeclareUnaryTensorOp(oper) \
+        void Do ## oper ## Of(ElemType beta, const TensorView & a, ElemType alpha) { DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op ## oper); }
+
+        ForAllUnaryOps(DeclareUnaryTensorOp);
+        ForAllParameterizedUnaryOps(DeclareUnaryTensorOp);
+        //DeclareUnaryTensorOp(Copy);
+        //DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not);
+        //DeclareUnaryTensorOp(Abs);
+        //DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine);
+        //DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha);
+#pragma pop_macro("DeclareUnaryTensorOp")
+
+#pragma push_macro("DeclareBinaryTensorOp")
+#define DeclareBinaryTensorOp(oper) \
+        void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); }
+
+        ForAllBinaryOps(DeclareBinaryTensorOp);
+        //DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient);
+        //DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min);
+        //DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE);
+#pragma pop_macro("DeclareBinaryTensorOp")
+
+#pragma push_macro("DeclareTernaryTensorOp")
+#define DeclareTernaryTensorOp(oper) \
+        void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha) { DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op ## oper); }
+
+        ForAllTernaryOps(DeclareTernaryTensorOp);
+#pragma pop_macro("DeclareTernaryTensorOp")

        static void Test();

    private:

-        void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/);
+        void DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op);
+        void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op);
+        void DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op);

        // -------------------------------------------------------------------
        // sob members
        // -------------------------------------------------------------------

-        Matrix<ElemType> & m_sob; // Storage OBject that holds the data that is being viewed with this TensorView
-        TensorShape m_shape;            // the meta-data that describes the data's shape and/or access pattern
+        Matrix<ElemType> * m_sob;   // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable.
+        TensorShape m_shape;        // the meta-data that describes the data's shape and/or access pattern
        // TODO: use a reference here or not? With a reference, we can hide more info in here such as cuDNN handles
    };

--- a/Source/Readers/ImageReader/ImageReader.cpp
+++ b/Source/Readers/ImageReader/ImageReader.cpp
@ -16,6 +16,7 @@
 #include <sstream>  // TODO: this should go away once we update the parameter parsing
 #include <unordered_map>
 #include <opencv2/opencv.hpp>
+#include <omp.h>

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -400,6 +401,10 @@ void ImageReader<ElemType>::InitFromConfig(const ConfigRecordType& config)

    m_prefetch = config(L"prefetch", true);

+    int cthread = config(L"numCPUThreads", 0);
+    if (cthread > 0)
+        omp_set_num_threads(cthread);
+
    m_epochStart = 0;
    m_mbStart = 0;
 }
@ -412,11 +417,16 @@ void ImageReader<ElemType>::Destroy()
 }

 template<class ElemType>
-void ImageReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
+void ImageReader<ElemType>::StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples)
 {
    assert(mbSize > 0);
+    assert(numSubsets > 0);
+    assert(subsetNum < numSubsets);
    assert(requestedEpochSamples > 0);

+    m_subsetNum = subsetNum;
+    m_numSubsets = numSubsets;
+
    if (m_imgListRand)
        std::shuffle(m_files.begin(), m_files.end(), m_rng);

@ -457,7 +467,6 @@ bool ImageReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>

    m_pMBLayout->InitAsFrameMode(mbSize);

-    m_mbStart += mbSize;
    // It is safe to run prefetching with just one buffer as SetValue is synchronous so there will be no race.
    m_mbPrefetchFut = std::async(GetLaunchPolicy(m_prefetch), [this]() { return ReadImages(); });

@ -505,10 +514,15 @@ size_t ImageReader<ElemType>::ReadImages()
    
    std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast<ElemType>(0));

+    size_t actualMBSize = mbLim - m_mbStart;
+    size_t iStart = actualMBSize * m_subsetNum / m_numSubsets;
+    size_t iLim = actualMBSize * (m_subsetNum + 1) / m_numSubsets;
+    size_t subsetSize = iLim - iStart;
+
 #pragma omp parallel for ordered schedule(dynamic)
-    for (long long i = 0; i < static_cast<long long>(mbLim - m_mbStart); i++)
+    for (long long i = 0; i < static_cast<long long>(subsetSize); i++)
    {
-        const auto& p = m_files[i + m_mbStart];
+        const auto& p = m_files[m_mbStart + iStart + i];
        cv::Mat img{ cv::imread(p.first, cv::IMREAD_COLOR) };
        if (!img.data)
            RuntimeError("Cannot read image file %s", p.first.c_str());
@ -522,7 +536,8 @@ size_t ImageReader<ElemType>::ReadImages()
        m_labBuf[m_labDim * i + p.second] = 1;
    }

-    return mbLim - m_mbStart;
+    m_mbStart += actualMBSize;
+    return subsetSize;
 }

 template class ImageReader<double>;
--- a/Source/Readers/ImageReader/ImageReader.h
+++ b/Source/Readers/ImageReader/ImageReader.h
@ -39,7 +39,12 @@ public:
    virtual void Init(const ScriptableObjects::IConfigRecord & config) override { InitFromConfig(config); }
 #endif
    void Destroy() override;
-    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override;
+    bool SupportsDistributedMBRead() const { return true; }
+    void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize) override;
+    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override
+    {
+        return StartDistributedMinibatchLoop(mbSize, epoch, 0, 1, requestedEpochSamples);
+    }
    bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) override;
    bool DataEnd(EndDataType endDataType) override;

@ -73,6 +78,9 @@ private:
    size_t m_epochStart;
    size_t m_mbStart;

+    size_t m_subsetNum;
+    size_t m_numSubsets;
+
    bool m_prefetch;
    std::future<size_t> m_mbPrefetchFut;
    std::vector<ElemType> m_featBuf;
--- a/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp
+++ b/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp
@ -32,22 +32,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        assert(pMBLayout->GetNumParallelSequences() == m_numUttsPerMinibatch);
        uttInfoInMinibatch->clear();
        uttInfoInMinibatch->resize(uttInfo.size());
+                
        for (size_t i = 0; i < uttInfo.size(); ++i)
        {
            size_t startFrameIndexInMinibatch = 0;
            size_t numFrames = 0;
+
            for (size_t j = 0; j < pMBLayout->GetNumTimeSteps(); ++j)
            {
-                if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
+                /*  if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
                {
                    continue;
-                }
-                if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoFeature))
+                }*/
+                FrameRange fr(pMBLayout,j);
+
+                if (pMBLayout->IsGap(fr.Sequence(i)))
                {
                    continue;
                }
                numFrames += 1;
-                if (pMBLayout->Is(i, j, MinibatchPackingFlags::SequenceEnd)
+                if (pMBLayout->IsBeyondStartOrEnd(fr.WithTimeOffset((ptrdiff_t) 1).Sequence(i))
                         || j == pMBLayout->GetNumTimeSteps() - 1)
                {
                    size_t uttIndex = (*uttInfoInMinibatch)[i].size();
--- a/Source/Readers/LMSequenceReader/SequenceWriter.cpp
+++ b/Source/Readers/LMSequenceReader/SequenceWriter.cpp
@ -4,10 +4,10 @@
 // </copyright>
 //

-//
-
 #include "stdafx.h"
+#ifdef _WIN32
 #include <objbase.h>
+#endif
 #include "Basics.h"
 #include <fstream>
 #include <algorithm>
--- a/Source/Readers/LMSequenceReader/SequenceWriter.h
+++ b/Source/Readers/LMSequenceReader/SequenceWriter.h
@ -12,21 +12,6 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-    template<class ElemType>
-    void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
-    {
-        *pwriter = new LMSequenceWriter<ElemType>();
-    }
-
-    extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
-    {
-        GetWriter(pwriter);
-    }
-    extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
-    {
-        GetWriter(pwriter);
-    }
-
    template<class ElemType>
    class LMSequenceWriter : public IDataWriter<ElemType>
    {
@ -65,8 +50,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

    public:
+        using LabelType = typename IDataWriter<ElemType>::LabelType;
+        using LabelIdType = typename IDataWriter<ElemType>::LabelIdType;
        void GetSections(std::map<std::wstring, SectionType, nocase_compare>& /*sections*/){}
-        void SaveMapping(std::wstring saveId, const std::map<typename LabelIdType, typename LabelType>& /*labelMapping*/){}
+        void SaveMapping(std::wstring saveId, const std::map<LabelIdType, LabelType>& /*labelMapping*/){}

    public:
        template<class ConfigRecordType>
@ -77,4 +64,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual bool SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized);
    };

+    template<class ElemType>
+    void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
+    {
+        assert(pwriter != nullptr);
+        *pwriter = new LMSequenceWriter<ElemType>();
+        assert(*pwriter != nullptr);
+    }
+
+    extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
+    {
+        GetWriter(pwriter);
+    }
+    extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
+    {
+        GetWriter(pwriter);
+    }
+
 }}}
--- a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt
+++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt
--- a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt
+++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt
--- a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt
+++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt
--- a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt
+++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt