full tensor support for Mean and InvStdDev operations

2016-03-03 23:46:17 -08:00 · 2016-03-03 23:46:17 -08:00 · 359d90ab09
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -47,7 +47,7 @@ public:
    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
        size_t rank = DetermineElementwiseTensorRank();
-        auto result = ValueTensorFor(rank, fr);
+        auto result =           ValueTensorFor(rank, fr);
        auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
        auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
        result.AssignSumOf(input0, input1);
@ -56,7 +56,7 @@ public:
    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
    {
        size_t rank = DetermineElementwiseTensorRank();
-        auto gradient = GradientTensorFor(rank, fr);
+        auto gradient      =                    GradientTensorFor(rank, fr);
        auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());

        // if reduction then mask the respective input(s) (zero out the gaps)
@ -77,12 +77,8 @@ template class PlusNode<double>;
 template <class ElemType>
 class MinusNode : public BinaryElementWiseNode<ElemType>
 {
-    typedef BinaryElementWiseNode<ElemType> Base;
-    UsingBinaryElementwiseNodeBaseMembers;
-    static const std::wstring TypeName()
-    {
-        return L"Minus";
-    }
+    typedef BinaryElementWiseNode<ElemType> Base; UsingBinaryElementwiseNodeBaseMembers;
+    static const std::wstring TypeName() { return L"Minus"; }

 public:
    DeclareConstructorFromConfigWithNumInputs(MinusNode);
@ -95,7 +91,7 @@ public:
    {
        ElemType sign = inputIndex == 0 ? 1.0f : -1.0f;
        size_t rank = DetermineElementwiseTensorRank();
-        auto gradient = GradientTensorFor(rank, fr);
+        auto gradient      =                    GradientTensorFor(rank, fr);
        auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());

        // if reduction then mask the respective input(s) (zero out the gaps)
@ -108,7 +104,7 @@ public:
    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
        size_t rank = DetermineElementwiseTensorRank();
-        auto result = ValueTensorFor(rank, fr);
+        auto result =           ValueTensorFor(rank, fr);
        auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
        auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
        result.AssignDifferenceOf(input0, input1);
@ -126,12 +122,8 @@ template class MinusNode<double>;
 template <class ElemType>
 class NegateNode : public ComputationNode<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"Negate";
-    }
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"Negate"; }

 public:
    DeclareConstructorFromConfigWithNumInputs(NegateNode);
--- a/Source/ComputationNetworkLib/PreComputeNodes.h
+++ b/Source/ComputationNetworkLib/PreComputeNodes.h
@ -138,8 +138,7 @@ public:
 template <class ElemType>
 class MeanInvStdDevNodeBase : public PreComputedNodeBase<ElemType>, public NumInputs<1>
 {
-    typedef PreComputedNodeBase<ElemType> Base;
-    UsingPreComputedNodeMembers;
+    typedef PreComputedNodeBase<ElemType> Base; UsingPreComputedNodeMembers;
    // static const std::wstring TypeName() { return L"MeanInvStdDev (base)"; }
 public:
    // DeclareConstructorFromConfigWithNumInputs(MeanInvStdDevNodeBase);
@ -219,12 +218,8 @@ protected:
 template <class ElemType>
 class MeanNode : public MeanInvStdDevNodeBase<ElemType>
 {
-    typedef MeanInvStdDevNodeBase<ElemType> Base;
-    UsingMeanInvStdDevNodeBaseNodeMembers;
-    static const std::wstring TypeName()
-    {
-        return L"Mean";
-    }
+    typedef MeanInvStdDevNodeBase<ElemType> Base; UsingMeanInvStdDevNodeBaseNodeMembers;
+    static const std::wstring TypeName() { return L"Mean"; }

 public:
    DeclareConstructorFromConfigWithNumInputs(MeanNode);
@ -232,11 +227,11 @@ public:
        : Base(deviceId, name)
    {
    }
-
    MeanNode(DEVICEID_TYPE deviceId, const wstring& name, size_t)
        : Base(deviceId, name)
    {
    }
+
    virtual void /*PreComputedNodeBase::*/ MarkComputed(const bool hasComputed)
    {
        Base::MarkComputed(hasComputed);
@ -260,19 +255,27 @@ public:
        // set gaps to zero, since we are reducing in time
        Input(0)->MaskMissingValueColumnsToZero(fr);

-        auto& samples = Input(0)->Value();
-        auto& avg = Value();
-
-#if NANCHECK
-        samples.HasNan("Mean-Samples");
-#endif
        size_t numNewSamples = Input(0)->GetMBLayout()->GetActualNumSamples();
        size_t totalNumSamples = m_numSamples + numNewSamples;
        if (totalNumSamples == 0)
            totalNumSamples = 1; // 0/0=1 in this context
-        Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
+        ElemType alpha =                   1.0f / totalNumSamples;
+        ElemType beta  = (ElemType)m_numSamples / totalNumSamples;
+#if 1
+        size_t rank = DetermineElementwiseTensorRank();
+        auto mean  = ValueTensorFor(rank, FrameRange()); // mean is formed directly in our m_value
+        auto input = Input(0)->ValueTensorFor(rank, fr);
+
+        mean.DoCopyOf(beta, input, alpha);
+        // Note: We leverage that TensorView allows "broadcasting" the output,
+        // which really means a reduction.
+#else
+        auto& samples = Input(0)->Value();
+        auto& avg = Value();
+        Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, samples, false,
                                                 ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
-                                                 false, (ElemType) m_numSamples / totalNumSamples, avg);
+                                                 false, beta, avg);
+#endif
 #if NANCHECK
        avg.HasNan("Mean-avg");
 #endif
@ -292,12 +295,8 @@ template class MeanNode<double>;
 template <class ElemType>
 class InvStdDevNode : public MeanInvStdDevNodeBase<ElemType>
 {
-    typedef MeanInvStdDevNodeBase<ElemType> Base;
-    UsingMeanInvStdDevNodeBaseNodeMembers;
-    static const std::wstring TypeName()
-    {
-        return L"InvStdDev";
-    }
+    typedef MeanInvStdDevNodeBase<ElemType> Base; UsingMeanInvStdDevNodeBaseNodeMembers;
+    static const std::wstring TypeName() { return L"InvStdDev"; }

 public:
    DeclareConstructorFromConfigWithNumInputs(InvStdDevNode);
@ -316,31 +315,21 @@ public:
        if (!m_hasComputed) // initialize
        {
            // reset accumulators
-            size_t inputDim = Input(0)->GetSampleMatrixNumRows();
-            m_mean.Resize(inputDim, 1);
-            m_var.Resize(inputDim, 1);
-            m_mean.SetValue(0);
-            m_var.SetValue(0);
            UpdateFunctionValuesSize();
-            Value().SetValue(0); // also set this because not doing it may flag during debugging; avoids special-casing this
+            Value().SetValue(0);    // Note: We must do this here already because dimensions are verified at places.
+            m_mean.Resize(Value()); // mean accumulator normalized by #samples in it
+            m_var .Resize(Value()); // likewise the variance
+            m_temp.Resize(Value()); // and a temp
+            m_mean.SetValue(0); // reset the mean and var accumulators
+            m_var .SetValue(0);
        }
        else // finalize
        {
+            // m_value <- 1/stddev
            ElemType sqrtFloor = 1e-10f;
            m_var.InplaceTruncateBottom(sqrtFloor); // prevent too small variance (and negative square roots due to numeric inaccuracy)
-#if NANCHECK
-            m_var.HasNan("MarkComputed-InplaceTruncateBottom");
-#endif
            m_var.InplaceSqrt();
-
-#if NANCHECK
-            m_var.HasNan("MarkComputed-InplaceSqrt");
-#endif
            m_var.ElementInverse();
-
-#if NANCHECK
-            m_var.HasNan("MarkComputed-ElementInverse()");
-#endif
            Value().SetValue(m_var);
        }
    }
@ -357,29 +346,55 @@ public:
        // set gaps to zero, since we are reducing in time
        Input(0)->MaskMissingValueColumnsToZero(fr);

-        auto& samples = Input(0)->Value();
-#if NANCHECK
-        samples.HasNan("InvStdDev-Samples");
-#endif
-        m_temp.SetValue(m_mean);
+        //m_temp.SetValue(m_mean); // old mean
        size_t numNewSamples = Input(0)->GetMBLayout()->GetActualNumSamples();
        size_t totalNumSamples = m_numSamples + numNewSamples;
        if (totalNumSamples == 0)
            totalNumSamples = 1; // 0/0=1 in this context
-        Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
-                                                 ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
-                                                 false, (ElemType) m_numSamples / totalNumSamples, m_mean);
+        ElemType alpha =                   1.0f / totalNumSamples;
+        ElemType beta  = (ElemType)m_numSamples / totalNumSamples;
+#if 1
+        size_t rank = DetermineElementwiseTensorRank();
+        auto input    = Input(0)->ValueTensorFor(        rank, fr);
+        auto mean     =            DataTensorFor(m_mean, rank, FrameRange());
+        auto temp     =            DataTensorFor(m_temp, rank, FrameRange());
+        auto var      =            DataTensorFor(m_var,  rank, FrameRange());

+        // preserve the old mean value for the next step
+        temp.AssignCopyOf(mean);
+
+        // accumulate the mean
+        mean.DoCopyOf(beta, input, alpha); // Note: This reduces over samples.
+#else
+        auto& samples = Input(0)->Value();
+        Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, samples, false,
+                                                 ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
+                                                 false, beta, m_mean);
+#endif
+
+        // compute the correction term
+#if 1
+        // var += (oldMean - newMean)^2
+        temp.DoCopyOf(1.0f, mean, -1.0f); // subtract new 'mean' from the old one
+        var .DoSqrOf (1.0f, temp,  1.0f); // add the square
+
+        // var += (input - mean)^2
+        auto& temp2 = temp;                    // another temp variable, for which we can reuse the first one
+        temp2.AssignDifferenceOf(input, mean); // Note: This also reduces over samples.
+        var.DoSqrOf(beta, temp2, alpha);
+#else
+        // var += (oldMean - newMean)^2
        m_temp -= m_mean;
        m_temp.AssignElementPowerOf(m_temp, 2);
-        m_var += m_temp;
+        m_var  += m_temp;

-        m_temp.AssignDifferenceOf(samples, m_mean);
+        m_temp.AssignDifferenceOf(Input(0)->Value(), m_mean);
        m_temp.AssignElementPowerOf(m_temp, 2);

-        Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, m_temp, false,
+        Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, m_temp, false,
                                                 ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
-                                                 false, (ElemType) m_numSamples / totalNumSamples, m_var);
+                                                 false, beta, m_var);
+#endif

 #if NANCHECK
        m_var.HasNan("InvStdDev-m_var");
@ -415,6 +430,11 @@ template class InvStdDevNode<double>;

 // -----------------------------------------------------------------------
 // PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
+// Computes
+//   output = (feature - mean) .* invStdDev
+// where mean and invStdDev are meant to be single elements while features
+// is minibatch data.
+// TODO: Why do we need this? Why not use Plus and ElementTimes?
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -441,34 +461,24 @@ public:

    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
+#if 1
+        size_t rank = DetermineElementwiseTensorRank();
+        auto output    =           ValueTensorFor(rank, fr);
+        auto input     = Input(0)->ValueTensorFor(rank, fr);
+        auto mean      = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto invStdDev = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
+
+        output.AssignDifferenceOf(input, mean);               // output = input - mean
+        output.AssignElementwiseProductOf(output, invStdDev); // output *= invStdDev
+#else
        // only feature (input0) and output needs to be sliced
-        auto sliceInput0Value = Input(0)->ValueFor(fr);
-        auto sliceOutputValue = ValueFor(fr);
+        auto functionValues = Input(0)->ValueFor(fr);
+        auto input0         = ValueFor(fr);
+        const auto& input1  = Input(1)->Value(); // mean
+        const auto& input2  = Input(2)->Value(); // inv stddev

-        ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value(), Input(2)->Value());
-    }
-
-    /*TODO: merge with call site*/ void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0,
-                                                     const Matrix<ElemType>& input1, const Matrix<ElemType>& input2)
-    {
-#if DUMPOUTPUT
-//input0.Print("PerDimMeanVarNormalization-input0");
-//input1.Print("PerDimMeanVarNormalization-input1");
-//input2.Print("PerDimMeanVarNormalization-input2");
-#endif
-
-#if NANCHECK
-        input0.HasNan("PerDimMeanVarNormalization-input0");
-        input1.HasNan("PerDimMeanVarNormalization-input1");
-        input2.HasNan("PerDimMeanVarNormalization-input2");
-#endif
        functionValues.AssignDifferenceOf(input0, input1);
        functionValues.ColumnElementMultiplyWith(input2);
-#if NANCHECK
-        functionValues.HasNan("PerDimMeanVarNormalization");
-#endif
-#if DUMPOUTPUT
-        functionValues.Print("PerDimMeanVarNormalizationNode");
 #endif
    }

@ -477,31 +487,11 @@ public:
        Base::Validate(isFinalValidationPass);
        InferMBLayoutFromInputsForStandardCase();

-        if (Input(0)->RequiresPreCompute())
-        {
-            LogicError(
-                "PerDimMeanVarNormalizationNode criterion forbids first input from being a pre-compute node. "
-                "The first input should be the node whose output should be normalized, and the second and third inputs "
-                "should be LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
-        }
-
-        if (!(Input(1)->OperationName() == OperationNameOf(LearnableParameter) &&
-              Input(2)->OperationName() == OperationNameOf(LearnableParameter)) &&
-            !(Input(1)->OperationName() == OperationNameOf(MeanNode) &&
-              Input(2)->OperationName() == OperationNameOf(InvStdDevNode)))
-        {
-            LogicError(
-                "PerDimMeanVarNormalizationNode criterion requires the last two inputs to be LearnableParameter "
-                "type or (Mean, InvStdDev) so that the values will be saved.");
-        }
-
        Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
        Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());

        if (isFinalValidationPass)
        {
-            if (!Input(0)->HasMBLayout() || Input(1)->HasMBLayout() || Input(2)->HasMBLayout())
-                InvalidArgument("PerDimMeanVarNormalizationNode: Inputs must be data, while mean and InvStdDev must be column vectors.");
            if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
                InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
        }
@ -515,17 +505,17 @@ template class PerDimMeanVarNormalizationNode<double>;

 // -----------------------------------------------------------------------
 // PerDimMeanVarDeNormalizationNode (feature, mean, invStdDev)
+// Computes
+//   output = feature ./ invStdDev + mean
+// with parameters the same as PerDimMeanVarNormalizationNode.
+// TODO: Why do we need this? Why not use Plus and ElementDividedBy?
 // -----------------------------------------------------------------------

 template <class ElemType>
 class PerDimMeanVarDeNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
 {
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"PerDimMeanVarDeNormalization";
-    }
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"PerDimMeanVarDeNormalization"; }

 public:
    DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarDeNormalizationNode);
@ -539,44 +529,28 @@ public:
        InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
    }

-    // (feature-mean).*InvStdDev
+    // feature ./ invStdDev + mean
    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
+#if 1
+        size_t rank = DetermineElementwiseTensorRank();
+        auto output    =           ValueTensorFor(rank, fr);
+        auto input     = Input(0)->ValueTensorFor(rank, fr);
+        auto mean      = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto invStdDev = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
+
+        output.AssignElementwiseQuotientOf(input, invStdDev); // output = input / invStdDev
+        output.AssignDifferenceOf(output, mean);              // output += mean
+#else
        // only feature (input0) and output needs to be sliced
-        auto sliceInput0Value = Input(0)->ValueFor(fr);
-        auto sliceOutputValue = ValueFor(fr);
+        auto functionValues = Input(0)->ValueFor(fr);
+        auto input0         = ValueFor(fr);
+        const auto& input1  = Input(1)->Value(); // mean
+        const auto& input2  = Input(2)->Value(); // inv stddev

-        ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value(), Input(2)->Value());
-    }
-
-    /*TODO: merge with call site*/ void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0,
-                                                     const Matrix<ElemType>& input1, const Matrix<ElemType>& input2)
-    {
-#if DUMPOUTPUT
-//input0.Print("PerDimMeanVarDeNormalization-input0");
-//input1.Print("PerDimMeanVarDeNormalization-input1");
-//input2.Print("PerDimMeanVarDeNormalization-input2");
-#endif
-
-#if NANCHECK
-        input0.HasNan("PerDimMeanVarDeNormalization-input0");
-        input1.HasNan("PerDimMeanVarDeNormalization-input1");
-        input2.HasNan("PerDimMeanVarDeNormalization-input2");
-#endif
-        // functionValues.AssignDifferenceOf(input0, input1);
-        // functionValues.ColumnElementMultiplyWith(input2);
-        // functionValues.AssignDifferenceOf(input0, input0);
-        // functionValues += input2;
-        // functionValues.ElementInverse();
-        // functionValues.ElementMultiplyWith(input0);
        functionValues.SetValue(input0);
        functionValues.ColumnElementDivideBy(input2);
        functionValues += input1;
-#if NANCHECK
-        functionValues.HasNan("PerDimMeanVarDeNormalization");
-#endif
-#if DUMPOUTPUT
-        functionValues.Print("PerDimMeanVarDeNormalizationNode");
 #endif
    }

@ -585,31 +559,11 @@ public:
        Base::Validate(isFinalValidationPass);
        InferMBLayoutFromInputsForStandardCase();

-        if (Input(0)->RequiresPreCompute())
-        {
-            LogicError(
-                "PerDimMeanVarDeNormalizationNode criterion forbids first input from being a pre-compute node. "
-                "The first input should be the node whose output should be de-normalized, and the second and third inputs "
-                "should be LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
-        }
-
-        if (!(Input(1)->OperationName() == OperationNameOf(LearnableParameter) &&
-              Input(2)->OperationName() == OperationNameOf(LearnableParameter)) &&
-            !(Input(1)->OperationName() == OperationNameOf(MeanNode) &&
-              Input(2)->OperationName() == OperationNameOf(InvStdDevNode)))
-        {
-            LogicError(
-                "PerDimMeanVarDeNormalizationNode criterion requires the last two inputs to be "
-                "LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
-        }
-
        Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
        Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());

        if (isFinalValidationPass)
        {
-            if (!Input(0)->HasMBLayout() || Input(1)->HasMBLayout() || Input(2)->HasMBLayout())
-                InvalidArgument("PerDimMeanVarDeNormalizationNode: Inputs must be data, while mean and InvStdDev must be column vectors.");
            if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
                InvalidArgument("PerDimMeanVarDeNormalizationNode: All inputs should have same sample layout.");
        }
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -78,6 +78,7 @@ enum ElementWiseOperator
    opAbs,
    opSigmoid,
    opTanh,
+    opSqr,
    opSqrt,
    opExp,
    opLog,
@ -129,6 +130,7 @@ enum ElementWiseOperator
    Macro(Abs);               \
    Macro(Sigmoid);           \
    Macro(Tanh);              \
+    Macro(Sqr);               \
    Macro(Sqrt);              \
    Macro(Exp);               \
    Macro(Log);               \
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -357,6 +357,8 @@ public:
    Matrix<ElemType>& InplaceAbs();
    Matrix<ElemType>& AssignAbsOf(const Matrix<ElemType>& a);

+    // TODO: rename these to InPlaceFloor() and -Ceil() (I never know what it means to truncate a bottom)
+    //       And also document and implement that sparse matrices can only truncate towards 0.
    Matrix<ElemType>& InplaceTruncateBottom(const ElemType threshold);
    Matrix<ElemType>& AssignTruncateBottomOf(const Matrix<ElemType>& a, const ElemType threshold);
    Matrix<ElemType>& InplaceTruncateTop(const ElemType threshold);
--- a/Source/Math/TensorOps.h
+++ b/Source/Math/TensorOps.h
@ -96,6 +96,12 @@ DECL ElemType LinearRectifierDerivative(ElemType z)
    return z > 0 ? (ElemType) 1 : 0;
 }

+template <class ElemType>
+DECL ElemType Sqr(ElemType z)
+{
+    return z * z;
+}
+
 template <class ElemType>
 DECL ElemType Sqrt(ElemType z)
 {
@ -143,12 +149,6 @@ DECL ElemType LogAdd(ElemType x, ElemType y)
    }
 }

-template <class ElemType>
-DECL ElemType Sqr(ElemType z)
-{
-    return z * z;
-}
-
 // IndexElement reindexes a tensor along one dimension.
 // For the indexed dimension, the tensor op is prepared by setting 'a' to be broadcasting along the indexed dimension.
 // I.e. pa = &a points to the first element (as if index == 0).
@ -188,6 +188,7 @@ DefUnaryOp(Not, !a);
 DefUnaryOp(Abs, fabs_(a));
 DefUnaryOp(Sigmoid, Sigmoid(a));
 DefUnaryOp(Tanh, tanh_(a));
+DefUnaryOp(Sqr, Sqr(a));
 DefUnaryOp(Sqrt, Sqrt(a));
 DefUnaryOp(Exp, exp_(a));
 DefUnaryOp(Log, ClippedLog(a));
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -56,7 +56,7 @@ public:
    //      c.AssignDiffOf(c,a) means c -= a,
    //  and c.AddElementwiseProductOf(a, b, 1) means c += a .* b.
    // All operators support elementwise in-place operations, i.e. a, b, and c
-    // may all reference the same underlying SOB, with onee exception:
+    // may all reference the same underlying SOB, with one exception:
    // The output cannot be in-place and inverse-broadcasting at the same time.
    // E.g. with c=[10] and a=[10 x 20], c.AssignDiffOf(c,a) will fail.
    // In that case, you can use c.AddCopyOf(a,-1).
--- a/Source/Readers/LMSequenceReader/SequenceReader.cpp
+++ b/Source/Readers/LMSequenceReader/SequenceReader.cpp
@ -1747,8 +1747,9 @@ size_t BatchSequenceReader<ElemType>::DetermineSequencesToProcess()
        // and count tokens
        numTokens += m_parser.mSentenceIndex2SentenceInfo[seq].sLen;
    }
-    // if all are already done, we will return sln=0
-    fprintf(stderr, "DetermineSequencesToProcess: %d sequences of len %d, %d tokens\n", (int) mToProcess.size(), (int) sln, (int) numTokens);
+    // if all were already done, we will get here with sln=0 and return that
+
+    //fprintf(stderr, "DetermineSequencesToProcess: %d sequences of len %d, %d tokens\n", (int) mToProcess.size(), (int) sln, (int) numTokens);

    return sln;
 }
--- a/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
+++ b/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
@ -54,7 +54,7 @@ COMMAND:    currentDirectory=\\storage.ccp.philly.selfhost.corp.microsoft.com\pu

 COMMAND:    configFile=$(SolutionDir)Examples/Image/MNIST/Config/01_OneHidden.cntk  currentDirectory=$(SolutionDir)Tests/EndToEndTests/Image/Data  RunDir=$(SolutionDir)Tests/EndToEndTests/RunDir/Image/MNIST_01_OneHidden  DataDir=$(SolutionDir)Tests/EndToEndTests/Image/Data  ConfigDir=$(SolutionDir)Examples/Image/MNIST/Config  OutputDir=$(SolutionDir)Tests/EndToEndTests/RunDir/Image/MNIST_01_OneHidden  DeviceId=0  MNISTtrain=[reader=[file=$(SolutionDir)Tests/EndToEndTests/Image/Data/Train.txt]]  MNISTtest=[reader=[file=$(SolutionDir)Tests/EndToEndTests/Image/Data/Test.txt]]  MNISTtrain=[SGD=[maxEpochs=1]]  MNISTtrain=[SGD=[epochSize=100]]  MNISTtrain=[reader=[randomize=none]]  imageLayout="cudnn"  makeMode=false

-COMMAND:    configFile=$(SolutionDir)Examples/Image/MNIST/Config/02_Convolution.cntk  currentDirectory=$(SolutionDir)Tests/EndToEndTests/Image/Data  RunDir=$(SolutionDir)Tests/EndToEndTests/RunDir/Image/MNIST_02_Convolution  DataDir=$(SolutionDir)Tests/EndToEndTests/Image/Data  ConfigDir=$(SolutionDir)Examples/Image/MNIST/Config  OutputDir=$(SolutionDir)Tests/EndToEndTests/RunDir/Image/MNIST_02_Convolution  DeviceId=0  MNISTtrain=[reader=[file=$(SolutionDir)Tests/EndToEndTests/Image/Data/Train.txt]]  MNISTtest=[reader=[file=$(SolutionDir)Tests/EndToEndTests/Image/Data/Test.txt]]  MNISTtrain=[SGD=[maxEpochs=1]]  MNISTtrain=[SGD=[epochSize=100]]  MNISTtrain=[reader=[randomize=none]]  imageLayout="cudnn"  makeMode=false
+COMMAND:    configFile=$(SolutionDir)Examples/Image/MNIST/Config/02_Convolution.cntk  currentDirectory=$(SolutionDir)Tests/EndToEndTests/Image/Data  RunDir=$(SolutionDir)Tests/EndToEndTests/RunDir/Image/MNIST_02_Convolution  DataDir=$(SolutionDir)Tests/EndToEndTests/Image/Data  ConfigDir=$(SolutionDir)Examples/Image/MNIST/Config  OutputDir=$(SolutionDir)Tests/EndToEndTests/RunDir/Image/MNIST_02_Convolution  DeviceId=0  train=[reader=[file=$(SolutionDir)Tests/EndToEndTests/Image/Data/Train.txt]]  MNISTtest=[reader=[file=$(SolutionDir)Tests/EndToEndTests/Image/Data/Test.txt]]  train=[SGD=[maxEpochs=1]]  train=[SGD=[epochSize=100]]  train=[reader=[randomize=none]]  imageLayout="cudnn"  makeMode=false

 TODO out-of-date:
 COMMAND:     currentDirectory=$(SolutionDir)ExampleSetups\Image\MNIST  configFile=02_Conv.cntk configName=02_Conv