new condition for tensor lib: output cannot be in-place and inverse-broadcasting (reducing) at the same time. This makes reduction easier

2015-12-28 23:48:40 -08:00 · 2015-12-28 23:48:40 -08:00 · cdcc4bb3d2
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -53,7 +53,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (Input(inputIndex)->GetNumCols() < GetNumCols())
                MaskMissingGradientColumnsToZero(fr);

-            inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
+            inputGradient.AddCopyOf(gradient);
 #if 0
            if (Input(inputIndex)->GetNumCols() < GetNumCols())
                Input(inputIndex)->Gradient().Print("PlusNode BackProp with reduction");
@ -227,10 +227,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (Input(inputIndex)->GetNumCols() < GetNumCols())
                MaskMissingGradientColumnsToZero(fr);

-            if (sign > 0)
-                inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
-            else
-                inputGradient.DoDifferenceOf(0.0f, inputGradient, gradient, 1.0f);
+            inputGradient.AddCopyOf(gradient, sign);
 #else
            Matrix<ElemType> gradientValues = GradientFor(fr);

--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@ -427,9 +427,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Cases:
        //  - input elements >> GPU procs  -->  do reduction in inner loop
        //  - reduction dimension fits into a single kernel  -->  launch it that way
-        //  - reduction dimension requires multiple kernels  -->  use atomic add, to avoid temp mem alloc  --is this any good?
+        //  - reduction dimension requires multiple kernels  -->  use atomic add, to avoid temp mem alloc
        //     - PlusNode: reducing to a bias for small matrices
        //     - ScaleNode: big elementwise product reduced to a scalar (dot product)
+        //     - E.g. 3072 GPU procs:
+        //       If >= 3072 reduced output values must be computed, just loop inside.
+        //       If less, and reduction per value does not fit into a single proc,
+        //       then we break it into procs, say, 24.
+        //       This way we will need 24 atomicAdd()s of 3072/24 = 128 values.
+        //       If reduction is along stride=1, then we'd have 24 atomicAdd()s of 32 coalesced writes.
+        //       Does not sound scary at all.
+        //       Precondition: matrix cannot at the same time participate in reduction and operation.
 #if 1
        C_size_t reductionDim = 1;  // number of elements to reduce over
        for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@ -223,6 +223,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            offsets[i] = shapes[i].GetOffset();
    }

+    // enforce that in case of broadcasting, the output must not be an input
+    template<class ElemType>
+    static bool CheckDifferentObject(const TensorView<ElemType> & a, const TensorView<ElemType> & b)
+    {
+        if (&a == &b)
+            LogicError("Do{U,Bi,Ter}naryOpOf: When inverse broadcasting, output must not be an input.");
+        return true;
+    }
+
    template<class ElemType>
    void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
    {
@ -235,6 +244,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SmallVector<size_t> regularOpDims, reducingOpDims;
        PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);

+        // output cannot be input when reducing
+        if (reducingOpDims.size() > 0)
+            CheckDifferentObject(a, *this);
+
        // now perform the operation
        GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }
@ -250,6 +263,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SmallVector<size_t> regularOpDims, reducingOpDims;
        PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);

+        // output cannot be input when reducing
+        if (reducingOpDims.size() > 0)
+            CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);
+
        GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }

@ -264,6 +281,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        SmallVector<size_t> regularOpDims, reducingOpDims;
        PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);

+        // output cannot be input when reducing
+        if (reducingOpDims.size() > 0)
+            CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);
+
        GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }

--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -48,7 +48,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //      c.AssignDiffOf(c,a) means c -= a,
        //  and c.AddElementwiseProductOf(a, b, 1) means c += a .* b.
        // All operators support elementwise in-place operations, i.e. a, b, and c
-        // may all reference the same underlying SOB.
+        // may all reference the same underlying SOB, with onee exception:
+        // The output cannot be in-place and inverse-broadcasting at the same time.
+        // E.g. with c=[10] and a=[10 x 20], c.AssignDiffOf(c,a) will fail.
+        // In that case, you can use c.AddCopyOf(a,-1).
+        // Aliasing is not detected, so don't pass distinct TensorView objects that
+        // reference overlapping but not identical slices.
        // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
        // -------------------------------------------------------------------