CNTK core: Implemented an optimization to elide the initial zeroing and subsequent accumulation into gradients for nodes with just one parent/accestor node

2016-12-09 15:58:01 -08:00 · 2016-12-09 15:58:01 -08:00 · 4fe22b81c5
--- a/2
+++ b/2
@ -239,7 +239,7 @@ ifeq ("$(BUILDTYPE)","release")
  CXXFLAGS += -g -O4
  LDFLAGS += -rdynamic
  COMMON_FLAGS += -DNDEBUG -DNO_SYNC
-  CUFLAGS += -O3 -g -use_fast_math -lineinfo $(GENCODE_FLAGS)
+  CUFLAGS += -O3 -g -use_fast_math $(GENCODE_FLAGS)
 endif

 ifdef CNTK_CUDA_DEVICE_DEBUGINFO
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -565,6 +565,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
        Globals::EnableShareNodeValueMatrices();
    if (config(L"hyperCompressMemory", false))
        Globals::EnableHyperCompressMemory();
+    if (config(L"optimizeGradientAccumulation", true))
+        Globals::EnableGradientAccumulationOptimization();

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

@ -710,6 +712,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
        Globals::EnableShareNodeValueMatrices();
    if (config(L"hyperCompressMemory", false))
        Globals::EnableHyperCompressMemory();
+    if (config(L"optimizeGradientAccumulation", true))
+        Globals::EnableGradientAccumulationOptimization();

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -249,6 +249,9 @@ namespace CNTK
        CNTK_API void EnableForwardValuesSharing();
        CNTK_API void EnableHyperMemoryCompress();

+        CNTK_API void EnableGradientAccumulationOptimization();
+        CNTK_API void DisableGradientAccumulationOptimization();
+
        CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
        CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);

--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -70,6 +70,16 @@ namespace CNTK
            Microsoft::MSR::CNTK::Globals::EnableHyperCompressMemory();
        }

+        void EnableGradientAccumulationOptimization()
+        {
+            Microsoft::MSR::CNTK::Globals::EnableGradientAccumulationOptimization();
+        }
+
+        void DisableGradientAccumulationOptimization()
+        {
+            Microsoft::MSR::CNTK::Globals::DisableGradientAccumulationOptimization();
+        }
+
        bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
        {
            bool areDynamicAxesCompatible = (var1.DynamicAxes().size() == var2.DynamicAxes().size());
--- a/Source/Common/Globals.cpp
+++ b/Source/Common/Globals.cpp
@ -15,5 +15,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    std::atomic<bool> Globals::m_enableShareNodeValueMatrices(false);
    std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
+    std::atomic<bool> Globals::m_optimizeGradientAccumulation(true);

-}}}
+}}}
--- a/Source/Common/Include/Globals.h
+++ b/Source/Common/Include/Globals.h
@ -19,6 +19,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static void       ForceConstantRandomSeed() {        m_forceConstantRandomSeed = true; }
        static bool ShouldForceConstantRandomSeed() { return m_forceConstantRandomSeed; }

+        static void EnableGradientAccumulationOptimization() { m_optimizeGradientAccumulation = true; }
+        static void DisableGradientAccumulationOptimization() { m_optimizeGradientAccumulation = false; }
+        static bool ShouldOptimizeGradientAccumulation() { return m_optimizeGradientAccumulation; }
+
        // TODO: Currently the flag is set to false. Should be switched to true after more rigorous testing.
        static bool UseV2Aggregator() { return false; }

@ -49,5 +53,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // The global flag to enable hyper memory compression 
        static std::atomic<bool> m_enableHyperCompressMemory;
        static std::atomic<bool> m_forceConstantRandomSeed;
+        static std::atomic<bool> m_optimizeGradientAccumulation;
    };
 }}}
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -1037,6 +1037,16 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
    for (auto& keyValue : parentsMap)
    {
        parentCount[keyValue.first] = keyValue.second.size();
+
+        // Indicate on the node that it's parent overwrites its gradient if the node is not part of a loop
+        // and has exactly one parent who implements the gradient overwrite optimization
+        if (Globals::ShouldOptimizeGradientAccumulation() &&
+            !keyValue.first->IsPartOfLoop() &&
+            (keyValue.second.size() == 1) &&
+            (*keyValue.second.begin())->ImplementsGradientOverwriteOptimization())
+        {
+            keyValue.first->MarkParentOverwritesGradient();
+        }
    }

    // Construct the composite forward prop eval order by enumerating the
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -152,7 +152,7 @@ struct ComputationNetworkOwnedNodeState
    friend class ComputationNetwork;

    ComputationNetworkOwnedNodeState()
-        : m_needsGradient(false), m_valueSharable(true)
+        : m_needsGradient(false), m_valueSharable(true), m_parentOverwritesGradient(false)
    {
        PurgeStateForFormingRecurrentLoops();
        m_isPartOfLoop = false;
@ -168,10 +168,14 @@ struct ComputationNetworkOwnedNodeState
        other.m_traceNodeValueSparse          = m_traceNodeValueSparse;
        other.m_traceNodeValueUpToDim         = m_traceNodeValueUpToDim;
        other.m_traceNodeValueUpToT           = m_traceNodeValueUpToT;
+        other.m_parentOverwritesGradient = m_parentOverwritesGradient;
    }

    bool IsPartOfLoop() const { return m_isPartOfLoop; }

+    void MarkParentOverwritesGradient() { m_parentOverwritesGradient = true; }
+    bool ParentOverwritesGradient() const { return m_parentOverwritesGradient; }
+
    virtual void MarkValueNonSharable() { m_valueSharable = false; }
    virtual void MarkValueSharable() { m_valueSharable = true; }
    bool IsValueSharable() const { return m_valueSharable; }
@ -186,12 +190,17 @@ struct ComputationNetworkOwnedNodeState
    size_t m_traceNodeValueUpToT = 8;   // 8 time steps fit comfortably into a normal-sized console
    void EnableNodeTracing(bool asReal, bool asCategoryLabel, bool asSparse) { m_traceNodeValueReal = asReal; m_traceNodeValueAsCategoryLabel = asCategoryLabel; m_traceNodeValueSparse = asSparse; }

+    virtual bool ImplementsGradientOverwriteOptimization() const { return false; }
+
 protected:                // TODO: should be fully encapsulated here
    bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)

    bool m_valueSharable; // a flag is needed for memory share.
                          // If it is false (e.g., LearnableParameters/InputValue and those nodes are solely induced by LearnableParameters),
                          // it will never be released to memory pool
+
+    bool m_parentOverwritesGradient; // flag indicating whether the parent of this node overwrites the gradient of this node instead of accumulating to it
+
 private:
    bool m_isPartOfLoop; // true if this loop is part of a recurrent loop

@ -1717,7 +1726,10 @@ public:
    void ResetGradient(ElemType val)
    {
        UpdateDataSize(Gradient());
-        Gradient().SetValue(val);
+
+        // No need to zero initialize the gradient if the node's parent is going to overwrite it anyways
+        if ((val != 0) || !ParentOverwritesGradient())
+            Gradient().SetValue(val);

        m_gradientInitialized = true;
    }
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -283,6 +283,8 @@ public:
        AttachInputsFromConfig(configp, GetExpectedNumInputs());
    }

+    virtual bool ImplementsGradientOverwriteOptimization() const override { return m_convEng->ImplementsGradientOverwriteOptimization(); }
+
 public:
    void Save(File& fstream) const override
    {
@ -348,7 +350,7 @@ public:
            // BackwardData adds results to the output so need to zero them out first.
            // REVIEW alexeyk: should be rolled into BackwardData itself.
            sliceOutputValue.SetValue(0);
-            m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
+            m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrix);
        }
    }

@ -360,16 +362,16 @@ public:
            auto& grad = InputRef(0).GradientAsMatrix();
            auto sliceInput1Value = InputRef(1).ValueFor(fr);
            if (!m_transpose)
-                m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
+                m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
            else
-                m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, fr.IsAllFrames(), *m_tempMatrix);
+                m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
        }
        else if (inputIndex == 1) // derivative with respect to the input feature
        {
            auto& input0 = InputRef(0).ValueAsMatrix();
            auto sliceInput1Grad = InputRef(1).GradientFor(fr);
            if (!m_transpose)
-                m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
+                m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrix);
            else
            {
                // REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -60,8 +60,13 @@ public:
        if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
            MaskMissingGradientColumnsToZero(fr);

-        inputGradient.AddCopyOf(gradient);
+        if (Input(inputIndex)->ParentOverwritesGradient())
+            inputGradient.AssignCopyOf(gradient);
+        else
+            inputGradient.AddCopyOf(gradient);
    }
+
+    virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
 };

 template class PlusNode<float>;
@ -415,7 +420,10 @@ public:
            auto input0Gradient = OneSampleTensorFor(0,  /*gradient=*/true,  fr.AllowBroadcast());
            auto input1         = OneSampleTensorFor(1,  /*gradient=*/false, fr.AllowBroadcast());
            auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true,  fr);
-            input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
+            if (Input(inputIndex)->ParentOverwritesGradient())
+                input0Gradient.AssignMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
+            else
+                input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
        }
        else if (inputIndex == 1) // right derivative
        {
@ -423,13 +431,18 @@ public:
            auto input0         = OneSampleTensorFor(0,  /*gradient=*/false, fr.AllowBroadcast());
            auto input1Gradient = OneSampleTensorFor(1,  /*gradient=*/true,  fr.AllowBroadcast());
            auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
-            input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
+            if (Input(inputIndex)->ParentOverwritesGradient())
+                input1Gradient.AssignMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
+            else
+                input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
        }
    }

    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
    // but both *inputs* are used, so we don't overload the InputUsed-() function which defaults to 'true'

+    virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
+
    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -72,7 +72,7 @@ public:
        }
        else if (opTypeHolder == unaryGradient)
        {
-            sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward, opSum);
+            sliceInputGrad.DoUnaryOpOf(Input(inputIndex)->ParentOverwritesGradient() ? 0.0f : 1.0f, sliceOutputGrad, 1, opBackward, opSum);
        }
        else 
        {
@ -80,7 +80,7 @@ public:
            // Not possible for Cos().
            auto sliceValue = (opType == binaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
                InputRef(0).ValueTensorFor(rank, fr);
-            sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
+            sliceInputGrad.DoBinaryOpOf(Input(inputIndex)->ParentOverwritesGradient() ? 0.0f : 1.0f, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
        }
    }

@ -93,10 +93,13 @@ public:
    {
        return opType == binaryWithOutputGradient;
    }
+
    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
    {
        return opType == binaryWithInputGradient;
    }
+
+    virtual bool ImplementsGradientOverwriteOptimization() const override { return (opType != noGradient); }
 };

 #define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -30,7 +30,7 @@ void ConvolutionEngine<ElemType>::Forward(const Mat& in, const Mat& kernel, Mat&
 }

 template <class ElemType>
-void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace)
+void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace)
 {
    const auto& g = *m_geometry;
    assert(g.InputShape().GetNumElements() == grad.GetNumRows());
@ -45,11 +45,11 @@ void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& ke

    EnsureCompatible();
    EnsureConvolutionInitialized();
-    BackwardDataCore(srcGrad, kernel, grad, workspace);
+    BackwardDataCore(srcGrad, kernel, grad, accumulateGradient, workspace);
 }

 template <class ElemType>
-void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool allowReuse, Mat& workspace)
+void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool accumulateGradient, bool allowReuse, Mat& workspace)
 {
    const auto& g = *m_geometry;
    assert(g.InputShape().GetNumElements() == in.GetNumRows());
@ -64,7 +64,7 @@ void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat&

    EnsureCompatible();
    EnsureConvolutionInitialized();
-    BackwardKernelCore(srcGrad, in, kernel, allowReuse, workspace);
+    BackwardKernelCore(srcGrad, in, kernel, accumulateGradient, allowReuse, workspace);
 }

 template <class ElemType>
@ -179,12 +179,12 @@ protected:
        in.ConvolutionForward(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, out);
    }

-    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& /*workspace*/) override
+    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& /*workspace*/) override
    {
        srcGrad.ConvolutionBackwardData(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, grad);
    }

-    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& /*workspace*/) override
+    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool /*allowReuse*/, Mat& /*workspace*/) override
    {
        srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
    }
@ -372,7 +372,7 @@ protected:
        assert(batchSize == out.GetNumCols());
    }

-    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
+    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& workspace) override
    {
        size_t batchSize = srcGrad.GetNumCols();
        size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
@ -412,7 +412,7 @@ protected:
        assert(batchSize == srcGrad.GetNumCols());
    }

-    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) override
+    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool allowReuse, Mat& workspace) override
    {
        size_t batchSize = in.GetNumCols();
        size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
@ -678,7 +678,7 @@ protected:
    //    [KXY x NWH]^T * [KXY x C] -> [NWH x C]
    // 4. Reshape and transpose outputs (grad): [NWH x C] -> [N x WHC]^T -> [WHC x N]
    //    In case minibatch size == 1 this step is not required and step 3 writes results directly to output (grad).
-    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
+    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& workspace) override
    {
        size_t batchSize = srcGrad.GetNumCols();
        size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
@ -771,7 +771,7 @@ protected:
    // 2. Unrolling convolution input (in) into a matrix of [NW'H' x WHC] layout.
    // 3. Performing matrix multiplication of unrolled input with transposed output:
    //    [NW'H' x WHC]^T * [NW'H' x K] -> [WHC x K] - kernel gradients.
-    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
+    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool /*allowReuse*/, Mat& workspace) override
    {
        size_t batchSize = srcGrad.GetNumCols();
        size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
--- a/Source/Math/ConvolutionEngine.h
+++ b/Source/Math/ConvolutionEngine.h
@ -47,9 +47,9 @@ public:

    void Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace);

-    void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace);
+    void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace);

-    void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace);
+    void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool allowReuse, Mat& workspace);

    void ForwardPooling(const Mat& in, Mat& out);

@ -72,6 +72,8 @@ public:
        m_maxTempMemSizeInSamples = maxTempMemSizeInSamples;
    }

+    virtual bool ImplementsGradientOverwriteOptimization() const { return false; }
+
 protected:
    ConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
        : m_geometry(geometry), m_deviceId(deviceId), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_poolKind(poolKind)
@ -85,9 +87,9 @@ protected:

    virtual void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) = 0;

-    virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) = 0;
+    virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace) = 0;

-    virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) = 0;
+    virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool allowReuse, Mat& workspace) = 0;

    virtual void EnsurePoolingInitialized() = 0;

--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -183,6 +183,8 @@ public:
    {
    }

+    virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
+
 protected:
    using Base::m_geometry;
    using Base::m_deviceId;
@ -255,7 +257,7 @@ protected:
        CUDNN_CALL(err);
    }

-    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
+    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace) override
    {
        size_t batchSize = srcGrad.GetNumCols();
        // Find best algo and allocate temp buffer, if needed.
@ -282,11 +284,11 @@ protected:
            workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
        // Compute gradients with respect to the output tensor (data).
        CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
-                                                ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
+                                                ptr(workspace), m_backDataAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad)));
        workspace.Resize(0, 0);
    }

-    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
+    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool /*allowReuse*/, Mat& workspace) override
    {
        size_t batchSize = in.GetNumCols();
        // Find best algo and allocate temp buffer, if needed.
@ -313,7 +315,7 @@ protected:
            workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
        // Compute gradients with respect to the output tensor (data).
        CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
-                                                  ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
+                                                  ptr(workspace), m_backFiltAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad)));
        workspace.Resize(0, 0);
    }

--- a/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
+++ b/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
@ -280,8 +280,8 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardData)
            SingleMatrix workspace(deviceId);
            SingleMatrix workspaceB(baseDeviceId);

-            testEng->BackwardData(srcGrad, kernel, grad, workspace);
-            baseEng->BackwardData(srcGradB, kernelB, gradB, workspaceB);
+            testEng->BackwardData(srcGrad, kernel, grad, true, workspace);
+            baseEng->BackwardData(srcGradB, kernelB, gradB, true, workspaceB);

            std::stringstream tmsg;
            tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;
@ -349,8 +349,8 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardKernel)
            SingleMatrix workspace(deviceId);
            SingleMatrix workspaceB(baseDeviceId);
            
-            testEng->BackwardKernel(grad, in, kernel, false, workspace);
-            baseEng->BackwardKernel(gradB, inB, kernelB, false, workspaceB);
+            testEng->BackwardKernel(grad, in, kernel, true, false, workspace);
+            baseEng->BackwardKernel(gradB, inB, kernelB, true, false, workspaceB);
            
            std::stringstream tmsg;
            tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;