CNTK core: Implemented an optimization to elide the initial zeroing and subsequent accumulation into gradients for nodes with just one parent/accestor node
This commit is contained in:
Родитель
1a197f596f
Коммит
4fe22b81c5
2
Makefile
2
Makefile
|
@ -239,7 +239,7 @@ ifeq ("$(BUILDTYPE)","release")
|
|||
CXXFLAGS += -g -O4
|
||||
LDFLAGS += -rdynamic
|
||||
COMMON_FLAGS += -DNDEBUG -DNO_SYNC
|
||||
CUFLAGS += -O3 -g -use_fast_math -lineinfo $(GENCODE_FLAGS)
|
||||
CUFLAGS += -O3 -g -use_fast_math $(GENCODE_FLAGS)
|
||||
endif
|
||||
|
||||
ifdef CNTK_CUDA_DEVICE_DEBUGINFO
|
||||
|
|
|
@ -565,6 +565,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
Globals::EnableShareNodeValueMatrices();
|
||||
if (config(L"hyperCompressMemory", false))
|
||||
Globals::EnableHyperCompressMemory();
|
||||
if (config(L"optimizeGradientAccumulation", true))
|
||||
Globals::EnableGradientAccumulationOptimization();
|
||||
|
||||
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
||||
|
||||
|
@ -710,6 +712,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
|
|||
Globals::EnableShareNodeValueMatrices();
|
||||
if (config(L"hyperCompressMemory", false))
|
||||
Globals::EnableHyperCompressMemory();
|
||||
if (config(L"optimizeGradientAccumulation", true))
|
||||
Globals::EnableGradientAccumulationOptimization();
|
||||
|
||||
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
||||
|
||||
|
|
|
@ -249,6 +249,9 @@ namespace CNTK
|
|||
CNTK_API void EnableForwardValuesSharing();
|
||||
CNTK_API void EnableHyperMemoryCompress();
|
||||
|
||||
CNTK_API void EnableGradientAccumulationOptimization();
|
||||
CNTK_API void DisableGradientAccumulationOptimization();
|
||||
|
||||
CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
|
||||
CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);
|
||||
|
||||
|
|
|
@ -70,6 +70,16 @@ namespace CNTK
|
|||
Microsoft::MSR::CNTK::Globals::EnableHyperCompressMemory();
|
||||
}
|
||||
|
||||
void EnableGradientAccumulationOptimization()
|
||||
{
|
||||
Microsoft::MSR::CNTK::Globals::EnableGradientAccumulationOptimization();
|
||||
}
|
||||
|
||||
void DisableGradientAccumulationOptimization()
|
||||
{
|
||||
Microsoft::MSR::CNTK::Globals::DisableGradientAccumulationOptimization();
|
||||
}
|
||||
|
||||
bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
|
||||
{
|
||||
bool areDynamicAxesCompatible = (var1.DynamicAxes().size() == var2.DynamicAxes().size());
|
||||
|
|
|
@ -15,5 +15,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
std::atomic<bool> Globals::m_enableShareNodeValueMatrices(false);
|
||||
std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
|
||||
std::atomic<bool> Globals::m_optimizeGradientAccumulation(true);
|
||||
|
||||
}}}
|
||||
}}}
|
||||
|
|
|
@ -19,6 +19,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
static void ForceConstantRandomSeed() { m_forceConstantRandomSeed = true; }
|
||||
static bool ShouldForceConstantRandomSeed() { return m_forceConstantRandomSeed; }
|
||||
|
||||
static void EnableGradientAccumulationOptimization() { m_optimizeGradientAccumulation = true; }
|
||||
static void DisableGradientAccumulationOptimization() { m_optimizeGradientAccumulation = false; }
|
||||
static bool ShouldOptimizeGradientAccumulation() { return m_optimizeGradientAccumulation; }
|
||||
|
||||
// TODO: Currently the flag is set to false. Should be switched to true after more rigorous testing.
|
||||
static bool UseV2Aggregator() { return false; }
|
||||
|
||||
|
@ -49,5 +53,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// The global flag to enable hyper memory compression
|
||||
static std::atomic<bool> m_enableHyperCompressMemory;
|
||||
static std::atomic<bool> m_forceConstantRandomSeed;
|
||||
static std::atomic<bool> m_optimizeGradientAccumulation;
|
||||
};
|
||||
}}}
|
||||
|
|
|
@ -1037,6 +1037,16 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
for (auto& keyValue : parentsMap)
|
||||
{
|
||||
parentCount[keyValue.first] = keyValue.second.size();
|
||||
|
||||
// Indicate on the node that it's parent overwrites its gradient if the node is not part of a loop
|
||||
// and has exactly one parent who implements the gradient overwrite optimization
|
||||
if (Globals::ShouldOptimizeGradientAccumulation() &&
|
||||
!keyValue.first->IsPartOfLoop() &&
|
||||
(keyValue.second.size() == 1) &&
|
||||
(*keyValue.second.begin())->ImplementsGradientOverwriteOptimization())
|
||||
{
|
||||
keyValue.first->MarkParentOverwritesGradient();
|
||||
}
|
||||
}
|
||||
|
||||
// Construct the composite forward prop eval order by enumerating the
|
||||
|
|
|
@ -152,7 +152,7 @@ struct ComputationNetworkOwnedNodeState
|
|||
friend class ComputationNetwork;
|
||||
|
||||
ComputationNetworkOwnedNodeState()
|
||||
: m_needsGradient(false), m_valueSharable(true)
|
||||
: m_needsGradient(false), m_valueSharable(true), m_parentOverwritesGradient(false)
|
||||
{
|
||||
PurgeStateForFormingRecurrentLoops();
|
||||
m_isPartOfLoop = false;
|
||||
|
@ -168,10 +168,14 @@ struct ComputationNetworkOwnedNodeState
|
|||
other.m_traceNodeValueSparse = m_traceNodeValueSparse;
|
||||
other.m_traceNodeValueUpToDim = m_traceNodeValueUpToDim;
|
||||
other.m_traceNodeValueUpToT = m_traceNodeValueUpToT;
|
||||
other.m_parentOverwritesGradient = m_parentOverwritesGradient;
|
||||
}
|
||||
|
||||
bool IsPartOfLoop() const { return m_isPartOfLoop; }
|
||||
|
||||
void MarkParentOverwritesGradient() { m_parentOverwritesGradient = true; }
|
||||
bool ParentOverwritesGradient() const { return m_parentOverwritesGradient; }
|
||||
|
||||
virtual void MarkValueNonSharable() { m_valueSharable = false; }
|
||||
virtual void MarkValueSharable() { m_valueSharable = true; }
|
||||
bool IsValueSharable() const { return m_valueSharable; }
|
||||
|
@ -186,12 +190,17 @@ struct ComputationNetworkOwnedNodeState
|
|||
size_t m_traceNodeValueUpToT = 8; // 8 time steps fit comfortably into a normal-sized console
|
||||
void EnableNodeTracing(bool asReal, bool asCategoryLabel, bool asSparse) { m_traceNodeValueReal = asReal; m_traceNodeValueAsCategoryLabel = asCategoryLabel; m_traceNodeValueSparse = asSparse; }
|
||||
|
||||
virtual bool ImplementsGradientOverwriteOptimization() const { return false; }
|
||||
|
||||
protected: // TODO: should be fully encapsulated here
|
||||
bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
|
||||
|
||||
bool m_valueSharable; // a flag is needed for memory share.
|
||||
// If it is false (e.g., LearnableParameters/InputValue and those nodes are solely induced by LearnableParameters),
|
||||
// it will never be released to memory pool
|
||||
|
||||
bool m_parentOverwritesGradient; // flag indicating whether the parent of this node overwrites the gradient of this node instead of accumulating to it
|
||||
|
||||
private:
|
||||
bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
|
||||
|
||||
|
@ -1717,7 +1726,10 @@ public:
|
|||
void ResetGradient(ElemType val)
|
||||
{
|
||||
UpdateDataSize(Gradient());
|
||||
Gradient().SetValue(val);
|
||||
|
||||
// No need to zero initialize the gradient if the node's parent is going to overwrite it anyways
|
||||
if ((val != 0) || !ParentOverwritesGradient())
|
||||
Gradient().SetValue(val);
|
||||
|
||||
m_gradientInitialized = true;
|
||||
}
|
||||
|
|
|
@ -283,6 +283,8 @@ public:
|
|||
AttachInputsFromConfig(configp, GetExpectedNumInputs());
|
||||
}
|
||||
|
||||
virtual bool ImplementsGradientOverwriteOptimization() const override { return m_convEng->ImplementsGradientOverwriteOptimization(); }
|
||||
|
||||
public:
|
||||
void Save(File& fstream) const override
|
||||
{
|
||||
|
@ -348,7 +350,7 @@ public:
|
|||
// BackwardData adds results to the output so need to zero them out first.
|
||||
// REVIEW alexeyk: should be rolled into BackwardData itself.
|
||||
sliceOutputValue.SetValue(0);
|
||||
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
|
||||
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -360,16 +362,16 @@ public:
|
|||
auto& grad = InputRef(0).GradientAsMatrix();
|
||||
auto sliceInput1Value = InputRef(1).ValueFor(fr);
|
||||
if (!m_transpose)
|
||||
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
|
||||
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
|
||||
else
|
||||
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, fr.IsAllFrames(), *m_tempMatrix);
|
||||
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
|
||||
}
|
||||
else if (inputIndex == 1) // derivative with respect to the input feature
|
||||
{
|
||||
auto& input0 = InputRef(0).ValueAsMatrix();
|
||||
auto sliceInput1Grad = InputRef(1).GradientFor(fr);
|
||||
if (!m_transpose)
|
||||
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
|
||||
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrix);
|
||||
else
|
||||
{
|
||||
// REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.
|
||||
|
|
|
@ -60,8 +60,13 @@ public:
|
|||
if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
|
||||
MaskMissingGradientColumnsToZero(fr);
|
||||
|
||||
inputGradient.AddCopyOf(gradient);
|
||||
if (Input(inputIndex)->ParentOverwritesGradient())
|
||||
inputGradient.AssignCopyOf(gradient);
|
||||
else
|
||||
inputGradient.AddCopyOf(gradient);
|
||||
}
|
||||
|
||||
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
|
||||
};
|
||||
|
||||
template class PlusNode<float>;
|
||||
|
@ -415,7 +420,10 @@ public:
|
|||
auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true, fr.AllowBroadcast());
|
||||
auto input1 = OneSampleTensorFor(1, /*gradient=*/false, fr.AllowBroadcast());
|
||||
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
|
||||
input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
|
||||
if (Input(inputIndex)->ParentOverwritesGradient())
|
||||
input0Gradient.AssignMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
|
||||
else
|
||||
input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
|
||||
}
|
||||
else if (inputIndex == 1) // right derivative
|
||||
{
|
||||
|
@ -423,13 +431,18 @@ public:
|
|||
auto input0 = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
|
||||
auto input1Gradient = OneSampleTensorFor(1, /*gradient=*/true, fr.AllowBroadcast());
|
||||
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
|
||||
input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
|
||||
if (Input(inputIndex)->ParentOverwritesGradient())
|
||||
input1Gradient.AssignMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
|
||||
else
|
||||
input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
// but both *inputs* are used, so we don't overload the InputUsed-() function which defaults to 'true'
|
||||
|
||||
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
|
||||
|
||||
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
|
|
|
@ -72,7 +72,7 @@ public:
|
|||
}
|
||||
else if (opTypeHolder == unaryGradient)
|
||||
{
|
||||
sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward, opSum);
|
||||
sliceInputGrad.DoUnaryOpOf(Input(inputIndex)->ParentOverwritesGradient() ? 0.0f : 1.0f, sliceOutputGrad, 1, opBackward, opSum);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -80,7 +80,7 @@ public:
|
|||
// Not possible for Cos().
|
||||
auto sliceValue = (opType == binaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
|
||||
InputRef(0).ValueTensorFor(rank, fr);
|
||||
sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
|
||||
sliceInputGrad.DoBinaryOpOf(Input(inputIndex)->ParentOverwritesGradient() ? 0.0f : 1.0f, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -93,10 +93,13 @@ public:
|
|||
{
|
||||
return opType == binaryWithOutputGradient;
|
||||
}
|
||||
|
||||
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
|
||||
{
|
||||
return opType == binaryWithInputGradient;
|
||||
}
|
||||
|
||||
virtual bool ImplementsGradientOverwriteOptimization() const override { return (opType != noGradient); }
|
||||
};
|
||||
|
||||
#define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;
|
||||
|
|
|
@ -30,7 +30,7 @@ void ConvolutionEngine<ElemType>::Forward(const Mat& in, const Mat& kernel, Mat&
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace)
|
||||
void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace)
|
||||
{
|
||||
const auto& g = *m_geometry;
|
||||
assert(g.InputShape().GetNumElements() == grad.GetNumRows());
|
||||
|
@ -45,11 +45,11 @@ void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& ke
|
|||
|
||||
EnsureCompatible();
|
||||
EnsureConvolutionInitialized();
|
||||
BackwardDataCore(srcGrad, kernel, grad, workspace);
|
||||
BackwardDataCore(srcGrad, kernel, grad, accumulateGradient, workspace);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool allowReuse, Mat& workspace)
|
||||
void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool accumulateGradient, bool allowReuse, Mat& workspace)
|
||||
{
|
||||
const auto& g = *m_geometry;
|
||||
assert(g.InputShape().GetNumElements() == in.GetNumRows());
|
||||
|
@ -64,7 +64,7 @@ void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat&
|
|||
|
||||
EnsureCompatible();
|
||||
EnsureConvolutionInitialized();
|
||||
BackwardKernelCore(srcGrad, in, kernel, allowReuse, workspace);
|
||||
BackwardKernelCore(srcGrad, in, kernel, accumulateGradient, allowReuse, workspace);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -179,12 +179,12 @@ protected:
|
|||
in.ConvolutionForward(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, out);
|
||||
}
|
||||
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& /*workspace*/) override
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& /*workspace*/) override
|
||||
{
|
||||
srcGrad.ConvolutionBackwardData(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, grad);
|
||||
}
|
||||
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& /*workspace*/) override
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool /*allowReuse*/, Mat& /*workspace*/) override
|
||||
{
|
||||
srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
|
||||
}
|
||||
|
@ -372,7 +372,7 @@ protected:
|
|||
assert(batchSize == out.GetNumCols());
|
||||
}
|
||||
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = srcGrad.GetNumCols();
|
||||
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
||||
|
@ -412,7 +412,7 @@ protected:
|
|||
assert(batchSize == srcGrad.GetNumCols());
|
||||
}
|
||||
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) override
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool allowReuse, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = in.GetNumCols();
|
||||
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
||||
|
@ -678,7 +678,7 @@ protected:
|
|||
// [KXY x NWH]^T * [KXY x C] -> [NWH x C]
|
||||
// 4. Reshape and transpose outputs (grad): [NWH x C] -> [N x WHC]^T -> [WHC x N]
|
||||
// In case minibatch size == 1 this step is not required and step 3 writes results directly to output (grad).
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = srcGrad.GetNumCols();
|
||||
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
|
||||
|
@ -771,7 +771,7 @@ protected:
|
|||
// 2. Unrolling convolution input (in) into a matrix of [NW'H' x WHC] layout.
|
||||
// 3. Performing matrix multiplication of unrolled input with transposed output:
|
||||
// [NW'H' x WHC]^T * [NW'H' x K] -> [WHC x K] - kernel gradients.
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool /*allowReuse*/, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = srcGrad.GetNumCols();
|
||||
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
|
||||
|
|
|
@ -47,9 +47,9 @@ public:
|
|||
|
||||
void Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace);
|
||||
|
||||
void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace);
|
||||
void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace);
|
||||
|
||||
void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace);
|
||||
void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool allowReuse, Mat& workspace);
|
||||
|
||||
void ForwardPooling(const Mat& in, Mat& out);
|
||||
|
||||
|
@ -72,6 +72,8 @@ public:
|
|||
m_maxTempMemSizeInSamples = maxTempMemSizeInSamples;
|
||||
}
|
||||
|
||||
virtual bool ImplementsGradientOverwriteOptimization() const { return false; }
|
||||
|
||||
protected:
|
||||
ConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
|
||||
: m_geometry(geometry), m_deviceId(deviceId), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_poolKind(poolKind)
|
||||
|
@ -85,9 +87,9 @@ protected:
|
|||
|
||||
virtual void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) = 0;
|
||||
|
||||
virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) = 0;
|
||||
virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace) = 0;
|
||||
|
||||
virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) = 0;
|
||||
virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool allowReuse, Mat& workspace) = 0;
|
||||
|
||||
virtual void EnsurePoolingInitialized() = 0;
|
||||
|
||||
|
|
|
@ -183,6 +183,8 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
|
||||
|
||||
protected:
|
||||
using Base::m_geometry;
|
||||
using Base::m_deviceId;
|
||||
|
@ -255,7 +257,7 @@ protected:
|
|||
CUDNN_CALL(err);
|
||||
}
|
||||
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = srcGrad.GetNumCols();
|
||||
// Find best algo and allocate temp buffer, if needed.
|
||||
|
@ -282,11 +284,11 @@ protected:
|
|||
workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
||||
// Compute gradients with respect to the output tensor (data).
|
||||
CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
|
||||
ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
|
||||
ptr(workspace), m_backDataAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad)));
|
||||
workspace.Resize(0, 0);
|
||||
}
|
||||
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool /*allowReuse*/, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = in.GetNumCols();
|
||||
// Find best algo and allocate temp buffer, if needed.
|
||||
|
@ -313,7 +315,7 @@ protected:
|
|||
workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
||||
// Compute gradients with respect to the output tensor (data).
|
||||
CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
|
||||
ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
|
||||
ptr(workspace), m_backFiltAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad)));
|
||||
workspace.Resize(0, 0);
|
||||
}
|
||||
|
||||
|
|
|
@ -280,8 +280,8 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardData)
|
|||
SingleMatrix workspace(deviceId);
|
||||
SingleMatrix workspaceB(baseDeviceId);
|
||||
|
||||
testEng->BackwardData(srcGrad, kernel, grad, workspace);
|
||||
baseEng->BackwardData(srcGradB, kernelB, gradB, workspaceB);
|
||||
testEng->BackwardData(srcGrad, kernel, grad, true, workspace);
|
||||
baseEng->BackwardData(srcGradB, kernelB, gradB, true, workspaceB);
|
||||
|
||||
std::stringstream tmsg;
|
||||
tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;
|
||||
|
@ -349,8 +349,8 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardKernel)
|
|||
SingleMatrix workspace(deviceId);
|
||||
SingleMatrix workspaceB(baseDeviceId);
|
||||
|
||||
testEng->BackwardKernel(grad, in, kernel, false, workspace);
|
||||
baseEng->BackwardKernel(gradB, inB, kernelB, false, workspaceB);
|
||||
testEng->BackwardKernel(grad, in, kernel, true, false, workspace);
|
||||
baseEng->BackwardKernel(gradB, inB, kernelB, true, false, workspaceB);
|
||||
|
||||
std::stringstream tmsg;
|
||||
tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;
|
||||
|
|
Загрузка…
Ссылка в новой задаче