CNTK core: Implemented an optimization to elide the initial zeroing and subsequent accumulation into gradients for nodes with just one parent/accestor node
This commit is contained in:
Родитель
1a197f596f
Коммит
4fe22b81c5
2
Makefile
2
Makefile
|
@ -239,7 +239,7 @@ ifeq ("$(BUILDTYPE)","release")
|
||||||
CXXFLAGS += -g -O4
|
CXXFLAGS += -g -O4
|
||||||
LDFLAGS += -rdynamic
|
LDFLAGS += -rdynamic
|
||||||
COMMON_FLAGS += -DNDEBUG -DNO_SYNC
|
COMMON_FLAGS += -DNDEBUG -DNO_SYNC
|
||||||
CUFLAGS += -O3 -g -use_fast_math -lineinfo $(GENCODE_FLAGS)
|
CUFLAGS += -O3 -g -use_fast_math $(GENCODE_FLAGS)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CNTK_CUDA_DEVICE_DEBUGINFO
|
ifdef CNTK_CUDA_DEVICE_DEBUGINFO
|
||||||
|
|
|
@ -565,6 +565,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
||||||
Globals::EnableShareNodeValueMatrices();
|
Globals::EnableShareNodeValueMatrices();
|
||||||
if (config(L"hyperCompressMemory", false))
|
if (config(L"hyperCompressMemory", false))
|
||||||
Globals::EnableHyperCompressMemory();
|
Globals::EnableHyperCompressMemory();
|
||||||
|
if (config(L"optimizeGradientAccumulation", true))
|
||||||
|
Globals::EnableGradientAccumulationOptimization();
|
||||||
|
|
||||||
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
||||||
|
|
||||||
|
@ -710,6 +712,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
|
||||||
Globals::EnableShareNodeValueMatrices();
|
Globals::EnableShareNodeValueMatrices();
|
||||||
if (config(L"hyperCompressMemory", false))
|
if (config(L"hyperCompressMemory", false))
|
||||||
Globals::EnableHyperCompressMemory();
|
Globals::EnableHyperCompressMemory();
|
||||||
|
if (config(L"optimizeGradientAccumulation", true))
|
||||||
|
Globals::EnableGradientAccumulationOptimization();
|
||||||
|
|
||||||
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
|
||||||
|
|
||||||
|
|
|
@ -249,6 +249,9 @@ namespace CNTK
|
||||||
CNTK_API void EnableForwardValuesSharing();
|
CNTK_API void EnableForwardValuesSharing();
|
||||||
CNTK_API void EnableHyperMemoryCompress();
|
CNTK_API void EnableHyperMemoryCompress();
|
||||||
|
|
||||||
|
CNTK_API void EnableGradientAccumulationOptimization();
|
||||||
|
CNTK_API void DisableGradientAccumulationOptimization();
|
||||||
|
|
||||||
CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
|
CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
|
||||||
CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);
|
CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);
|
||||||
|
|
||||||
|
|
|
@ -70,6 +70,16 @@ namespace CNTK
|
||||||
Microsoft::MSR::CNTK::Globals::EnableHyperCompressMemory();
|
Microsoft::MSR::CNTK::Globals::EnableHyperCompressMemory();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EnableGradientAccumulationOptimization()
|
||||||
|
{
|
||||||
|
Microsoft::MSR::CNTK::Globals::EnableGradientAccumulationOptimization();
|
||||||
|
}
|
||||||
|
|
||||||
|
void DisableGradientAccumulationOptimization()
|
||||||
|
{
|
||||||
|
Microsoft::MSR::CNTK::Globals::DisableGradientAccumulationOptimization();
|
||||||
|
}
|
||||||
|
|
||||||
bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
|
bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
|
||||||
{
|
{
|
||||||
bool areDynamicAxesCompatible = (var1.DynamicAxes().size() == var2.DynamicAxes().size());
|
bool areDynamicAxesCompatible = (var1.DynamicAxes().size() == var2.DynamicAxes().size());
|
||||||
|
|
|
@ -15,5 +15,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
|
|
||||||
std::atomic<bool> Globals::m_enableShareNodeValueMatrices(false);
|
std::atomic<bool> Globals::m_enableShareNodeValueMatrices(false);
|
||||||
std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
|
std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
|
||||||
|
std::atomic<bool> Globals::m_optimizeGradientAccumulation(true);
|
||||||
|
|
||||||
}}}
|
}}}
|
||||||
|
|
|
@ -19,6 +19,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
static void ForceConstantRandomSeed() { m_forceConstantRandomSeed = true; }
|
static void ForceConstantRandomSeed() { m_forceConstantRandomSeed = true; }
|
||||||
static bool ShouldForceConstantRandomSeed() { return m_forceConstantRandomSeed; }
|
static bool ShouldForceConstantRandomSeed() { return m_forceConstantRandomSeed; }
|
||||||
|
|
||||||
|
static void EnableGradientAccumulationOptimization() { m_optimizeGradientAccumulation = true; }
|
||||||
|
static void DisableGradientAccumulationOptimization() { m_optimizeGradientAccumulation = false; }
|
||||||
|
static bool ShouldOptimizeGradientAccumulation() { return m_optimizeGradientAccumulation; }
|
||||||
|
|
||||||
// TODO: Currently the flag is set to false. Should be switched to true after more rigorous testing.
|
// TODO: Currently the flag is set to false. Should be switched to true after more rigorous testing.
|
||||||
static bool UseV2Aggregator() { return false; }
|
static bool UseV2Aggregator() { return false; }
|
||||||
|
|
||||||
|
@ -49,5 +53,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
// The global flag to enable hyper memory compression
|
// The global flag to enable hyper memory compression
|
||||||
static std::atomic<bool> m_enableHyperCompressMemory;
|
static std::atomic<bool> m_enableHyperCompressMemory;
|
||||||
static std::atomic<bool> m_forceConstantRandomSeed;
|
static std::atomic<bool> m_forceConstantRandomSeed;
|
||||||
|
static std::atomic<bool> m_optimizeGradientAccumulation;
|
||||||
};
|
};
|
||||||
}}}
|
}}}
|
||||||
|
|
|
@ -1037,6 +1037,16 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
||||||
for (auto& keyValue : parentsMap)
|
for (auto& keyValue : parentsMap)
|
||||||
{
|
{
|
||||||
parentCount[keyValue.first] = keyValue.second.size();
|
parentCount[keyValue.first] = keyValue.second.size();
|
||||||
|
|
||||||
|
// Indicate on the node that it's parent overwrites its gradient if the node is not part of a loop
|
||||||
|
// and has exactly one parent who implements the gradient overwrite optimization
|
||||||
|
if (Globals::ShouldOptimizeGradientAccumulation() &&
|
||||||
|
!keyValue.first->IsPartOfLoop() &&
|
||||||
|
(keyValue.second.size() == 1) &&
|
||||||
|
(*keyValue.second.begin())->ImplementsGradientOverwriteOptimization())
|
||||||
|
{
|
||||||
|
keyValue.first->MarkParentOverwritesGradient();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Construct the composite forward prop eval order by enumerating the
|
// Construct the composite forward prop eval order by enumerating the
|
||||||
|
|
|
@ -152,7 +152,7 @@ struct ComputationNetworkOwnedNodeState
|
||||||
friend class ComputationNetwork;
|
friend class ComputationNetwork;
|
||||||
|
|
||||||
ComputationNetworkOwnedNodeState()
|
ComputationNetworkOwnedNodeState()
|
||||||
: m_needsGradient(false), m_valueSharable(true)
|
: m_needsGradient(false), m_valueSharable(true), m_parentOverwritesGradient(false)
|
||||||
{
|
{
|
||||||
PurgeStateForFormingRecurrentLoops();
|
PurgeStateForFormingRecurrentLoops();
|
||||||
m_isPartOfLoop = false;
|
m_isPartOfLoop = false;
|
||||||
|
@ -168,10 +168,14 @@ struct ComputationNetworkOwnedNodeState
|
||||||
other.m_traceNodeValueSparse = m_traceNodeValueSparse;
|
other.m_traceNodeValueSparse = m_traceNodeValueSparse;
|
||||||
other.m_traceNodeValueUpToDim = m_traceNodeValueUpToDim;
|
other.m_traceNodeValueUpToDim = m_traceNodeValueUpToDim;
|
||||||
other.m_traceNodeValueUpToT = m_traceNodeValueUpToT;
|
other.m_traceNodeValueUpToT = m_traceNodeValueUpToT;
|
||||||
|
other.m_parentOverwritesGradient = m_parentOverwritesGradient;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IsPartOfLoop() const { return m_isPartOfLoop; }
|
bool IsPartOfLoop() const { return m_isPartOfLoop; }
|
||||||
|
|
||||||
|
void MarkParentOverwritesGradient() { m_parentOverwritesGradient = true; }
|
||||||
|
bool ParentOverwritesGradient() const { return m_parentOverwritesGradient; }
|
||||||
|
|
||||||
virtual void MarkValueNonSharable() { m_valueSharable = false; }
|
virtual void MarkValueNonSharable() { m_valueSharable = false; }
|
||||||
virtual void MarkValueSharable() { m_valueSharable = true; }
|
virtual void MarkValueSharable() { m_valueSharable = true; }
|
||||||
bool IsValueSharable() const { return m_valueSharable; }
|
bool IsValueSharable() const { return m_valueSharable; }
|
||||||
|
@ -186,12 +190,17 @@ struct ComputationNetworkOwnedNodeState
|
||||||
size_t m_traceNodeValueUpToT = 8; // 8 time steps fit comfortably into a normal-sized console
|
size_t m_traceNodeValueUpToT = 8; // 8 time steps fit comfortably into a normal-sized console
|
||||||
void EnableNodeTracing(bool asReal, bool asCategoryLabel, bool asSparse) { m_traceNodeValueReal = asReal; m_traceNodeValueAsCategoryLabel = asCategoryLabel; m_traceNodeValueSparse = asSparse; }
|
void EnableNodeTracing(bool asReal, bool asCategoryLabel, bool asSparse) { m_traceNodeValueReal = asReal; m_traceNodeValueAsCategoryLabel = asCategoryLabel; m_traceNodeValueSparse = asSparse; }
|
||||||
|
|
||||||
|
virtual bool ImplementsGradientOverwriteOptimization() const { return false; }
|
||||||
|
|
||||||
protected: // TODO: should be fully encapsulated here
|
protected: // TODO: should be fully encapsulated here
|
||||||
bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
|
bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
|
||||||
|
|
||||||
bool m_valueSharable; // a flag is needed for memory share.
|
bool m_valueSharable; // a flag is needed for memory share.
|
||||||
// If it is false (e.g., LearnableParameters/InputValue and those nodes are solely induced by LearnableParameters),
|
// If it is false (e.g., LearnableParameters/InputValue and those nodes are solely induced by LearnableParameters),
|
||||||
// it will never be released to memory pool
|
// it will never be released to memory pool
|
||||||
|
|
||||||
|
bool m_parentOverwritesGradient; // flag indicating whether the parent of this node overwrites the gradient of this node instead of accumulating to it
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
|
bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
|
||||||
|
|
||||||
|
@ -1717,7 +1726,10 @@ public:
|
||||||
void ResetGradient(ElemType val)
|
void ResetGradient(ElemType val)
|
||||||
{
|
{
|
||||||
UpdateDataSize(Gradient());
|
UpdateDataSize(Gradient());
|
||||||
Gradient().SetValue(val);
|
|
||||||
|
// No need to zero initialize the gradient if the node's parent is going to overwrite it anyways
|
||||||
|
if ((val != 0) || !ParentOverwritesGradient())
|
||||||
|
Gradient().SetValue(val);
|
||||||
|
|
||||||
m_gradientInitialized = true;
|
m_gradientInitialized = true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -283,6 +283,8 @@ public:
|
||||||
AttachInputsFromConfig(configp, GetExpectedNumInputs());
|
AttachInputsFromConfig(configp, GetExpectedNumInputs());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual bool ImplementsGradientOverwriteOptimization() const override { return m_convEng->ImplementsGradientOverwriteOptimization(); }
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void Save(File& fstream) const override
|
void Save(File& fstream) const override
|
||||||
{
|
{
|
||||||
|
@ -348,7 +350,7 @@ public:
|
||||||
// BackwardData adds results to the output so need to zero them out first.
|
// BackwardData adds results to the output so need to zero them out first.
|
||||||
// REVIEW alexeyk: should be rolled into BackwardData itself.
|
// REVIEW alexeyk: should be rolled into BackwardData itself.
|
||||||
sliceOutputValue.SetValue(0);
|
sliceOutputValue.SetValue(0);
|
||||||
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
|
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrix);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -360,16 +362,16 @@ public:
|
||||||
auto& grad = InputRef(0).GradientAsMatrix();
|
auto& grad = InputRef(0).GradientAsMatrix();
|
||||||
auto sliceInput1Value = InputRef(1).ValueFor(fr);
|
auto sliceInput1Value = InputRef(1).ValueFor(fr);
|
||||||
if (!m_transpose)
|
if (!m_transpose)
|
||||||
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
|
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
|
||||||
else
|
else
|
||||||
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, fr.IsAllFrames(), *m_tempMatrix);
|
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
|
||||||
}
|
}
|
||||||
else if (inputIndex == 1) // derivative with respect to the input feature
|
else if (inputIndex == 1) // derivative with respect to the input feature
|
||||||
{
|
{
|
||||||
auto& input0 = InputRef(0).ValueAsMatrix();
|
auto& input0 = InputRef(0).ValueAsMatrix();
|
||||||
auto sliceInput1Grad = InputRef(1).GradientFor(fr);
|
auto sliceInput1Grad = InputRef(1).GradientFor(fr);
|
||||||
if (!m_transpose)
|
if (!m_transpose)
|
||||||
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
|
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrix);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.
|
// REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.
|
||||||
|
|
|
@ -60,8 +60,13 @@ public:
|
||||||
if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
|
if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
|
||||||
MaskMissingGradientColumnsToZero(fr);
|
MaskMissingGradientColumnsToZero(fr);
|
||||||
|
|
||||||
inputGradient.AddCopyOf(gradient);
|
if (Input(inputIndex)->ParentOverwritesGradient())
|
||||||
|
inputGradient.AssignCopyOf(gradient);
|
||||||
|
else
|
||||||
|
inputGradient.AddCopyOf(gradient);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
|
||||||
};
|
};
|
||||||
|
|
||||||
template class PlusNode<float>;
|
template class PlusNode<float>;
|
||||||
|
@ -415,7 +420,10 @@ public:
|
||||||
auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true, fr.AllowBroadcast());
|
auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true, fr.AllowBroadcast());
|
||||||
auto input1 = OneSampleTensorFor(1, /*gradient=*/false, fr.AllowBroadcast());
|
auto input1 = OneSampleTensorFor(1, /*gradient=*/false, fr.AllowBroadcast());
|
||||||
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
|
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
|
||||||
input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
|
if (Input(inputIndex)->ParentOverwritesGradient())
|
||||||
|
input0Gradient.AssignMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
|
||||||
|
else
|
||||||
|
input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
|
||||||
}
|
}
|
||||||
else if (inputIndex == 1) // right derivative
|
else if (inputIndex == 1) // right derivative
|
||||||
{
|
{
|
||||||
|
@ -423,13 +431,18 @@ public:
|
||||||
auto input0 = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
|
auto input0 = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
|
||||||
auto input1Gradient = OneSampleTensorFor(1, /*gradient=*/true, fr.AllowBroadcast());
|
auto input1Gradient = OneSampleTensorFor(1, /*gradient=*/true, fr.AllowBroadcast());
|
||||||
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
|
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
|
||||||
input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
|
if (Input(inputIndex)->ParentOverwritesGradient())
|
||||||
|
input1Gradient.AssignMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
|
||||||
|
else
|
||||||
|
input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||||
// but both *inputs* are used, so we don't overload the InputUsed-() function which defaults to 'true'
|
// but both *inputs* are used, so we don't overload the InputUsed-() function which defaults to 'true'
|
||||||
|
|
||||||
|
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
|
||||||
|
|
||||||
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
||||||
{
|
{
|
||||||
Base::Validate(isFinalValidationPass);
|
Base::Validate(isFinalValidationPass);
|
||||||
|
|
|
@ -72,7 +72,7 @@ public:
|
||||||
}
|
}
|
||||||
else if (opTypeHolder == unaryGradient)
|
else if (opTypeHolder == unaryGradient)
|
||||||
{
|
{
|
||||||
sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward, opSum);
|
sliceInputGrad.DoUnaryOpOf(Input(inputIndex)->ParentOverwritesGradient() ? 0.0f : 1.0f, sliceOutputGrad, 1, opBackward, opSum);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -80,7 +80,7 @@ public:
|
||||||
// Not possible for Cos().
|
// Not possible for Cos().
|
||||||
auto sliceValue = (opType == binaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
|
auto sliceValue = (opType == binaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
|
||||||
InputRef(0).ValueTensorFor(rank, fr);
|
InputRef(0).ValueTensorFor(rank, fr);
|
||||||
sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
|
sliceInputGrad.DoBinaryOpOf(Input(inputIndex)->ParentOverwritesGradient() ? 0.0f : 1.0f, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -93,10 +93,13 @@ public:
|
||||||
{
|
{
|
||||||
return opType == binaryWithOutputGradient;
|
return opType == binaryWithOutputGradient;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
|
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
|
||||||
{
|
{
|
||||||
return opType == binaryWithInputGradient;
|
return opType == binaryWithInputGradient;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual bool ImplementsGradientOverwriteOptimization() const override { return (opType != noGradient); }
|
||||||
};
|
};
|
||||||
|
|
||||||
#define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;
|
#define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;
|
||||||
|
|
|
@ -30,7 +30,7 @@ void ConvolutionEngine<ElemType>::Forward(const Mat& in, const Mat& kernel, Mat&
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class ElemType>
|
template <class ElemType>
|
||||||
void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace)
|
void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace)
|
||||||
{
|
{
|
||||||
const auto& g = *m_geometry;
|
const auto& g = *m_geometry;
|
||||||
assert(g.InputShape().GetNumElements() == grad.GetNumRows());
|
assert(g.InputShape().GetNumElements() == grad.GetNumRows());
|
||||||
|
@ -45,11 +45,11 @@ void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& ke
|
||||||
|
|
||||||
EnsureCompatible();
|
EnsureCompatible();
|
||||||
EnsureConvolutionInitialized();
|
EnsureConvolutionInitialized();
|
||||||
BackwardDataCore(srcGrad, kernel, grad, workspace);
|
BackwardDataCore(srcGrad, kernel, grad, accumulateGradient, workspace);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class ElemType>
|
template <class ElemType>
|
||||||
void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool allowReuse, Mat& workspace)
|
void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool accumulateGradient, bool allowReuse, Mat& workspace)
|
||||||
{
|
{
|
||||||
const auto& g = *m_geometry;
|
const auto& g = *m_geometry;
|
||||||
assert(g.InputShape().GetNumElements() == in.GetNumRows());
|
assert(g.InputShape().GetNumElements() == in.GetNumRows());
|
||||||
|
@ -64,7 +64,7 @@ void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat&
|
||||||
|
|
||||||
EnsureCompatible();
|
EnsureCompatible();
|
||||||
EnsureConvolutionInitialized();
|
EnsureConvolutionInitialized();
|
||||||
BackwardKernelCore(srcGrad, in, kernel, allowReuse, workspace);
|
BackwardKernelCore(srcGrad, in, kernel, accumulateGradient, allowReuse, workspace);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class ElemType>
|
template <class ElemType>
|
||||||
|
@ -179,12 +179,12 @@ protected:
|
||||||
in.ConvolutionForward(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, out);
|
in.ConvolutionForward(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& /*workspace*/) override
|
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& /*workspace*/) override
|
||||||
{
|
{
|
||||||
srcGrad.ConvolutionBackwardData(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, grad);
|
srcGrad.ConvolutionBackwardData(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, grad);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& /*workspace*/) override
|
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool /*allowReuse*/, Mat& /*workspace*/) override
|
||||||
{
|
{
|
||||||
srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
|
srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
|
||||||
}
|
}
|
||||||
|
@ -372,7 +372,7 @@ protected:
|
||||||
assert(batchSize == out.GetNumCols());
|
assert(batchSize == out.GetNumCols());
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
|
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& workspace) override
|
||||||
{
|
{
|
||||||
size_t batchSize = srcGrad.GetNumCols();
|
size_t batchSize = srcGrad.GetNumCols();
|
||||||
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
||||||
|
@ -412,7 +412,7 @@ protected:
|
||||||
assert(batchSize == srcGrad.GetNumCols());
|
assert(batchSize == srcGrad.GetNumCols());
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) override
|
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool allowReuse, Mat& workspace) override
|
||||||
{
|
{
|
||||||
size_t batchSize = in.GetNumCols();
|
size_t batchSize = in.GetNumCols();
|
||||||
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
||||||
|
@ -678,7 +678,7 @@ protected:
|
||||||
// [KXY x NWH]^T * [KXY x C] -> [NWH x C]
|
// [KXY x NWH]^T * [KXY x C] -> [NWH x C]
|
||||||
// 4. Reshape and transpose outputs (grad): [NWH x C] -> [N x WHC]^T -> [WHC x N]
|
// 4. Reshape and transpose outputs (grad): [NWH x C] -> [N x WHC]^T -> [WHC x N]
|
||||||
// In case minibatch size == 1 this step is not required and step 3 writes results directly to output (grad).
|
// In case minibatch size == 1 this step is not required and step 3 writes results directly to output (grad).
|
||||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
|
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& workspace) override
|
||||||
{
|
{
|
||||||
size_t batchSize = srcGrad.GetNumCols();
|
size_t batchSize = srcGrad.GetNumCols();
|
||||||
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
|
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
|
||||||
|
@ -771,7 +771,7 @@ protected:
|
||||||
// 2. Unrolling convolution input (in) into a matrix of [NW'H' x WHC] layout.
|
// 2. Unrolling convolution input (in) into a matrix of [NW'H' x WHC] layout.
|
||||||
// 3. Performing matrix multiplication of unrolled input with transposed output:
|
// 3. Performing matrix multiplication of unrolled input with transposed output:
|
||||||
// [NW'H' x WHC]^T * [NW'H' x K] -> [WHC x K] - kernel gradients.
|
// [NW'H' x WHC]^T * [NW'H' x K] -> [WHC x K] - kernel gradients.
|
||||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
|
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool /*allowReuse*/, Mat& workspace) override
|
||||||
{
|
{
|
||||||
size_t batchSize = srcGrad.GetNumCols();
|
size_t batchSize = srcGrad.GetNumCols();
|
||||||
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
|
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
|
||||||
|
|
|
@ -47,9 +47,9 @@ public:
|
||||||
|
|
||||||
void Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace);
|
void Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace);
|
||||||
|
|
||||||
void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace);
|
void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace);
|
||||||
|
|
||||||
void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace);
|
void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool allowReuse, Mat& workspace);
|
||||||
|
|
||||||
void ForwardPooling(const Mat& in, Mat& out);
|
void ForwardPooling(const Mat& in, Mat& out);
|
||||||
|
|
||||||
|
@ -72,6 +72,8 @@ public:
|
||||||
m_maxTempMemSizeInSamples = maxTempMemSizeInSamples;
|
m_maxTempMemSizeInSamples = maxTempMemSizeInSamples;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual bool ImplementsGradientOverwriteOptimization() const { return false; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
ConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
|
ConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
|
||||||
: m_geometry(geometry), m_deviceId(deviceId), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_poolKind(poolKind)
|
: m_geometry(geometry), m_deviceId(deviceId), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_poolKind(poolKind)
|
||||||
|
@ -85,9 +87,9 @@ protected:
|
||||||
|
|
||||||
virtual void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) = 0;
|
virtual void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) = 0;
|
||||||
|
|
||||||
virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) = 0;
|
virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace) = 0;
|
||||||
|
|
||||||
virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) = 0;
|
virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool allowReuse, Mat& workspace) = 0;
|
||||||
|
|
||||||
virtual void EnsurePoolingInitialized() = 0;
|
virtual void EnsurePoolingInitialized() = 0;
|
||||||
|
|
||||||
|
|
|
@ -183,6 +183,8 @@ public:
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
using Base::m_geometry;
|
using Base::m_geometry;
|
||||||
using Base::m_deviceId;
|
using Base::m_deviceId;
|
||||||
|
@ -255,7 +257,7 @@ protected:
|
||||||
CUDNN_CALL(err);
|
CUDNN_CALL(err);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
|
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace) override
|
||||||
{
|
{
|
||||||
size_t batchSize = srcGrad.GetNumCols();
|
size_t batchSize = srcGrad.GetNumCols();
|
||||||
// Find best algo and allocate temp buffer, if needed.
|
// Find best algo and allocate temp buffer, if needed.
|
||||||
|
@ -282,11 +284,11 @@ protected:
|
||||||
workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
||||||
// Compute gradients with respect to the output tensor (data).
|
// Compute gradients with respect to the output tensor (data).
|
||||||
CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
|
CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
|
||||||
ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
|
ptr(workspace), m_backDataAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad)));
|
||||||
workspace.Resize(0, 0);
|
workspace.Resize(0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
|
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool /*allowReuse*/, Mat& workspace) override
|
||||||
{
|
{
|
||||||
size_t batchSize = in.GetNumCols();
|
size_t batchSize = in.GetNumCols();
|
||||||
// Find best algo and allocate temp buffer, if needed.
|
// Find best algo and allocate temp buffer, if needed.
|
||||||
|
@ -313,7 +315,7 @@ protected:
|
||||||
workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
||||||
// Compute gradients with respect to the output tensor (data).
|
// Compute gradients with respect to the output tensor (data).
|
||||||
CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
|
CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
|
||||||
ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
|
ptr(workspace), m_backFiltAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad)));
|
||||||
workspace.Resize(0, 0);
|
workspace.Resize(0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -280,8 +280,8 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardData)
|
||||||
SingleMatrix workspace(deviceId);
|
SingleMatrix workspace(deviceId);
|
||||||
SingleMatrix workspaceB(baseDeviceId);
|
SingleMatrix workspaceB(baseDeviceId);
|
||||||
|
|
||||||
testEng->BackwardData(srcGrad, kernel, grad, workspace);
|
testEng->BackwardData(srcGrad, kernel, grad, true, workspace);
|
||||||
baseEng->BackwardData(srcGradB, kernelB, gradB, workspaceB);
|
baseEng->BackwardData(srcGradB, kernelB, gradB, true, workspaceB);
|
||||||
|
|
||||||
std::stringstream tmsg;
|
std::stringstream tmsg;
|
||||||
tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;
|
tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;
|
||||||
|
@ -349,8 +349,8 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardKernel)
|
||||||
SingleMatrix workspace(deviceId);
|
SingleMatrix workspace(deviceId);
|
||||||
SingleMatrix workspaceB(baseDeviceId);
|
SingleMatrix workspaceB(baseDeviceId);
|
||||||
|
|
||||||
testEng->BackwardKernel(grad, in, kernel, false, workspace);
|
testEng->BackwardKernel(grad, in, kernel, true, false, workspace);
|
||||||
baseEng->BackwardKernel(gradB, inB, kernelB, false, workspaceB);
|
baseEng->BackwardKernel(gradB, inB, kernelB, true, false, workspaceB);
|
||||||
|
|
||||||
std::stringstream tmsg;
|
std::stringstream tmsg;
|
||||||
tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;
|
tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;
|
||||||
|
|
Загрузка…
Ссылка в новой задаче