CNTK core: Implemented an optimization to elide the initial zeroing and subsequent accumulation into gradients for nodes with just one parent/accestor node

This commit is contained in:
Amit Agarwal 2016-12-09 15:58:01 -08:00
Родитель 1a197f596f
Коммит 4fe22b81c5
15 изменённых файлов: 102 добавлений и 35 удалений

Просмотреть файл

@ -239,7 +239,7 @@ ifeq ("$(BUILDTYPE)","release")
CXXFLAGS += -g -O4
LDFLAGS += -rdynamic
COMMON_FLAGS += -DNDEBUG -DNO_SYNC
CUFLAGS += -O3 -g -use_fast_math -lineinfo $(GENCODE_FLAGS)
CUFLAGS += -O3 -g -use_fast_math $(GENCODE_FLAGS)
endif
ifdef CNTK_CUDA_DEVICE_DEBUGINFO

Просмотреть файл

@ -565,6 +565,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
Globals::EnableShareNodeValueMatrices();
if (config(L"hyperCompressMemory", false))
Globals::EnableHyperCompressMemory();
if (config(L"optimizeGradientAccumulation", true))
Globals::EnableGradientAccumulationOptimization();
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));
@ -710,6 +712,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
Globals::EnableShareNodeValueMatrices();
if (config(L"hyperCompressMemory", false))
Globals::EnableHyperCompressMemory();
if (config(L"optimizeGradientAccumulation", true))
Globals::EnableGradientAccumulationOptimization();
TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

Просмотреть файл

@ -249,6 +249,9 @@ namespace CNTK
CNTK_API void EnableForwardValuesSharing();
CNTK_API void EnableHyperMemoryCompress();
CNTK_API void EnableGradientAccumulationOptimization();
CNTK_API void DisableGradientAccumulationOptimization();
CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);

Просмотреть файл

@ -70,6 +70,16 @@ namespace CNTK
Microsoft::MSR::CNTK::Globals::EnableHyperCompressMemory();
}
void EnableGradientAccumulationOptimization()
{
Microsoft::MSR::CNTK::Globals::EnableGradientAccumulationOptimization();
}
void DisableGradientAccumulationOptimization()
{
Microsoft::MSR::CNTK::Globals::DisableGradientAccumulationOptimization();
}
bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
{
bool areDynamicAxesCompatible = (var1.DynamicAxes().size() == var2.DynamicAxes().size());

Просмотреть файл

@ -15,5 +15,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
std::atomic<bool> Globals::m_enableShareNodeValueMatrices(false);
std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
std::atomic<bool> Globals::m_optimizeGradientAccumulation(true);
}}}
}}}

Просмотреть файл

@ -19,6 +19,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static void ForceConstantRandomSeed() { m_forceConstantRandomSeed = true; }
static bool ShouldForceConstantRandomSeed() { return m_forceConstantRandomSeed; }
static void EnableGradientAccumulationOptimization() { m_optimizeGradientAccumulation = true; }
static void DisableGradientAccumulationOptimization() { m_optimizeGradientAccumulation = false; }
static bool ShouldOptimizeGradientAccumulation() { return m_optimizeGradientAccumulation; }
// TODO: Currently the flag is set to false. Should be switched to true after more rigorous testing.
static bool UseV2Aggregator() { return false; }
@ -49,5 +53,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// The global flag to enable hyper memory compression
static std::atomic<bool> m_enableHyperCompressMemory;
static std::atomic<bool> m_forceConstantRandomSeed;
static std::atomic<bool> m_optimizeGradientAccumulation;
};
}}}

Просмотреть файл

@ -1037,6 +1037,16 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
for (auto& keyValue : parentsMap)
{
parentCount[keyValue.first] = keyValue.second.size();
// Indicate on the node that it's parent overwrites its gradient if the node is not part of a loop
// and has exactly one parent who implements the gradient overwrite optimization
if (Globals::ShouldOptimizeGradientAccumulation() &&
!keyValue.first->IsPartOfLoop() &&
(keyValue.second.size() == 1) &&
(*keyValue.second.begin())->ImplementsGradientOverwriteOptimization())
{
keyValue.first->MarkParentOverwritesGradient();
}
}
// Construct the composite forward prop eval order by enumerating the

Просмотреть файл

@ -152,7 +152,7 @@ struct ComputationNetworkOwnedNodeState
friend class ComputationNetwork;
ComputationNetworkOwnedNodeState()
: m_needsGradient(false), m_valueSharable(true)
: m_needsGradient(false), m_valueSharable(true), m_parentOverwritesGradient(false)
{
PurgeStateForFormingRecurrentLoops();
m_isPartOfLoop = false;
@ -168,10 +168,14 @@ struct ComputationNetworkOwnedNodeState
other.m_traceNodeValueSparse = m_traceNodeValueSparse;
other.m_traceNodeValueUpToDim = m_traceNodeValueUpToDim;
other.m_traceNodeValueUpToT = m_traceNodeValueUpToT;
other.m_parentOverwritesGradient = m_parentOverwritesGradient;
}
bool IsPartOfLoop() const { return m_isPartOfLoop; }
void MarkParentOverwritesGradient() { m_parentOverwritesGradient = true; }
bool ParentOverwritesGradient() const { return m_parentOverwritesGradient; }
virtual void MarkValueNonSharable() { m_valueSharable = false; }
virtual void MarkValueSharable() { m_valueSharable = true; }
bool IsValueSharable() const { return m_valueSharable; }
@ -186,12 +190,17 @@ struct ComputationNetworkOwnedNodeState
size_t m_traceNodeValueUpToT = 8; // 8 time steps fit comfortably into a normal-sized console
void EnableNodeTracing(bool asReal, bool asCategoryLabel, bool asSparse) { m_traceNodeValueReal = asReal; m_traceNodeValueAsCategoryLabel = asCategoryLabel; m_traceNodeValueSparse = asSparse; }
virtual bool ImplementsGradientOverwriteOptimization() const { return false; }
protected: // TODO: should be fully encapsulated here
bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
bool m_valueSharable; // a flag is needed for memory share.
// If it is false (e.g., LearnableParameters/InputValue and those nodes are solely induced by LearnableParameters),
// it will never be released to memory pool
bool m_parentOverwritesGradient; // flag indicating whether the parent of this node overwrites the gradient of this node instead of accumulating to it
private:
bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
@ -1717,7 +1726,10 @@ public:
void ResetGradient(ElemType val)
{
UpdateDataSize(Gradient());
Gradient().SetValue(val);
// No need to zero initialize the gradient if the node's parent is going to overwrite it anyways
if ((val != 0) || !ParentOverwritesGradient())
Gradient().SetValue(val);
m_gradientInitialized = true;
}

Просмотреть файл

@ -283,6 +283,8 @@ public:
AttachInputsFromConfig(configp, GetExpectedNumInputs());
}
virtual bool ImplementsGradientOverwriteOptimization() const override { return m_convEng->ImplementsGradientOverwriteOptimization(); }
public:
void Save(File& fstream) const override
{
@ -348,7 +350,7 @@ public:
// BackwardData adds results to the output so need to zero them out first.
// REVIEW alexeyk: should be rolled into BackwardData itself.
sliceOutputValue.SetValue(0);
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrix);
}
}
@ -360,16 +362,16 @@ public:
auto& grad = InputRef(0).GradientAsMatrix();
auto sliceInput1Value = InputRef(1).ValueFor(fr);
if (!m_transpose)
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
else
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, fr.IsAllFrames(), *m_tempMatrix);
m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
}
else if (inputIndex == 1) // derivative with respect to the input feature
{
auto& input0 = InputRef(0).ValueAsMatrix();
auto sliceInput1Grad = InputRef(1).GradientFor(fr);
if (!m_transpose)
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrix);
else
{
// REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.

Просмотреть файл

@ -60,8 +60,13 @@ public:
if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
MaskMissingGradientColumnsToZero(fr);
inputGradient.AddCopyOf(gradient);
if (Input(inputIndex)->ParentOverwritesGradient())
inputGradient.AssignCopyOf(gradient);
else
inputGradient.AddCopyOf(gradient);
}
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
};
template class PlusNode<float>;
@ -415,7 +420,10 @@ public:
auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true, fr.AllowBroadcast());
auto input1 = OneSampleTensorFor(1, /*gradient=*/false, fr.AllowBroadcast());
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
if (Input(inputIndex)->ParentOverwritesGradient())
input0Gradient.AssignMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
else
input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
}
else if (inputIndex == 1) // right derivative
{
@ -423,13 +431,18 @@ public:
auto input0 = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
auto input1Gradient = OneSampleTensorFor(1, /*gradient=*/true, fr.AllowBroadcast());
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
if (Input(inputIndex)->ParentOverwritesGradient())
input1Gradient.AssignMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
else
input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
}
}
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
// but both *inputs* are used, so we don't overload the InputUsed-() function which defaults to 'true'
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);

Просмотреть файл

@ -72,7 +72,7 @@ public:
}
else if (opTypeHolder == unaryGradient)
{
sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward, opSum);
sliceInputGrad.DoUnaryOpOf(Input(inputIndex)->ParentOverwritesGradient() ? 0.0f : 1.0f, sliceOutputGrad, 1, opBackward, opSum);
}
else
{
@ -80,7 +80,7 @@ public:
// Not possible for Cos().
auto sliceValue = (opType == binaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
InputRef(0).ValueTensorFor(rank, fr);
sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
sliceInputGrad.DoBinaryOpOf(Input(inputIndex)->ParentOverwritesGradient() ? 0.0f : 1.0f, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
}
}
@ -93,10 +93,13 @@ public:
{
return opType == binaryWithOutputGradient;
}
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
{
return opType == binaryWithInputGradient;
}
virtual bool ImplementsGradientOverwriteOptimization() const override { return (opType != noGradient); }
};
#define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;

Просмотреть файл

@ -30,7 +30,7 @@ void ConvolutionEngine<ElemType>::Forward(const Mat& in, const Mat& kernel, Mat&
}
template <class ElemType>
void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace)
void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace)
{
const auto& g = *m_geometry;
assert(g.InputShape().GetNumElements() == grad.GetNumRows());
@ -45,11 +45,11 @@ void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& ke
EnsureCompatible();
EnsureConvolutionInitialized();
BackwardDataCore(srcGrad, kernel, grad, workspace);
BackwardDataCore(srcGrad, kernel, grad, accumulateGradient, workspace);
}
template <class ElemType>
void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool allowReuse, Mat& workspace)
void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool accumulateGradient, bool allowReuse, Mat& workspace)
{
const auto& g = *m_geometry;
assert(g.InputShape().GetNumElements() == in.GetNumRows());
@ -64,7 +64,7 @@ void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat&
EnsureCompatible();
EnsureConvolutionInitialized();
BackwardKernelCore(srcGrad, in, kernel, allowReuse, workspace);
BackwardKernelCore(srcGrad, in, kernel, accumulateGradient, allowReuse, workspace);
}
template <class ElemType>
@ -179,12 +179,12 @@ protected:
in.ConvolutionForward(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, out);
}
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& /*workspace*/) override
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& /*workspace*/) override
{
srcGrad.ConvolutionBackwardData(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, grad);
}
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& /*workspace*/) override
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool /*allowReuse*/, Mat& /*workspace*/) override
{
srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
}
@ -372,7 +372,7 @@ protected:
assert(batchSize == out.GetNumCols());
}
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& workspace) override
{
size_t batchSize = srcGrad.GetNumCols();
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
@ -412,7 +412,7 @@ protected:
assert(batchSize == srcGrad.GetNumCols());
}
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) override
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool allowReuse, Mat& workspace) override
{
size_t batchSize = in.GetNumCols();
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
@ -678,7 +678,7 @@ protected:
// [KXY x NWH]^T * [KXY x C] -> [NWH x C]
// 4. Reshape and transpose outputs (grad): [NWH x C] -> [N x WHC]^T -> [WHC x N]
// In case minibatch size == 1 this step is not required and step 3 writes results directly to output (grad).
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool /*accumulateGradient*/, Mat& workspace) override
{
size_t batchSize = srcGrad.GetNumCols();
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
@ -771,7 +771,7 @@ protected:
// 2. Unrolling convolution input (in) into a matrix of [NW'H' x WHC] layout.
// 3. Performing matrix multiplication of unrolled input with transposed output:
// [NW'H' x WHC]^T * [NW'H' x K] -> [WHC x K] - kernel gradients.
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*accumulateGradient*/, bool /*allowReuse*/, Mat& workspace) override
{
size_t batchSize = srcGrad.GetNumCols();
size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);

Просмотреть файл

@ -47,9 +47,9 @@ public:
void Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace);
void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace);
void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace);
void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace);
void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool allowReuse, Mat& workspace);
void ForwardPooling(const Mat& in, Mat& out);
@ -72,6 +72,8 @@ public:
m_maxTempMemSizeInSamples = maxTempMemSizeInSamples;
}
virtual bool ImplementsGradientOverwriteOptimization() const { return false; }
protected:
ConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
: m_geometry(geometry), m_deviceId(deviceId), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_poolKind(poolKind)
@ -85,9 +87,9 @@ protected:
virtual void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) = 0;
virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) = 0;
virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace) = 0;
virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) = 0;
virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool allowReuse, Mat& workspace) = 0;
virtual void EnsurePoolingInitialized() = 0;

Просмотреть файл

@ -183,6 +183,8 @@ public:
{
}
virtual bool ImplementsGradientOverwriteOptimization() const override { return true; }
protected:
using Base::m_geometry;
using Base::m_deviceId;
@ -255,7 +257,7 @@ protected:
CUDNN_CALL(err);
}
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, bool accumulateGradient, Mat& workspace) override
{
size_t batchSize = srcGrad.GetNumCols();
// Find best algo and allocate temp buffer, if needed.
@ -282,11 +284,11 @@ protected:
workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
// Compute gradients with respect to the output tensor (data).
CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
ptr(workspace), m_backDataAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad)));
workspace.Resize(0, 0);
}
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool /*allowReuse*/, Mat& workspace) override
{
size_t batchSize = in.GetNumCols();
// Find best algo and allocate temp buffer, if needed.
@ -313,7 +315,7 @@ protected:
workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
// Compute gradients with respect to the output tensor (data).
CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
ptr(workspace), m_backFiltAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad)));
workspace.Resize(0, 0);
}

Просмотреть файл

@ -280,8 +280,8 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardData)
SingleMatrix workspace(deviceId);
SingleMatrix workspaceB(baseDeviceId);
testEng->BackwardData(srcGrad, kernel, grad, workspace);
baseEng->BackwardData(srcGradB, kernelB, gradB, workspaceB);
testEng->BackwardData(srcGrad, kernel, grad, true, workspace);
baseEng->BackwardData(srcGradB, kernelB, gradB, true, workspaceB);
std::stringstream tmsg;
tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;
@ -349,8 +349,8 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardKernel)
SingleMatrix workspace(deviceId);
SingleMatrix workspaceB(baseDeviceId);
testEng->BackwardKernel(grad, in, kernel, false, workspace);
baseEng->BackwardKernel(gradB, inB, kernelB, false, workspaceB);
testEng->BackwardKernel(grad, in, kernel, true, false, workspace);
baseEng->BackwardKernel(gradB, inB, kernelB, true, false, workspaceB);
std::stringstream tmsg;
tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n << ", Device: " << deviceId;