new condition for tensor lib: output cannot be in-place and inverse-broadcasting (reducing) at the same time. This makes reduction easier
This commit is contained in:
Родитель
01a33f7bea
Коммит
cdcc4bb3d2
|
@ -53,7 +53,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (Input(inputIndex)->GetNumCols() < GetNumCols())
|
||||
MaskMissingGradientColumnsToZero(fr);
|
||||
|
||||
inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
|
||||
inputGradient.AddCopyOf(gradient);
|
||||
#if 0
|
||||
if (Input(inputIndex)->GetNumCols() < GetNumCols())
|
||||
Input(inputIndex)->Gradient().Print("PlusNode BackProp with reduction");
|
||||
|
@ -227,10 +227,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (Input(inputIndex)->GetNumCols() < GetNumCols())
|
||||
MaskMissingGradientColumnsToZero(fr);
|
||||
|
||||
if (sign > 0)
|
||||
inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
|
||||
else
|
||||
inputGradient.DoDifferenceOf(0.0f, inputGradient, gradient, 1.0f);
|
||||
inputGradient.AddCopyOf(gradient, sign);
|
||||
#else
|
||||
Matrix<ElemType> gradientValues = GradientFor(fr);
|
||||
|
||||
|
|
|
@ -427,9 +427,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// Cases:
|
||||
// - input elements >> GPU procs --> do reduction in inner loop
|
||||
// - reduction dimension fits into a single kernel --> launch it that way
|
||||
// - reduction dimension requires multiple kernels --> use atomic add, to avoid temp mem alloc --is this any good?
|
||||
// - reduction dimension requires multiple kernels --> use atomic add, to avoid temp mem alloc
|
||||
// - PlusNode: reducing to a bias for small matrices
|
||||
// - ScaleNode: big elementwise product reduced to a scalar (dot product)
|
||||
// - E.g. 3072 GPU procs:
|
||||
// If >= 3072 reduced output values must be computed, just loop inside.
|
||||
// If less, and reduction per value does not fit into a single proc,
|
||||
// then we break it into procs, say, 24.
|
||||
// This way we will need 24 atomicAdd()s of 3072/24 = 128 values.
|
||||
// If reduction is along stride=1, then we'd have 24 atomicAdd()s of 32 coalesced writes.
|
||||
// Does not sound scary at all.
|
||||
// Precondition: matrix cannot at the same time participate in reduction and operation.
|
||||
#if 1
|
||||
C_size_t reductionDim = 1; // number of elements to reduce over
|
||||
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
|
||||
|
|
|
@ -223,6 +223,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
offsets[i] = shapes[i].GetOffset();
|
||||
}
|
||||
|
||||
// enforce that in case of broadcasting, the output must not be an input
|
||||
template<class ElemType>
|
||||
static bool CheckDifferentObject(const TensorView<ElemType> & a, const TensorView<ElemType> & b)
|
||||
{
|
||||
if (&a == &b)
|
||||
LogicError("Do{U,Bi,Ter}naryOpOf: When inverse broadcasting, output must not be an input.");
|
||||
return true;
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
|
||||
{
|
||||
|
@ -235,6 +244,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SmallVector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
// output cannot be input when reducing
|
||||
if (reducingOpDims.size() > 0)
|
||||
CheckDifferentObject(a, *this);
|
||||
|
||||
// now perform the operation
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
@ -250,6 +263,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SmallVector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
// output cannot be input when reducing
|
||||
if (reducingOpDims.size() > 0)
|
||||
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);
|
||||
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
|
@ -264,6 +281,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
SmallVector<size_t> regularOpDims, reducingOpDims;
|
||||
PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
|
||||
// output cannot be input when reducing
|
||||
if (reducingOpDims.size() > 0)
|
||||
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);
|
||||
|
||||
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
}
|
||||
|
||||
|
|
|
@ -48,7 +48,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// c.AssignDiffOf(c,a) means c -= a,
|
||||
// and c.AddElementwiseProductOf(a, b, 1) means c += a .* b.
|
||||
// All operators support elementwise in-place operations, i.e. a, b, and c
|
||||
// may all reference the same underlying SOB.
|
||||
// may all reference the same underlying SOB, with onee exception:
|
||||
// The output cannot be in-place and inverse-broadcasting at the same time.
|
||||
// E.g. with c=[10] and a=[10 x 20], c.AssignDiffOf(c,a) will fail.
|
||||
// In that case, you can use c.AddCopyOf(a,-1).
|
||||
// Aliasing is not detected, so don't pass distinct TensorView objects that
|
||||
// reference overlapping but not identical slices.
|
||||
// If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче