new condition for tensor lib: output cannot be in-place and inverse-broadcasting (reducing) at the same time. This makes reduction easier

This commit is contained in:
Frank Seide 2015-12-28 23:48:40 -08:00
Родитель 01a33f7bea
Коммит cdcc4bb3d2
4 изменённых файлов: 38 добавлений и 7 удалений

Просмотреть файл

@ -53,7 +53,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (Input(inputIndex)->GetNumCols() < GetNumCols())
MaskMissingGradientColumnsToZero(fr);
inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
inputGradient.AddCopyOf(gradient);
#if 0
if (Input(inputIndex)->GetNumCols() < GetNumCols())
Input(inputIndex)->Gradient().Print("PlusNode BackProp with reduction");
@ -227,10 +227,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (Input(inputIndex)->GetNumCols() < GetNumCols())
MaskMissingGradientColumnsToZero(fr);
if (sign > 0)
inputGradient.DoSumOf(0.0f, inputGradient, gradient, 1.0f);
else
inputGradient.DoDifferenceOf(0.0f, inputGradient, gradient, 1.0f);
inputGradient.AddCopyOf(gradient, sign);
#else
Matrix<ElemType> gradientValues = GradientFor(fr);

Просмотреть файл

@ -427,9 +427,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Cases:
// - input elements >> GPU procs --> do reduction in inner loop
// - reduction dimension fits into a single kernel --> launch it that way
// - reduction dimension requires multiple kernels --> use atomic add, to avoid temp mem alloc --is this any good?
// - reduction dimension requires multiple kernels --> use atomic add, to avoid temp mem alloc
// - PlusNode: reducing to a bias for small matrices
// - ScaleNode: big elementwise product reduced to a scalar (dot product)
// - E.g. 3072 GPU procs:
// If >= 3072 reduced output values must be computed, just loop inside.
// If less, and reduction per value does not fit into a single proc,
// then we break it into procs, say, 24.
// This way we will need 24 atomicAdd()s of 3072/24 = 128 values.
// If reduction is along stride=1, then we'd have 24 atomicAdd()s of 32 coalesced writes.
// Does not sound scary at all.
// Precondition: matrix cannot at the same time participate in reduction and operation.
#if 1
C_size_t reductionDim = 1; // number of elements to reduce over
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)

Просмотреть файл

@ -223,6 +223,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
offsets[i] = shapes[i].GetOffset();
}
// enforce that in case of broadcasting, the output must not be an input
template<class ElemType>
static bool CheckDifferentObject(const TensorView<ElemType> & a, const TensorView<ElemType> & b)
{
if (&a == &b)
LogicError("Do{U,Bi,Ter}naryOpOf: When inverse broadcasting, output must not be an input.");
return true;
}
template<class ElemType>
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
{
@ -235,6 +244,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this);
// now perform the operation
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
@ -250,6 +263,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
@ -264,6 +281,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SmallVector<size_t> regularOpDims, reducingOpDims;
PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
// output cannot be input when reducing
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}

Просмотреть файл

@ -48,7 +48,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// c.AssignDiffOf(c,a) means c -= a,
// and c.AddElementwiseProductOf(a, b, 1) means c += a .* b.
// All operators support elementwise in-place operations, i.e. a, b, and c
// may all reference the same underlying SOB.
// may all reference the same underlying SOB, with onee exception:
// The output cannot be in-place and inverse-broadcasting at the same time.
// E.g. with c=[10] and a=[10 x 20], c.AssignDiffOf(c,a) will fail.
// In that case, you can use c.AddCopyOf(a,-1).
// Aliasing is not detected, so don't pass distinct TensorView objects that
// reference overlapping but not identical slices.
// If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
// -------------------------------------------------------------------