Improved formatting and comments
This commit is contained in:
Родитель
6dce931c19
Коммит
fec05bffe8
|
@ -91,6 +91,19 @@ template <class ElemType>
|
|||
auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
|
||||
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
|
||||
|
||||
// POTENETIAL PROBLEM:
|
||||
// For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
|
||||
// E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
|
||||
// In these cases there is no correct gradient.The question is if this could lead to any problems.
|
||||
// Let's look at two scenarios where this might happen:
|
||||
//
|
||||
// * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
|
||||
// In this case it's not a problem of the input gradient is kind of bad as the derivative of the concerning input nodes will be zero.
|
||||
//
|
||||
// * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as would wouldn't wan't to propagate gradients to the training data.
|
||||
//
|
||||
// So as we don't have a better solution yet and it probably doesn't have impact lets stay with the current solution.
|
||||
// Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
|
||||
sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
|
||||
break;
|
||||
|
||||
|
@ -141,7 +154,7 @@ void ReduceElementsNode<ElemType>::ValidateOp()
|
|||
else if (m_operation == L"Min") m_reductionOp = ElementWiseOperator::opMin;
|
||||
|
||||
// more here
|
||||
else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum'. And a few more soon.", NodeDescription().c_str(), m_operation.c_str());
|
||||
else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min'.", NodeDescription().c_str(), m_operation.c_str());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
|
|
@ -267,8 +267,9 @@ struct TensorOps
|
|||
template <class ElemType> class BinaryOpConstants
|
||||
{
|
||||
public:
|
||||
__device__ static ElemType NeutralValue(ElementWiseOperator op) {
|
||||
return 0; //error, only the explicit instantiations below should be used.
|
||||
__device__ static ElemType NeutralValue(ElementWiseOperator op)
|
||||
{
|
||||
return 0; // error, only the explicit instantiations below should be used.
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -278,14 +279,10 @@ public:
|
|||
__device__ static float NeutralValue(ElementWiseOperator op) {
|
||||
switch (op)
|
||||
{
|
||||
case ElementWiseOperator::opMax:
|
||||
return FLT_MIN;
|
||||
case ElementWiseOperator::opMin:
|
||||
return FLT_MAX;
|
||||
case ElementWiseOperator::opSum:
|
||||
return 0;
|
||||
default:
|
||||
return 0; // error
|
||||
case ElementWiseOperator::opMax: return FLT_MIN;
|
||||
case ElementWiseOperator::opMin: return FLT_MAX;
|
||||
case ElementWiseOperator::opSum: return 0;
|
||||
default: return 0; // error
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -296,14 +293,10 @@ public:
|
|||
__device__ static double NeutralValue(ElementWiseOperator op) {
|
||||
switch (op)
|
||||
{
|
||||
case ElementWiseOperator::opMax:
|
||||
return DBL_MIN;
|
||||
case ElementWiseOperator::opMin:
|
||||
return DBL_MAX;
|
||||
case ElementWiseOperator::opSum:
|
||||
return 0;
|
||||
default:
|
||||
return 0; // error
|
||||
case ElementWiseOperator::opMax: return DBL_MIN;
|
||||
case ElementWiseOperator::opMin: return DBL_MAX;
|
||||
case ElementWiseOperator::opSum: return 0;
|
||||
default: return 0; // error
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -534,9 +527,8 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
|
|||
for (CUDA_LONG i = 256; i; i >>= 1)
|
||||
{
|
||||
if (tid < i && tid + i < tids)
|
||||
{
|
||||
AggregationOp<volatile ReduceElemType, volatile ReduceElemType>::Update(accumulators[tid], accumulators[tid + i], reductionOp);
|
||||
}
|
||||
|
||||
if (0 + i < tids)
|
||||
__syncthreads(); // sync if condition true for at least one thread
|
||||
// TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values. See Amit's allreduce() function implementation in MatrixQuantizer_kernel.cu.
|
||||
|
@ -765,7 +757,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
|
|||
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
|
||||
beta, pointers, alpha, op, reductionOp,
|
||||
regularOpStrides, regularStrides, NN,
|
||||
reducingOpDims, reducingStrides,/*reductionBegin*/ 0, reductionChunkSize);
|
||||
reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize);
|
||||
}
|
||||
// --- case (b)
|
||||
// Reduction across blocks. This is the difficult one.
|
||||
|
|
Загрузка…
Ссылка в новой задаче