Improved formatting and comments

This commit is contained in:
Thilo Will 2016-07-22 15:51:15 +02:00
Родитель 6dce931c19
Коммит fec05bffe8
2 изменённых файлов: 27 добавлений и 22 удалений

Просмотреть файл

@ -91,6 +91,19 @@ template <class ElemType>
auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
auto output = ValueTensorFor(rank, fr.AllowBroadcast());
// POTENETIAL PROBLEM:
// For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
// E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
// In these cases there is no correct gradient.The question is if this could lead to any problems.
// Let's look at two scenarios where this might happen:
//
// * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
// In this case it's not a problem of the input gradient is kind of bad as the derivative of the concerning input nodes will be zero.
//
// * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as would wouldn't wan't to propagate gradients to the training data.
//
// So as we don't have a better solution yet and it probably doesn't have impact lets stay with the current solution.
// Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
break;
@ -141,7 +154,7 @@ void ReduceElementsNode<ElemType>::ValidateOp()
else if (m_operation == L"Min") m_reductionOp = ElementWiseOperator::opMin;
// more here
else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum'. And a few more soon.", NodeDescription().c_str(), m_operation.c_str());
else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min'.", NodeDescription().c_str(), m_operation.c_str());
}
template <class ElemType>

Просмотреть файл

@ -267,8 +267,9 @@ struct TensorOps
template <class ElemType> class BinaryOpConstants
{
public:
__device__ static ElemType NeutralValue(ElementWiseOperator op) {
return 0; //error, only the explicit instantiations below should be used.
__device__ static ElemType NeutralValue(ElementWiseOperator op)
{
return 0; // error, only the explicit instantiations below should be used.
}
};
@ -278,14 +279,10 @@ public:
__device__ static float NeutralValue(ElementWiseOperator op) {
switch (op)
{
case ElementWiseOperator::opMax:
return FLT_MIN;
case ElementWiseOperator::opMin:
return FLT_MAX;
case ElementWiseOperator::opSum:
return 0;
default:
return 0; // error
case ElementWiseOperator::opMax: return FLT_MIN;
case ElementWiseOperator::opMin: return FLT_MAX;
case ElementWiseOperator::opSum: return 0;
default: return 0; // error
}
}
};
@ -296,14 +293,10 @@ public:
__device__ static double NeutralValue(ElementWiseOperator op) {
switch (op)
{
case ElementWiseOperator::opMax:
return DBL_MIN;
case ElementWiseOperator::opMin:
return DBL_MAX;
case ElementWiseOperator::opSum:
return 0;
default:
return 0; // error
case ElementWiseOperator::opMax: return DBL_MIN;
case ElementWiseOperator::opMin: return DBL_MAX;
case ElementWiseOperator::opSum: return 0;
default: return 0; // error
}
}
};
@ -534,9 +527,8 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
for (CUDA_LONG i = 256; i; i >>= 1)
{
if (tid < i && tid + i < tids)
{
AggregationOp<volatile ReduceElemType, volatile ReduceElemType>::Update(accumulators[tid], accumulators[tid + i], reductionOp);
}
if (0 + i < tids)
__syncthreads(); // sync if condition true for at least one thread
// TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values. See Amit's allreduce() function implementation in MatrixQuantizer_kernel.cu.
@ -765,7 +757,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
_launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
beta, pointers, alpha, op, reductionOp,
regularOpStrides, regularStrides, NN,
reducingOpDims, reducingStrides,/*reductionBegin*/ 0, reductionChunkSize);
reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize);
}
// --- case (b)
// Reduction across blocks. This is the difficult one.