(deleted some obsolete debug code)
This commit is contained in:
Родитель
9372b6afdd
Коммит
d11532cf69
|
@ -4470,6 +4470,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);
|
||||
|
||||
// special case: recuding a matrix onto a column vector; can be done with SGEMM
|
||||
// Note: A minor risk is that with this, our own reduction function will rarely be used.
|
||||
// That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
|
||||
else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
|
||||
regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column
|
||||
reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t)regularOpDims[0]) // reducing across columns and no overlap
|
||||
|
|
|
@ -468,7 +468,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
|
||||
const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
|
||||
{
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// copy all parameters to CUDA-compatible data structures
|
||||
FixedArray<ElemType*, N> pointers(pointerVector);
|
||||
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
|
||||
|
@ -483,13 +482,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
|
||||
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
|
||||
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// launch the kernel
|
||||
CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// do some optimization for reductions
|
||||
// Cases:
|
||||
// - #output elements >= GPU procs --> use one proc per element, do reduction in inner loop
|
||||
|
@ -509,12 +506,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
|
||||
reductionDim *= (C_size_t)reducingOpDimVector[k];
|
||||
let & props = GridDim::GetDeviceProps();
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
GridDim grid(NN);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount /* && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
|
||||
{
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
|
||||
// Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.
|
||||
|
||||
|
@ -522,55 +516,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// We increase #blocks by that factor by breaking reduction into that many chunks.
|
||||
let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);
|
||||
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)props.maxGridSize[0]);
|
||||
// NN may be too large for a single dimension
|
||||
let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)blockXOverBy);
|
||||
let numBlocksX = CeilDiv(NN, blockXOverBy);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)numBlocksX);
|
||||
let numBlocksY = CeilDiv(NN, numBlocksX);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
let numBlocksZ = numReductionChunks;
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// Block dim is now:
|
||||
// - X, Y: such that X*Y covers NN
|
||||
// - Z: reduction chunks
|
||||
|
||||
// reduction goes into thread dim X
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
if (beta == 1 || numBlocksZ == 1)
|
||||
{
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
}
|
||||
else
|
||||
{
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// We need more than one chunk, we will use atomicAdd().
|
||||
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// we got enough elements to generate: do one element per thread, and reduction inside
|
||||
_launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
}
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
|
Загрузка…
Ссылка в новой задаче