(deleted some obsolete debug code)

This commit is contained in:
Frank Seide 2015-12-30 19:55:06 -08:00
Родитель 9372b6afdd
Коммит d11532cf69
2 изменённых файлов: 2 добавлений и 22 удалений

Просмотреть файл

@ -4470,6 +4470,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);
// special case: recuding a matrix onto a column vector; can be done with SGEMM
// Note: A minor risk is that with this, our own reduction function will rarely be used.
// That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column
reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t)regularOpDims[0]) // reducing across columns and no overlap

Просмотреть файл

@ -468,7 +468,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
{
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// copy all parameters to CUDA-compatible data structures
FixedArray<ElemType*, N> pointers(pointerVector);
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
@ -483,13 +482,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// launch the kernel
CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// do some optimization for reductions
// Cases:
// - #output elements >= GPU procs --> use one proc per element, do reduction in inner loop
@ -509,12 +506,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
reductionDim *= (C_size_t)reducingOpDimVector[k];
let & props = GridDim::GetDeviceProps();
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
GridDim grid(NN);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount /* && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
{
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
// Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.
@ -522,55 +516,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// We increase #blocks by that factor by breaking reduction into that many chunks.
let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)props.maxGridSize[0]);
// NN may be too large for a single dimension
let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)blockXOverBy);
let numBlocksX = CeilDiv(NN, blockXOverBy);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)numBlocksX);
let numBlocksY = CeilDiv(NN, numBlocksX);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
let numBlocksZ = numReductionChunks;
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// Block dim is now:
// - X, Y: such that X*Y covers NN
// - Z: reduction chunks
// reduction goes into thread dim X
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
if (beta == 1 || numBlocksZ == 1)
{
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
}
else
{
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// We need more than one chunk, we will use atomicAdd().
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
}
}
else
{
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// we got enough elements to generate: do one element per thread, and reduction inside
_launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
}
// -----------------------------------------------------------------------