bug fix: CeilDiv() overflowed for b == INT_MAX

This commit is contained in:
Frank Seide 2015-12-30 10:57:08 -08:00
Родитель da2b298ca3
Коммит d0b5c8d3c4
5 изменённых файлов: 57 добавлений и 27 удалений

Просмотреть файл

@ -447,15 +447,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: Does the same trick work for 2D images?
};
// image layouts used in CNTK
// Nodes that do semantic interpretation of width, height, channel information must know which index they are in.
// Eventually this can go away once we switch completely to cudnn layout.
enum ImageLayoutKind
{
CHW, // cudnn
HWC // legacy
};
static inline ImageLayoutKind ImageLayoutKindFrom(const wstring & s)
{
if (s == L"CHW") return ImageLayoutKind::CHW;
else if (s == L"HWC") return ImageLayoutKind::HWC;
else InvalidArgument("ImageLayoutKindFrom: Unknown ImageLayoutKind '%ls', must be 'CHW' (cudnn) or 'HWC' (CNTK legacy)", s.c_str());
}
static inline TensorShape ImageLayout(size_t width, size_t height, size_t channels, ImageLayoutKind imageLayoutKind)
{
if (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, channels);
else if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(channels, width, height);
else LogicError("ImageLayout: Invalid ImageLayoutKind");
}
// When constructing an image tensor with the usual W, H, C format, use the following function instead.
// This will sort the three parameters into the correct order.
// BUGBUG: at several places, a comment says "after multiplication the structure is lost" and the vector dimension
// is set as the image height. However, the image height is actually the wrong dimension since images are assumed transposed.
// This will get fixed once we get more complete arbitrary tensor support throughout, including better-defined inference rules.
// BUGBUG: This only works for ImageLayoutKind::HWC. Also the naming is bad.
static inline TensorShape ImageLayoutWHC(size_t width, size_t height, size_t channels)
{
return TensorShape(channels, width, height);
}
// TODO: we need a constructor from config; that will allow us to generalize
}}}

Просмотреть файл

@ -4456,17 +4456,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all.
if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
{
if(op==1)fprintf(stderr, "LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);
if(op==1)fprintf(stderr, "Done LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
////if (op == 1)fprintf(stderr, "Done LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
}
// regular case
else
{
if(op==1)fprintf(stderr, "TensorOpN<2> %d\n", (int)regularOpDims[0]);
////if (op == 1)fprintf(stderr, "TensorOpN<2> %d\n", (int)regularOpDims[0]);
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2> { a.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
if(op==1)fprintf(stderr, "Done TensorOpN<2> %d\n", (int)regularOpDims[0]);
////if (op == 1)fprintf(stderr, "Done TensorOpN<2> %d\n", (int)regularOpDims[0]);
}
}

Просмотреть файл

@ -85,10 +85,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// ---------------------------------------------------------------------------
template<class INT, class INT2>
static INT CeilDiv(INT a, INT2 b)
static INT CeilDiv(INT a, INT2 b) // ceil(a/b)
{
if (b == 0) LogicError("CeilDiv a=%d b=%d", (int)a, (int)b); // TODO: delete this once tracked down
return (a + b - 1) / b;
return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b); // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
}
struct GridDim
@ -138,6 +137,10 @@ struct GridDim
std::vector<cudaDeviceProp> props(numDevices);
for (int i = 0; i < numDevices; i++)
CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
#if 1 // on Linux, maxGridSize[0] gets reported as 0
for (int i = 0; i < numDevices; i++)
fprintf(stderr, "%d procs %d warps %d %d %d max grid on %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
#endif
return props;
}

Просмотреть файл

@ -465,7 +465,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
{
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// copy all parameters to CUDA-compatible data structures
FixedArray<ElemType*, N> pointers(pointerVector);
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
@ -480,13 +480,13 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// launch the kernel
CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// do some optimization for reductions
// Cases:
// - #output elements >= GPU procs --> use one proc per element, do reduction in inner loop
@ -506,12 +506,12 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
reductionDim *= (C_size_t)reducingOpDimVector[k];
let & props = GridDim::GetDeviceProps();
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
GridDim grid(NN);
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount /* && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
{
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
// Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.
@ -519,48 +519,55 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// We increase #blocks by that factor by breaking reduction into that many chunks.
let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)props.maxGridSize[0]);
// NN may be too large for a single dimension
let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)blockXOverBy);
let numBlocksX = CeilDiv(NN, blockXOverBy);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)numBlocksX);
let numBlocksY = CeilDiv(NN, numBlocksX);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
let numBlocksZ = numReductionChunks;
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// Block dim is now:
// - X, Y: such that X*Y covers NN
// - Z: reduction chunks
// reduction goes into thread dim X
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
if (beta == 1 || numBlocksZ == 1)
{
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
}
else
{
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// We need more than one chunk, we will use atomicAdd().
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
}
}
else
{
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
// we got enough elements to generate: do one element per thread, and reduction inside
_launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
}
// -----------------------------------------------------------------------
@ -603,7 +610,7 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
template<class ElemType>
void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
{
if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
//////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
CUDA_LONG NN = (CUDA_LONG)regularOpDim;
#define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \

Просмотреть файл

@ -27,6 +27,8 @@ Using parallel sequences (difference to above: nbruttsineachrecurrentiter=4). No
COMMAND: currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data configFile=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM\cntk.config stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log RunDir=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance NdlDir=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM DataDir=. DeviceId=auto Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=4]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[learningRatesPerMB=0.125]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] makeMode=false
Linux: bin/cntk currentDirectory=Tests/EndToEndTests/Speech/Data configFile=../LSTM/cntk.config stderr=../RunDir/LSTM/Truncated/models/cntkSpeech.dnn.log RunDir=../RunDir/LSTM/Truncated NdlDir=../LSTM DataDir=. DeviceId=auto Truncated=false 'speechTrain=[reader=[nbruttsineachrecurrentiter=4]]' 'speechTrain=[SGD=[epochSize=2560]]' 'speechTrain=[SGD=[learningRatesPerMB=0.125]]' 'speechTrain=[SGD=[maxEpochs=2]]' 'speechTrain=[SGD=[numMBsToShowResult=1]]' makeMode=false
Using full BrainScript configuration
COMMAND: --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log' -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance' -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM' -D DataDir='.' -D DeviceId='Auto' -D Truncated=false -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]] -D makeMode=false