bug fix: CeilDiv() overflowed for b == INT_MAX
This commit is contained in:
Родитель
da2b298ca3
Коммит
d0b5c8d3c4
|
@ -447,15 +447,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// TODO: Does the same trick work for 2D images?
|
||||
};
|
||||
|
||||
// image layouts used in CNTK
|
||||
// Nodes that do semantic interpretation of width, height, channel information must know which index they are in.
|
||||
// Eventually this can go away once we switch completely to cudnn layout.
|
||||
enum ImageLayoutKind
|
||||
{
|
||||
CHW, // cudnn
|
||||
HWC // legacy
|
||||
};
|
||||
static inline ImageLayoutKind ImageLayoutKindFrom(const wstring & s)
|
||||
{
|
||||
if (s == L"CHW") return ImageLayoutKind::CHW;
|
||||
else if (s == L"HWC") return ImageLayoutKind::HWC;
|
||||
else InvalidArgument("ImageLayoutKindFrom: Unknown ImageLayoutKind '%ls', must be 'CHW' (cudnn) or 'HWC' (CNTK legacy)", s.c_str());
|
||||
}
|
||||
static inline TensorShape ImageLayout(size_t width, size_t height, size_t channels, ImageLayoutKind imageLayoutKind)
|
||||
{
|
||||
if (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, channels);
|
||||
else if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(channels, width, height);
|
||||
else LogicError("ImageLayout: Invalid ImageLayoutKind");
|
||||
}
|
||||
|
||||
// When constructing an image tensor with the usual W, H, C format, use the following function instead.
|
||||
// This will sort the three parameters into the correct order.
|
||||
// BUGBUG: at several places, a comment says "after multiplication the structure is lost" and the vector dimension
|
||||
// is set as the image height. However, the image height is actually the wrong dimension since images are assumed transposed.
|
||||
// This will get fixed once we get more complete arbitrary tensor support throughout, including better-defined inference rules.
|
||||
// BUGBUG: This only works for ImageLayoutKind::HWC. Also the naming is bad.
|
||||
static inline TensorShape ImageLayoutWHC(size_t width, size_t height, size_t channels)
|
||||
{
|
||||
return TensorShape(channels, width, height);
|
||||
}
|
||||
// TODO: we need a constructor from config; that will allow us to generalize
|
||||
|
||||
}}}
|
||||
|
|
|
@ -4456,17 +4456,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all.
|
||||
if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
|
||||
{
|
||||
if(op==1)fprintf(stderr, "LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
|
||||
////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
|
||||
return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);
|
||||
if(op==1)fprintf(stderr, "Done LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
|
||||
////if (op == 1)fprintf(stderr, "Done LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
|
||||
}
|
||||
|
||||
// regular case
|
||||
else
|
||||
{
|
||||
if(op==1)fprintf(stderr, "TensorOpN<2> %d\n", (int)regularOpDims[0]);
|
||||
////if (op == 1)fprintf(stderr, "TensorOpN<2> %d\n", (int)regularOpDims[0]);
|
||||
return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2> { a.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
|
||||
if(op==1)fprintf(stderr, "Done TensorOpN<2> %d\n", (int)regularOpDims[0]);
|
||||
////if (op == 1)fprintf(stderr, "Done TensorOpN<2> %d\n", (int)regularOpDims[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -85,10 +85,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
template<class INT, class INT2>
|
||||
static INT CeilDiv(INT a, INT2 b)
|
||||
static INT CeilDiv(INT a, INT2 b) // ceil(a/b)
|
||||
{
|
||||
if (b == 0) LogicError("CeilDiv a=%d b=%d", (int)a, (int)b); // TODO: delete this once tracked down
|
||||
return (a + b - 1) / b;
|
||||
return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b); // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
|
||||
}
|
||||
|
||||
struct GridDim
|
||||
|
@ -138,6 +137,10 @@ struct GridDim
|
|||
std::vector<cudaDeviceProp> props(numDevices);
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
|
||||
#if 1 // on Linux, maxGridSize[0] gets reported as 0
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
fprintf(stderr, "%d procs %d warps %d %d %d max grid on %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
|
||||
#endif
|
||||
return props;
|
||||
}
|
||||
|
||||
|
|
|
@ -465,7 +465,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
const SmallVector<size_t> & regularOpDims, const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
|
||||
const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
|
||||
{
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// copy all parameters to CUDA-compatible data structures
|
||||
FixedArray<ElemType*, N> pointers(pointerVector);
|
||||
SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
|
||||
|
@ -480,13 +480,13 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
|||
FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
|
||||
FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
|
||||
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// launch the kernel
|
||||
CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// do some optimization for reductions
|
||||
// Cases:
|
||||
// - #output elements >= GPU procs --> use one proc per element, do reduction in inner loop
|
||||
|
@ -506,12 +506,12 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
|||
for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
|
||||
reductionDim *= (C_size_t)reducingOpDimVector[k];
|
||||
let & props = GridDim::GetDeviceProps();
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
GridDim grid(NN);
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount /* && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
|
||||
{
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
|
||||
// Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.
|
||||
|
||||
|
@ -519,48 +519,55 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
|||
// We increase #blocks by that factor by breaking reduction into that many chunks.
|
||||
let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);
|
||||
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)props.maxGridSize[0]);
|
||||
// NN may be too large for a single dimension
|
||||
let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)blockXOverBy);
|
||||
let numBlocksX = CeilDiv(NN, blockXOverBy);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)numBlocksX);
|
||||
let numBlocksY = CeilDiv(NN, numBlocksX);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
let numBlocksZ = numReductionChunks;
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// Block dim is now:
|
||||
// - X, Y: such that X*Y covers NN
|
||||
// - Z: reduction chunks
|
||||
|
||||
// reduction goes into thread dim X
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
if (beta == 1 || numBlocksZ == 1)
|
||||
{
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// We need more than one chunk, we will use atomicAdd().
|
||||
// First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
|
||||
_launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
// we got enough elements to generate: do one element per thread, and reduction inside
|
||||
_launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
}
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -603,7 +610,7 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
|
|||
template<class ElemType>
|
||||
void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
|
||||
{
|
||||
if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
|
||||
//////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
|
||||
CUDA_LONG NN = (CUDA_LONG)regularOpDim;
|
||||
|
||||
#define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \
|
||||
|
|
|
@ -27,6 +27,8 @@ Using parallel sequences (difference to above: nbruttsineachrecurrentiter=4). No
|
|||
|
||||
COMMAND: currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data configFile=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM\cntk.config stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log RunDir=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance NdlDir=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM DataDir=. DeviceId=auto Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=4]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[learningRatesPerMB=0.125]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] makeMode=false
|
||||
|
||||
Linux: bin/cntk currentDirectory=Tests/EndToEndTests/Speech/Data configFile=../LSTM/cntk.config stderr=../RunDir/LSTM/Truncated/models/cntkSpeech.dnn.log RunDir=../RunDir/LSTM/Truncated NdlDir=../LSTM DataDir=. DeviceId=auto Truncated=false 'speechTrain=[reader=[nbruttsineachrecurrentiter=4]]' 'speechTrain=[SGD=[epochSize=2560]]' 'speechTrain=[SGD=[learningRatesPerMB=0.125]]' 'speechTrain=[SGD=[maxEpochs=2]]' 'speechTrain=[SGD=[numMBsToShowResult=1]]' makeMode=false
|
||||
|
||||
Using full BrainScript configuration
|
||||
|
||||
COMMAND: --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log' -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance' -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM' -D DataDir='.' -D DeviceId='Auto' -D Truncated=false -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]] -D makeMode=false
|
||||
|
|
Загрузка…
Ссылка в новой задаче