From d0b5c8d3c4531b7be20be8e9b909cc005cc5949c Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 30 Dec 2015 10:57:08 -0800 Subject: [PATCH] bug fix: CeilDiv() overflowed for b == INT_MAX --- Source/Common/Include/DataTensor.h | 26 +++++++++++-- Source/Math/GPUMatrix.cu | 8 ++-- Source/Math/GPUMatrixCUDAKernels.cuh | 9 +++-- Source/Math/GPUTensor.cu | 39 +++++++++++-------- .../Speech/README_Windows_Debug_commands.txt | 2 + 5 files changed, 57 insertions(+), 27 deletions(-) diff --git a/Source/Common/Include/DataTensor.h b/Source/Common/Include/DataTensor.h index eb8610e94..384bebefb 100644 --- a/Source/Common/Include/DataTensor.h +++ b/Source/Common/Include/DataTensor.h @@ -447,15 +447,33 @@ namespace Microsoft { namespace MSR { namespace CNTK { // TODO: Does the same trick work for 2D images? }; + // image layouts used in CNTK + // Nodes that do semantic interpretation of width, height, channel information must know which index they are in. + // Eventually this can go away once we switch completely to cudnn layout. + enum ImageLayoutKind + { + CHW, // cudnn + HWC // legacy + }; + static inline ImageLayoutKind ImageLayoutKindFrom(const wstring & s) + { + if (s == L"CHW") return ImageLayoutKind::CHW; + else if (s == L"HWC") return ImageLayoutKind::HWC; + else InvalidArgument("ImageLayoutKindFrom: Unknown ImageLayoutKind '%ls', must be 'CHW' (cudnn) or 'HWC' (CNTK legacy)", s.c_str()); + } + static inline TensorShape ImageLayout(size_t width, size_t height, size_t channels, ImageLayoutKind imageLayoutKind) + { + if (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, channels); + else if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(channels, width, height); + else LogicError("ImageLayout: Invalid ImageLayoutKind"); + } + // When constructing an image tensor with the usual W, H, C format, use the following function instead. // This will sort the three parameters into the correct order. - // BUGBUG: at several places, a comment says "after multiplication the structure is lost" and the vector dimension - // is set as the image height. However, the image height is actually the wrong dimension since images are assumed transposed. - // This will get fixed once we get more complete arbitrary tensor support throughout, including better-defined inference rules. + // BUGBUG: This only works for ImageLayoutKind::HWC. Also the naming is bad. static inline TensorShape ImageLayoutWHC(size_t width, size_t height, size_t channels) { return TensorShape(channels, width, height); } - // TODO: we need a constructor from config; that will allow us to generalize }}} diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index 1881daadd..101847448 100644 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -4456,17 +4456,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all. if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0) { -if(op==1)fprintf(stderr, "LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]); +////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]); return LaunchUnaryTensorOp(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]); -if(op==1)fprintf(stderr, "Done LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]); +////if (op == 1)fprintf(stderr, "Done LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]); } // regular case else { -if(op==1)fprintf(stderr, "TensorOpN<2> %d\n", (int)regularOpDims[0]); +////if (op == 1)fprintf(stderr, "TensorOpN<2> %d\n", (int)regularOpDims[0]); return TensorOpN(beta, array { a.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); -if(op==1)fprintf(stderr, "Done TensorOpN<2> %d\n", (int)regularOpDims[0]); +////if (op == 1)fprintf(stderr, "Done TensorOpN<2> %d\n", (int)regularOpDims[0]); } } diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 05163fed6..75d0293ca 100644 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -85,10 +85,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { // --------------------------------------------------------------------------- template -static INT CeilDiv(INT a, INT2 b) +static INT CeilDiv(INT a, INT2 b) // ceil(a/b) { -if (b == 0) LogicError("CeilDiv a=%d b=%d", (int)a, (int)b); // TODO: delete this once tracked down - return (a + b - 1) / b; + return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b); // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[]) } struct GridDim @@ -138,6 +137,10 @@ struct GridDim std::vector props(numDevices); for (int i = 0; i < numDevices; i++) CUDA_CALL(cudaGetDeviceProperties(&props[i], i)); +#if 1 // on Linux, maxGridSize[0] gets reported as 0 + for (int i = 0; i < numDevices; i++) + fprintf(stderr, "%d procs %d warps %d %d %d max grid on %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name); +#endif return props; } diff --git a/Source/Math/GPUTensor.cu b/Source/Math/GPUTensor.cu index a1ac47701..c732b2b33 100644 --- a/Source/Math/GPUTensor.cu +++ b/Source/Math/GPUTensor.cu @@ -465,7 +465,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { const SmallVector & regularOpDims, const array, N> & regularStrideVectors, const SmallVector & reducingOpDimVector, const array, N> & reducingStrideVectors) { -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); // copy all parameters to CUDA-compatible data structures FixedArray pointers(pointerVector); SmallVector regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index @@ -480,13 +480,13 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); FixedArray reducingOpDims(reducingOpDimVector); FixedMatrix reducingStrides(reducingStrideVectors); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); // launch the kernel CUDA_LONG NN = (CUDA_LONG)numElements; // linear space identifying each individual input element cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); // do some optimization for reductions // Cases: // - #output elements >= GPU procs --> use one proc per element, do reduction in inner loop @@ -506,12 +506,12 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); for (C_size_t k = 0; k < reducingOpDimVector.size(); k++) reductionDim *= (C_size_t)reducingOpDimVector[k]; let & props = GridDim::GetDeviceProps(); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); GridDim grid(NN); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount /* && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/) { -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); // we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel // Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well. @@ -519,48 +519,55 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); // We increase #blocks by that factor by breaking reduction into that many chunks. let numReductionChunks = CeilDiv(props.multiProcessorCount, NN); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)props.maxGridSize[0]); // NN may be too large for a single dimension let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)blockXOverBy); let numBlocksX = CeilDiv(NN, blockXOverBy); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)numBlocksX); let numBlocksY = CeilDiv(NN, numBlocksX); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); let numBlocksZ = numReductionChunks; +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); // Block dim is now: // - X, Y: such that X*Y covers NN // - Z: reduction chunks // reduction goes into thread dim X +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); if (beta == 1 || numBlocksZ == 1) { -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); _launchTensorOpWithReduction << > >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); } else { -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); // We need more than one chunk, we will use atomicAdd(). // First reset/pre-multiply input; then do the remaining chunks using atomicAdd(). _launchTensorOpWithReduction << > >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize); _launchTensorOpWithReduction << > >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); } } else { -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); // we got enough elements to generate: do one element per thread, and reduction inside _launchTensorOp << > >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); -if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); } // ----------------------------------------------------------------------- @@ -603,7 +610,7 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__); template void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim) { -if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__); +//////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__); CUDA_LONG NN = (CUDA_LONG)regularOpDim; #define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \ diff --git a/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt b/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt index e4e460a88..5add2936e 100644 --- a/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt +++ b/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt @@ -27,6 +27,8 @@ Using parallel sequences (difference to above: nbruttsineachrecurrentiter=4). No COMMAND: currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data configFile=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM\cntk.config stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log RunDir=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance NdlDir=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM DataDir=. DeviceId=auto Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=4]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[learningRatesPerMB=0.125]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] makeMode=false +Linux: bin/cntk currentDirectory=Tests/EndToEndTests/Speech/Data configFile=../LSTM/cntk.config stderr=../RunDir/LSTM/Truncated/models/cntkSpeech.dnn.log RunDir=../RunDir/LSTM/Truncated NdlDir=../LSTM DataDir=. DeviceId=auto Truncated=false 'speechTrain=[reader=[nbruttsineachrecurrentiter=4]]' 'speechTrain=[SGD=[epochSize=2560]]' 'speechTrain=[SGD=[learningRatesPerMB=0.125]]' 'speechTrain=[SGD=[maxEpochs=2]]' 'speechTrain=[SGD=[numMBsToShowResult=1]]' makeMode=false + Using full BrainScript configuration COMMAND: --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log' -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance' -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM' -D DataDir='.' -D DeviceId='Auto' -D Truncated=false -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]] -D makeMode=false