bug fix: CeilDiv() overflowed for b == INT_MAX

2015-12-30 10:57:08 -08:00 · 2015-12-30 10:57:08 -08:00 · d0b5c8d3c4
--- a/Source/Common/Include/DataTensor.h
+++ b/Source/Common/Include/DataTensor.h
@ -447,15 +447,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // TODO: Does the same trick work for 2D images?
    };

+    // image layouts used in CNTK
+    // Nodes that do semantic interpretation of width, height, channel information must know which index they are in.
+    // Eventually this can go away once we switch completely to cudnn layout.
+    enum ImageLayoutKind
+    {
+        CHW,    // cudnn
+        HWC     // legacy
+    };
+    static inline ImageLayoutKind ImageLayoutKindFrom(const wstring & s)
+    {
+        if      (s == L"CHW") return ImageLayoutKind::CHW;
+        else if (s == L"HWC") return ImageLayoutKind::HWC;
+        else InvalidArgument("ImageLayoutKindFrom: Unknown ImageLayoutKind '%ls', must be 'CHW' (cudnn) or 'HWC' (CNTK legacy)", s.c_str());
+    }
+    static inline TensorShape ImageLayout(size_t width, size_t height, size_t channels, ImageLayoutKind imageLayoutKind)
+    {
+        if       (imageLayoutKind == ImageLayoutKind::CHW) return TensorShape(width, height, channels);
+        else  if (imageLayoutKind == ImageLayoutKind::HWC) return TensorShape(channels, width, height);
+        else LogicError("ImageLayout: Invalid ImageLayoutKind");
+    }
+
    // When constructing an image tensor with the usual W, H, C format, use the following function instead.
    // This will sort the three parameters into the correct order.
-    // BUGBUG: at several places, a comment says "after multiplication the structure is lost" and the vector dimension
-    //         is set as the image height. However, the image height is actually the wrong dimension since images are assumed transposed.
-    //         This will get fixed once we get more complete arbitrary tensor support throughout, including better-defined inference rules.
+    // BUGBUG: This only works for ImageLayoutKind::HWC. Also the naming is bad.
    static inline TensorShape ImageLayoutWHC(size_t width, size_t height, size_t channels)
    {
        return TensorShape(channels, width, height);
    }
-    // TODO: we need a constructor from config; that will allow us to generalize

 }}}
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -4456,17 +4456,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all.
        if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
        {
-if(op==1)fprintf(stderr, "LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
+////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
            return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);
-if(op==1)fprintf(stderr, "Done LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
+////if (op == 1)fprintf(stderr, "Done LaunchUnaryTensorOp %d\n", (int)regularOpDims[0]);
        }

        // regular case
        else
        {
-if(op==1)fprintf(stderr, "TensorOpN<2> %d\n", (int)regularOpDims[0]);
+////if (op == 1)fprintf(stderr, "TensorOpN<2> %d\n", (int)regularOpDims[0]);
            return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2> { a.m_pArray, m_pArray }, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-if(op==1)fprintf(stderr, "Done TensorOpN<2> %d\n", (int)regularOpDims[0]);
+////if (op == 1)fprintf(stderr, "Done TensorOpN<2> %d\n", (int)regularOpDims[0]);
        }
    }

--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -85,10 +85,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // ---------------------------------------------------------------------------

 template<class INT, class INT2>
-static INT CeilDiv(INT a, INT2 b)
+static INT CeilDiv(INT a, INT2 b)   // ceil(a/b)
 {
-if (b == 0) LogicError("CeilDiv a=%d b=%d", (int)a, (int)b);    // TODO: delete this once tracked down
-    return (a + b - 1) / b;
+    return (INT)(((size_t)a + (size_t)b - 1) / (size_t)b);  // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
 }

 struct GridDim
@ -138,6 +137,10 @@ struct GridDim
        std::vector<cudaDeviceProp> props(numDevices);
        for (int i = 0; i < numDevices; i++)
            CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
+#if 1   // on Linux, maxGridSize[0] gets reported as 0
+        for (int i = 0; i < numDevices; i++)
+            fprintf(stderr, "%d procs  %d warps  %d %d %d max grid  on  %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
+#endif
        return props;
    }

--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@ -465,7 +465,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                            const SmallVector<size_t> & regularOpDims,       const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
                                            const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
    {
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        // copy all parameters to CUDA-compatible data structures
        FixedArray<ElemType*, N> pointers(pointerVector);
        SmallVector<C_size_t> regularOpStrideVector;    // kernel needs the strides for converting thread index back to multi-dimensional tensor index
@ -480,13 +480,13 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
        FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);

-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        // launch the kernel
        CUDA_LONG NN = (CUDA_LONG)numElements;      // linear space identifying each individual input element
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));

-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        // do some optimization for reductions
        // Cases:
        //  - #output elements >= GPU procs  -->  use one proc per element, do reduction in inner loop
@ -506,12 +506,12 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
            reductionDim *= (C_size_t)reducingOpDimVector[k];
        let & props = GridDim::GetDeviceProps();
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        GridDim grid(NN);
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount  /*    && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
        {
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            // we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
            // Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.

@ -519,48 +519,55 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            // We increase #blocks by that factor by breaking reduction into that many chunks.
            let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);

-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)props.maxGridSize[0]);
            // NN may be too large for a single dimension
            let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)blockXOverBy);
            let numBlocksX = CeilDiv(NN, blockXOverBy);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)numBlocksX);
            let numBlocksY = CeilDiv(NN, numBlocksX);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            let numBlocksZ = numReductionChunks;
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            // Block dim is now:
            //  - X, Y: such that X*Y covers NN
            //  - Z: reduction chunks

            // reduction goes into thread dim X
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);

-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            if (beta == 1 || numBlocksZ == 1)
            {
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            }
            else
            {
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
                // We need more than one chunk, we will use atomicAdd().
                // First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(double), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            }
        }
        else
        {
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            // we got enough elements to generate: do one element per thread, and reduction inside
            _launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        }
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
    }

    // -----------------------------------------------------------------------
@ -603,7 +610,7 @@ if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
    template<class ElemType>
    void LaunchUnaryTensorOp(ElemType beta, const ElemType * pa, ElemType * pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim)
    {
-if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
+//////if (op == 1)fprintf(stderr, "LaunchUnaryTensorOp: %d", (int)__LINE__);
        CUDA_LONG NN = (CUDA_LONG)regularOpDim;

        #define CaseLaunchUnaryTensorOp(oper) case ElementWiseOperator::op ## oper: \
--- a/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
+++ b/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
@ -27,6 +27,8 @@ Using parallel sequences (difference to above: nbruttsineachrecurrentiter=4). No

 COMMAND:     currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data  configFile=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM\cntk.config  stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log  RunDir=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance  NdlDir=$(SolutionDir)Tests\EndToEndTests\Speech\LSTM  DataDir=.  DeviceId=auto  Truncated=false  speechTrain=[reader=[nbruttsineachrecurrentiter=4]]  speechTrain=[SGD=[epochSize=2560]]  speechTrain=[SGD=[learningRatesPerMB=0.125]]  speechTrain=[SGD=[maxEpochs=2]]  speechTrain=[SGD=[numMBsToShowResult=1]]  makeMode=false

+Linux:      bin/cntk  currentDirectory=Tests/EndToEndTests/Speech/Data  configFile=../LSTM/cntk.config  stderr=../RunDir/LSTM/Truncated/models/cntkSpeech.dnn.log  RunDir=../RunDir/LSTM/Truncated  NdlDir=../LSTM  DataDir=.  DeviceId=auto  Truncated=false  'speechTrain=[reader=[nbruttsineachrecurrentiter=4]]'  'speechTrain=[SGD=[epochSize=2560]]'  'speechTrain=[SGD=[learningRatesPerMB=0.125]]'  'speechTrain=[SGD=[maxEpochs=2]]'  'speechTrain=[SGD=[numMBsToShowResult=1]]'  makeMode=false
+
 Using full BrainScript configuration

 COMMAND:     --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data  -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs  -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log'  -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance'  -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM'  -D DataDir='.'  -D DeviceId='Auto'  -D Truncated=false  -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]]  -D makeMode=false