(deleted some obsolete debug code)

2015-12-30 19:55:06 -08:00 · 2015-12-30 19:55:06 -08:00 · d11532cf69
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -4470,6 +4470,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return LaunchUnaryTensorOp<ElemType>(beta, a.m_pArray + offsets[0], m_pArray + offsets[1], alpha, op, regularOpDims[0]);

        // special case: recuding a matrix onto a column vector; can be done with SGEMM
+        // Note: A minor risk is that with this, our own reduction function will rarely be used.
+        // That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
        else if (op == ElementWiseOperator::opCopy &&                                                       // we are just adding to target without any further operation
                 regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 &&     // we are processing a column
                 reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t)regularOpDims[0])        // reducing across columns and no overlap
--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@ -468,7 +468,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                            const SmallVector<size_t> & regularOpDims,       const array<SmallVector<ptrdiff_t>, N> & regularStrideVectors,
                                            const SmallVector<size_t> & reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N> & reducingStrideVectors)
    {
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        // copy all parameters to CUDA-compatible data structures
        FixedArray<ElemType*, N> pointers(pointerVector);
        SmallVector<C_size_t> regularOpStrideVector;    // kernel needs the strides for converting thread index back to multi-dimensional tensor index
@ -483,13 +482,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
        FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);

-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        // launch the kernel
        CUDA_LONG NN = (CUDA_LONG)numElements;      // linear space identifying each individual input element
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));

-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        // do some optimization for reductions
        // Cases:
        //  - #output elements >= GPU procs  -->  use one proc per element, do reduction in inner loop
@ -509,12 +506,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
            reductionDim *= (C_size_t)reducingOpDimVector[k];
        let & props = GridDim::GetDeviceProps();
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        GridDim grid(NN);
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        if (reductionDim > 1 && grid.m_blocksPerGrid < props.multiProcessorCount  /*    && NN == 10 && reductionDim <= GridDim::maxThreadsPerBlock*/)
        {
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            // we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
            // Change of strategy: All NN elements get their own block. Reduction gets split over blocks as well.

@ -522,55 +516,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // We increase #blocks by that factor by breaking reduction into that many chunks.
            let numReductionChunks = CeilDiv(props.multiProcessorCount, NN);

-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)props.maxGridSize[0]);
            // NN may be too large for a single dimension
            let blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)blockXOverBy);
            let numBlocksX = CeilDiv(NN, blockXOverBy);
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d %d", (int)__LINE__, (int)numBlocksX);
            let numBlocksY = CeilDiv(NN, numBlocksX);
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            let numBlocksZ = numReductionChunks;
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            // Block dim is now:
            //  - X, Y: such that X*Y covers NN
            //  - Z: reduction chunks

            // reduction goes into thread dim X
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            let reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            let numThreadsX = min(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);

-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            if (beta == 1 || numBlocksZ == 1)
            {
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            }
            else
            {
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
                // We need more than one chunk, we will use atomicAdd().
                // First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
                _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            }
        }
        else
        {
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
            // we got enough elements to generate: do one element per thread, and reduction inside
            _launchTensorOp<ElemType, N, M, K> << <grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >> >(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
        }
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-//////if (op == 1)fprintf(stderr, "LaunchTensorOpWithReduction: %d", (int)__LINE__);
    }

    // -----------------------------------------------------------------------