Minor changes in ImageReader and TopK eval.

2015-09-16 12:35:21 -07:00 · 2015-09-16 12:35:21 -07:00 · 3129a35ed0
--- a/DataReader/ImageReader/ImageReader.cpp
+++ b/DataReader/ImageReader/ImageReader.cpp
@ -393,7 +393,7 @@ bool ImageReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>

    std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast<ElemType>(0));
    
-//#pragma omp parallel for ordered schedule(dynamic)
+#pragma omp parallel for ordered schedule(dynamic)
    for (long long i = 0; i < static_cast<long long>(mbLim - m_mbStart); i++)
    {
        const auto& p = files[i + m_mbStart];
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -22,39 +22,6 @@
 #include "GPUSparseMatrix.h"
 #include <iostream> // for cout

-// REVIEW alexeyk: disable warnings properly for GCC/clang
-//#ifdef _MSC_VER
-//#pragma warning (push)
-//#pragma warning (disable: 4100)
-//#pragma warning (disable: 4127)
-//#pragma warning (disable: 4201)
-//#pragma warning (disable: 4244)
-//#pragma warning (disable: 4267)
-//#pragma warning (disable: 4324)
-//#pragma warning (disable: 4510)
-//#pragma warning (disable: 4512)
-//#pragma warning (disable: 4515)
-//#pragma warning (disable: 4610)
-//#endif
-//#include <thrust/device_ptr.h>
-//#include <thrust/sort.h>
-//#ifdef _MSC_VER
-//#pragma warning (pop)
-//#endif
-
-// REVIEW alexeyk: disable warnings properly for GCC/clang
-#ifdef _MSC_VER
-#pragma warning (push)
-#pragma warning (disable: 4100)
-#pragma warning (disable: 4127)
-#pragma warning (disable: 4201)
-#pragma warning (disable: 4515)
-#endif
-#include <cub/cub.cuh>
-#ifdef _MSC_VER
-#pragma warning (pop)
-#endif
-
 #pragma comment (lib, "cudart.lib")     // instruct linker to reference these libs
 #pragma comment (lib, "cublas.lib")
 #pragma comment (lib, "cusparse.lib")
@ -2971,7 +2938,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
    }
-
+    
    __global__ void _initIndicesForSort(uint64_t* indexes, CUDA_LONG crow, CUDA_LONG ccol)
    {
        CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
@ -2982,19 +2949,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        indexes[id] = (static_cast<uint64_t>(irow) << 32) | icol;
    }

-    template<class ElemType>
-    __global__ void _copyTopKResults(uint64_t* indexes, ElemType* values, ElemType* maxIndexes, ElemType* maxValues,
-        CUDA_LONG crow, CUDA_LONG ccol, int topK)
-    {
-        CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
-        if (id >= topK * ccol)
-            return;
-        CUDA_LONG irow = id % topK;
-        CUDA_LONG icol = id / topK;
-        maxIndexes[id] = static_cast<CUDA_LONG>(indexes[icol * crow + irow] >> 32);
-        maxValues[id] = values[icol * crow + irow];
-    }
-
    template<class ElemType>
    void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK, GPUMatrix<ElemType>& workspace) const
    {
@ -3123,35 +3077,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
    }

-    template<int BlockSize, class ElemType>
-    __global__ void _assignNumOfDiffCol(const ElemType *a, const ElemType *b, ElemType *c, CUDA_LONG crowB, CUDA_LONG ccol)
-    {
-        assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
-
-        using BlockReduceT = cub::BlockReduce<int, BlockSize>;
-        __shared__ typename BlockReduceT::TempStorage tmp;
-        
-        int cur = 0;
-        CUDA_LONG icol = threadIdx.x;
-        for (; icol < ccol; icol += blockDim.x)
-        {
-            ElemType key = a[icol];
-            CUDA_LONG idxB = icol * crowB;
-            CUDA_LONG irow = 0;
-            for (; irow < crowB; irow++, idxB++)
-            {
-                if (b[idxB] == key)
-                    break;
-            }
-
-            cur += (irow == crowB);
-        }
-
-        int res = BlockReduceT(tmp).Sum(cur);
-        if (threadIdx.x == 0)
-            *c = res;
-    }
-
    template<class ElemType>
    GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, bool searchInCol)
    {
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -13,6 +13,19 @@
 #include "CommonMatrix.h"
 #include "device_functions.h"

+// REVIEW alexeyk: disable warnings properly for GCC/clang
+#ifdef _MSC_VER
+#pragma warning (push)
+#pragma warning (disable: 4100)
+#pragma warning (disable: 4127)
+#pragma warning (disable: 4201)
+#pragma warning (disable: 4515)
+#endif
+#include <cub/cub.cuh>
+#ifdef _MSC_VER
+#pragma warning (pop)
+#endif
+
 // We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.

@ -4502,4 +4515,47 @@ __global__ void _AssignSequenceError(const ElemType hsmoothingWeight, ElemType *
    //error[id] -= alpha * (label[id] - dnnoutput[id] );

 }
+
+template<class ElemType>
+__global__ void _copyTopKResults(const uint64_t* indexes, const ElemType* values, ElemType* maxIndexes, ElemType* maxValues,
+    CUDA_LONG crow, CUDA_LONG ccol, int topK)
+{
+    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= topK * ccol)
+        return;
+    CUDA_LONG irow = id % topK;
+    CUDA_LONG icol = id / topK;
+    maxIndexes[id] = static_cast<CUDA_LONG>(indexes[icol * crow + irow] >> 32);
+    maxValues[id] = values[icol * crow + irow];
+}
+
+template<int BlockSize, class ElemType>
+__global__ void _assignNumOfDiffCol(const ElemType *a, const ElemType *b, ElemType *c, CUDA_LONG crowB, CUDA_LONG ccol)
+{
+    assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
+
+    int cur = 0;
+    CUDA_LONG icol = threadIdx.x;
+    for (; icol < ccol; icol += blockDim.x)
+    {
+        ElemType key = a[icol];
+        CUDA_LONG idxB = icol * crowB;
+        CUDA_LONG irow = 0;
+        for (; irow < crowB; irow++, idxB++)
+        {
+            if (b[idxB] == key)
+                break;
+        }
+
+        cur += (irow == crowB);
+    }
+
+    using BlockReduceT = cub::BlockReduce<int, BlockSize>;
+    __shared__ typename BlockReduceT::TempStorage tmp;
+
+    int res = BlockReduceT(tmp).Sum(cur);
+    if (threadIdx.x == 0)
+        *c = res;
+}
+
 #endif // !CPUONLY