Minor changes in ImageReader and TopK eval.

This commit is contained in:
Alexey Kamenev 2015-09-16 12:35:21 -07:00
Родитель 7b751bf813
Коммит 3129a35ed0
3 изменённых файлов: 58 добавлений и 77 удалений

Просмотреть файл

@ -393,7 +393,7 @@ bool ImageReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>
std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast<ElemType>(0));
//#pragma omp parallel for ordered schedule(dynamic)
#pragma omp parallel for ordered schedule(dynamic)
for (long long i = 0; i < static_cast<long long>(mbLim - m_mbStart); i++)
{
const auto& p = files[i + m_mbStart];

Просмотреть файл

@ -22,39 +22,6 @@
#include "GPUSparseMatrix.h"
#include <iostream> // for cout
// REVIEW alexeyk: disable warnings properly for GCC/clang
//#ifdef _MSC_VER
//#pragma warning (push)
//#pragma warning (disable: 4100)
//#pragma warning (disable: 4127)
//#pragma warning (disable: 4201)
//#pragma warning (disable: 4244)
//#pragma warning (disable: 4267)
//#pragma warning (disable: 4324)
//#pragma warning (disable: 4510)
//#pragma warning (disable: 4512)
//#pragma warning (disable: 4515)
//#pragma warning (disable: 4610)
//#endif
//#include <thrust/device_ptr.h>
//#include <thrust/sort.h>
//#ifdef _MSC_VER
//#pragma warning (pop)
//#endif
// REVIEW alexeyk: disable warnings properly for GCC/clang
#ifdef _MSC_VER
#pragma warning (push)
#pragma warning (disable: 4100)
#pragma warning (disable: 4127)
#pragma warning (disable: 4201)
#pragma warning (disable: 4515)
#endif
#include <cub/cub.cuh>
#ifdef _MSC_VER
#pragma warning (pop)
#endif
#pragma comment (lib, "cudart.lib") // instruct linker to reference these libs
#pragma comment (lib, "cublas.lib")
#pragma comment (lib, "cusparse.lib")
@ -2971,7 +2938,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
__global__ void _initIndicesForSort(uint64_t* indexes, CUDA_LONG crow, CUDA_LONG ccol)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
@ -2982,19 +2949,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
indexes[id] = (static_cast<uint64_t>(irow) << 32) | icol;
}
template<class ElemType>
__global__ void _copyTopKResults(uint64_t* indexes, ElemType* values, ElemType* maxIndexes, ElemType* maxValues,
CUDA_LONG crow, CUDA_LONG ccol, int topK)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= topK * ccol)
return;
CUDA_LONG irow = id % topK;
CUDA_LONG icol = id / topK;
maxIndexes[id] = static_cast<CUDA_LONG>(indexes[icol * crow + irow] >> 32);
maxValues[id] = values[icol * crow + irow];
}
template<class ElemType>
void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK, GPUMatrix<ElemType>& workspace) const
{
@ -3123,35 +3077,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
}
template<int BlockSize, class ElemType>
__global__ void _assignNumOfDiffCol(const ElemType *a, const ElemType *b, ElemType *c, CUDA_LONG crowB, CUDA_LONG ccol)
{
assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
using BlockReduceT = cub::BlockReduce<int, BlockSize>;
__shared__ typename BlockReduceT::TempStorage tmp;
int cur = 0;
CUDA_LONG icol = threadIdx.x;
for (; icol < ccol; icol += blockDim.x)
{
ElemType key = a[icol];
CUDA_LONG idxB = icol * crowB;
CUDA_LONG irow = 0;
for (; irow < crowB; irow++, idxB++)
{
if (b[idxB] == key)
break;
}
cur += (irow == crowB);
}
int res = BlockReduceT(tmp).Sum(cur);
if (threadIdx.x == 0)
*c = res;
}
template<class ElemType>
GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, bool searchInCol)
{

Просмотреть файл

@ -13,6 +13,19 @@
#include "CommonMatrix.h"
#include "device_functions.h"
// REVIEW alexeyk: disable warnings properly for GCC/clang
#ifdef _MSC_VER
#pragma warning (push)
#pragma warning (disable: 4100)
#pragma warning (disable: 4127)
#pragma warning (disable: 4201)
#pragma warning (disable: 4515)
#endif
#include <cub/cub.cuh>
#ifdef _MSC_VER
#pragma warning (pop)
#endif
// We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
// For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
@ -4502,4 +4515,47 @@ __global__ void _AssignSequenceError(const ElemType hsmoothingWeight, ElemType *
//error[id] -= alpha * (label[id] - dnnoutput[id] );
}
template<class ElemType>
__global__ void _copyTopKResults(const uint64_t* indexes, const ElemType* values, ElemType* maxIndexes, ElemType* maxValues,
CUDA_LONG crow, CUDA_LONG ccol, int topK)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id >= topK * ccol)
return;
CUDA_LONG irow = id % topK;
CUDA_LONG icol = id / topK;
maxIndexes[id] = static_cast<CUDA_LONG>(indexes[icol * crow + irow] >> 32);
maxValues[id] = values[icol * crow + irow];
}
template<int BlockSize, class ElemType>
__global__ void _assignNumOfDiffCol(const ElemType *a, const ElemType *b, ElemType *c, CUDA_LONG crowB, CUDA_LONG ccol)
{
assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
int cur = 0;
CUDA_LONG icol = threadIdx.x;
for (; icol < ccol; icol += blockDim.x)
{
ElemType key = a[icol];
CUDA_LONG idxB = icol * crowB;
CUDA_LONG irow = 0;
for (; irow < crowB; irow++, idxB++)
{
if (b[idxB] == key)
break;
}
cur += (irow == crowB);
}
using BlockReduceT = cub::BlockReduce<int, BlockSize>;
__shared__ typename BlockReduceT::TempStorage tmp;
int res = BlockReduceT(tmp).Sum(cur);
if (threadIdx.x == 0)
*c = res;
}
#endif // !CPUONLY