CNTK/Source/Math/MatrixQuantizer_kernel.cu

309 строки
11 KiB
Plaintext

#ifndef __MATRIX_QUANTIZER_KERNEL_CUH__
#define __MATRIX_QUANTIZER_KERNEL_CUH__
#include <float.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include "ValueQuantizer.h"
#include "ColumnQuantizer.h"
#include "QuantizedMatrix.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// =======================================================================
// thread layout helpers
// =======================================================================
// --- distribute array elements naively over threads
__host__ static void ParallelizeOverRangeDim(size_t size, dim3& griddim, dim3& blockdim, const size_t warpsize = 64)
{
// <<< griddim, blockdim, sharedmemsize, stream >>>
griddim = (unsigned int) ((size + warpsize - 1) / warpsize); // 'warpsize' threads on each block (-> threadIdx.x)
blockdim = (unsigned int) warpsize; // -> blockIdx.x
}
// get the array index for the current thread
__device__ __inline__ static size_t ParallelizeOverRangeIndex()
{
return threadIdx.x + (blockIdx.x * blockDim.x);
}
// =======================================================================
// quantization
// =======================================================================
// helper to reduce all T across all threads of a block
template <typename T, int BLOCKSIZE>
__device__ void allreduce(T& var)
{
__shared__ T buf[BLOCKSIZE];
volatile T* vBuf = buf;
buf[threadIdx.x] = var;
__syncthreads();
// We assume BLOCKSIZE is a power of 2
if (BLOCKSIZE >= 1024)
{
if (threadIdx.x < 512)
{
var = var + buf[threadIdx.x + 512];
buf[threadIdx.x] = var;
}
__syncthreads();
}
if (BLOCKSIZE >= 512)
{
if (threadIdx.x < 256)
{
var = var + buf[threadIdx.x + 256];
buf[threadIdx.x] = var;
}
__syncthreads();
}
if (BLOCKSIZE >= 256)
{
if (threadIdx.x < 128)
{
var = var + buf[threadIdx.x + 128];
buf[threadIdx.x] = var;
}
__syncthreads();
}
if (BLOCKSIZE >= 128)
{
if (threadIdx.x < 64)
{
var = var + buf[threadIdx.x + 64];
buf[threadIdx.x] = var;
}
__syncthreads();
}
// Intra warp reduce
if ((BLOCKSIZE >= 64) && (threadIdx.x < 32))
{
var = var + vBuf[threadIdx.x + 32];
vBuf[threadIdx.x] = var;
}
if ((BLOCKSIZE >= 32) && (threadIdx.x < 16))
{
var = var + vBuf[threadIdx.x + 16];
vBuf[threadIdx.x] = var;
}
if ((BLOCKSIZE >= 16) && (threadIdx.x < 8))
{
var = var + vBuf[threadIdx.x + 8];
vBuf[threadIdx.x] = var;
}
if ((BLOCKSIZE >= 8) && (threadIdx.x < 4))
{
var = var + vBuf[threadIdx.x + 4];
vBuf[threadIdx.x] = var;
}
if ((BLOCKSIZE >= 4) && (threadIdx.x < 2))
{
var = var + vBuf[threadIdx.x + 2];
vBuf[threadIdx.x] = var;
}
if ((BLOCKSIZE >= 2) && (threadIdx.x == 0))
{
var = var + vBuf[1];
vBuf[0] = var;
}
__syncthreads();
var = buf[0];
}
#define REDUCTION_BLOCK_SIZE 128 // 256 is much worse; 64 is somewhat worse
// version optimized for collated memory access
template <class ElemType, bool ZeroThresholdFor1Bit>
__global__ void _ComputeQuantiStatParj(const ElemType* us, const ElemType* inResidual, long M, long N, size_t ldNbits, char* qpackage)
{
size_t subset = threadIdx.x; // first thread computes 0, 64, 128; second thread 1, 65, 129 etc.
size_t j = blockIdx.x; // we process one column per *block*, j=column index; note: j is never out of range
size_t rows = M; // we compute from 0..rows-1
size_t bits = 1 << ldNbits;
const size_t colSizeByte = Microsoft::MSR::CNTK::QuantizedColumn<ElemType>::QuantizedColumnSize(bits, rows);
auto& qcol = *(Microsoft::MSR::CNTK::QuantizedColumn<ElemType>*) &qpackage[colSizeByte * j];
Microsoft::MSR::CNTK::ColumnQuantizer<ElemType>::ComputeRangeStatColjSubset<ZeroThresholdFor1Bit>(us, inResidual, M, j, bits, qcol.lower, qcol.upper,
subset, REDUCTION_BLOCK_SIZE, allreduce<ElemType, REDUCTION_BLOCK_SIZE>, allreduce<unsigned int, REDUCTION_BLOCK_SIZE>);
}
//caller: griddim and blockdim should be both 1d
//total thread number is: totalNumQWordsAlMatrix = numCols() * numQWordsPerCol
//called to quantize a GPU matrix
template <class ElemType, bool ZeroThresholdFor1Bit>
__global__ void _QuantizeStripjOneQWord(
const ElemType* us,
ElemType* curResidual,
long M, long N,
char* qMat,
size_t qColSize,
size_t numQWordsPerCol,
size_t ldNbits,
ElemType* newResidual)
{
// map our thread index into a linear index
const size_t linindex = ParallelizeOverRangeIndex();
// map to (QWord index, column index)
const size_t j = linindex / numQWordsPerCol;
if (j >= N) // out of col range
return;
const size_t iQWord = linindex % numQWordsPerCol;
// get data pointers to the quantized column
auto& qCol = *(Microsoft::MSR::CNTK::QuantizedColumn<ElemType>*) &qMat[qColSize * j];
// and quantizer
const Microsoft::MSR::CNTK::ColumnQuantizer<ElemType> q(ldNbits, qCol.lower, qCol.upper);
// quantize one QWord to qCol[iQWord]
qCol.bits[iQWord] = q.QuantizeOneQWord<ZeroThresholdFor1Bit>(us, curResidual, M, iQWord, M, numQWordsPerCol, j, newResidual);
}
template <class ElemType>
__global__ void UnquantizeStripejOneQWord(ElemType* us, const long M, const long N, const char* qpackage, size_t colsize, size_t numQWordsPerCol, size_t ldNbits, bool add)
{
// this follows the same as quantizestripej()
// map our thread index into a linear index
const size_t linindex = ParallelizeOverRangeIndex();
// map to (QWord index, column index)
const size_t j = linindex / numQWordsPerCol;
if (j >= N) // out of col range
return;
const size_t iQWord = linindex % numQWordsPerCol;
// get data pointers and quantizer
const auto& qcol = *(const Microsoft::MSR::CNTK::QuantizedColumn<ElemType>*) &qpackage[colsize * j];
const ElemType lower = qcol.lower;
const ElemType upper = qcol.upper;
Microsoft::MSR::CNTK::ColumnQuantizer<ElemType> q(ldNbits, lower, upper);
// unquantize from this one QWord
q.UnquantizeOneQWord(us, M, iQWord, M, numQWordsPerCol, j, qcol.bits[iQWord], add);
}
//maybe should move out into another class?
template <class ElemType>
void _QuantizeMatrix(
const ElemType* us,
ElemType* curResidual,
long M, long N,
char* qPackage,
size_t Nbits,
cudaStream_t stream,
ElemType* newResidual,
bool zeroThresholdFor1Bit)
{
/* verify buffer allocation size
if (msra::math::matrixquantizer::buffersize(bits, rows(), cols()) != gpubuffer.size())
LogicError("quantizestripe: dimension of patch to be quantized does not match allocated buffer size for quantized data");
if (rows() != curresidual.rows() || cols() != curresidual.cols()
|| rows() != newresidual.rows() || cols() != newresidual.cols())
LogicError("quantizestripe: dimension of patch to be quantized does not match residual buffer");
if (gpubuffer.size() == 0) // empty buffer: empty matrix, we are done (explicit test needed since launch will fail with 0 threads)
return;*/
// determine mean and variance -> value range (stored in quant package) --for 1 bit, refine it in a second pass
const size_t ldNbits = ValueQuantizer<ElemType>::ld(Nbits);
size_t nRow = M;
size_t nCol = N;
dim3 mvgriddim, mvblockdim;
// using specialized CUDA code (not shared with CPU) for collated memory access
// each thread column computes 'warpsize' elements
mvgriddim = (unsigned int) nCol; // column number
mvblockdim = REDUCTION_BLOCK_SIZE;
if (zeroThresholdFor1Bit)
{
_ComputeQuantiStatParj<ElemType, true><<<mvgriddim, mvblockdim, 0, stream>>>(us, curResidual, M, N, ldNbits, qPackage);
}
else
{
_ComputeQuantiStatParj<ElemType, false><<<mvgriddim, mvblockdim, 0, stream>>>(us, curResidual, M, N, ldNbits, qPackage);
}
// quantize data (also computing the residual at once)
// optimizing for collated memory access:
// - each 32-bit word represents an interleaved (not consecutive) set of floats -> parallel threads can do collated accesses
// example:
// - total number of 32-bit words(1-bit quant): 1100 * 2048 / 32 = 70k
// - thread x dimension: index into 32-bit word (e.g. 1100/32 = 35 threads)
// - thread y dimension and thread position: column (e.g. 2048)
// - using 128 threads on one proc -> 70k/128 = 550 blocks
// - threads are indexed by a global index into quantized 32-bit words in increasing order; each thread must
// - re-linearize block index and thread index
// - map to (i,j) coordinate (start of the set of floats)
const size_t numQWordsPerCol = Microsoft::MSR::CNTK::ColumnQuantizer<ElemType>::QWordsPerCol(nRow, Nbits);
const size_t totalQWords = nCol * numQWordsPerCol;
const size_t colsizebyte = Microsoft::MSR::CNTK::QuantizedColumn<ElemType>::QuantizedColumnSize(Nbits, nRow);
dim3 griddim, blockdim;
ParallelizeOverRangeDim(totalQWords, griddim, blockdim, 256);
if (zeroThresholdFor1Bit)
{
_QuantizeStripjOneQWord<ElemType, true><<<griddim, blockdim, 0, stream>>>(us, curResidual, M, N, qPackage, colsizebyte, numQWordsPerCol, ldNbits, newResidual);
}
else
{
_QuantizeStripjOneQWord<ElemType, false><<<griddim, blockdim, 0, stream>>>(us, curResidual, M, N, qPackage, colsizebyte, numQWordsPerCol, ldNbits, newResidual);
}
}
// unquantize
// Process the quantization package to recover (unquantize) the matrix patch.
template <class ElemType>
void _UnquantizeMatrix(const char* gpuBuffer, size_t gpuBufferSize,
ElemType* us, long M, long N,
size_t nBits, bool add, cudaStream_t stream)
{
// verify buffer allocation size
/*if (msra::math::matrixquantizer::buffersize(bits, rows(), cols()) != gpubuffer.size())
LogicError("unquantizestripe: dimension of patch to be unquantized does not match size of quantized data");
if (gpubuffer.size() == 0) // empty buffer: empty matrix, we are done (explicit test needed since launch will fail with 0 threads)
return;
*/
size_t qSize = QuantizedColumn<ElemType>::QuantizedColumnSize(nBits, M) * N;
if (qSize != gpuBufferSize)
LogicError("unquantizestripe: dimension of patch to be unquantized does not match size of quantized data");
if (gpuBufferSize == 0) // empty buffer: empty matrix, we are done (explicit test needed since launch will fail with 0 threads)
return;
// #bits must be a power of two; we operate on shift values
const size_t ldNbits = ValueQuantizer<ElemType>::ld(nBits);
// unquantize in the same thread layout as quantize(), see there
const size_t numQWordsPerCol = ColumnQuantizer<ElemType>::QWordsPerCol(M, nBits);
const size_t totalQWords = N * numQWordsPerCol;
const size_t colsize = QuantizedColumn<ElemType>::QuantizedColumnSize(nBits, M);
dim3 griddim, blockdim;
ParallelizeOverRangeDim(totalQWords, griddim, blockdim, 256);
UnquantizeStripejOneQWord<<<griddim, blockdim, 0, stream>>>(us, M, N, gpuBuffer, colsize, numQWordsPerCol, ldNbits, add);
}
}
}
}
#endif