CNTK/Source/Math/ColumnQuantizer.h

350 строки
15 KiB
C++

#ifndef __COLUMN_QUANTIZER_H__
#define __COLUMN_QUANTIZER_H__
#include "ValueQuantizer.h"
#include <math.h>
#pragma warning (disable: 4127) // conditional expression is constant
namespace Microsoft { namespace MSR { namespace CNTK {
#define ColMIDX(i,j,numRow) (((j)*(numRow))+(i)) // 0 based indexing for column major
// ---------------------------------------------------------------------------
// Class to perform columnwise quantization/unquantization
//
// The quantization of a column is performed in 2 steps
// a) Compute the values used for unquantizing/reconstructing the quantized values. This is done by computing a pair of
// values that specify the range of reconstructed unquantized values, such that the aggregate qunatization error is minimized.
// b) Perform the actual quantization by quantizing each value in the column to an integer of the size of
// the specified number of bits and then packing these integer bits into the quantized matrix storage
// ---------------------------------------------------------------------------
template<class ElemType>
class ColumnQuantizer
{
typedef typename ValueQuantizer<ElemType>::QWord QWord;
typedef typename ValueQuantizer<ElemType>::QWordVal QWordVal;
static const size_t QWordNumBits = ValueQuantizer<ElemType>::QWordNumBits;
public:
cudacode ColumnQuantizer(size_t logNbits, ElemType lower, ElemType upper)
: valQ(logNbits, lower, upper)
{
}
// compute #QWords per column of a given height
static size_t QWordsPerCol(size_t rows, size_t Nbits)
{
const size_t valsPerQWord = QWordNumBits / Nbits;
return (rows + valsPerQWord - 1) / valsPerQWord;
}
size_t QWordsPerCol(size_t rows) const
{
return QWordsPerCol(rows, valQ.NBits());
}
// quantize a matrix column into qcoldata
// The current value of 'inResidual' is added to the matrix, and 'outResidual' gets updated with the new residual;
// inResidual = outResidual is allowed (intended)
template<bool ZeroThresholdFor1Bit>
cudacode void Quantize(const ElemType* inMat, const ElemType* inResidual, long M, size_t j, QWord* qColBits, ElemType* outResidual) const
{
// we loop over QWord values
// E.g. there are 35 ints for a 1100-dim column (at 1-bit quantization).
// For better CUDA memory collating, we interleave memory such that computing consecutive ints triggers consecutive memory accesses
// (although for the CPU side, it breaks caching; we could do in-place op)
// E.g., int 0 accesses elements 0, 35, 70, etc.
// while int 1 accesses elements 1, 36, 71, etc
// up to int 34 accesses elements 34, 69, 104, etc.
const size_t numQWordsPerCol = QWordsPerCol(M);
for (size_t iQWord = 0; iQWord < numQWordsPerCol; iQWord++)
{
qColBits[iQWord] = QuantizeOneQWord<ZeroThresholdFor1Bit>(inMat, inResidual, M, iQWord, M, numQWordsPerCol, j, outResidual);
}
}
// unquantize a matrix column from qcoldata
// If 'add' then add to the existing content of the matrix (this is a common thing to do; saves a buffer).
cudacode void Unquantize(ElemType* outMat, long M, size_t j, const QWord* qColBits, bool add) const
{
// loop over QWord values
const size_t numQWordsPerCol = QWordsPerCol(M);
for (size_t iQWord = 0; iQWord < numQWordsPerCol; iQWord++)
{
UnquantizeOneQWord(outMat, M, iQWord, M, numQWordsPerCol, j, qColBits[iQWord], add);
}
}
// workaround for not being able to declare a default argument for lambda parameters
template<bool ZeroThresholdFor1Bit>
static cudacode void ComputeRangeStatColj(const ElemType* inMat, const ElemType* inResidual, long M, size_t j, size_t bits, ElemType& lower, ElemType& upper)
{
/*dummy reducers do nothing in linear CPU version*/
ComputeRangeStatColjSubset<ZeroThresholdFor1Bit>(inMat, inResidual, M, j, bits, lower, upper, 0, 1, [](ElemType&){}, [](unsigned int&){});
}
public:
// quantize the value in inMat[rowStart,colIdx], inMat[rowStart + rowStride,colIdx],inMat[rowStart + rowStride*,colIdx] ... and pack them into a QWord
// Question: note that it is somewhat un-intuitional, but this memory access pattern is efficient for GPU?
template<bool ZeroThresholdFor1Bit>
cudacode QWord QuantizeOneQWord(
const ElemType* inMat, const ElemType* inResidual,
long M,
size_t rowStart, size_t rowEnd, size_t rowStride,
size_t colIdx,
ElemType* outResidual) const
{
QWord bitBuf = 0;
if ((valQ.NBits() == 1) && (inResidual == outResidual)/*in-place*/)
{
ElemType val0 = valQ.Unquantize(0);
ElemType val1 = valQ.Unquantize(1);
size_t ij = ColMIDX(rowStart, colIdx, M);
const ElemType* usibj = inMat + ij;
const ElemType* usibjend = usibj + (rowEnd - rowStart);
ElemType* resibj = outResidual + ij;
// we know that the range covers at most the number of bits in a 'QWord'
for (QWord bitmask = 1; usibj < usibjend; bitmask <<= 1, usibj += rowStride, resibj += rowStride)
{
// quantize --we access element (i,j) through the three increasing pointers
ElemType val = *usibj + *resibj;
// Explicit use of 'template' keyword is needed to compile with GCC
bool qval = valQ.template Quantize1<ZeroThresholdFor1Bit>(val);
if (qval)
{
bitBuf |= bitmask;
}
// compute residual
ElemType uval = valQ.Unquantize1(qval, val0, val1);
*resibj = val - uval;
}
}
else
{
// number of bits in a QWord
size_t i = rowStart;
for (size_t k = 0; (k < QWordNumBits) && (i < rowEnd); k += valQ.NBits(), i += rowStride)
{
// quantize
size_t ij = ColMIDX(i, colIdx, M);
ElemType val = inMat[ij] + inResidual[ij];
QWordVal qval = valQ.Quantize<ZeroThresholdFor1Bit>(val);
// compute residual
ElemType uval = valQ.Unquantize(qval);
ElemType r = val - uval;
outResidual[ij] = r;
bitBuf = bitBuf | (qval << k);
}
}
return bitBuf;
}
// unquantize one QWord of a quantized matrix column
cudacode void UnquantizeOneQWord(
ElemType* us, long M,
size_t rowStart, size_t rowEnd, size_t rowStride,
size_t colIdx, QWord bitBuf, bool add) const
{
// special case for 1 bit
if (valQ.NBits() == 1)
{
ElemType val0 = valQ.Unquantize(0);
ElemType val1 = valQ.Unquantize(1);
size_t ij = ColMIDX(rowStart, colIdx, M);
ElemType* usibj = us + ij;
const ElemType* usibjend = usibj + (rowEnd - rowStart);
for (; usibj < usibjend; usibj += rowStride)
{
// get value
// bitbuf is shifted in-place
bool qval = (bitBuf & 1) != 0;
// and get bitbuf into next position
bitBuf >>= 1;
// unquantize
ElemType val = ValueQuantizer<ElemType>::Unquantize1(qval, val0, val1);
if (add)
{
val += *usibj;
}
*usibj = val;
}
}
else
{
// (rangeend MUST be a power of two; ensured by constructing off ldNbits)
const QWordVal bitmask = valQ.QuanRangeEnd() - 1;
size_t i = rowStart;
for (size_t k = 0; (k < QWordNumBits) && (i < rowEnd); k += valQ.NBits(), i += rowStride)
{
// get value
const QWordVal qval = (bitBuf >> k) & bitmask; // % 2^Nbits
// unquantize
ElemType val = valQ.Unquantize(qval);
size_t ij = ColMIDX(i, colIdx, M);
if (add)
{
val += us[ij];
}
us[ij] = val;
}
}
}
// determine quantization range of one column
// This code is written so that it can run in parallel threads on CUDA for collated memory access;
// set 'subsets' to >1 and pass cross-thread reducer functions for 'float' and 'size_t' (which would reduce through using CUDA __shared__ memory).
// TODO: further opportunity for speed-up: use 'mean' from last round for 1-bit and stddev calc
template<bool ZeroThresholdFor1Bit, class F1, class F2>
static cudacode void ComputeRangeStatColjSubset(
const ElemType* inMat,
const ElemType* inResidual, long M,
size_t j,
size_t bits,
ElemType& lower, ElemType& upper,
size_t subset, size_t subsets,
F1 allReduceElem, F2 allReduceUint)
{
// quantization range, cut off after how many standard deviations (make this a parameter if we care)
size_t rows = M;
// compute mean
// computing the mean is expensive; we assume there is no reason for asymmetry and thus a zero mean
// an initial experiment showed that this is significantly worse (36.0 vs. 37.7% frame acc) at the start, but seems to recover nearly (minor gap)
// thought:
// - we could set the threshold at 0
// - but keep the quantization values for 0 and 1 separate
// i.e.
// - do not symmetrize/pool the quantization values for 0 and 1
// - but hard-code the quantization threshold to be 0 instead of the mean of the two bounds
// This should give us the best of all--fast operation yet ability to be asymmetric within a column
ElemType mean = 0.0f;
if (!ZeroThresholdFor1Bit || (bits != 1))
{
ElemType meanacc = 0.0f;
// (subset: compute subset sum)
for (size_t i = subset; i < rows; i += subsets)
{
size_t ij = ColMIDX(i, j, M);
meanacc += inMat[ij] + inResidual[ij];
}
// multi-subset (CUDA): reduce to one thread
allReduceElem(meanacc);
mean = meanacc / rows;
}
if (bits == 1)
{
// 1-bit case:
// We want to minimize the (squared) reconstruction error within the two levels.
// I.e. we should reconstruct to the respective means of each level.
// To be able to express the range by two floats, we approximate the level threshold as the av. of the two level means.
// compute the two level means
ElemType meanacc0 = 0.0f, meanacc1 = 0.0f;
unsigned int num0 = 0, num1 = 0;
// (subset: compute subset sum)
for (size_t i = subset; i < rows; i += subsets)
{
size_t ij = ColMIDX(i, j, M);
ElemType val = inMat[ij] + inResidual[ij];
if (val < mean)
{
meanacc0 += val;
num0++;
}
else
{
meanacc1 += val;
num1++;
}
}
// multi-subset (CUDA): reduce to one thread
allReduceElem(meanacc0);
allReduceElem(meanacc1);
allReduceUint(num0);
allReduceUint(num1);
ElemType radius;
ElemType newmean;
if (!ZeroThresholdFor1Bit)
{
// we minimize the error jointly across positive and negative numbers to make things
// symmetrical around the mean (which may be non-zero) tying the two sides
ElemType devacc0 = (num0 * mean) - meanacc0;
ElemType devacc1 = meanacc1 - (num1 * mean);
// both deviations tied, to ensure consistent mean
ElemType dev = (devacc0 + devacc1) / rows;
radius = 2.0f * dev;
newmean = mean;
}
else
{
// we keep two separate reconstruction values to allow for asymmetries--but we
// instead hard-code that the threshold is 0
// happens for all-zero columns which do exist (mean0 is 0 in that case)
if (num0 == 0) num0 = 1;
if (num1 == 0) num1 = 1;
ElemType mean0 = meanacc0 / num0;
ElemType mean1 = meanacc1 / num1;
// approximate by using their average as the threshold between 0 and 1
// with these values, bits (0,1) which mean values (0.5,1.5) will reconstruct to mean0/1
newmean = 0.5f * (mean0 + mean1);
radius = 2.0f * (mean1 - newmean);
}
if (subset == 0)
{
lower = newmean - radius;
upper = newmean + radius;
}
}
else
{
ElemType stddevs = 5.0f;
// >1 bit:
// We linearly quantize between 'stddevs' standard deviations.
ElemType varacc = 0.0f;
// (subset: compute subset sum)
for (size_t i = subset; i < rows; i += subsets)
{
size_t ij = ColMIDX(i, j, M);
ElemType val = inMat[ij] + inResidual[ij];
varacc += (val - mean) * (val - mean);
}
// multi-subset (CUDA): reduce to one thread
allReduceElem(varacc);
ElemType stddev = sqrt(varacc / rows);
if (subset == 0)
{
// stddevs = how many stddevs from the mean until outside of quantization range
lower = mean - (stddevs * stddev);
upper = mean + (stddevs * stddev);
}
}
}
private:
ValueQuantizer<ElemType> valQ;
template<typename T>
friend class QuantizedMatrix;
};
}}}
#endif