350 строки
15 KiB
C++
350 строки
15 KiB
C++
#ifndef __COLUMN_QUANTIZER_H__
|
|
#define __COLUMN_QUANTIZER_H__
|
|
#include "ValueQuantizer.h"
|
|
#include <math.h>
|
|
|
|
#pragma warning (disable: 4127) // conditional expression is constant
|
|
|
|
namespace Microsoft { namespace MSR { namespace CNTK {
|
|
|
|
#define ColMIDX(i,j,numRow) (((j)*(numRow))+(i)) // 0 based indexing for column major
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Class to perform columnwise quantization/unquantization
|
|
//
|
|
// The quantization of a column is performed in 2 steps
|
|
// a) Compute the values used for unquantizing/reconstructing the quantized values. This is done by computing a pair of
|
|
// values that specify the range of reconstructed unquantized values, such that the aggregate qunatization error is minimized.
|
|
// b) Perform the actual quantization by quantizing each value in the column to an integer of the size of
|
|
// the specified number of bits and then packing these integer bits into the quantized matrix storage
|
|
// ---------------------------------------------------------------------------
|
|
|
|
template<class ElemType>
|
|
class ColumnQuantizer
|
|
{
|
|
typedef typename ValueQuantizer<ElemType>::QWord QWord;
|
|
typedef typename ValueQuantizer<ElemType>::QWordVal QWordVal;
|
|
static const size_t QWordNumBits = ValueQuantizer<ElemType>::QWordNumBits;
|
|
|
|
public:
|
|
cudacode ColumnQuantizer(size_t logNbits, ElemType lower, ElemType upper)
|
|
: valQ(logNbits, lower, upper)
|
|
{
|
|
}
|
|
|
|
// compute #QWords per column of a given height
|
|
static size_t QWordsPerCol(size_t rows, size_t Nbits)
|
|
{
|
|
const size_t valsPerQWord = QWordNumBits / Nbits;
|
|
return (rows + valsPerQWord - 1) / valsPerQWord;
|
|
}
|
|
|
|
size_t QWordsPerCol(size_t rows) const
|
|
{
|
|
return QWordsPerCol(rows, valQ.NBits());
|
|
}
|
|
|
|
// quantize a matrix column into qcoldata
|
|
// The current value of 'inResidual' is added to the matrix, and 'outResidual' gets updated with the new residual;
|
|
// inResidual = outResidual is allowed (intended)
|
|
template<bool ZeroThresholdFor1Bit>
|
|
cudacode void Quantize(const ElemType* inMat, const ElemType* inResidual, long M, size_t j, QWord* qColBits, ElemType* outResidual) const
|
|
{
|
|
// we loop over QWord values
|
|
// E.g. there are 35 ints for a 1100-dim column (at 1-bit quantization).
|
|
// For better CUDA memory collating, we interleave memory such that computing consecutive ints triggers consecutive memory accesses
|
|
// (although for the CPU side, it breaks caching; we could do in-place op)
|
|
// E.g., int 0 accesses elements 0, 35, 70, etc.
|
|
// while int 1 accesses elements 1, 36, 71, etc
|
|
// up to int 34 accesses elements 34, 69, 104, etc.
|
|
const size_t numQWordsPerCol = QWordsPerCol(M);
|
|
for (size_t iQWord = 0; iQWord < numQWordsPerCol; iQWord++)
|
|
{
|
|
qColBits[iQWord] = QuantizeOneQWord<ZeroThresholdFor1Bit>(inMat, inResidual, M, iQWord, M, numQWordsPerCol, j, outResidual);
|
|
}
|
|
}
|
|
|
|
// unquantize a matrix column from qcoldata
|
|
// If 'add' then add to the existing content of the matrix (this is a common thing to do; saves a buffer).
|
|
cudacode void Unquantize(ElemType* outMat, long M, size_t j, const QWord* qColBits, bool add) const
|
|
{
|
|
// loop over QWord values
|
|
const size_t numQWordsPerCol = QWordsPerCol(M);
|
|
for (size_t iQWord = 0; iQWord < numQWordsPerCol; iQWord++)
|
|
{
|
|
UnquantizeOneQWord(outMat, M, iQWord, M, numQWordsPerCol, j, qColBits[iQWord], add);
|
|
}
|
|
}
|
|
|
|
// workaround for not being able to declare a default argument for lambda parameters
|
|
template<bool ZeroThresholdFor1Bit>
|
|
static cudacode void ComputeRangeStatColj(const ElemType* inMat, const ElemType* inResidual, long M, size_t j, size_t bits, ElemType& lower, ElemType& upper)
|
|
{
|
|
/*dummy reducers do nothing in linear CPU version*/
|
|
ComputeRangeStatColjSubset<ZeroThresholdFor1Bit>(inMat, inResidual, M, j, bits, lower, upper, 0, 1, [](ElemType&){}, [](unsigned int&){});
|
|
}
|
|
|
|
public:
|
|
|
|
// quantize the value in inMat[rowStart,colIdx], inMat[rowStart + rowStride,colIdx],inMat[rowStart + rowStride*,colIdx] ... and pack them into a QWord
|
|
// Question: note that it is somewhat un-intuitional, but this memory access pattern is efficient for GPU?
|
|
template<bool ZeroThresholdFor1Bit>
|
|
cudacode QWord QuantizeOneQWord(
|
|
const ElemType* inMat, const ElemType* inResidual,
|
|
long M,
|
|
size_t rowStart, size_t rowEnd, size_t rowStride,
|
|
size_t colIdx,
|
|
ElemType* outResidual) const
|
|
{
|
|
QWord bitBuf = 0;
|
|
|
|
if ((valQ.NBits() == 1) && (inResidual == outResidual)/*in-place*/)
|
|
{
|
|
ElemType val0 = valQ.Unquantize(0);
|
|
ElemType val1 = valQ.Unquantize(1);
|
|
size_t ij = ColMIDX(rowStart, colIdx, M);
|
|
const ElemType* usibj = inMat + ij;
|
|
const ElemType* usibjend = usibj + (rowEnd - rowStart);
|
|
ElemType* resibj = outResidual + ij;
|
|
// we know that the range covers at most the number of bits in a 'QWord'
|
|
for (QWord bitmask = 1; usibj < usibjend; bitmask <<= 1, usibj += rowStride, resibj += rowStride)
|
|
{
|
|
// quantize --we access element (i,j) through the three increasing pointers
|
|
ElemType val = *usibj + *resibj;
|
|
|
|
// Explicit use of 'template' keyword is needed to compile with GCC
|
|
bool qval = valQ.template Quantize1<ZeroThresholdFor1Bit>(val);
|
|
if (qval)
|
|
{
|
|
bitBuf |= bitmask;
|
|
}
|
|
|
|
// compute residual
|
|
ElemType uval = valQ.Unquantize1(qval, val0, val1);
|
|
*resibj = val - uval;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// number of bits in a QWord
|
|
size_t i = rowStart;
|
|
for (size_t k = 0; (k < QWordNumBits) && (i < rowEnd); k += valQ.NBits(), i += rowStride)
|
|
{
|
|
// quantize
|
|
size_t ij = ColMIDX(i, colIdx, M);
|
|
ElemType val = inMat[ij] + inResidual[ij];
|
|
QWordVal qval = valQ.Quantize<ZeroThresholdFor1Bit>(val);
|
|
|
|
// compute residual
|
|
ElemType uval = valQ.Unquantize(qval);
|
|
ElemType r = val - uval;
|
|
outResidual[ij] = r;
|
|
bitBuf = bitBuf | (qval << k);
|
|
}
|
|
}
|
|
return bitBuf;
|
|
}
|
|
|
|
// unquantize one QWord of a quantized matrix column
|
|
cudacode void UnquantizeOneQWord(
|
|
ElemType* us, long M,
|
|
size_t rowStart, size_t rowEnd, size_t rowStride,
|
|
size_t colIdx, QWord bitBuf, bool add) const
|
|
{
|
|
// special case for 1 bit
|
|
if (valQ.NBits() == 1)
|
|
{
|
|
ElemType val0 = valQ.Unquantize(0);
|
|
ElemType val1 = valQ.Unquantize(1);
|
|
size_t ij = ColMIDX(rowStart, colIdx, M);
|
|
ElemType* usibj = us + ij;
|
|
const ElemType* usibjend = usibj + (rowEnd - rowStart);
|
|
for (; usibj < usibjend; usibj += rowStride)
|
|
{
|
|
// get value
|
|
// bitbuf is shifted in-place
|
|
bool qval = (bitBuf & 1) != 0;
|
|
|
|
// and get bitbuf into next position
|
|
bitBuf >>= 1;
|
|
|
|
// unquantize
|
|
ElemType val = ValueQuantizer<ElemType>::Unquantize1(qval, val0, val1);
|
|
if (add)
|
|
{
|
|
val += *usibj;
|
|
}
|
|
|
|
*usibj = val;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// (rangeend MUST be a power of two; ensured by constructing off ldNbits)
|
|
const QWordVal bitmask = valQ.QuanRangeEnd() - 1;
|
|
size_t i = rowStart;
|
|
for (size_t k = 0; (k < QWordNumBits) && (i < rowEnd); k += valQ.NBits(), i += rowStride)
|
|
{
|
|
// get value
|
|
const QWordVal qval = (bitBuf >> k) & bitmask; // % 2^Nbits
|
|
|
|
// unquantize
|
|
ElemType val = valQ.Unquantize(qval);
|
|
size_t ij = ColMIDX(i, colIdx, M);
|
|
if (add)
|
|
{
|
|
val += us[ij];
|
|
}
|
|
|
|
us[ij] = val;
|
|
}
|
|
}
|
|
}
|
|
|
|
// determine quantization range of one column
|
|
// This code is written so that it can run in parallel threads on CUDA for collated memory access;
|
|
// set 'subsets' to >1 and pass cross-thread reducer functions for 'float' and 'size_t' (which would reduce through using CUDA __shared__ memory).
|
|
// TODO: further opportunity for speed-up: use 'mean' from last round for 1-bit and stddev calc
|
|
template<bool ZeroThresholdFor1Bit, class F1, class F2>
|
|
static cudacode void ComputeRangeStatColjSubset(
|
|
const ElemType* inMat,
|
|
const ElemType* inResidual, long M,
|
|
size_t j,
|
|
size_t bits,
|
|
ElemType& lower, ElemType& upper,
|
|
size_t subset, size_t subsets,
|
|
F1 allReduceElem, F2 allReduceUint)
|
|
{
|
|
// quantization range, cut off after how many standard deviations (make this a parameter if we care)
|
|
size_t rows = M;
|
|
|
|
// compute mean
|
|
// computing the mean is expensive; we assume there is no reason for asymmetry and thus a zero mean
|
|
// an initial experiment showed that this is significantly worse (36.0 vs. 37.7% frame acc) at the start, but seems to recover nearly (minor gap)
|
|
// thought:
|
|
// - we could set the threshold at 0
|
|
// - but keep the quantization values for 0 and 1 separate
|
|
// i.e.
|
|
// - do not symmetrize/pool the quantization values for 0 and 1
|
|
// - but hard-code the quantization threshold to be 0 instead of the mean of the two bounds
|
|
// This should give us the best of all--fast operation yet ability to be asymmetric within a column
|
|
ElemType mean = 0.0f;
|
|
if (!ZeroThresholdFor1Bit || (bits != 1))
|
|
{
|
|
ElemType meanacc = 0.0f;
|
|
// (subset: compute subset sum)
|
|
for (size_t i = subset; i < rows; i += subsets)
|
|
{
|
|
size_t ij = ColMIDX(i, j, M);
|
|
meanacc += inMat[ij] + inResidual[ij];
|
|
}
|
|
// multi-subset (CUDA): reduce to one thread
|
|
allReduceElem(meanacc);
|
|
mean = meanacc / rows;
|
|
}
|
|
|
|
if (bits == 1)
|
|
{
|
|
// 1-bit case:
|
|
// We want to minimize the (squared) reconstruction error within the two levels.
|
|
// I.e. we should reconstruct to the respective means of each level.
|
|
// To be able to express the range by two floats, we approximate the level threshold as the av. of the two level means.
|
|
// compute the two level means
|
|
ElemType meanacc0 = 0.0f, meanacc1 = 0.0f;
|
|
unsigned int num0 = 0, num1 = 0;
|
|
// (subset: compute subset sum)
|
|
for (size_t i = subset; i < rows; i += subsets)
|
|
{
|
|
size_t ij = ColMIDX(i, j, M);
|
|
ElemType val = inMat[ij] + inResidual[ij];
|
|
if (val < mean)
|
|
{
|
|
meanacc0 += val;
|
|
num0++;
|
|
}
|
|
else
|
|
{
|
|
meanacc1 += val;
|
|
num1++;
|
|
}
|
|
}
|
|
|
|
// multi-subset (CUDA): reduce to one thread
|
|
allReduceElem(meanacc0);
|
|
allReduceElem(meanacc1);
|
|
allReduceUint(num0);
|
|
allReduceUint(num1);
|
|
|
|
ElemType radius;
|
|
ElemType newmean;
|
|
if (!ZeroThresholdFor1Bit)
|
|
{
|
|
// we minimize the error jointly across positive and negative numbers to make things
|
|
// symmetrical around the mean (which may be non-zero) tying the two sides
|
|
ElemType devacc0 = (num0 * mean) - meanacc0;
|
|
ElemType devacc1 = meanacc1 - (num1 * mean);
|
|
|
|
// both deviations tied, to ensure consistent mean
|
|
ElemType dev = (devacc0 + devacc1) / rows;
|
|
radius = 2.0f * dev;
|
|
newmean = mean;
|
|
}
|
|
else
|
|
{
|
|
// we keep two separate reconstruction values to allow for asymmetries--but we
|
|
// instead hard-code that the threshold is 0
|
|
|
|
// happens for all-zero columns which do exist (mean0 is 0 in that case)
|
|
if (num0 == 0) num0 = 1;
|
|
if (num1 == 0) num1 = 1;
|
|
ElemType mean0 = meanacc0 / num0;
|
|
ElemType mean1 = meanacc1 / num1;
|
|
|
|
// approximate by using their average as the threshold between 0 and 1
|
|
// with these values, bits (0,1) which mean values (0.5,1.5) will reconstruct to mean0/1
|
|
newmean = 0.5f * (mean0 + mean1);
|
|
radius = 2.0f * (mean1 - newmean);
|
|
}
|
|
|
|
if (subset == 0)
|
|
{
|
|
lower = newmean - radius;
|
|
upper = newmean + radius;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ElemType stddevs = 5.0f;
|
|
// >1 bit:
|
|
// We linearly quantize between 'stddevs' standard deviations.
|
|
ElemType varacc = 0.0f;
|
|
// (subset: compute subset sum)
|
|
for (size_t i = subset; i < rows; i += subsets)
|
|
{
|
|
size_t ij = ColMIDX(i, j, M);
|
|
ElemType val = inMat[ij] + inResidual[ij];
|
|
varacc += (val - mean) * (val - mean);
|
|
}
|
|
// multi-subset (CUDA): reduce to one thread
|
|
allReduceElem(varacc);
|
|
ElemType stddev = sqrt(varacc / rows);
|
|
if (subset == 0)
|
|
{
|
|
// stddevs = how many stddevs from the mean until outside of quantization range
|
|
lower = mean - (stddevs * stddev);
|
|
upper = mean + (stddevs * stddev);
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
ValueQuantizer<ElemType> valQ;
|
|
|
|
template<typename T>
|
|
friend class QuantizedMatrix;
|
|
};
|
|
|
|
}}}
|
|
#endif
|
|
|