Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga/mergeHTKMLFReaders
This commit is contained in:
Коммит
8e85f07de3
2
Makefile
2
Makefile
|
@ -169,6 +169,7 @@ MATH_SRC =\
|
||||||
Math/Math/MatrixQuantizerCPU.cpp \
|
Math/Math/MatrixQuantizerCPU.cpp \
|
||||||
Math/Math/QuantizedMatrix.cpp \
|
Math/Math/QuantizedMatrix.cpp \
|
||||||
Math/Math/Matrix.cpp \
|
Math/Math/Matrix.cpp \
|
||||||
|
Math/Math/CUDAPageLockedMemAllocator.cpp \
|
||||||
|
|
||||||
ifdef CUDA_PATH
|
ifdef CUDA_PATH
|
||||||
MATH_SRC +=\
|
MATH_SRC +=\
|
||||||
|
@ -176,7 +177,6 @@ MATH_SRC +=\
|
||||||
Math/Math/GPUMatrixCUDAKernels.cu \
|
Math/Math/GPUMatrixCUDAKernels.cu \
|
||||||
Math/Math/GPUSparseMatrix.cu \
|
Math/Math/GPUSparseMatrix.cu \
|
||||||
Math/Math/GPUWatcher.cu \
|
Math/Math/GPUWatcher.cu \
|
||||||
Math/Math/CUDAPageLockedMemAllocator.cpp \
|
|
||||||
Math/Math/MatrixQuantizerGPU.cu \
|
Math/Math/MatrixQuantizerGPU.cu \
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
|
@ -152,10 +152,6 @@
|
||||||
<ExcludedFromBuild>true</ExcludedFromBuild>
|
<ExcludedFromBuild>true</ExcludedFromBuild>
|
||||||
<FileType>CppCode</FileType>
|
<FileType>CppCode</FileType>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="ValueQuantizer.cu">
|
|
||||||
<ExcludedFromBuild>true</ExcludedFromBuild>
|
|
||||||
<FileType>CppCode</FileType>
|
|
||||||
</CudaCompile>
|
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CudaCompile Include="GPUMatrix.cu">
|
<CudaCompile Include="GPUMatrix.cu">
|
||||||
|
|
|
@ -16,9 +16,6 @@
|
||||||
<CudaCompile Include="MatrixQuantizerGPU.cu">
|
<CudaCompile Include="MatrixQuantizerGPU.cu">
|
||||||
<Filter>GPU\1bitSGD</Filter>
|
<Filter>GPU\1bitSGD</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="ValueQuantizer.cu">
|
|
||||||
<Filter>GPU\1bitSGD</Filter>
|
|
||||||
</CudaCompile>
|
|
||||||
<CudaCompile Include="MatrixQuantizer_kernel.cu">
|
<CudaCompile Include="MatrixQuantizer_kernel.cu">
|
||||||
<Filter>GPU\1bitSGD</Filter>
|
<Filter>GPU\1bitSGD</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
#include "stdafx.h"
|
#include "stdafx.h"
|
||||||
#include "CUDAPageLockedMemAllocator.h"
|
#include "CUDAPageLockedMemAllocator.h"
|
||||||
|
#ifndef CPUONLY
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
|
#endif // !CPUONLY
|
||||||
|
#include "BestGpu.h"
|
||||||
|
|
||||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
|
|
||||||
|
#ifndef CPUONLY
|
||||||
CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)
|
CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)
|
||||||
: m_deviceID(deviceID)
|
: m_deviceID(deviceID)
|
||||||
{
|
{
|
||||||
|
@ -25,4 +29,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
cudaSetDevice(m_deviceID);
|
cudaSetDevice(m_deviceID);
|
||||||
cudaFreeHost(p) || "Free in CUDAPageLockedMemAllocator failed";
|
cudaFreeHost(p) || "Free in CUDAPageLockedMemAllocator failed";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int CUDAPageLockedMemAllocator::GetDeviceID() const
|
||||||
|
{
|
||||||
|
return m_deviceID;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// Dummy definitions when compiling for CPUONLY
|
||||||
|
CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
int CUDAPageLockedMemAllocator::GetDeviceID() const
|
||||||
|
{
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
char* CUDAPageLockedMemAllocator::Malloc(size_t)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CUDAPageLockedMemAllocator::Free(char*)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}}}
|
}}}
|
||||||
|
|
|
@ -19,11 +19,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
public:
|
public:
|
||||||
CUDAPageLockedMemAllocator(int deviceID);
|
CUDAPageLockedMemAllocator(int deviceID);
|
||||||
|
|
||||||
int GetDeviceID() const
|
int GetDeviceID() const;
|
||||||
{
|
|
||||||
return m_deviceID;
|
|
||||||
}
|
|
||||||
|
|
||||||
char* Malloc(size_t size) override;
|
char* Malloc(size_t size) override;
|
||||||
void Free(char* p) override;
|
void Free(char* p) override;
|
||||||
|
|
||||||
|
|
|
@ -4749,4 +4749,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
template size_t Matrix<char>::GetNumElements() const;
|
template size_t Matrix<char>::GetNumElements() const;
|
||||||
template Matrix<char> Matrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
template Matrix<char> Matrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||||
template void Matrix<char>::_transferToDevice(int id_to, bool ismoved, bool emptyTransfer) const;
|
template void Matrix<char>::_transferToDevice(int id_to, bool ismoved, bool emptyTransfer) const;
|
||||||
|
template size_t Matrix<char>::GetNumRows() const;
|
||||||
|
template size_t Matrix<char>::GetNumCols() const;
|
||||||
}}}
|
}}}
|
||||||
|
|
|
@ -4,8 +4,10 @@
|
||||||
#include "ColumnQuantizer.h"
|
#include "ColumnQuantizer.h"
|
||||||
#include "QuantizedMatrix.h"
|
#include "QuantizedMatrix.h"
|
||||||
#include "GPUMatrix.h"
|
#include "GPUMatrix.h"
|
||||||
|
#ifndef CPUONLY
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
#endif // !CPUONLY
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
|
@ -34,6 +36,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
// Helper function to get a temporary intermediate matrix on the GPU to store quantization results
|
// Helper function to get a temporary intermediate matrix on the GPU to store quantization results
|
||||||
QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated);
|
QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated);
|
||||||
|
|
||||||
|
#ifndef CPUONLY
|
||||||
// Record a event to flag the completion of quantization/unquantization kernel on the compute stream
|
// Record a event to flag the completion of quantization/unquantization kernel on the compute stream
|
||||||
void RecordQuantizeCompleteEvent(cudaStream_t computestream) const;
|
void RecordQuantizeCompleteEvent(cudaStream_t computestream) const;
|
||||||
|
|
||||||
|
@ -68,7 +71,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
mutable cudaEvent_t m_quantizeCompleteEvent;
|
mutable cudaEvent_t m_quantizeCompleteEvent;
|
||||||
mutable cudaEvent_t m_fetchCompleteEvent;
|
mutable cudaEvent_t m_fetchCompleteEvent;
|
||||||
mutable cudaEvent_t m_assignCompleteEvent;
|
mutable cudaEvent_t m_assignCompleteEvent;
|
||||||
|
#endif // !CPUONLY
|
||||||
|
|
||||||
|
private:
|
||||||
bool m_forceSync;
|
bool m_forceSync;
|
||||||
bool m_quantizeOpIncludedFetch;
|
bool m_quantizeOpIncludedFetch;
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,6 @@
|
||||||
#include <device_launch_parameters.h>
|
#include <device_launch_parameters.h>
|
||||||
|
|
||||||
#include "ValueQuantizer.h"
|
#include "ValueQuantizer.h"
|
||||||
#include "ValueQuantizer.cu"
|
|
||||||
#include "ColumnQuantizer.h"
|
#include "ColumnQuantizer.h"
|
||||||
#include "QuantizedMatrix.h"
|
#include "QuantizedMatrix.h"
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
|
|
||||||
#include "GPUMatrix.h"
|
#include "GPUMatrix.h"
|
||||||
#include "GPUSparseMatrix.h"
|
#include "GPUSparseMatrix.h"
|
||||||
|
#include "MatrixQuantizerGPU.h"
|
||||||
|
|
||||||
#pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
|
#pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
|
||||||
#pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
|
#pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
|
||||||
|
@ -355,6 +356,7 @@ namespace Microsoft {
|
||||||
|
|
||||||
#pragma endregion Helper Functions
|
#pragma endregion Helper Functions
|
||||||
|
|
||||||
|
template class GPUSparseMatrix<char>;
|
||||||
template class GPUSparseMatrix<float>;
|
template class GPUSparseMatrix<float>;
|
||||||
template class GPUSparseMatrix<double>;
|
template class GPUSparseMatrix<double>;
|
||||||
|
|
||||||
|
@ -477,6 +479,7 @@ namespace Microsoft {
|
||||||
|
|
||||||
template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }
|
template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }
|
||||||
|
|
||||||
|
template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }
|
||||||
|
|
||||||
//for each column of a, we assign numRows starting from startIndex to this
|
//for each column of a, we assign numRows starting from startIndex to this
|
||||||
template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
|
template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
|
||||||
|
@ -1082,10 +1085,50 @@ namespace Microsoft {
|
||||||
}
|
}
|
||||||
#pragma endregion Static BLAS Functions
|
#pragma endregion Static BLAS Functions
|
||||||
|
|
||||||
|
#pragma region MatrixQuantizerGPU functions
|
||||||
|
template<class ElemType>
|
||||||
|
MatrixQuantizerGPU<ElemType>::MatrixQuantizerGPU(const Matrix<ElemType>& inMatrix, bool forceSync)
|
||||||
|
: MatrixQuantizer<ElemType>(inMatrix)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class ElemType>
|
||||||
|
MatrixQuantizerGPU<ElemType>::~MatrixQuantizerGPU()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class ElemType>
|
||||||
|
void MatrixQuantizerGPU<ElemType>::QuantizeAsync(QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class ElemType>
|
||||||
|
void MatrixQuantizerGPU<ElemType>::WaitQuantizeAsyncDone()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class ElemType>
|
||||||
|
void MatrixQuantizerGPU<ElemType>::UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add /*= false*/)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class ElemType>
|
||||||
|
void MatrixQuantizerGPU<ElemType>::WaitUnquantizeAsyncDone()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
#pragma endregion MatrixQuantizerGPU functions
|
||||||
|
|
||||||
|
template class GPUMatrix<char>;
|
||||||
template class GPUMatrix<float>;
|
template class GPUMatrix<float>;
|
||||||
template class GPUMatrix<double>;
|
template class GPUMatrix<double>;
|
||||||
template class DeviceBoundNumber<float>;
|
template class DeviceBoundNumber<float>;
|
||||||
template class DeviceBoundNumber<double>;
|
template class DeviceBoundNumber<double>;
|
||||||
|
template MatrixQuantizerGPU<float>::MatrixQuantizerGPU(const Matrix<float>&, bool forceSync);
|
||||||
|
template MatrixQuantizerGPU<double>::MatrixQuantizerGPU(const Matrix<double>&, bool forceSync);
|
||||||
|
template MatrixQuantizerGPU<float>::~MatrixQuantizerGPU();
|
||||||
|
template MatrixQuantizerGPU<double>::~MatrixQuantizerGPU();
|
||||||
|
template void MatrixQuantizerGPU<float>::QuantizeAsync(QuantizedMatrix<float>&, bool);
|
||||||
|
template void MatrixQuantizerGPU<double>::QuantizeAsync(QuantizedMatrix<double>&, bool);
|
||||||
|
|
||||||
template<class ElemType> cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus] = { 0 };
|
template<class ElemType> cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus] = { 0 };
|
||||||
|
|
||||||
|
|
|
@ -1,153 +0,0 @@
|
||||||
#ifndef __VALLUE_QUANTIZER_CUH__
|
|
||||||
#define __VALLUE_QUANTIZER_CUH__
|
|
||||||
|
|
||||||
#include "stdafx.h"
|
|
||||||
#include "ValueQuantizer.h"
|
|
||||||
|
|
||||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
|
||||||
|
|
||||||
template<class ElemType>
|
|
||||||
cudasharedcode
|
|
||||||
ValueQuantizer<ElemType>::ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper)
|
|
||||||
: ldNbits(ldNbits), Nbits(1 << ldNbits), quantimin(lower), quantimax(upper)
|
|
||||||
{
|
|
||||||
rangeend = ((QWordVal)1) << Nbits;
|
|
||||||
|
|
||||||
// post-fix for incorrect shift for no-quant hack (Nbits=32): << arg is taken mod 32!
|
|
||||||
// in this case, it's only used as (rangeend-1) which is now correct (before it was 0!)
|
|
||||||
if (Nbits >= (8 * sizeof(rangeend)))
|
|
||||||
{
|
|
||||||
rangeend = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// must protect against NaN: interval is 0 -> quantization is futile, just emit 0
|
|
||||||
if (((quantimax - quantimin) < 1e-36f) || (rangeend == 0))
|
|
||||||
{
|
|
||||||
qfactor = ufactor = (ElemType)0.0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// precompute this for quantize() (see comment there)
|
|
||||||
qfactor = rangeend / (quantimax - quantimin);
|
|
||||||
// and for unquantize()
|
|
||||||
ufactor = (quantimax - quantimin) / rangeend;
|
|
||||||
}
|
|
||||||
|
|
||||||
// set the quantization threshold for the special case of 1-bit
|
|
||||||
quantimid = 0.5f * (quantimax + quantimin);
|
|
||||||
}
|
|
||||||
|
|
||||||
// quantize for full ElemType size bits case (special case that allows to bypass quantization, for testing/debugging purposes)
|
|
||||||
template<class ElemType>
|
|
||||||
cudasharedcode ValueQuantizer<ElemType>::QWordVal
|
|
||||||
ValueQuantizer<ElemType>::QuantizeToFullQWord(ElemType u) const
|
|
||||||
{
|
|
||||||
assert(Nbits == QWordNumBits);
|
|
||||||
|
|
||||||
// we return the bit pattern that encodes the float value
|
|
||||||
return *(QWordVal*)&u;
|
|
||||||
}
|
|
||||||
|
|
||||||
// quantize one value --special version for 1 bit
|
|
||||||
template<class ElemType>
|
|
||||||
template<bool ZeroThresholdFor1Bit>
|
|
||||||
cudasharedcode bool
|
|
||||||
ValueQuantizer<ElemType>::Quantize1(ElemType u) const
|
|
||||||
{
|
|
||||||
assert (Nbits == 1);
|
|
||||||
if (!ZeroThresholdFor1Bit)
|
|
||||||
{
|
|
||||||
return u >= quantimid;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return u >= (ElemType)0.0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// quantize one value
|
|
||||||
// TODO: we can optimize for 1 bit here - very simply use a template arg 'isonebit'
|
|
||||||
template<class ElemType>
|
|
||||||
template<bool ZeroThresholdFor1Bit>
|
|
||||||
cudasharedcode ValueQuantizer<ElemType>::QWordVal
|
|
||||||
ValueQuantizer<ElemType>::Quantize(ElemType u) const
|
|
||||||
{
|
|
||||||
if (Nbits == QWordNumBits)
|
|
||||||
{
|
|
||||||
return QuantizeToFullQWord(u);
|
|
||||||
}
|
|
||||||
// TODO: we may need to optimize this by a template arg
|
|
||||||
else if (ldNbits == 0)
|
|
||||||
{
|
|
||||||
return Quantize1<ZeroThresholdFor1Bit>(u) ? 1 : 0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (u <= quantimin)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else if (u >= quantimax)
|
|
||||||
{
|
|
||||||
return (rangeend - 1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return (QWordVal)((QWordValSigned)((u - quantimin) * qfactor));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// unquantize one value
|
|
||||||
template<class ElemType>
|
|
||||||
cudasharedcode
|
|
||||||
ElemType ValueQuantizer<ElemType>::Unquantize(QWordVal u) const
|
|
||||||
{
|
|
||||||
if (Nbits == QWordNumBits)
|
|
||||||
{
|
|
||||||
return *(ElemType*)&u;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: in 1-bit case, we want 0.5 -> mean0, 1.5 -> mean1
|
|
||||||
return ((u + (ElemType)0.5) * ufactor) + quantimin;
|
|
||||||
}
|
|
||||||
|
|
||||||
// unquantize one value --special case for 1 bit
|
|
||||||
template<class ElemType>
|
|
||||||
cudasharedcode
|
|
||||||
ElemType ValueQuantizer<ElemType>::Unquantize1(bool u, ElemType val0, ElemType val1)
|
|
||||||
{
|
|
||||||
return u ? val1 : val0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// helper: compute the binary log of a power of two (utility function to convert 'Nbits' into 'ldNbits'
|
|
||||||
template<class ElemType>
|
|
||||||
size_t ValueQuantizer<ElemType>::ld(size_t v)
|
|
||||||
{
|
|
||||||
if (v == 1)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else if (v & 1) // not a power of two
|
|
||||||
{
|
|
||||||
throw std::runtime_error("ld: 'bits' must be a power of two");
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return 1 + ld (v >> 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Explicit instantiation
|
|
||||||
template class ValueQuantizer<float>;
|
|
||||||
template class ValueQuantizer<double>;
|
|
||||||
template ValueQuantizer<float>::QWordVal ValueQuantizer<float>::Quantize<true>(float u) const;
|
|
||||||
template ValueQuantizer<float>::QWordVal ValueQuantizer<float>::Quantize<false>(float u) const;
|
|
||||||
template ValueQuantizer<double>::QWordVal ValueQuantizer<double>::Quantize<true>(double u) const;
|
|
||||||
template ValueQuantizer<double>::QWordVal ValueQuantizer<double>::Quantize<false>(double u) const;
|
|
||||||
template bool ValueQuantizer<float>::Quantize1<true>(float u) const;
|
|
||||||
template bool ValueQuantizer<float>::Quantize1<false>(float u) const;
|
|
||||||
template bool ValueQuantizer<double>::Quantize1<true>(double u) const;
|
|
||||||
template bool ValueQuantizer<double>::Quantize1<false>(double u) const;
|
|
||||||
}}}
|
|
||||||
#endif
|
|
|
@ -2,10 +2,17 @@
|
||||||
#ifndef __VALLUE_QUANTIZER_H__
|
#ifndef __VALLUE_QUANTIZER_H__
|
||||||
#define __VALLUE_QUANTIZER_H__
|
#define __VALLUE_QUANTIZER_H__
|
||||||
|
|
||||||
|
#ifndef CPUONLY
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#include <device_launch_parameters.h>
|
#include <device_launch_parameters.h>
|
||||||
|
#endif // !CPUONLY
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#pragma warning (disable: 4127) // conditional expression is constant
|
||||||
|
|
||||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
|
|
||||||
|
@ -55,17 +62,98 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
static const size_t QWordNumBits = 8 * sizeof(QWord);
|
static const size_t QWordNumBits = 8 * sizeof(QWord);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
cudasharedcode ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper);
|
cudasharedcode ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper)
|
||||||
|
: ldNbits(ldNbits), Nbits(1 << ldNbits), quantimin(lower), quantimax(upper)
|
||||||
|
{
|
||||||
|
rangeend = ((QWordVal)1) << Nbits;
|
||||||
|
|
||||||
|
// post-fix for incorrect shift for no-quant hack (Nbits=32): << arg is taken mod 32!
|
||||||
|
// in this case, it's only used as (rangeend-1) which is now correct (before it was 0!)
|
||||||
|
if (Nbits >= (8 * sizeof(rangeend)))
|
||||||
|
{
|
||||||
|
rangeend = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// must protect against NaN: interval is 0 -> quantization is futile, just emit 0
|
||||||
|
if (((quantimax - quantimin) < 1e-36f) || (rangeend == 0))
|
||||||
|
{
|
||||||
|
qfactor = ufactor = (ElemType)0.0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// precompute this for quantize() (see comment there)
|
||||||
|
qfactor = rangeend / (quantimax - quantimin);
|
||||||
|
// and for unquantize()
|
||||||
|
ufactor = (quantimax - quantimin) / rangeend;
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the quantization threshold for the special case of 1-bit
|
||||||
|
quantimid = 0.5f * (quantimax + quantimin);
|
||||||
|
}
|
||||||
|
|
||||||
|
// quantize one value
|
||||||
|
// TODO: we can optimize for 1 bit here - very simply use a template arg 'isonebit'
|
||||||
template<bool ZeroThresholdFor1Bit>
|
template<bool ZeroThresholdFor1Bit>
|
||||||
cudasharedcode QWordVal Quantize(ElemType u) const;
|
cudasharedcode QWordVal Quantize(ElemType u) const
|
||||||
|
{
|
||||||
|
if (Nbits == QWordNumBits)
|
||||||
|
{
|
||||||
|
return QuantizeToFullQWord(u);
|
||||||
|
}
|
||||||
|
// TODO: we may need to optimize this by a template arg
|
||||||
|
else if (ldNbits == 0)
|
||||||
|
{
|
||||||
|
return Quantize1<ZeroThresholdFor1Bit>(u) ? 1 : 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (u <= quantimin)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else if (u >= quantimax)
|
||||||
|
{
|
||||||
|
return (rangeend - 1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return (QWordVal)((QWordValSigned)((u - quantimin) * qfactor));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
cudasharedcode ElemType Unquantize(QWordVal u) const;
|
// unquantize one value
|
||||||
|
cudasharedcode ElemType Unquantize(QWordVal u) const
|
||||||
|
{
|
||||||
|
if (Nbits == QWordNumBits)
|
||||||
|
{
|
||||||
|
return *(ElemType*)&u;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: in 1-bit case, we want 0.5 -> mean0, 1.5 -> mean1
|
||||||
|
return ((u + (ElemType)0.5) * ufactor) + quantimin;
|
||||||
|
}
|
||||||
|
|
||||||
|
// quantize one value --special version for 1 bit
|
||||||
template<bool ZeroThresholdFor1Bit>
|
template<bool ZeroThresholdFor1Bit>
|
||||||
cudasharedcode bool Quantize1(ElemType u) const;
|
cudasharedcode bool Quantize1(ElemType u) const
|
||||||
|
{
|
||||||
|
assert(Nbits == 1);
|
||||||
|
if (!ZeroThresholdFor1Bit)
|
||||||
|
{
|
||||||
|
return u >= quantimid;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return u >= (ElemType)0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static cudasharedcode ElemType Unquantize1(bool u, ElemType val0, ElemType val1);
|
// unquantize one value --special case for 1 bit
|
||||||
|
static cudasharedcode ElemType Unquantize1(bool u, ElemType val0, ElemType val1)
|
||||||
|
{
|
||||||
|
return u ? val1 : val0;
|
||||||
|
}
|
||||||
|
|
||||||
//how many bits we are quanatizing to
|
//how many bits we are quanatizing to
|
||||||
cudasharedcode size_t NBits() const
|
cudasharedcode size_t NBits() const
|
||||||
|
@ -79,10 +167,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
return rangeend;
|
return rangeend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ld(size_t v);
|
// helper: compute the binary log of a power of two (utility function to convert 'Nbits' into 'ldNbits'
|
||||||
|
static size_t ld(size_t v)
|
||||||
|
{
|
||||||
|
if (v == 1)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else if (v & 1) // not a power of two
|
||||||
|
{
|
||||||
|
throw std::runtime_error("ld: 'bits' must be a power of two");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 1 + ld(v >> 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
cudasharedcode QWordVal QuantizeToFullQWord(ElemType u) const;
|
// quantize for full ElemType size bits case (special case that allows to bypass quantization, for testing/debugging purposes)
|
||||||
|
cudasharedcode QWordVal QuantizeToFullQWord(ElemType u) const
|
||||||
|
{
|
||||||
|
assert(Nbits == QWordNumBits);
|
||||||
|
|
||||||
|
// we return the bit pattern that encodes the float value
|
||||||
|
return *(QWordVal*)&u;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// NBits must be power of two
|
// NBits must be power of two
|
||||||
|
|
|
@ -1,27 +0,0 @@
|
||||||
dataDir: ../Data
|
|
||||||
|
|
||||||
testCases:
|
|
||||||
CNTK Run must be completed:
|
|
||||||
patterns:
|
|
||||||
- ^COMPLETED
|
|
||||||
|
|
||||||
Must train epochs in exactly same order and parameters:
|
|
||||||
patterns:
|
|
||||||
- ^Starting Epoch {{integer}}
|
|
||||||
- learning rate per sample = {{float}}
|
|
||||||
- momentum = {{float}}
|
|
||||||
|
|
||||||
Epochs must be finished with expected results:
|
|
||||||
patterns:
|
|
||||||
- ^Finished Epoch[{{integer}}]
|
|
||||||
- TrainLossPerSample = {{float,tolerance=1%}}
|
|
||||||
- EvalErrPerSample = {{float,tolerance=1%}}
|
|
||||||
- Ave LearnRatePerSample = {{float,tolerance=1%}}
|
|
||||||
|
|
||||||
Per-minibatch training results must match:
|
|
||||||
patterns:
|
|
||||||
- ^ Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}} of {{integer}}]
|
|
||||||
- SamplesSeen = {{integer}}
|
|
||||||
- TrainLossPerSample = {{float,tolerance=1%}}
|
|
||||||
- EvalErr[0]PerSample = {{float,tolerance=1%}}
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче