Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga/mergeHTKMLFReaders

This commit is contained in:
Amit Agarwal 2015-09-01 13:27:19 -07:00
Родитель 1df4e8e5e1 7915fef15d
Коммит 8e85f07de3
13 изменённых файлов: 5823 добавлений и 5826 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -169,6 +169,7 @@ MATH_SRC =\
Math/Math/MatrixQuantizerCPU.cpp \
Math/Math/QuantizedMatrix.cpp \
Math/Math/Matrix.cpp \
Math/Math/CUDAPageLockedMemAllocator.cpp \
ifdef CUDA_PATH
MATH_SRC +=\
@ -176,7 +177,6 @@ MATH_SRC +=\
Math/Math/GPUMatrixCUDAKernels.cu \
Math/Math/GPUSparseMatrix.cu \
Math/Math/GPUWatcher.cu \
Math/Math/CUDAPageLockedMemAllocator.cpp \
Math/Math/MatrixQuantizerGPU.cu \
else

Просмотреть файл

@ -152,10 +152,6 @@
<ExcludedFromBuild>true</ExcludedFromBuild>
<FileType>CppCode</FileType>
</CudaCompile>
<CudaCompile Include="ValueQuantizer.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
<FileType>CppCode</FileType>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<CudaCompile Include="GPUMatrix.cu">

Просмотреть файл

@ -16,9 +16,6 @@
<CudaCompile Include="MatrixQuantizerGPU.cu">
<Filter>GPU\1bitSGD</Filter>
</CudaCompile>
<CudaCompile Include="ValueQuantizer.cu">
<Filter>GPU\1bitSGD</Filter>
</CudaCompile>
<CudaCompile Include="MatrixQuantizer_kernel.cu">
<Filter>GPU\1bitSGD</Filter>
</CudaCompile>

Просмотреть файл

@ -1,9 +1,13 @@
#include "stdafx.h"
#include "CUDAPageLockedMemAllocator.h"
#ifndef CPUONLY
#include <cuda_runtime_api.h>
#endif // !CPUONLY
#include "BestGpu.h"
namespace Microsoft { namespace MSR { namespace CNTK {
#ifndef CPUONLY
CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)
: m_deviceID(deviceID)
{
@ -25,4 +29,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cudaSetDevice(m_deviceID);
cudaFreeHost(p) || "Free in CUDAPageLockedMemAllocator failed";
}
int CUDAPageLockedMemAllocator::GetDeviceID() const
{
return m_deviceID;
}
#else
// Dummy definitions when compiling for CPUONLY
CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int)
{
}
int CUDAPageLockedMemAllocator::GetDeviceID() const
{
return -1;
}
char* CUDAPageLockedMemAllocator::Malloc(size_t)
{
return NULL;
}
void CUDAPageLockedMemAllocator::Free(char*)
{
}
#endif
}}}

Просмотреть файл

@ -19,11 +19,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
public:
CUDAPageLockedMemAllocator(int deviceID);
int GetDeviceID() const
{
return m_deviceID;
}
int GetDeviceID() const;
char* Malloc(size_t size) override;
void Free(char* p) override;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -4,8 +4,10 @@
#include "ColumnQuantizer.h"
#include "QuantizedMatrix.h"
#include "GPUMatrix.h"
#ifndef CPUONLY
#include <cuda_runtime_api.h>
#include <cuda.h>
#endif // !CPUONLY
#include <vector>
#include <memory>
@ -34,6 +36,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Helper function to get a temporary intermediate matrix on the GPU to store quantization results
QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated);
#ifndef CPUONLY
// Record a event to flag the completion of quantization/unquantization kernel on the compute stream
void RecordQuantizeCompleteEvent(cudaStream_t computestream) const;
@ -68,7 +71,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
mutable cudaEvent_t m_quantizeCompleteEvent;
mutable cudaEvent_t m_fetchCompleteEvent;
mutable cudaEvent_t m_assignCompleteEvent;
#endif // !CPUONLY
private:
bool m_forceSync;
bool m_quantizeOpIncludedFetch;

Просмотреть файл

@ -7,7 +7,6 @@
#include <device_launch_parameters.h>
#include "ValueQuantizer.h"
#include "ValueQuantizer.cu"
#include "ColumnQuantizer.h"
#include "QuantizedMatrix.h"

Просмотреть файл

@ -11,6 +11,7 @@
#include "GPUMatrix.h"
#include "GPUSparseMatrix.h"
#include "MatrixQuantizerGPU.h"
#pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
#pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
@ -355,6 +356,7 @@ namespace Microsoft {
#pragma endregion Helper Functions
template class GPUSparseMatrix<char>;
template class GPUSparseMatrix<float>;
template class GPUSparseMatrix<double>;
@ -477,6 +479,7 @@ namespace Microsoft {
template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }
template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }
//for each column of a, we assign numRows starting from startIndex to this
template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
@ -1082,10 +1085,50 @@ namespace Microsoft {
}
#pragma endregion Static BLAS Functions
#pragma region MatrixQuantizerGPU functions
template<class ElemType>
MatrixQuantizerGPU<ElemType>::MatrixQuantizerGPU(const Matrix<ElemType>& inMatrix, bool forceSync)
: MatrixQuantizer<ElemType>(inMatrix)
{
}
template<class ElemType>
MatrixQuantizerGPU<ElemType>::~MatrixQuantizerGPU()
{
}
template<class ElemType>
void MatrixQuantizerGPU<ElemType>::QuantizeAsync(QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
{
}
template<class ElemType>
void MatrixQuantizerGPU<ElemType>::WaitQuantizeAsyncDone()
{
}
template<class ElemType>
void MatrixQuantizerGPU<ElemType>::UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add /*= false*/)
{
}
template<class ElemType>
void MatrixQuantizerGPU<ElemType>::WaitUnquantizeAsyncDone()
{
}
#pragma endregion MatrixQuantizerGPU functions
template class GPUMatrix<char>;
template class GPUMatrix<float>;
template class GPUMatrix<double>;
template class DeviceBoundNumber<float>;
template class DeviceBoundNumber<double>;
template MatrixQuantizerGPU<float>::MatrixQuantizerGPU(const Matrix<float>&, bool forceSync);
template MatrixQuantizerGPU<double>::MatrixQuantizerGPU(const Matrix<double>&, bool forceSync);
template MatrixQuantizerGPU<float>::~MatrixQuantizerGPU();
template MatrixQuantizerGPU<double>::~MatrixQuantizerGPU();
template void MatrixQuantizerGPU<float>::QuantizeAsync(QuantizedMatrix<float>&, bool);
template void MatrixQuantizerGPU<double>::QuantizeAsync(QuantizedMatrix<double>&, bool);
template<class ElemType> cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus] = { 0 };

Просмотреть файл

@ -1,153 +0,0 @@
#ifndef __VALLUE_QUANTIZER_CUH__
#define __VALLUE_QUANTIZER_CUH__
#include "stdafx.h"
#include "ValueQuantizer.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
cudasharedcode
ValueQuantizer<ElemType>::ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper)
: ldNbits(ldNbits), Nbits(1 << ldNbits), quantimin(lower), quantimax(upper)
{
rangeend = ((QWordVal)1) << Nbits;
// post-fix for incorrect shift for no-quant hack (Nbits=32): << arg is taken mod 32!
// in this case, it's only used as (rangeend-1) which is now correct (before it was 0!)
if (Nbits >= (8 * sizeof(rangeend)))
{
rangeend = 0;
}
// must protect against NaN: interval is 0 -> quantization is futile, just emit 0
if (((quantimax - quantimin) < 1e-36f) || (rangeend == 0))
{
qfactor = ufactor = (ElemType)0.0;
}
else
{
// precompute this for quantize() (see comment there)
qfactor = rangeend / (quantimax - quantimin);
// and for unquantize()
ufactor = (quantimax - quantimin) / rangeend;
}
// set the quantization threshold for the special case of 1-bit
quantimid = 0.5f * (quantimax + quantimin);
}
// quantize for full ElemType size bits case (special case that allows to bypass quantization, for testing/debugging purposes)
template<class ElemType>
cudasharedcode ValueQuantizer<ElemType>::QWordVal
ValueQuantizer<ElemType>::QuantizeToFullQWord(ElemType u) const
{
assert(Nbits == QWordNumBits);
// we return the bit pattern that encodes the float value
return *(QWordVal*)&u;
}
// quantize one value --special version for 1 bit
template<class ElemType>
template<bool ZeroThresholdFor1Bit>
cudasharedcode bool
ValueQuantizer<ElemType>::Quantize1(ElemType u) const
{
assert (Nbits == 1);
if (!ZeroThresholdFor1Bit)
{
return u >= quantimid;
}
else
{
return u >= (ElemType)0.0;
}
}
// quantize one value
// TODO: we can optimize for 1 bit here - very simply use a template arg 'isonebit'
template<class ElemType>
template<bool ZeroThresholdFor1Bit>
cudasharedcode ValueQuantizer<ElemType>::QWordVal
ValueQuantizer<ElemType>::Quantize(ElemType u) const
{
if (Nbits == QWordNumBits)
{
return QuantizeToFullQWord(u);
}
// TODO: we may need to optimize this by a template arg
else if (ldNbits == 0)
{
return Quantize1<ZeroThresholdFor1Bit>(u) ? 1 : 0;
}
else
{
if (u <= quantimin)
{
return 0;
}
else if (u >= quantimax)
{
return (rangeend - 1);
}
else
{
return (QWordVal)((QWordValSigned)((u - quantimin) * qfactor));
}
}
}
// unquantize one value
template<class ElemType>
cudasharedcode
ElemType ValueQuantizer<ElemType>::Unquantize(QWordVal u) const
{
if (Nbits == QWordNumBits)
{
return *(ElemType*)&u;
}
// Note: in 1-bit case, we want 0.5 -> mean0, 1.5 -> mean1
return ((u + (ElemType)0.5) * ufactor) + quantimin;
}
// unquantize one value --special case for 1 bit
template<class ElemType>
cudasharedcode
ElemType ValueQuantizer<ElemType>::Unquantize1(bool u, ElemType val0, ElemType val1)
{
return u ? val1 : val0;
}
// helper: compute the binary log of a power of two (utility function to convert 'Nbits' into 'ldNbits'
template<class ElemType>
size_t ValueQuantizer<ElemType>::ld(size_t v)
{
if (v == 1)
{
return 0;
}
else if (v & 1) // not a power of two
{
throw std::runtime_error("ld: 'bits' must be a power of two");
}
else
{
return 1 + ld (v >> 1);
}
}
// Explicit instantiation
template class ValueQuantizer<float>;
template class ValueQuantizer<double>;
template ValueQuantizer<float>::QWordVal ValueQuantizer<float>::Quantize<true>(float u) const;
template ValueQuantizer<float>::QWordVal ValueQuantizer<float>::Quantize<false>(float u) const;
template ValueQuantizer<double>::QWordVal ValueQuantizer<double>::Quantize<true>(double u) const;
template ValueQuantizer<double>::QWordVal ValueQuantizer<double>::Quantize<false>(double u) const;
template bool ValueQuantizer<float>::Quantize1<true>(float u) const;
template bool ValueQuantizer<float>::Quantize1<false>(float u) const;
template bool ValueQuantizer<double>::Quantize1<true>(double u) const;
template bool ValueQuantizer<double>::Quantize1<false>(double u) const;
}}}
#endif

Просмотреть файл

@ -2,10 +2,17 @@
#ifndef __VALLUE_QUANTIZER_H__
#define __VALLUE_QUANTIZER_H__
#ifndef CPUONLY
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#endif // !CPUONLY
#include <cassert>
#include <stdexcept>
#pragma warning (disable: 4127) // conditional expression is constant
namespace Microsoft { namespace MSR { namespace CNTK {
@ -55,17 +62,98 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static const size_t QWordNumBits = 8 * sizeof(QWord);
public:
cudasharedcode ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper);
cudasharedcode ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper)
: ldNbits(ldNbits), Nbits(1 << ldNbits), quantimin(lower), quantimax(upper)
{
rangeend = ((QWordVal)1) << Nbits;
// post-fix for incorrect shift for no-quant hack (Nbits=32): << arg is taken mod 32!
// in this case, it's only used as (rangeend-1) which is now correct (before it was 0!)
if (Nbits >= (8 * sizeof(rangeend)))
{
rangeend = 0;
}
// must protect against NaN: interval is 0 -> quantization is futile, just emit 0
if (((quantimax - quantimin) < 1e-36f) || (rangeend == 0))
{
qfactor = ufactor = (ElemType)0.0;
}
else
{
// precompute this for quantize() (see comment there)
qfactor = rangeend / (quantimax - quantimin);
// and for unquantize()
ufactor = (quantimax - quantimin) / rangeend;
}
// set the quantization threshold for the special case of 1-bit
quantimid = 0.5f * (quantimax + quantimin);
}
// quantize one value
// TODO: we can optimize for 1 bit here - very simply use a template arg 'isonebit'
template<bool ZeroThresholdFor1Bit>
cudasharedcode QWordVal Quantize(ElemType u) const;
cudasharedcode QWordVal Quantize(ElemType u) const
{
if (Nbits == QWordNumBits)
{
return QuantizeToFullQWord(u);
}
// TODO: we may need to optimize this by a template arg
else if (ldNbits == 0)
{
return Quantize1<ZeroThresholdFor1Bit>(u) ? 1 : 0;
}
else
{
if (u <= quantimin)
{
return 0;
}
else if (u >= quantimax)
{
return (rangeend - 1);
}
else
{
return (QWordVal)((QWordValSigned)((u - quantimin) * qfactor));
}
}
}
cudasharedcode ElemType Unquantize(QWordVal u) const;
// unquantize one value
cudasharedcode ElemType Unquantize(QWordVal u) const
{
if (Nbits == QWordNumBits)
{
return *(ElemType*)&u;
}
// Note: in 1-bit case, we want 0.5 -> mean0, 1.5 -> mean1
return ((u + (ElemType)0.5) * ufactor) + quantimin;
}
// quantize one value --special version for 1 bit
template<bool ZeroThresholdFor1Bit>
cudasharedcode bool Quantize1(ElemType u) const;
cudasharedcode bool Quantize1(ElemType u) const
{
assert(Nbits == 1);
if (!ZeroThresholdFor1Bit)
{
return u >= quantimid;
}
else
{
return u >= (ElemType)0.0;
}
}
static cudasharedcode ElemType Unquantize1(bool u, ElemType val0, ElemType val1);
// unquantize one value --special case for 1 bit
static cudasharedcode ElemType Unquantize1(bool u, ElemType val0, ElemType val1)
{
return u ? val1 : val0;
}
//how many bits we are quanatizing to
cudasharedcode size_t NBits() const
@ -79,10 +167,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return rangeend;
}
static size_t ld(size_t v);
protected:
cudasharedcode QWordVal QuantizeToFullQWord(ElemType u) const;
// helper: compute the binary log of a power of two (utility function to convert 'Nbits' into 'ldNbits'
static size_t ld(size_t v)
{
if (v == 1)
{
return 0;
}
else if (v & 1) // not a power of two
{
throw std::runtime_error("ld: 'bits' must be a power of two");
}
else
{
return 1 + ld(v >> 1);
}
}
protected:
// quantize for full ElemType size bits case (special case that allows to bypass quantization, for testing/debugging purposes)
cudasharedcode QWordVal QuantizeToFullQWord(ElemType u) const
{
assert(Nbits == QWordNumBits);
// we return the bit pattern that encodes the float value
return *(QWordVal*)&u;
}
protected:
// NBits must be power of two
@ -105,4 +215,4 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ElemType ufactor;
};
}}}
#endif
#endif

Просмотреть файл

@ -1,27 +0,0 @@
dataDir: ../Data
testCases:
CNTK Run must be completed:
patterns:
- ^COMPLETED
Must train epochs in exactly same order and parameters:
patterns:
- ^Starting Epoch {{integer}}
- learning rate per sample = {{float}}
- momentum = {{float}}
Epochs must be finished with expected results:
patterns:
- ^Finished Epoch[{{integer}}]
- TrainLossPerSample = {{float,tolerance=1%}}
- EvalErrPerSample = {{float,tolerance=1%}}
- Ave LearnRatePerSample = {{float,tolerance=1%}}
Per-minibatch training results must match:
patterns:
- ^ Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}} of {{integer}}]
- SamplesSeen = {{integer}}
- TrainLossPerSample = {{float,tolerance=1%}}
- EvalErr[0]PerSample = {{float,tolerance=1%}}