Merge remote-tracking branch 'origin/master' into linux-gcc

Conflicts:
	Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
	MachineLearning/CNTK/ComputationNetwork.h
	MachineLearning/CNTK/LinearAlgebraNodes.h
	MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
	Makefile_kaldi2.cpu
	Makefile_kaldi2.gpu
This commit is contained in:
Yu Zhang 2015-07-05 22:28:21 -07:00
Родитель 2582c8f5a8 b21e61904f
Коммит f7f902107f
20 изменённых файлов: 14364 добавлений и 19063 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -2290,6 +2290,50 @@ m1, m2 - input matrices.
The m1.cols must equal m2.rows.
\end_layout
\begin_layout Subsubsection
TransposeTimes
\begin_inset Index idx
status open
\begin_layout Plain Layout
TransposeTimes
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Calculate the product
\begin_inset Formula $m1^{T}m2$
\end_inset
.
The resulting matrix has a size of m1.cols by m2.cols.
The syntax is
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
Times(m1, m2)
\end_layout
\end_inset
\end_layout
\begin_layout Itemize
m1, m2 - input matrices.
The m1.rows must equal m2.rows.
\end_layout
\begin_layout Subsubsection
DiagTimes
\begin_inset Index idx

Просмотреть файл

@ -235,7 +235,7 @@ public:
std::vector<ComputationNodePtr> DelayNodes;
for (auto n : allnodes)
{
if (n->OperationName() == L"Delay")
if (n->OperationName() == DelayNode<ElemType>::TypeName())
{
DelayNodes.push_back(n);
}
@ -245,7 +245,7 @@ public:
std::vector<ComputationNodePtr> learnableParameters;
for (auto n : allnodes)
{
if (n->OperationName() == L"LearnableParameter")
if (n->OperationName() == LearnableParameter<ElemType>::TypeName())
{
learnableParameters.push_back(n);
}
@ -348,7 +348,7 @@ public:
std::wstring srcname = src->GetName();
std::wstring desname = des->GetName();
if (des->OperationName() == L"Delay")
if (des->OperationName() == DelayNode<ElemType>::TypeName())
{
// special treament for arc with Delay node as the children
// create a dummy node
@ -1181,6 +1181,10 @@ public:
{
newNode = new TimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
}
else if (nodeType == TransposeTimesNode<ElemType>::TypeName())
{
newNode = new TransposeTimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
}
else if (nodeType == ElementTimesNode<ElemType>::TypeName())
{
newNode = new ElementTimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
@ -1481,6 +1485,10 @@ public:
{
newNode = new TimesNode<ElemType>(m_deviceId, nodeName);
}
else if (nodeType == TransposeTimesNode<ElemType>::TypeName())
{
newNode = new TransposeTimesNode<ElemType>(m_deviceId, nodeName);
}
else if (nodeType == ElementTimesNode<ElemType>::TypeName())
{
newNode = new ElementTimesNode<ElemType>(m_deviceId, nodeName);
@ -1953,6 +1961,16 @@ public:
return newNode;
}
ComputationNodePtr TransposeTimes(const ComputationNodePtr a,
const ComputationNodePtr b,
const std::wstring nodeName = L"")
{
ComputationNodePtr newNode(new TransposeTimesNode<ElemType>(m_deviceId, nodeName));
newNode->AttachInputs(a, b);
AddNodeToNet(newNode);
return newNode;
}
ComputationNodePtr ElementTimes(const ComputationNodePtr a,
const ComputationNodePtr b,
const std::wstring nodeName = L"")
@ -2217,7 +2235,7 @@ public:
{
for (auto ptr = recurrentNodes.begin(); ptr != recurrentNodes.end(); ptr++)
{
if ((*ptr)->IsFuncValueOlderThanInputs() && (*ptr)->OperationName() != L"Delay") {
if ((*ptr)->IsFuncValueOlderThanInputs() && (*ptr)->OperationName() != DelayNode<ElemType>::TypeName()) {
return true;
}
}
@ -3359,7 +3377,7 @@ protected:
visited.insert(cur);
recStack.insert(cur);
if (cur->OperationName() != L"Delay")
if (cur->OperationName() != DelayNode<ElemType>::TypeName())
{
for (size_t i = 0; i < cur->ChildrenSize(); i++)
{
@ -3442,7 +3460,7 @@ protected:
ComputationNodePtr nodeRecIter = (*iter).m_recurrentNodes[j];
for (size_t i = 0; i < nodeRecIter->ChildrenSize(); i++)
{
if ((nodeRecIter->Inputs(i)->LoopId() == nodeRecIter->LoopId()) && (nodeRecIter->OperationName() != L"Delay"))
if ((nodeRecIter->Inputs(i)->LoopId() == nodeRecIter->LoopId()) && (nodeRecIter->OperationName() != DelayNode<ElemType>::TypeName()))
{
nodeRecIter->Inputs(i)->SetIndexInLoop(nodeRecIter->Inputs(i)->GetIndexInLoop() + 1);
}

Просмотреть файл

@ -51,7 +51,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
};
#pragma region base computation class
template<class ElemType>
class ComputationNode //Abstract Class that cannot be instantiated
{

Просмотреть файл

@ -887,6 +887,193 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template class TimesNode<float>;
template class TimesNode<double>;
template<class ElemType>
class TransposeTimesNode : public ComputationNode<ElemType>
{
UsingComputationNodeMembers;
public:
TransposeTimesNode(const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
{
m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
m_deviceId = deviceId;
MoveMatricesToDevice(deviceId);
InitRecurrentNode();
}
TransposeTimesNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
{
m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
LoadFromFile(fstream, modelVersion, deviceId);
}
// copy constructor
TransposeTimesNode(const TransposeTimesNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags) : ComputationNode<ElemType>(node->m_deviceId)
{
node->CopyTo(this, newName, flags);
}
virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
{
const std::wstring& name = (newName == L"") ? NodeName() : newName;
ComputationNodePtr node = new TransposeTimesNode<ElemType>(this, name, flags);
return node;
}
virtual const std::wstring OperationName() const { return TypeName(); }
static const std::wstring TypeName() { return L"TransposeTimes"; }
virtual void ComputeInputPartial(const size_t inputIndex)
{
if (inputIndex > 1)
throw std::invalid_argument("TransposeTimesNode operation only takes two inputs.");
if (inputIndex == 0) //left derivative
{
ComputeInputPartialLeft(Inputs(1)->FunctionValues(), Inputs(0)->GradientValues(), GradientValues());
}
else //right derivative
{
ComputeInputPartialRight(Inputs(0)->FunctionValues(), Inputs(1)->GradientValues(), GradientValues());
}
}
virtual void ComputeInputPartial(const size_t inputIndex, const size_t timeIdxInSeq)
{
if (inputIndex > 1)
throw std::invalid_argument("TransposeTimesNode operation only takes two inputs.");
if (inputIndex == 0) //left derivative
{
Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
}
else //right derivative
{
Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
}
}
static void WINAPI ComputeInputPartialLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
{
#if DUMPOUTPUT
gradientValues.Print("Gradient-in");
inputGradientValues.Print("child Gradient-in/out");
inputFunctionValues.Print("child Function values");
#endif
//currently we only support one combination when the input is sparse.
if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, true, inputGradientValues);
#if DUMPOUTPUT
inputGradientValues.Print("child Gradient-out");
#endif
}
static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
{
#if DUMPOUTPUT
gradientValues.Print("Gradient-in");
inputGradientValues.Print("child Gradient-in/out");
inputFunctionValues.Print("child Function values");
#endif
Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, false, inputGradientValues);
#if DUMPOUTPUT
inputGradientValues.Print("child Gradient-out");
#endif
}
virtual void EvaluateThisNode()
{
EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues());
}
virtual void EvaluateThisNode(const size_t timeIdxInSeq)
{
Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
Matrix<ElemType> sliceOutputValue = m_functionValues.ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
}
static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
{
#if DUMPOUTPUT
input0.Print("TransposeTimesNode - Input0");
#endif
functionValues.AssignProductOf(input0, true, input1, false);
#if NANCHECK
functionValues.HasNan("TransposeTimes");
#endif
#if DUMPOUTPUT
functionValues.Print("TransposeTimes");
#endif
}
virtual void Validate()
{
PrintSelfBeforeValidation();
if (m_children.size() != 2)
throw std::logic_error("TransposeTimes operation requires two inputs.");
//support automatic dimention inference for learnable parameters
size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols0 = Inputs(0)->FunctionValues().GetNumCols();
size_t rows1 = Inputs(1)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
if ((rows0 == 0 || cols1 == 0) && this->LoopId() < 0)
throw logic_error("TransposeTimes operation: Inputs(0)->FunctionValues().GetNumRows() and Inputs(1)->FunctionValues().GetNumCols() should not be 0 since it cannot be automatically inferred");
if ((Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
Inputs(0)->FunctionValues().Resize(rows0, rows1);
if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 != 0 && rows1 == 0)
Inputs(1)->FunctionValues().Resize(cols0, cols1);
if ((Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) && this->LoopId() < 0)
throw std::logic_error("TransposeTimes operation: One of the operants has 0 elements.");
//cols0 and rows1 may have been changed so don't use them in the following check
if ((Inputs(1)->FunctionValues().GetNumRows() != Inputs(0)->FunctionValues().GetNumRows()) && this->LoopId() < 0)
{
throw std::logic_error("The Matrix dimension in the TransposeTimes operation does not match.");
}
FunctionValues().Resize(cols0, cols1);
CopyImageSizeFromInputs();
}
virtual void CopyImageSizeFromInputs()
{
CopyImageSizeFromInput(1, false); //the second one is the input since it's column wize
//after multiplication the structure is lost
m_outputWidth = 1;
m_outputHeight = Inputs(0)->FunctionValues().GetNumRows();
m_outputChannels = 1;
}
virtual void AttachInputs(const ComputationNodePtr leftNode, const ComputationNodePtr rightNode)
{
m_children.resize(2);
m_children[0] = leftNode;
m_children[1] = rightNode;
}
};
template class TransposeTimesNode<float>;
template class TransposeTimesNode<double>;
template<class ElemType>
class ElementTimesNode : public ComputationNode<ElemType>
{

Просмотреть файл

@ -158,6 +158,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
ret = true;
else if (EqualInsensitive(nodeType, TimesNode<ElemType>::TypeName()))
ret = true;
else if (EqualInsensitive(nodeType, TransposeTimesNode<ElemType>::TypeName()))
ret = true;
else if (EqualInsensitive(nodeType, ElementTimesNode<ElemType>::TypeName()))
ret = true;
else if (EqualInsensitive(nodeType, DiagTimesNode<ElemType>::TypeName()))

Просмотреть файл

@ -982,13 +982,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
// evaluation uses softmax
m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
/*
#pragma omp parallel for
for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
for (int j = 0; j < Inputs(3)->FunctionValues().GetNumRows(); j++)
m_logSoftmax(i, j) += Inputs(3)->FunctionValues()(j, 0);
*/
m_logSoftmax += Inputs(3)->FunctionValues().Transpose();
m_logSoftmax += Inputs(3)->FunctionValues();
m_logSoftmax.InplaceLogSoftmax(false);
FunctionValues().Resize(1, 1);
FunctionValues().SetValue(0);

Просмотреть файл

@ -37,6 +37,7 @@ CUDA_PATH = /usr/local/cuda-7.0
# This is a suggested/default location for NVML
NVML_INCLUDE = /usr/include/nvidia/gdk
NVML_LIB = /usr/src/gdk/nvml/lib
#######
BUILDFOR = $(ARCH).$(DEVICE).$(BUILDTYPE).$(MATHLIB)
@ -50,7 +51,7 @@ ifeq ($(BUILDTYPE),debug)
GPU_BUILDTYPE_OPT = -O0 -G -lineinfo
else
BUILDTYPE_OPT = -O3 -flto
GPU_BUILDTYPE_OPT = -O3 -use_fast_math
GPU_BUILDTYPE_OPT = -O3 -use_fast_math -lineinfo
endif
# Set up math library defines and libraries
@ -66,7 +67,7 @@ endif
# Set up CUDA includes and libraries
CUDA_INCLUDE = $(CUDA_PATH)/include
CUDA_LIB = -L$(CUDA_PATH)/lib64 -lcublas -lcudart -lcuda -lcurand -lcusparse -lnvidia-ml
CUDA_LIB = -L$(CUDA_PATH)/lib64 -L$(NVML_LIB) -lcublas -lcudart -lcurand -lcusparse -lnvidia-ml
# Set up final list of libs to use
ifeq ($(DEVICE),gpu)

Просмотреть файл

@ -78,7 +78,8 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
MATH_SRC = Math/Math/Matrix.cpp Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp Math/Math/NoGPU.cpp
CN_SRC = MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
MachineLearning/CNTK/ModelEditLanguage.cpp \
MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
BINARYREADER_SRC = DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp

Просмотреть файл

@ -88,7 +88,8 @@ MATH_SRC = Math/Math/Matrix.cpp Math/Math/GPUMatrix.cu Math/Math/GPUMatrixCUDAKe
Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp #Math/Math/InstantiateTemplates.cu
CN_SRC = MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
MachineLearning/CNTK/ModelEditLanguage.cpp \
MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTKEval/CNTKEval.cpp
MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTKEval/CNTKEval.cpp
BINARYREADER_SRC = #DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp

Просмотреть файл

@ -88,7 +88,8 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
MATH_SRC = Math/Math/Matrix.cpp Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp Math/Math/NoGPU.cpp
CN_SRC = MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
MachineLearning/CNTK/ModelEditLanguage.cpp MachineLearning/CNTK/Profiler.cpp \
MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
BINARYREADER_SRC = DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp

Просмотреть файл

@ -48,8 +48,8 @@ DEVICE = gpu
#BUILDTYPE = debug
BUILDTYPE = release
# comment following and uncomment the next one to enable MKL library
#MATHLIB = acml
MATHLIB = mkl
MATHLIB = acml
#MATHLIB = mkl
# modify relevant path below for your system
MKL_PATH = /usr/users/yzhang87/tools/composer_xe_2015.2.164
ACML_PATH = /usr/users/yzhang87/code/acml/gfortran64
@ -99,8 +99,9 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
MATH_SRC = Math/Math/Matrix.cpp Math/Math/GPUMatrix.cu Math/Math/GPUMatrixCUDAKernels.cu Math/Math/GPUSparseMatrix.cu Math/Math/GPUWatcher.cu \
Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp #Math/Math/InstantiateTemplates.cu
CN_SRC = MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
MachineLearning/CNTK/ModelEditLanguage.cpp MachineLearning/CNTK/Profiler.cpp \
MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
MachineLearning/CNTK/ModelEditLanguage.cpp \
MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
BINARYREADER_SRC = #DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp

Просмотреть файл

@ -38,7 +38,7 @@ namespace CNTKMathTest
}
DenseMatrix DM1 = DM0.ColumnSlice(start, numCols);
DenseMatrix DM2 = SM0.ColumnSlice(start, numCols);
DenseMatrix DM2 = SM0.ColumnSliceToDense(start, numCols);
Assert::IsTrue(DM1.IsEqualTo(DM2, 0.0001));
}

Просмотреть файл

@ -410,11 +410,11 @@ namespace CNTKMathTest
SM0.SetValue(M0);
GPUMatrix<float> M1 = M0.ColumnSlice(0, 2);
GPUMatrix<float> SM1 = SM0.ColumnSlice(0, 2);
GPUMatrix<float> SM1 = SM0.ColumnSliceToDense(0, 2);
Assert::IsTrue(M1.IsEqualTo(SM1, 0.0001f));
GPUMatrix<float> M2 = M0.ColumnSlice(1, 2);
GPUMatrix<float> SM2 = SM0.ColumnSlice(1, 2);
GPUMatrix<float> SM2 = SM0.ColumnSliceToDense(1, 2);
Assert::IsTrue(M2.IsEqualTo(SM2, 0.0001f));
Assert::IsFalse(M1.IsEqualTo(SM2, 0.0001f));

Просмотреть файл

@ -3918,7 +3918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (int instance_id = 0; instance_id < batch_size; instance_id++)
for (int sample_id = 0; sample_id < sample_size; sample_id++)
{
int sample =(int) (*this)(2 * sample_id, instance_id);
int sample = (int)(*this)(2 * sample_id, instance_id);
for (int dim = 0; dim < b.GetNumRows(); dim++)
c(dim, instance_id) -= b(dim, sample)* tmp(sample_id, instance_id);
}
@ -3961,16 +3961,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// a: hidden
// b: embedding
// tmp: softmax
// c: loglikelihood
// c: loglikelihood
{
/*z
for (int i = 0; i < (*this).GetNumRows(); i++)
{
for (int j = 0; j < (*this).GetNumCols(); j++)
std::cerr << (*this)(i, j) << " ";
std::cerr << endl;
}
*/
double log_likelihood = 0.0;
size_t sample_size = this->GetNumRows() / 2;
size_t batch_size = this->GetNumCols();
@ -3980,7 +3972,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (int instance_id = 0; instance_id < batch_size; instance_id++)
for (int sample_id = 0; sample_id < sample_size; sample_id++)
{
int sample =(int) (*this)(2 * sample_id, instance_id);
int sample = (int)(*this)(2 * sample_id, instance_id);
double score = bias(0, sample);
for (int dim = 0; dim < b.GetNumRows(); dim++)
score += a(dim, instance_id)* b(dim, sample);
@ -3994,9 +3986,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob);
if (sample_id == 0)
tmp(sample_id, instance_id) += 1;
log_likelihood += sample_id == 0 ? logprob : logprob_noise;
log_likelihood += sample_id == 0 ? logprob : logprob_noise;
}
c(0, 0) = (ElemType)-log_likelihood;
}

Просмотреть файл

@ -1895,15 +1895,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
UNCONST(ElemType, a, my_a);
UNCONST(ElemType, b, my_b);
UNCONST(ElemType, bias, my_bias);
cudaEvent_t done = nullptr;
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
//a: dim * minibatch
//b: dim * |vocab|
int p = 512;
int width = a.GetNumRows(); //dimension of hidden vector
//int width = a.GetNumCols(); original setup, considering column-major
//
while (p / 2 > width) p = p / 2;
_computeNceOutput<ElemType> << <this->GetNumElements() / 2, p >> >(
@ -1915,28 +1913,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
my_b.GetArray(),//b
my_bias.GetArray(),
tmp.GetArray());//tmp
p = 512;
while (p / 2 > this->GetNumElements() / 2) p = p / 2;
// summing up objective must be done in one block
_assignNoiseContrastiveEstimation<ElemType> << <1, p >> >(
this->GetArray(),
sampleCount,
m_numRows / 2,
my_a.GetArray(),
my_a.GetArray(),
a.GetNumCols(),
my_b.GetArray(),
tmp.GetArray(),
c.GetArray());
_computeNceError<ElemType> << <1, p >> >(
this->GetArray(),
m_numRows / 2,
tmp.GetNumCols(),
tmp.GetArray());
cerr << "log-likelihood:" << Get00Element() << endl;
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1953,8 +1943,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int p = 512;
int width = a.GetNumRows();
while (p / 2 > width) p = p / 2;
_assignNceDerivative<ElemType> << <m_nz, p >> >(
_assignNceDerivative<ElemType> << <this->GetNumElements() / 2, p >> >(
GetArray(),
tmp.GetNumCols(),
m_numRows / 2,
@ -1964,7 +1953,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
tmp.GetArray(),
c.GetArray(),
inputIndex);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));

Просмотреть файл

@ -2928,80 +2928,10 @@ __global__ void _computeNceOutput(
}
if (threadIdx.x == 0)
res[i] = partials[0];
res[i] = partials[0] + bias[wid];
}
}
template<class ElemType>
__global__ void _computeNceOutput(
const ElemType* val,
const int* col,
int numRows,
int sampleCount,
const ElemType* a,
int numCols_a,
const ElemType* b,
ElemType* res)
{
// val and col are in CSR format
// val is an array contains log_Pn(w). To differentiate positive and negative samples,
// we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
// col is an array contains index of the word samples
// a is a matrix in column major format contains output from hidden layer
// b is the weight matrix for output layer
// res is the buffer to store computed output (sparse)
// follow the convention, this kernel must be run on 512 threads per block
__shared__ ElemType partials[512];
partials[threadIdx.x] = 0;
//threadIdx.x range from[0 ~ 512)
//blockIdx.x range from[0 ~ nnz)
//blockDim.x equal to 512
//gridDim.x equal to nnz
// determine the elements to be handled by this block
int total = numRows * sampleCount;
int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;
int start = loadPerBlock * blockIdx.x;
int end = min(total, loadPerBlock * (blockIdx.x + 1));
for (int i = start; i < end; i++)
{
int colIndex = col[i];
int rowIndex = i / sampleCount;
int loadPerThread = (numCols_a + blockDim.x - 1) / blockDim.x;
int tstart = loadPerThread * threadIdx.x;
int tend = min(numCols_a, loadPerThread * (threadIdx.x + 1));
for (int j = tstart; j < tend; j++)
partials[threadIdx.x] = a[IDX2C(rowIndex, j, numRows)] * b[IDX2C(j, colIndex, numCols_a)];
__syncthreads();
// sum up
int nTotalThreads = blockDim.x;
while (nTotalThreads >1)
{
int halfPoint = (nTotalThreads >> 1);
if (threadIdx.x < halfPoint)
partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
__syncthreads();
nTotalThreads = (nTotalThreads >> 1);
}
if (threadIdx.x == 0)
res[i] = partials[0];
}
}
template<class ElemType>
__global__ void _assignNoiseContrastiveEstimation(
const ElemType* val,
@ -3010,7 +2940,7 @@ __global__ void _assignNoiseContrastiveEstimation(
const ElemType* a,
int width, // number of columns in a
const ElemType* b,
const ElemType* tmp,
ElemType* tmp,
ElemType* c) // run on 512 threads per block
{
// val and col are in CSR format
@ -3026,32 +2956,30 @@ __global__ void _assignNoiseContrastiveEstimation(
partials[threadIdx.x] = 0;
int total = numRows * sampleCount;
int loadPerThread = (total + 511) / 512;
int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
// find out the items this thread is responsible for
int start = loadPerThread * threadIdx.x;
int end = min(total, loadPerThread * (threadIdx.x + 1));
ElemType log_num_noise_samples = log((ElemType)(sampleCount - 1));
for (int i = start; i < end; i++)
{
//int colIndex = col[i];
//int rowIndex = i / sampleCount;
// add to objective
ElemType log_pnw = val[2 * i + 1];
if (log_pnw < 0) // positive sample: log(pw / (pw + k * pnw))
{
ElemType den = tmp[i];
logadd(den, log((ElemType)(sampleCount - 1)) + log_pnw);
partials[threadIdx.x] += (tmp[i] - den);
}
else // negative sample: log(k * pnw / (pw + k * pnw))
{
ElemType nom = log((ElemType)(sampleCount - 1)) - log_pnw;
ElemType den = nom;
logadd(den, tmp[i]);
partials[threadIdx.x] += (nom - den);
}
ElemType prob = -val[2 * i + 1];
bool positive = (prob > 0);
if (positive)
prob = -prob;
ElemType score_noise = log_num_noise_samples + prob;
ElemType z = logadd(tmp[i], score_noise);
ElemType logprob = tmp[i] - z;
ElemType logprob_noise = score_noise - z;
tmp[i] = -exp(logprob);
if (positive)
tmp[i] += 1;
if (positive)
partials[threadIdx.x] += logprob;
else
partials[threadIdx.x] += logprob_noise;
}
__syncthreads();
@ -3075,211 +3003,6 @@ __global__ void _assignNoiseContrastiveEstimation(
c[0] = -partials[0];
}
template<class ElemType>
__global__ void _assignNoiseContrastiveEstimation(
const ElemType* val,
const int* col,
int numRows,
int sampleCount,
const ElemType* a,
int width, // number of columns in a
const ElemType* b,
const ElemType* tmp,
ElemType* c) // run on 512 threads per block
{
// val and col are in CSR format
// val is an array contains log_Pn(w). To differentiate positive and negative samples,
// we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
// col is an array contains index of the word samples
// a is a matrix in column major format contains output from hidden layer
// b is the weight matrix for output layer
// tmp is the buffer that stores NCE output calculated from _computeNceOutput
// c is the matrix to store objective
__shared__ ElemType partials[512];
partials[threadIdx.x] = 0;
int total = numRows * sampleCount;
int loadPerThread = (total + 511) / 512;
// find out the items this thread is responsible for
int start = loadPerThread * threadIdx.x;
int end = min(total, loadPerThread * (threadIdx.x + 1));
for (int i = start; i < end; i++)
{
//int colIndex = col[i];
//int rowIndex = i / sampleCount;
// add to objective
ElemType log_pnw = val[i];
if (log_pnw < 0) // positive sample: log(pw / (pw + k * pnw))
{
ElemType den = tmp[i];
logadd(den, log((ElemType)(sampleCount - 1)) + log_pnw);
partials[threadIdx.x] += (tmp[i] - den);
}
else // negative sample: log(k * pnw / (pw + k * pnw))
{
ElemType nom = log((ElemType)(sampleCount - 1)) - log_pnw;
ElemType den = nom;
logadd(den, tmp[i]);
partials[threadIdx.x] += (nom - den);
}
}
__syncthreads();
// now sum up the objective function
int nTotalThreads = blockDim.x;
while (nTotalThreads >1)
{
int halfPoint = (nTotalThreads >> 1);
if (threadIdx.x < halfPoint)
partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
__syncthreads();
nTotalThreads = (nTotalThreads >> 1);
}
if (threadIdx.x == 0)
c[0] = -partials[0];
}
template<class ElemType>
__global__ void _computeNceError(
const ElemType* val,
int numRows,
int sampleCount,
ElemType* tmp) // run on one block
{
int total = numRows * sampleCount;
int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
// find out the items this thread is responsible for
int start = loadPerThread * threadIdx.x;
int end = min(total, loadPerThread * (threadIdx.x + 1));
for (int i = start; i < end; i++)
{
ElemType ac = tmp[i]; // precalculated NCE output
ElemType log_pnw = val[2 * i + 1];
ElemType er = 0;
if (log_pnw < 0) // positive sample: k * pnw / (pw + k * pnw)
{
ElemType nom = log((ElemType)(sampleCount - 1)) + log_pnw;
logadd(ac, nom);
er = -1 * exp(nom - ac);
}
else // negative sample: pw / (pw + k * pnw);
{
logadd(ac, log((ElemType)(sampleCount - 1)) - log_pnw);
er = exp(tmp[i] - ac);
}
tmp[i] = er;
}
}
template<class ElemType>
__global__ void _computeNceError(
const ElemType* val,
const int* col,
int numRows,
int sampleCount,
ElemType* tmp) // run on one block
{
int total = numRows * sampleCount;
int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
// find out the items this thread is responsible for
int start = loadPerThread * threadIdx.x;
int end = min(total, loadPerThread * (threadIdx.x + 1));
for (int i = start; i < end; i++)
{
ElemType ac = tmp[i]; // precalculated NCE output
ElemType log_pnw = val[i];
ElemType er = 0;
if (log_pnw < 0) // positive sample: k * pnw / (pw + k * pnw)
{
ElemType nom = log((ElemType)(sampleCount - 1)) + log_pnw;
logadd(ac, nom);
er = -1 * exp(nom - ac);
}
else // negative sample: pw / (pw + k * pnw);
{
logadd(ac, log((ElemType)(sampleCount - 1)) - log_pnw);
er = exp(tmp[i] - ac);
}
tmp[i] = er;
}
}
template<class ElemType>
__global__ void _assignNceDerivativeInJbor(
const ElemType* val,
const int* col,
const int* colndx,
int numRows,
int sampleCount,
const ElemType* a,
int width, // number of columns in a
const ElemType* b,
const ElemType* tmp,
unsigned char* c,
size_t jborBlockSize)
{
// val and col are CSR format sparse matrix for label
// val is an array contains log_Pn(w). To differentiate positive and negative samples
// we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
// col is an array contains index of the word samples
// a is a matrix in column major format contains output from hidden layer
// b is the weight matrix for output layer
// tmp is a matrix of precalculated error
// c is the output array to store intermediate results
/*
int total = numRows * sampleCount;
int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;
// find out the items this block is responsible for
int start = loadPerBlock * blockIdx.x;
int end = min(total, loadPerBlock * (blockIdx.x + 1));
for (int i = start; i < end; i++)
{
int colIndex = col[i];
int rowIndex = i / sampleCount;
ElemType er = tmp[i]; // precalculated error for this output node
ElemType log_pnw = val[i];
// calculate gradients
int loadPerThread = (width + blockDim.x - 1) / blockDim.x;
int tstart = loadPerThread * threadIdx.x;
int tend = min(width, loadPerThread*(threadIdx.x + 1));
for (int j = tstart; j < tend; j++)
{
ElemType val = er * a[IDX2C(rowIndex, j, numRows)];
unsigned char* pj = c + (j*jborBlockSize + colndx[i]) * sizeof(JborRecord<ElemType>);
atomicExch((int*)pj, colIndex);
atomicAdd((ElemType*)(pj + sizeof(int)), val);
}
}*/
}
template<class ElemType>
__global__ void _assignNceDerivative(
const ElemType* val,
@ -3310,12 +3033,11 @@ __global__ void _assignNceDerivative(
for (int i = start; i < end; i++)
{
int colIndex = (int)val[2 * i];
int rowIndex = i / sampleCount;
int wid = (int)val[2 * i];
int batchId = i / sampleCount;
ElemType er = tmp[i]; // precalculated error for this output node
//ElemType log_pnw = val[2 * i + 1];
// calculate gradients
int loadPerThread = (width + blockDim.x - 1) / blockDim.x;
int tstart = loadPerThread * threadIdx.x;
@ -3325,23 +3047,24 @@ __global__ void _assignNceDerivative(
{
for (int j = tstart; j < tend; j++)
{
ElemType val = er * b[IDX2C(j, colIndex, width)];
atomicAdd(c + IDX2C(j, rowIndex, width), val);
//c[IDX2C(rowIndex, j, numRows)] += val;
ElemType val = -er * b[IDX2C(j, wid, width)];
atomicAdd(&c[IDX2C(j, batchId, width)], val);
//c[IDX2C(j, batchId, width)] += val;
//c[IDX2C(batchId, j, numRows)] += val;
}
}
else if (inputIndex == 2) // weight
{
for (int j = tstart; j < tend; j++)
{
ElemType val = er * a[IDX2C(j, rowIndex, width)];
atomicAdd(c + IDX2C(j, colIndex, width), val);
//c[IDX2C(j, colIndex, width)] += val;
ElemType val = -er * a[IDX2C(j, batchId, width)];
atomicAdd(&c[IDX2C(j, wid, width)], val);
//c[IDX2C(j, wid, width)] += val;
}
}
else //bias vector
{
c[colIndex] += er;
c[wid] -= er;
}
}
}

Просмотреть файл

@ -11,6 +11,7 @@
#include <assert.h>
#include <math.h>
#include "GPUWatcher.h" // bring in this class as well so that it gets exported from this DLL
#include <iostream>
#ifndef CPUONLY
#pragma comment (lib, "CNTKMathCUDA.lib") // built by CNTKMathCUDA project
@ -3687,8 +3688,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
throw std::logic_error("AssignNoiseContrastiveEstimation: one of the input matrices is empty.");
if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != this->GetDeviceId())
{
std::cerr << a.GetDeviceId() << " " << b.GetDeviceId() << " " << c.GetDeviceId() << " " << this->GetDeviceId() << std::endl;
NOT_IMPLEMENTED;
}
this->Resize(1, 1);
if (this->GetDeviceId() < 0)

Просмотреть файл

@ -232,6 +232,12 @@ namespace Microsoft {
template<class ElemType> void GPUSparseMatrix<ElemType>::InplaceTranspose() { }
template<class ElemType>
GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ColumnSliceToDense(size_t startColumn, size_t numCols) const
{
return ElemType(0);
}
template<class ElemType> ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
{
return ElemType(0);
@ -1053,7 +1059,7 @@ namespace Microsoft {
template<class ElemType>
void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemType>& a,
const GPUMatrix<ElemType>& b, size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c)
const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& bias, size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c)
{
}