Merge remote-tracking branch 'origin/master' into linux-gcc
Conflicts: Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx MachineLearning/CNTK/ComputationNetwork.h MachineLearning/CNTK/LinearAlgebraNodes.h MachineLearning/CNTK/NetworkDescriptionLanguage.cpp Makefile_kaldi2.cpu Makefile_kaldi2.gpu
This commit is contained in:
Коммит
f7f902107f
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -2290,6 +2290,50 @@ m1, m2 - input matrices.
|
|||
The m1.cols must equal m2.rows.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Subsubsection
|
||||
TransposeTimes
|
||||
\begin_inset Index idx
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
TransposeTimes
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Calculate the product
|
||||
\begin_inset Formula $m1^{T}m2$
|
||||
\end_inset
|
||||
|
||||
.
|
||||
The resulting matrix has a size of m1.cols by m2.cols.
|
||||
The syntax is
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\begin_inset listings
|
||||
inline false
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
Times(m1, m2)
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Itemize
|
||||
m1, m2 - input matrices.
|
||||
The m1.rows must equal m2.rows.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Subsubsection
|
||||
DiagTimes
|
||||
\begin_inset Index idx
|
||||
|
|
|
@ -235,7 +235,7 @@ public:
|
|||
std::vector<ComputationNodePtr> DelayNodes;
|
||||
for (auto n : allnodes)
|
||||
{
|
||||
if (n->OperationName() == L"Delay")
|
||||
if (n->OperationName() == DelayNode<ElemType>::TypeName())
|
||||
{
|
||||
DelayNodes.push_back(n);
|
||||
}
|
||||
|
@ -245,7 +245,7 @@ public:
|
|||
std::vector<ComputationNodePtr> learnableParameters;
|
||||
for (auto n : allnodes)
|
||||
{
|
||||
if (n->OperationName() == L"LearnableParameter")
|
||||
if (n->OperationName() == LearnableParameter<ElemType>::TypeName())
|
||||
{
|
||||
learnableParameters.push_back(n);
|
||||
}
|
||||
|
@ -348,7 +348,7 @@ public:
|
|||
std::wstring srcname = src->GetName();
|
||||
std::wstring desname = des->GetName();
|
||||
|
||||
if (des->OperationName() == L"Delay")
|
||||
if (des->OperationName() == DelayNode<ElemType>::TypeName())
|
||||
{
|
||||
// special treament for arc with Delay node as the children
|
||||
// create a dummy node
|
||||
|
@ -1181,6 +1181,10 @@ public:
|
|||
{
|
||||
newNode = new TimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
|
||||
}
|
||||
else if (nodeType == TransposeTimesNode<ElemType>::TypeName())
|
||||
{
|
||||
newNode = new TransposeTimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
|
||||
}
|
||||
else if (nodeType == ElementTimesNode<ElemType>::TypeName())
|
||||
{
|
||||
newNode = new ElementTimesNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
|
||||
|
@ -1481,6 +1485,10 @@ public:
|
|||
{
|
||||
newNode = new TimesNode<ElemType>(m_deviceId, nodeName);
|
||||
}
|
||||
else if (nodeType == TransposeTimesNode<ElemType>::TypeName())
|
||||
{
|
||||
newNode = new TransposeTimesNode<ElemType>(m_deviceId, nodeName);
|
||||
}
|
||||
else if (nodeType == ElementTimesNode<ElemType>::TypeName())
|
||||
{
|
||||
newNode = new ElementTimesNode<ElemType>(m_deviceId, nodeName);
|
||||
|
@ -1953,6 +1961,16 @@ public:
|
|||
return newNode;
|
||||
}
|
||||
|
||||
ComputationNodePtr TransposeTimes(const ComputationNodePtr a,
|
||||
const ComputationNodePtr b,
|
||||
const std::wstring nodeName = L"")
|
||||
{
|
||||
ComputationNodePtr newNode(new TransposeTimesNode<ElemType>(m_deviceId, nodeName));
|
||||
newNode->AttachInputs(a, b);
|
||||
AddNodeToNet(newNode);
|
||||
return newNode;
|
||||
}
|
||||
|
||||
ComputationNodePtr ElementTimes(const ComputationNodePtr a,
|
||||
const ComputationNodePtr b,
|
||||
const std::wstring nodeName = L"")
|
||||
|
@ -2217,7 +2235,7 @@ public:
|
|||
{
|
||||
for (auto ptr = recurrentNodes.begin(); ptr != recurrentNodes.end(); ptr++)
|
||||
{
|
||||
if ((*ptr)->IsFuncValueOlderThanInputs() && (*ptr)->OperationName() != L"Delay") {
|
||||
if ((*ptr)->IsFuncValueOlderThanInputs() && (*ptr)->OperationName() != DelayNode<ElemType>::TypeName()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -3359,7 +3377,7 @@ protected:
|
|||
visited.insert(cur);
|
||||
recStack.insert(cur);
|
||||
|
||||
if (cur->OperationName() != L"Delay")
|
||||
if (cur->OperationName() != DelayNode<ElemType>::TypeName())
|
||||
{
|
||||
for (size_t i = 0; i < cur->ChildrenSize(); i++)
|
||||
{
|
||||
|
@ -3442,7 +3460,7 @@ protected:
|
|||
ComputationNodePtr nodeRecIter = (*iter).m_recurrentNodes[j];
|
||||
for (size_t i = 0; i < nodeRecIter->ChildrenSize(); i++)
|
||||
{
|
||||
if ((nodeRecIter->Inputs(i)->LoopId() == nodeRecIter->LoopId()) && (nodeRecIter->OperationName() != L"Delay"))
|
||||
if ((nodeRecIter->Inputs(i)->LoopId() == nodeRecIter->LoopId()) && (nodeRecIter->OperationName() != DelayNode<ElemType>::TypeName()))
|
||||
{
|
||||
nodeRecIter->Inputs(i)->SetIndexInLoop(nodeRecIter->Inputs(i)->GetIndexInLoop() + 1);
|
||||
}
|
||||
|
|
|
@ -51,7 +51,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
};
|
||||
|
||||
#pragma region base computation class
|
||||
|
||||
template<class ElemType>
|
||||
class ComputationNode //Abstract Class that cannot be instantiated
|
||||
{
|
||||
|
|
|
@ -887,6 +887,193 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template class TimesNode<float>;
|
||||
template class TimesNode<double>;
|
||||
|
||||
template<class ElemType>
|
||||
class TransposeTimesNode : public ComputationNode<ElemType>
|
||||
{
|
||||
UsingComputationNodeMembers;
|
||||
public:
|
||||
TransposeTimesNode(const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
|
||||
{
|
||||
m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
|
||||
m_deviceId = deviceId;
|
||||
MoveMatricesToDevice(deviceId);
|
||||
InitRecurrentNode();
|
||||
}
|
||||
|
||||
TransposeTimesNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
|
||||
{
|
||||
m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
|
||||
LoadFromFile(fstream, modelVersion, deviceId);
|
||||
}
|
||||
|
||||
// copy constructor
|
||||
TransposeTimesNode(const TransposeTimesNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags) : ComputationNode<ElemType>(node->m_deviceId)
|
||||
{
|
||||
node->CopyTo(this, newName, flags);
|
||||
}
|
||||
|
||||
virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
|
||||
{
|
||||
const std::wstring& name = (newName == L"") ? NodeName() : newName;
|
||||
|
||||
ComputationNodePtr node = new TransposeTimesNode<ElemType>(this, name, flags);
|
||||
return node;
|
||||
}
|
||||
|
||||
virtual const std::wstring OperationName() const { return TypeName(); }
|
||||
static const std::wstring TypeName() { return L"TransposeTimes"; }
|
||||
|
||||
virtual void ComputeInputPartial(const size_t inputIndex)
|
||||
{
|
||||
if (inputIndex > 1)
|
||||
throw std::invalid_argument("TransposeTimesNode operation only takes two inputs.");
|
||||
|
||||
if (inputIndex == 0) //left derivative
|
||||
{
|
||||
ComputeInputPartialLeft(Inputs(1)->FunctionValues(), Inputs(0)->GradientValues(), GradientValues());
|
||||
}
|
||||
else //right derivative
|
||||
{
|
||||
ComputeInputPartialRight(Inputs(0)->FunctionValues(), Inputs(1)->GradientValues(), GradientValues());
|
||||
}
|
||||
}
|
||||
|
||||
virtual void ComputeInputPartial(const size_t inputIndex, const size_t timeIdxInSeq)
|
||||
{
|
||||
if (inputIndex > 1)
|
||||
throw std::invalid_argument("TransposeTimesNode operation only takes two inputs.");
|
||||
|
||||
if (inputIndex == 0) //left derivative
|
||||
{
|
||||
Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
|
||||
ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
|
||||
}
|
||||
else //right derivative
|
||||
{
|
||||
Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
|
||||
ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
|
||||
}
|
||||
}
|
||||
|
||||
static void WINAPI ComputeInputPartialLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
|
||||
{
|
||||
#if DUMPOUTPUT
|
||||
gradientValues.Print("Gradient-in");
|
||||
inputGradientValues.Print("child Gradient-in/out");
|
||||
inputFunctionValues.Print("child Function values");
|
||||
#endif
|
||||
//currently we only support one combination when the input is sparse.
|
||||
if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
|
||||
inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
|
||||
|
||||
Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, true, inputGradientValues);
|
||||
|
||||
|
||||
#if DUMPOUTPUT
|
||||
inputGradientValues.Print("child Gradient-out");
|
||||
#endif
|
||||
}
|
||||
|
||||
static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
|
||||
{
|
||||
#if DUMPOUTPUT
|
||||
gradientValues.Print("Gradient-in");
|
||||
inputGradientValues.Print("child Gradient-in/out");
|
||||
inputFunctionValues.Print("child Function values");
|
||||
#endif
|
||||
Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, false, inputGradientValues);
|
||||
|
||||
#if DUMPOUTPUT
|
||||
inputGradientValues.Print("child Gradient-out");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
virtual void EvaluateThisNode()
|
||||
{
|
||||
EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues());
|
||||
}
|
||||
|
||||
virtual void EvaluateThisNode(const size_t timeIdxInSeq)
|
||||
{
|
||||
Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
Matrix<ElemType> sliceOutputValue = m_functionValues.ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
|
||||
|
||||
EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
|
||||
}
|
||||
|
||||
static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
|
||||
{
|
||||
#if DUMPOUTPUT
|
||||
input0.Print("TransposeTimesNode - Input0");
|
||||
#endif
|
||||
functionValues.AssignProductOf(input0, true, input1, false);
|
||||
#if NANCHECK
|
||||
functionValues.HasNan("TransposeTimes");
|
||||
#endif
|
||||
#if DUMPOUTPUT
|
||||
functionValues.Print("TransposeTimes");
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void Validate()
|
||||
{
|
||||
PrintSelfBeforeValidation();
|
||||
|
||||
if (m_children.size() != 2)
|
||||
throw std::logic_error("TransposeTimes operation requires two inputs.");
|
||||
|
||||
//support automatic dimention inference for learnable parameters
|
||||
size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols0 = Inputs(0)->FunctionValues().GetNumCols();
|
||||
size_t rows1 = Inputs(1)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
|
||||
|
||||
if ((rows0 == 0 || cols1 == 0) && this->LoopId() < 0)
|
||||
throw logic_error("TransposeTimes operation: Inputs(0)->FunctionValues().GetNumRows() and Inputs(1)->FunctionValues().GetNumCols() should not be 0 since it cannot be automatically inferred");
|
||||
|
||||
if ((Inputs(0)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
|
||||
Inputs(0)->FunctionValues().Resize(rows0, rows1);
|
||||
|
||||
if (Inputs(1)->OperationName() == LearnableParameter<ElemType>::TypeName() && cols0 != 0 && rows1 == 0)
|
||||
Inputs(1)->FunctionValues().Resize(cols0, cols1);
|
||||
|
||||
if ((Inputs(0)->FunctionValues().GetNumElements() == 0 || Inputs(1)->FunctionValues().GetNumElements() == 0) && this->LoopId() < 0)
|
||||
throw std::logic_error("TransposeTimes operation: One of the operants has 0 elements.");
|
||||
|
||||
//cols0 and rows1 may have been changed so don't use them in the following check
|
||||
if ((Inputs(1)->FunctionValues().GetNumRows() != Inputs(0)->FunctionValues().GetNumRows()) && this->LoopId() < 0)
|
||||
{
|
||||
throw std::logic_error("The Matrix dimension in the TransposeTimes operation does not match.");
|
||||
}
|
||||
FunctionValues().Resize(cols0, cols1);
|
||||
CopyImageSizeFromInputs();
|
||||
}
|
||||
|
||||
virtual void CopyImageSizeFromInputs()
|
||||
{
|
||||
CopyImageSizeFromInput(1, false); //the second one is the input since it's column wize
|
||||
|
||||
//after multiplication the structure is lost
|
||||
m_outputWidth = 1;
|
||||
m_outputHeight = Inputs(0)->FunctionValues().GetNumRows();
|
||||
m_outputChannels = 1;
|
||||
}
|
||||
|
||||
|
||||
virtual void AttachInputs(const ComputationNodePtr leftNode, const ComputationNodePtr rightNode)
|
||||
{
|
||||
m_children.resize(2);
|
||||
m_children[0] = leftNode;
|
||||
m_children[1] = rightNode;
|
||||
}
|
||||
};
|
||||
|
||||
template class TransposeTimesNode<float>;
|
||||
template class TransposeTimesNode<double>;
|
||||
|
||||
template<class ElemType>
|
||||
class ElementTimesNode : public ComputationNode<ElemType>
|
||||
{
|
||||
|
|
|
@ -158,6 +158,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
|
|||
ret = true;
|
||||
else if (EqualInsensitive(nodeType, TimesNode<ElemType>::TypeName()))
|
||||
ret = true;
|
||||
else if (EqualInsensitive(nodeType, TransposeTimesNode<ElemType>::TypeName()))
|
||||
ret = true;
|
||||
else if (EqualInsensitive(nodeType, ElementTimesNode<ElemType>::TypeName()))
|
||||
ret = true;
|
||||
else if (EqualInsensitive(nodeType, DiagTimesNode<ElemType>::TypeName()))
|
||||
|
|
|
@ -982,13 +982,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
// evaluation uses softmax
|
||||
m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
|
||||
/*
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
|
||||
for (int j = 0; j < Inputs(3)->FunctionValues().GetNumRows(); j++)
|
||||
m_logSoftmax(i, j) += Inputs(3)->FunctionValues()(j, 0);
|
||||
*/
|
||||
m_logSoftmax += Inputs(3)->FunctionValues().Transpose();
|
||||
m_logSoftmax += Inputs(3)->FunctionValues();
|
||||
m_logSoftmax.InplaceLogSoftmax(false);
|
||||
FunctionValues().Resize(1, 1);
|
||||
FunctionValues().SetValue(0);
|
||||
|
|
5
Makefile
5
Makefile
|
@ -37,6 +37,7 @@ CUDA_PATH = /usr/local/cuda-7.0
|
|||
|
||||
# This is a suggested/default location for NVML
|
||||
NVML_INCLUDE = /usr/include/nvidia/gdk
|
||||
NVML_LIB = /usr/src/gdk/nvml/lib
|
||||
#######
|
||||
|
||||
BUILDFOR = $(ARCH).$(DEVICE).$(BUILDTYPE).$(MATHLIB)
|
||||
|
@ -50,7 +51,7 @@ ifeq ($(BUILDTYPE),debug)
|
|||
GPU_BUILDTYPE_OPT = -O0 -G -lineinfo
|
||||
else
|
||||
BUILDTYPE_OPT = -O3 -flto
|
||||
GPU_BUILDTYPE_OPT = -O3 -use_fast_math
|
||||
GPU_BUILDTYPE_OPT = -O3 -use_fast_math -lineinfo
|
||||
endif
|
||||
|
||||
# Set up math library defines and libraries
|
||||
|
@ -66,7 +67,7 @@ endif
|
|||
|
||||
# Set up CUDA includes and libraries
|
||||
CUDA_INCLUDE = $(CUDA_PATH)/include
|
||||
CUDA_LIB = -L$(CUDA_PATH)/lib64 -lcublas -lcudart -lcuda -lcurand -lcusparse -lnvidia-ml
|
||||
CUDA_LIB = -L$(CUDA_PATH)/lib64 -L$(NVML_LIB) -lcublas -lcudart -lcurand -lcusparse -lnvidia-ml
|
||||
|
||||
# Set up final list of libs to use
|
||||
ifeq ($(DEVICE),gpu)
|
||||
|
|
|
@ -78,7 +78,8 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
|
|||
MATH_SRC = Math/Math/Matrix.cpp Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp Math/Math/NoGPU.cpp
|
||||
CN_SRC = MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
|
||||
MachineLearning/CNTK/ModelEditLanguage.cpp \
|
||||
MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
|
||||
MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
|
||||
MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
|
||||
BINARYREADER_SRC = DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
|
||||
HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
|
||||
KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
|
||||
|
|
|
@ -88,7 +88,8 @@ MATH_SRC = Math/Math/Matrix.cpp Math/Math/GPUMatrix.cu Math/Math/GPUMatrixCUDAKe
|
|||
Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp #Math/Math/InstantiateTemplates.cu
|
||||
CN_SRC = MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
|
||||
MachineLearning/CNTK/ModelEditLanguage.cpp \
|
||||
MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTKEval/CNTKEval.cpp
|
||||
MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
|
||||
MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTKEval/CNTKEval.cpp
|
||||
BINARYREADER_SRC = #DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
|
||||
HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
|
||||
KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
|
||||
|
|
|
@ -88,7 +88,8 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
|
|||
MATH_SRC = Math/Math/Matrix.cpp Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp Math/Math/NoGPU.cpp
|
||||
CN_SRC = MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
|
||||
MachineLearning/CNTK/ModelEditLanguage.cpp MachineLearning/CNTK/Profiler.cpp \
|
||||
MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
|
||||
MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
|
||||
MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
|
||||
BINARYREADER_SRC = DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
|
||||
HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
|
||||
KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
|
||||
|
|
|
@ -48,8 +48,8 @@ DEVICE = gpu
|
|||
#BUILDTYPE = debug
|
||||
BUILDTYPE = release
|
||||
# comment following and uncomment the next one to enable MKL library
|
||||
#MATHLIB = acml
|
||||
MATHLIB = mkl
|
||||
MATHLIB = acml
|
||||
#MATHLIB = mkl
|
||||
# modify relevant path below for your system
|
||||
MKL_PATH = /usr/users/yzhang87/tools/composer_xe_2015.2.164
|
||||
ACML_PATH = /usr/users/yzhang87/code/acml/gfortran64
|
||||
|
@ -99,8 +99,9 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
|
|||
MATH_SRC = Math/Math/Matrix.cpp Math/Math/GPUMatrix.cu Math/Math/GPUMatrixCUDAKernels.cu Math/Math/GPUSparseMatrix.cu Math/Math/GPUWatcher.cu \
|
||||
Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp #Math/Math/InstantiateTemplates.cu
|
||||
CN_SRC = MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
|
||||
MachineLearning/CNTK/ModelEditLanguage.cpp MachineLearning/CNTK/Profiler.cpp \
|
||||
MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
|
||||
MachineLearning/CNTK/ModelEditLanguage.cpp \
|
||||
MachineLearning/CNTK/SimpleNetworkBuilder.cpp \
|
||||
MachineLearning/CNTK/Profiler.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
|
||||
BINARYREADER_SRC = #DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
|
||||
HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
|
||||
KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
|
||||
|
|
|
@ -38,7 +38,7 @@ namespace CNTKMathTest
|
|||
}
|
||||
|
||||
DenseMatrix DM1 = DM0.ColumnSlice(start, numCols);
|
||||
DenseMatrix DM2 = SM0.ColumnSlice(start, numCols);
|
||||
DenseMatrix DM2 = SM0.ColumnSliceToDense(start, numCols);
|
||||
|
||||
Assert::IsTrue(DM1.IsEqualTo(DM2, 0.0001));
|
||||
}
|
||||
|
|
|
@ -410,11 +410,11 @@ namespace CNTKMathTest
|
|||
SM0.SetValue(M0);
|
||||
|
||||
GPUMatrix<float> M1 = M0.ColumnSlice(0, 2);
|
||||
GPUMatrix<float> SM1 = SM0.ColumnSlice(0, 2);
|
||||
GPUMatrix<float> SM1 = SM0.ColumnSliceToDense(0, 2);
|
||||
Assert::IsTrue(M1.IsEqualTo(SM1, 0.0001f));
|
||||
|
||||
GPUMatrix<float> M2 = M0.ColumnSlice(1, 2);
|
||||
GPUMatrix<float> SM2 = SM0.ColumnSlice(1, 2);
|
||||
GPUMatrix<float> SM2 = SM0.ColumnSliceToDense(1, 2);
|
||||
Assert::IsTrue(M2.IsEqualTo(SM2, 0.0001f));
|
||||
|
||||
Assert::IsFalse(M1.IsEqualTo(SM2, 0.0001f));
|
||||
|
|
|
@ -3918,7 +3918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (int instance_id = 0; instance_id < batch_size; instance_id++)
|
||||
for (int sample_id = 0; sample_id < sample_size; sample_id++)
|
||||
{
|
||||
int sample =(int) (*this)(2 * sample_id, instance_id);
|
||||
int sample = (int)(*this)(2 * sample_id, instance_id);
|
||||
for (int dim = 0; dim < b.GetNumRows(); dim++)
|
||||
c(dim, instance_id) -= b(dim, sample)* tmp(sample_id, instance_id);
|
||||
}
|
||||
|
@ -3961,16 +3961,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// a: hidden
|
||||
// b: embedding
|
||||
// tmp: softmax
|
||||
// c: loglikelihood
|
||||
// c: loglikelihood
|
||||
{
|
||||
/*z
|
||||
for (int i = 0; i < (*this).GetNumRows(); i++)
|
||||
{
|
||||
for (int j = 0; j < (*this).GetNumCols(); j++)
|
||||
std::cerr << (*this)(i, j) << " ";
|
||||
std::cerr << endl;
|
||||
}
|
||||
*/
|
||||
double log_likelihood = 0.0;
|
||||
size_t sample_size = this->GetNumRows() / 2;
|
||||
size_t batch_size = this->GetNumCols();
|
||||
|
@ -3980,7 +3972,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (int instance_id = 0; instance_id < batch_size; instance_id++)
|
||||
for (int sample_id = 0; sample_id < sample_size; sample_id++)
|
||||
{
|
||||
int sample =(int) (*this)(2 * sample_id, instance_id);
|
||||
int sample = (int)(*this)(2 * sample_id, instance_id);
|
||||
double score = bias(0, sample);
|
||||
for (int dim = 0; dim < b.GetNumRows(); dim++)
|
||||
score += a(dim, instance_id)* b(dim, sample);
|
||||
|
@ -3994,9 +3986,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob);
|
||||
if (sample_id == 0)
|
||||
tmp(sample_id, instance_id) += 1;
|
||||
log_likelihood += sample_id == 0 ? logprob : logprob_noise;
|
||||
log_likelihood += sample_id == 0 ? logprob : logprob_noise;
|
||||
}
|
||||
|
||||
c(0, 0) = (ElemType)-log_likelihood;
|
||||
}
|
||||
|
||||
|
|
|
@ -1895,15 +1895,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
UNCONST(ElemType, a, my_a);
|
||||
UNCONST(ElemType, b, my_b);
|
||||
UNCONST(ElemType, bias, my_bias);
|
||||
|
||||
cudaEvent_t done = nullptr;
|
||||
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
|
||||
//a: dim * minibatch
|
||||
//b: dim * |vocab|
|
||||
int p = 512;
|
||||
int width = a.GetNumRows(); //dimension of hidden vector
|
||||
//int width = a.GetNumCols(); original setup, considering column-major
|
||||
//
|
||||
|
||||
while (p / 2 > width) p = p / 2;
|
||||
|
||||
_computeNceOutput<ElemType> << <this->GetNumElements() / 2, p >> >(
|
||||
|
@ -1915,28 +1913,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
my_b.GetArray(),//b
|
||||
my_bias.GetArray(),
|
||||
tmp.GetArray());//tmp
|
||||
|
||||
|
||||
p = 512;
|
||||
while (p / 2 > this->GetNumElements() / 2) p = p / 2;
|
||||
|
||||
// summing up objective must be done in one block
|
||||
_assignNoiseContrastiveEstimation<ElemType> << <1, p >> >(
|
||||
this->GetArray(),
|
||||
sampleCount,
|
||||
m_numRows / 2,
|
||||
my_a.GetArray(),
|
||||
my_a.GetArray(),
|
||||
a.GetNumCols(),
|
||||
my_b.GetArray(),
|
||||
tmp.GetArray(),
|
||||
c.GetArray());
|
||||
|
||||
_computeNceError<ElemType> << <1, p >> >(
|
||||
this->GetArray(),
|
||||
m_numRows / 2,
|
||||
tmp.GetNumCols(),
|
||||
tmp.GetArray());
|
||||
|
||||
cerr << "log-likelihood:" << Get00Element() << endl;
|
||||
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
@ -1953,8 +1943,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
int p = 512;
|
||||
int width = a.GetNumRows();
|
||||
while (p / 2 > width) p = p / 2;
|
||||
|
||||
_assignNceDerivative<ElemType> << <m_nz, p >> >(
|
||||
_assignNceDerivative<ElemType> << <this->GetNumElements() / 2, p >> >(
|
||||
GetArray(),
|
||||
tmp.GetNumCols(),
|
||||
m_numRows / 2,
|
||||
|
@ -1964,7 +1953,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
tmp.GetArray(),
|
||||
c.GetArray(),
|
||||
inputIndex);
|
||||
|
||||
if (do_sync) CUDA_CALL(cudaEventRecord(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
|
||||
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
|
||||
|
|
|
@ -2928,80 +2928,10 @@ __global__ void _computeNceOutput(
|
|||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
res[i] = partials[0];
|
||||
res[i] = partials[0] + bias[wid];
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _computeNceOutput(
|
||||
const ElemType* val,
|
||||
const int* col,
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
const ElemType* a,
|
||||
int numCols_a,
|
||||
const ElemType* b,
|
||||
ElemType* res)
|
||||
{
|
||||
// val and col are in CSR format
|
||||
// val is an array contains log_Pn(w). To differentiate positive and negative samples,
|
||||
// we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
|
||||
// col is an array contains index of the word samples
|
||||
// a is a matrix in column major format contains output from hidden layer
|
||||
// b is the weight matrix for output layer
|
||||
// res is the buffer to store computed output (sparse)
|
||||
|
||||
// follow the convention, this kernel must be run on 512 threads per block
|
||||
__shared__ ElemType partials[512];
|
||||
partials[threadIdx.x] = 0;
|
||||
|
||||
//threadIdx.x range from[0 ~ 512)
|
||||
//blockIdx.x range from[0 ~ nnz)
|
||||
//blockDim.x equal to 512
|
||||
//gridDim.x equal to nnz
|
||||
|
||||
// determine the elements to be handled by this block
|
||||
int total = numRows * sampleCount;
|
||||
int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;
|
||||
|
||||
int start = loadPerBlock * blockIdx.x;
|
||||
int end = min(total, loadPerBlock * (blockIdx.x + 1));
|
||||
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
int colIndex = col[i];
|
||||
int rowIndex = i / sampleCount;
|
||||
|
||||
int loadPerThread = (numCols_a + blockDim.x - 1) / blockDim.x;
|
||||
int tstart = loadPerThread * threadIdx.x;
|
||||
int tend = min(numCols_a, loadPerThread * (threadIdx.x + 1));
|
||||
|
||||
for (int j = tstart; j < tend; j++)
|
||||
partials[threadIdx.x] = a[IDX2C(rowIndex, j, numRows)] * b[IDX2C(j, colIndex, numCols_a)];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// sum up
|
||||
int nTotalThreads = blockDim.x;
|
||||
|
||||
while (nTotalThreads >1)
|
||||
{
|
||||
int halfPoint = (nTotalThreads >> 1);
|
||||
|
||||
if (threadIdx.x < halfPoint)
|
||||
partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nTotalThreads = (nTotalThreads >> 1);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
res[i] = partials[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _assignNoiseContrastiveEstimation(
|
||||
const ElemType* val,
|
||||
|
@ -3010,7 +2940,7 @@ __global__ void _assignNoiseContrastiveEstimation(
|
|||
const ElemType* a,
|
||||
int width, // number of columns in a
|
||||
const ElemType* b,
|
||||
const ElemType* tmp,
|
||||
ElemType* tmp,
|
||||
ElemType* c) // run on 512 threads per block
|
||||
{
|
||||
// val and col are in CSR format
|
||||
|
@ -3026,32 +2956,30 @@ __global__ void _assignNoiseContrastiveEstimation(
|
|||
partials[threadIdx.x] = 0;
|
||||
|
||||
int total = numRows * sampleCount;
|
||||
int loadPerThread = (total + 511) / 512;
|
||||
int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
|
||||
|
||||
// find out the items this thread is responsible for
|
||||
int start = loadPerThread * threadIdx.x;
|
||||
int end = min(total, loadPerThread * (threadIdx.x + 1));
|
||||
|
||||
ElemType log_num_noise_samples = log((ElemType)(sampleCount - 1));
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
//int colIndex = col[i];
|
||||
//int rowIndex = i / sampleCount;
|
||||
|
||||
// add to objective
|
||||
ElemType log_pnw = val[2 * i + 1];
|
||||
if (log_pnw < 0) // positive sample: log(pw / (pw + k * pnw))
|
||||
{
|
||||
ElemType den = tmp[i];
|
||||
logadd(den, log((ElemType)(sampleCount - 1)) + log_pnw);
|
||||
partials[threadIdx.x] += (tmp[i] - den);
|
||||
}
|
||||
else // negative sample: log(k * pnw / (pw + k * pnw))
|
||||
{
|
||||
ElemType nom = log((ElemType)(sampleCount - 1)) - log_pnw;
|
||||
ElemType den = nom;
|
||||
logadd(den, tmp[i]);
|
||||
partials[threadIdx.x] += (nom - den);
|
||||
}
|
||||
ElemType prob = -val[2 * i + 1];
|
||||
bool positive = (prob > 0);
|
||||
if (positive)
|
||||
prob = -prob;
|
||||
ElemType score_noise = log_num_noise_samples + prob;
|
||||
ElemType z = logadd(tmp[i], score_noise);
|
||||
ElemType logprob = tmp[i] - z;
|
||||
ElemType logprob_noise = score_noise - z;
|
||||
tmp[i] = -exp(logprob);
|
||||
if (positive)
|
||||
tmp[i] += 1;
|
||||
if (positive)
|
||||
partials[threadIdx.x] += logprob;
|
||||
else
|
||||
partials[threadIdx.x] += logprob_noise;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
@ -3075,211 +3003,6 @@ __global__ void _assignNoiseContrastiveEstimation(
|
|||
c[0] = -partials[0];
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _assignNoiseContrastiveEstimation(
|
||||
const ElemType* val,
|
||||
const int* col,
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
const ElemType* a,
|
||||
int width, // number of columns in a
|
||||
const ElemType* b,
|
||||
const ElemType* tmp,
|
||||
ElemType* c) // run on 512 threads per block
|
||||
{
|
||||
// val and col are in CSR format
|
||||
// val is an array contains log_Pn(w). To differentiate positive and negative samples,
|
||||
// we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
|
||||
// col is an array contains index of the word samples
|
||||
// a is a matrix in column major format contains output from hidden layer
|
||||
// b is the weight matrix for output layer
|
||||
// tmp is the buffer that stores NCE output calculated from _computeNceOutput
|
||||
// c is the matrix to store objective
|
||||
|
||||
__shared__ ElemType partials[512];
|
||||
partials[threadIdx.x] = 0;
|
||||
|
||||
int total = numRows * sampleCount;
|
||||
int loadPerThread = (total + 511) / 512;
|
||||
|
||||
// find out the items this thread is responsible for
|
||||
int start = loadPerThread * threadIdx.x;
|
||||
int end = min(total, loadPerThread * (threadIdx.x + 1));
|
||||
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
//int colIndex = col[i];
|
||||
//int rowIndex = i / sampleCount;
|
||||
|
||||
// add to objective
|
||||
ElemType log_pnw = val[i];
|
||||
if (log_pnw < 0) // positive sample: log(pw / (pw + k * pnw))
|
||||
{
|
||||
ElemType den = tmp[i];
|
||||
logadd(den, log((ElemType)(sampleCount - 1)) + log_pnw);
|
||||
partials[threadIdx.x] += (tmp[i] - den);
|
||||
}
|
||||
else // negative sample: log(k * pnw / (pw + k * pnw))
|
||||
{
|
||||
ElemType nom = log((ElemType)(sampleCount - 1)) - log_pnw;
|
||||
ElemType den = nom;
|
||||
logadd(den, tmp[i]);
|
||||
partials[threadIdx.x] += (nom - den);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// now sum up the objective function
|
||||
int nTotalThreads = blockDim.x;
|
||||
|
||||
while (nTotalThreads >1)
|
||||
{
|
||||
int halfPoint = (nTotalThreads >> 1);
|
||||
|
||||
if (threadIdx.x < halfPoint)
|
||||
partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nTotalThreads = (nTotalThreads >> 1);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
c[0] = -partials[0];
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _computeNceError(
|
||||
const ElemType* val,
|
||||
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
ElemType* tmp) // run on one block
|
||||
{
|
||||
int total = numRows * sampleCount;
|
||||
int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
|
||||
|
||||
// find out the items this thread is responsible for
|
||||
int start = loadPerThread * threadIdx.x;
|
||||
int end = min(total, loadPerThread * (threadIdx.x + 1));
|
||||
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
ElemType ac = tmp[i]; // precalculated NCE output
|
||||
ElemType log_pnw = val[2 * i + 1];
|
||||
|
||||
ElemType er = 0;
|
||||
if (log_pnw < 0) // positive sample: k * pnw / (pw + k * pnw)
|
||||
{
|
||||
ElemType nom = log((ElemType)(sampleCount - 1)) + log_pnw;
|
||||
logadd(ac, nom);
|
||||
er = -1 * exp(nom - ac);
|
||||
|
||||
}
|
||||
else // negative sample: pw / (pw + k * pnw);
|
||||
{
|
||||
logadd(ac, log((ElemType)(sampleCount - 1)) - log_pnw);
|
||||
er = exp(tmp[i] - ac);
|
||||
}
|
||||
|
||||
tmp[i] = er;
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _computeNceError(
|
||||
const ElemType* val,
|
||||
const int* col,
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
ElemType* tmp) // run on one block
|
||||
{
|
||||
int total = numRows * sampleCount;
|
||||
int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
|
||||
|
||||
// find out the items this thread is responsible for
|
||||
int start = loadPerThread * threadIdx.x;
|
||||
int end = min(total, loadPerThread * (threadIdx.x + 1));
|
||||
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
ElemType ac = tmp[i]; // precalculated NCE output
|
||||
ElemType log_pnw = val[i];
|
||||
|
||||
ElemType er = 0;
|
||||
if (log_pnw < 0) // positive sample: k * pnw / (pw + k * pnw)
|
||||
{
|
||||
ElemType nom = log((ElemType)(sampleCount - 1)) + log_pnw;
|
||||
logadd(ac, nom);
|
||||
er = -1 * exp(nom - ac);
|
||||
|
||||
}
|
||||
else // negative sample: pw / (pw + k * pnw);
|
||||
{
|
||||
logadd(ac, log((ElemType)(sampleCount - 1)) - log_pnw);
|
||||
er = exp(tmp[i] - ac);
|
||||
}
|
||||
|
||||
tmp[i] = er;
|
||||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _assignNceDerivativeInJbor(
|
||||
const ElemType* val,
|
||||
const int* col,
|
||||
const int* colndx,
|
||||
int numRows,
|
||||
int sampleCount,
|
||||
const ElemType* a,
|
||||
int width, // number of columns in a
|
||||
const ElemType* b,
|
||||
const ElemType* tmp,
|
||||
unsigned char* c,
|
||||
size_t jborBlockSize)
|
||||
{
|
||||
// val and col are CSR format sparse matrix for label
|
||||
// val is an array contains log_Pn(w). To differentiate positive and negative samples
|
||||
// we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
|
||||
// col is an array contains index of the word samples
|
||||
// a is a matrix in column major format contains output from hidden layer
|
||||
// b is the weight matrix for output layer
|
||||
// tmp is a matrix of precalculated error
|
||||
// c is the output array to store intermediate results
|
||||
|
||||
|
||||
/*
|
||||
int total = numRows * sampleCount;
|
||||
int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;
|
||||
|
||||
// find out the items this block is responsible for
|
||||
int start = loadPerBlock * blockIdx.x;
|
||||
int end = min(total, loadPerBlock * (blockIdx.x + 1));
|
||||
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
int colIndex = col[i];
|
||||
int rowIndex = i / sampleCount;
|
||||
|
||||
ElemType er = tmp[i]; // precalculated error for this output node
|
||||
ElemType log_pnw = val[i];
|
||||
|
||||
// calculate gradients
|
||||
int loadPerThread = (width + blockDim.x - 1) / blockDim.x;
|
||||
int tstart = loadPerThread * threadIdx.x;
|
||||
int tend = min(width, loadPerThread*(threadIdx.x + 1));
|
||||
for (int j = tstart; j < tend; j++)
|
||||
{
|
||||
ElemType val = er * a[IDX2C(rowIndex, j, numRows)];
|
||||
|
||||
unsigned char* pj = c + (j*jborBlockSize + colndx[i]) * sizeof(JborRecord<ElemType>);
|
||||
atomicExch((int*)pj, colIndex);
|
||||
atomicAdd((ElemType*)(pj + sizeof(int)), val);
|
||||
}
|
||||
}*/
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
__global__ void _assignNceDerivative(
|
||||
const ElemType* val,
|
||||
|
@ -3310,12 +3033,11 @@ __global__ void _assignNceDerivative(
|
|||
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
int colIndex = (int)val[2 * i];
|
||||
int rowIndex = i / sampleCount;
|
||||
int wid = (int)val[2 * i];
|
||||
int batchId = i / sampleCount;
|
||||
|
||||
ElemType er = tmp[i]; // precalculated error for this output node
|
||||
//ElemType log_pnw = val[2 * i + 1];
|
||||
|
||||
|
||||
// calculate gradients
|
||||
int loadPerThread = (width + blockDim.x - 1) / blockDim.x;
|
||||
int tstart = loadPerThread * threadIdx.x;
|
||||
|
@ -3325,23 +3047,24 @@ __global__ void _assignNceDerivative(
|
|||
{
|
||||
for (int j = tstart; j < tend; j++)
|
||||
{
|
||||
ElemType val = er * b[IDX2C(j, colIndex, width)];
|
||||
atomicAdd(c + IDX2C(j, rowIndex, width), val);
|
||||
//c[IDX2C(rowIndex, j, numRows)] += val;
|
||||
ElemType val = -er * b[IDX2C(j, wid, width)];
|
||||
atomicAdd(&c[IDX2C(j, batchId, width)], val);
|
||||
//c[IDX2C(j, batchId, width)] += val;
|
||||
//c[IDX2C(batchId, j, numRows)] += val;
|
||||
}
|
||||
}
|
||||
else if (inputIndex == 2) // weight
|
||||
{
|
||||
for (int j = tstart; j < tend; j++)
|
||||
{
|
||||
ElemType val = er * a[IDX2C(j, rowIndex, width)];
|
||||
atomicAdd(c + IDX2C(j, colIndex, width), val);
|
||||
//c[IDX2C(j, colIndex, width)] += val;
|
||||
ElemType val = -er * a[IDX2C(j, batchId, width)];
|
||||
atomicAdd(&c[IDX2C(j, wid, width)], val);
|
||||
//c[IDX2C(j, wid, width)] += val;
|
||||
}
|
||||
}
|
||||
else //bias vector
|
||||
{
|
||||
c[colIndex] += er;
|
||||
c[wid] -= er;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include "GPUWatcher.h" // bring in this class as well so that it gets exported from this DLL
|
||||
#include <iostream>
|
||||
|
||||
#ifndef CPUONLY
|
||||
#pragma comment (lib, "CNTKMathCUDA.lib") // built by CNTKMathCUDA project
|
||||
|
@ -3687,8 +3688,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
throw std::logic_error("AssignNoiseContrastiveEstimation: one of the input matrices is empty.");
|
||||
|
||||
if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != this->GetDeviceId())
|
||||
{
|
||||
std::cerr << a.GetDeviceId() << " " << b.GetDeviceId() << " " << c.GetDeviceId() << " " << this->GetDeviceId() << std::endl;
|
||||
NOT_IMPLEMENTED;
|
||||
|
||||
}
|
||||
|
||||
this->Resize(1, 1);
|
||||
|
||||
if (this->GetDeviceId() < 0)
|
||||
|
|
|
@ -232,6 +232,12 @@ namespace Microsoft {
|
|||
|
||||
template<class ElemType> void GPUSparseMatrix<ElemType>::InplaceTranspose() { }
|
||||
|
||||
template<class ElemType>
|
||||
GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ColumnSliceToDense(size_t startColumn, size_t numCols) const
|
||||
{
|
||||
return ElemType(0);
|
||||
}
|
||||
|
||||
template<class ElemType> ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
|
||||
{
|
||||
return ElemType(0);
|
||||
|
@ -1053,7 +1059,7 @@ namespace Microsoft {
|
|||
|
||||
template<class ElemType>
|
||||
void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemType>& a,
|
||||
const GPUMatrix<ElemType>& b, size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c)
|
||||
const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& bias, size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче