1894 строки
77 KiB
C++
1894 строки
77 KiB
C++
//
|
|
// Copyright (c) Microsoft. All rights reserved.
|
|
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
|
//
|
|
#pragma once
|
|
|
|
#include "Basics.h"
|
|
#include "ComputationNode.h"
|
|
#include "ConvolutionEngine.h"
|
|
|
|
#include <map>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <stdexcept>
|
|
#include <list>
|
|
#include <memory>
|
|
|
|
namespace Microsoft { namespace MSR { namespace CNTK {
|
|
|
|
// -----------------------------------------------------------------------
|
|
/// SquareErrorNode (left, right)
|
|
// -----------------------------------------------------------------------
|
|
|
|
//note: to save computation the gradient may be scaled by an constant.
|
|
|
|
template <class ElemType>
|
|
class SquareErrorNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<2>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"SquareError";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(SquareErrorNode);
|
|
SquareErrorNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name)
|
|
{
|
|
}
|
|
|
|
virtual void BackpropToNonLooping(size_t inputIndex) override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
auto gradient = Input(inputIndex)->GradientFor(fr);
|
|
Matrix<ElemType>::Multiply1x1AndWeightedAdd(inputIndex == 0 ? 1.0f : -1.0f, Gradient() /*1x1*/, *m_leftMinusRight, 1.0f, gradient);
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
return false;
|
|
}
|
|
virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
virtual void UpdateFunctionMBSize() override
|
|
{
|
|
m_leftMinusRight->Resize(Input(0)->Value());
|
|
}
|
|
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
m_leftMinusRight->AssignDifferenceOf(Input(0)->ValueFor(fr), Input(1)->ValueFor(fr));
|
|
MaskMissingColumnsToZero(*m_leftMinusRight, Input(0)->GetMBLayout(), fr); // we are fine since it will only be called with full minibatch.
|
|
ElemType v = m_leftMinusRight->FrobeniusNorm();
|
|
Value().VerifySize(1, 1);
|
|
Value().SetValue(v * v / 2);
|
|
#if NANCHECK
|
|
Value().HasNan("SquareError");
|
|
#endif
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
ValidateBinaryReduce(isFinalValidationPass);
|
|
}
|
|
|
|
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
|
{
|
|
Base::CopyTo(nodeP, newName, flags);
|
|
if (flags & CopyNodeFlags::copyNodeValue)
|
|
{
|
|
auto node = dynamic_pointer_cast<SquareErrorNode<ElemType>>(nodeP);
|
|
*node->m_leftMinusRight = *m_leftMinusRight;
|
|
}
|
|
}
|
|
|
|
// request matrices needed to do node function value evaluation
|
|
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
|
|
{
|
|
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
|
RequestMatrixFromPool(m_leftMinusRight, matrixPool);
|
|
}
|
|
|
|
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
|
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
|
{
|
|
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
|
ReleaseMatrixToPool(m_leftMinusRight, matrixPool);
|
|
}
|
|
|
|
private:
|
|
shared_ptr<Matrix<ElemType>> m_leftMinusRight;
|
|
};
|
|
|
|
template class SquareErrorNode<float>;
|
|
template class SquareErrorNode<double>;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// CrossEntropyWithSoftmaxNode (labels, prediction)
|
|
// calculates: -sum(left_i * log(softmax_i(right)))
|
|
// -----------------------------------------------------------------------
|
|
|
|
template <class ElemType>
|
|
class CrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<2>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"CrossEntropyWithSoftmax";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(CrossEntropyWithSoftmaxNode);
|
|
CrossEntropyWithSoftmaxNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name)
|
|
{
|
|
}
|
|
|
|
virtual void BackpropToNonLooping(size_t inputIndex) override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
// left input is scalar
|
|
if (inputIndex == 0) // left derivative
|
|
{
|
|
#if DUMPOUTPUT
|
|
*m_logSoftmaxOfRight.Print("CrossEntropyWithSoftmax Partial-logSoftmaxOfRight");
|
|
Gradient().Print("CrossEntropyWithSoftmax Partial-gradientValues");
|
|
Input(0)->GradientFor(fr).Print("CrossEntropyWithSoftmaxNode Partial-Left-in");
|
|
#endif
|
|
|
|
auto gradient = Input(0)->GradientFor(fr);
|
|
Matrix<ElemType>::Multiply1x1AndWeightedAdd(-1.0f, Gradient() /*1x1*/, *m_logSoftmaxOfRight, 1.0f, gradient);
|
|
#if DUMPOUTPUT
|
|
Input(0)->GradientFor(fr).Print("CrossEntropyWithSoftmaxNode Partial-Left-out");
|
|
#endif
|
|
}
|
|
|
|
else if (inputIndex == 1) // right derivative
|
|
{
|
|
#if DUMPOUTPUT
|
|
*m_softmaxOfRight.Print("CrossEntropyWithSoftmax Partial-softmaxOfRight");
|
|
Input(0)->ValueFor(fr).Print("CrossEntropyWithSoftmax Partial-inputFunctionValues");
|
|
Gradient().Print("CrossEntropyWithSoftmax Partial-gradientValues");
|
|
Input(1)->GradientFor(fr).Print("CrossEntropyWithSoftmaxNode Partial-Right-in");
|
|
#endif
|
|
|
|
auto gradient = Input(1)->GradientFor(fr);
|
|
Matrix<ElemType>::AddScaledDifference(Gradient(), *m_softmaxOfRight, Input(0)->ValueFor(fr), gradient);
|
|
#if DUMPOUTPUT
|
|
Input(1)->GradientFor(fr).Print("CrossEntropyWithSoftmaxNode Partial-Right");
|
|
#endif
|
|
#ifdef _DEBUG
|
|
Input(1)->InvalidateMissingGradientColumns(fr); // TODO: This should not be necessary.
|
|
#endif
|
|
}
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
virtual void UpdateFunctionMBSize() override
|
|
{
|
|
m_logSoftmaxOfRight->Resize(Input(1)->Value());
|
|
m_softmaxOfRight->Resize(*m_logSoftmaxOfRight);
|
|
}
|
|
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override // -sum(left_i * log(softmax_i(right)))
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
// first compute the softmax (column-wise)
|
|
// Note that we need both log and non-log for gradient computation.
|
|
m_logSoftmaxOfRight->AssignLogSoftmaxOf(Input(1)->ValueFor(fr), true);
|
|
m_softmaxOfRight->SetValue(*m_logSoftmaxOfRight);
|
|
m_softmaxOfRight->InplaceExp();
|
|
// flatten all gaps to zero, such that gaps will contribute zero to the sum
|
|
MaskMissingColumnsToZero(*m_logSoftmaxOfRight, Input(1)->GetMBLayout(), fr);
|
|
// reduce over all frames
|
|
Value().AssignInnerProductOfMatrices(Input(0)->MaskedValueFor(fr), *m_logSoftmaxOfRight);
|
|
Value() *= -1;
|
|
#if NANCHECK
|
|
Value().HasNan("CrossEntropyWithSoftmax");
|
|
#endif
|
|
#if DUMPOUTPUT
|
|
Value().Print("CrossEntropyWithSoftmaxNode");
|
|
#endif
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
ValidateBinaryReduce(isFinalValidationPass);
|
|
}
|
|
|
|
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
|
{
|
|
Base::CopyTo(nodeP, newName, flags);
|
|
if (flags & CopyNodeFlags::copyNodeValue)
|
|
{
|
|
auto node = dynamic_pointer_cast<CrossEntropyWithSoftmaxNode<ElemType>>(nodeP);
|
|
*node->m_logSoftmaxOfRight = *m_logSoftmaxOfRight;
|
|
*node->m_softmaxOfRight = *m_softmaxOfRight;
|
|
}
|
|
}
|
|
|
|
// request matrices needed to do node function value evaluation
|
|
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
|
|
{
|
|
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
|
RequestMatrixFromPool(m_logSoftmaxOfRight, matrixPool);
|
|
RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
|
|
}
|
|
|
|
protected:
|
|
shared_ptr<Matrix<ElemType>> m_logSoftmaxOfRight;
|
|
shared_ptr<Matrix<ElemType>> m_softmaxOfRight;
|
|
};
|
|
|
|
template class CrossEntropyWithSoftmaxNode<float>;
|
|
template class CrossEntropyWithSoftmaxNode<double>;
|
|
|
|
// -----------------------------------------------------------------------
|
|
/// CrossEntropyNode (labels, prediction)
|
|
// -----------------------------------------------------------------------
|
|
|
|
// calculates: -sum(left_i * log(right_i))
|
|
// assume softmax is already done
|
|
// You probably want to use CrossEntropyWithSoftMaxNode instead, it is more efficient in most cases.
|
|
template <class ElemType>
|
|
class CrossEntropyNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<2>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"CrossEntropy";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(CrossEntropyNode);
|
|
CrossEntropyNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name)
|
|
{
|
|
}
|
|
|
|
virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
// left Node must be a scalar
|
|
if (inputIndex == 0) // left derivative
|
|
{
|
|
BackpropToLeft(*m_logOfRight, Input(0)->GradientFor(fr), Gradient());
|
|
}
|
|
else
|
|
{
|
|
BackpropToRight(*m_leftDivRight, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(1)->GradientFor(fr), Gradient());
|
|
}
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
/*TODO: merge with call site*/ void BackpropToLeft(const Matrix<ElemType>& logOfRight, Matrix<ElemType> inputGradientValues,
|
|
const Matrix<ElemType>& gradientValues)
|
|
{
|
|
Matrix<ElemType>::Multiply1x1AndWeightedAdd(-1.0f, gradientValues /*1x1*/, logOfRight, 1.0f, inputGradientValues);
|
|
}
|
|
|
|
/*TODO: merge with call site*/ void BackpropToRight(Matrix<ElemType>& leftDivRight,
|
|
const Matrix<ElemType> inputFunctionValues0, const Matrix<ElemType> inputFunctionValues1,
|
|
Matrix<ElemType> inputGradientValues, const Matrix<ElemType>& gradientValues)
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
leftDivRight.AssignElementDivisionOf(inputFunctionValues0, inputFunctionValues1);
|
|
MaskMissingColumnsToZero(leftDivRight, Input(0)->GetMBLayout(), fr);
|
|
Matrix<ElemType>::Multiply1x1AndWeightedAdd(-1.0f, gradientValues /*1x1*/, leftDivRight, 1.0f, inputGradientValues);
|
|
}
|
|
|
|
virtual void UpdateFunctionMBSize() override
|
|
{
|
|
m_logOfRight->Resize(Input(1)->Value());
|
|
m_leftDivRight->Resize(Input(1)->Value());
|
|
}
|
|
|
|
// -sum(left_i * log(right_i))
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
m_logOfRight->SetValue(Input(1)->ValueFor(fr));
|
|
m_logOfRight->InplaceLog();
|
|
MaskMissingColumnsToZero(*m_logOfRight, Input(1)->GetMBLayout(), fr);
|
|
Value().AssignInnerProductOfMatrices(Input(0)->MaskedValueFor(fr), *m_logOfRight);
|
|
Value() *= -1;
|
|
#if NANCHECK
|
|
functionValues.HasNan("CrossEntropy");
|
|
#endif
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
ValidateBinaryReduce(isFinalValidationPass);
|
|
}
|
|
|
|
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
|
{
|
|
Base::CopyTo(nodeP, newName, flags);
|
|
if (flags & CopyNodeFlags::copyNodeValue)
|
|
{
|
|
auto node = dynamic_pointer_cast<CrossEntropyNode<ElemType>>(nodeP);
|
|
*node->m_logOfRight = *m_logOfRight;
|
|
*node->m_leftDivRight = *m_leftDivRight;
|
|
}
|
|
}
|
|
|
|
// request matrices needed to do node function value evaluation
|
|
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
|
|
{
|
|
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
|
RequestMatrixFromPool(m_logOfRight, matrixPool);
|
|
}
|
|
|
|
// request matrices that are needed for gradient computation
|
|
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
|
|
{
|
|
Base::RequestMatricesBeforeBackprop(matrixPool);
|
|
RequestMatrixFromPool(m_leftDivRight, matrixPool);
|
|
}
|
|
|
|
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
|
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
|
{
|
|
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
|
ReleaseMatrixToPool(m_logOfRight, matrixPool);
|
|
ReleaseMatrixToPool(m_leftDivRight, matrixPool);
|
|
}
|
|
|
|
private:
|
|
// matrix value passed from evaluate to computePartial
|
|
shared_ptr<Matrix<ElemType>> m_logOfRight;
|
|
// temporary
|
|
shared_ptr<Matrix<ElemType>> m_leftDivRight;
|
|
};
|
|
|
|
template class CrossEntropyNode<float>;
|
|
template class CrossEntropyNode<double>;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// MatrixL1RegNode (input)
|
|
// TODO: share most code with MatrixL2RegNode
|
|
// -----------------------------------------------------------------------
|
|
|
|
template <class ElemType>
|
|
class MatrixL1RegNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<1>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"MatrixL1Reg";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(MatrixL1RegNode);
|
|
MatrixL1RegNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name)
|
|
{
|
|
}
|
|
|
|
virtual void BackpropToNonLooping(size_t inputIndex) override // scale by number of cols (or samples)
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
assert(inputIndex == 0);
|
|
inputIndex;
|
|
BackpropToS(*m_gradientOfL1Norm, Input(0)->GradientFor(fr), Gradient(), Input(0)->ValueFor(fr));
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
/*TODO: merge with call site*/ void BackpropToS(Matrix<ElemType>& gradientOfL1Norm,
|
|
Matrix<ElemType> inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& inputFunctionValues)
|
|
{
|
|
gradientOfL1Norm.AssignSignOf(inputFunctionValues);
|
|
Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, gradientValues /*1x1*/, gradientOfL1Norm, 1.0f, inputGradientValues);
|
|
}
|
|
|
|
virtual void UpdateFunctionMBSize() override
|
|
{
|
|
m_gradientOfL1Norm->Resize(Input(0)->Value());
|
|
}
|
|
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
Value().VerifySize(1, 1);
|
|
Value().SetValue(Input(0)->MaskedValueFor(fr).MatrixNorm1());
|
|
#if NANCHECK
|
|
Value().HasNan("MatrixL1Reg");
|
|
#endif
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
ValidateUnaryReduce(isFinalValidationPass);
|
|
}
|
|
|
|
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
|
{
|
|
Base::CopyTo(nodeP, newName, flags);
|
|
if (flags & CopyNodeFlags::copyNodeValue)
|
|
{
|
|
auto node = dynamic_pointer_cast<MatrixL1RegNode<ElemType>>(nodeP);
|
|
*node->m_gradientOfL1Norm = *m_gradientOfL1Norm;
|
|
}
|
|
}
|
|
|
|
// request matrices that are needed for gradient computation
|
|
virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
|
|
{
|
|
Base::RequestMatricesBeforeBackprop(matrixPool);
|
|
RequestMatrixFromPool(m_gradientOfL1Norm, matrixPool);
|
|
}
|
|
|
|
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
|
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
|
{
|
|
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
|
ReleaseMatrixToPool(m_gradientOfL1Norm, matrixPool);
|
|
}
|
|
|
|
private:
|
|
shared_ptr<Matrix<ElemType>> m_gradientOfL1Norm; // temporary
|
|
};
|
|
|
|
template class MatrixL1RegNode<float>;
|
|
template class MatrixL1RegNode<double>;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// MatrixL2RegNode (input)
|
|
// TODO: share most code with MatrixL1RegNode
|
|
// -----------------------------------------------------------------------
|
|
|
|
template <class ElemType>
|
|
class MatrixL2RegNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<1>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"MatrixL2Reg";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(MatrixL2RegNode);
|
|
MatrixL2RegNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name)
|
|
{
|
|
}
|
|
|
|
virtual void BackpropToNonLooping(size_t inputIndex) override // scale by number of cols (or samples)
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
assert(inputIndex == 0);
|
|
inputIndex;
|
|
BackpropToS(Input(0)->GradientFor(fr), Gradient(), Input(0)->ValueFor(fr), Value());
|
|
}
|
|
|
|
/*TODO: merge with call site*/ void BackpropToS(Matrix<ElemType> inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& inputFunctionValues, const Matrix<ElemType>& functionValues)
|
|
{
|
|
ElemType v = gradientValues.Get00Element() / (functionValues.Get00Element() + EPS_IN_INVERSE); // TODO: GPU inefficiency
|
|
inputGradientValues.AddWithScaleOf(v, inputFunctionValues);
|
|
}
|
|
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
Value().VerifySize(1, 1);
|
|
Value().SetValue(Input(0)->MaskedValueFor(fr).FrobeniusNorm());
|
|
#if NANCHECK
|
|
Value().HasNan("MatrixL2Reg");
|
|
#endif
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
ValidateUnaryReduce(isFinalValidationPass);
|
|
}
|
|
};
|
|
|
|
template class MatrixL2RegNode<float>;
|
|
template class MatrixL2RegNode<double>;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// NoiseContrastiveEstimationNode (labels, input, inputWeights, biasWeights)
|
|
// -labels: label in dense matrix in [4 x T]
|
|
// the first row is the word index, the second row is the class index, the third row is the first word index of the class
|
|
// the last row is the first word index of the next class
|
|
// - input: hidden layer activity to the node in [hdsize x T]. for a simple rnn, this is the hidden layer activty
|
|
// - inputWeights: weight matrix in [hdsize x vocab_size], for speed-up, as per word matrix can be simply obtained as column slice
|
|
// - biasWeights: clsprob in dense matrix in [nbr_cls x T]. this is the output from logsoftmax node for the log-posterior probabilty of class given observations
|
|
// */
|
|
// BUGBUG: This node has not been converted to memshare conventions.
|
|
// -----------------------------------------------------------------------
|
|
|
|
enum NCEEvalMode
|
|
{
|
|
Softmax = 0,
|
|
Unnormalized = 1,
|
|
None = 2
|
|
};
|
|
template <class ElemType>
|
|
class NoiseContrastiveEstimationNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<4>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"NCEBasedCrossEntropyWithSoftmax";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(NoiseContrastiveEstimationNode);
|
|
NoiseContrastiveEstimationNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name),
|
|
m_logSoftmax(deviceId),
|
|
m_softMax(deviceId),
|
|
m_grdToSoftMaxInput(deviceId),
|
|
m_ncePrediction(deviceId),
|
|
m_evalMode(NCEEvalMode::None)
|
|
{
|
|
}
|
|
NoiseContrastiveEstimationNode(DEVICEID_TYPE deviceId, const wstring& name, NCEEvalMode xm_evalMode)
|
|
: Base(deviceId, name),
|
|
m_logSoftmax(deviceId),
|
|
m_softMax(deviceId),
|
|
m_grdToSoftMaxInput(deviceId),
|
|
m_ncePrediction(deviceId),
|
|
m_evalMode(xm_evalMode)
|
|
{
|
|
}
|
|
// ^^ TODO: we can merge these two
|
|
|
|
virtual void Save(File& fstream) const override
|
|
{
|
|
Base::Save(fstream);
|
|
fstream << m_evalMode;
|
|
}
|
|
|
|
virtual void Load(File& fstream, size_t modelVersion) override
|
|
{
|
|
Base::Load(fstream, modelVersion);
|
|
fstream >> m_evalMode;
|
|
if (m_evalMode > NCEEvalMode::None)
|
|
{
|
|
m_evalMode = NCEEvalMode::None;
|
|
fstream.SetPosition(fstream.GetPosition() - sizeof(m_evalMode));
|
|
}
|
|
}
|
|
|
|
void SetEvalMode(NCEEvalMode& xevMode)
|
|
{
|
|
m_evalMode = xevMode;
|
|
}
|
|
NCEEvalMode& EvalMode()
|
|
{
|
|
return m_evalMode;
|
|
} // TODO: really? Return a reference to a local? TODO: change to const? and call it GetEvalMode()
|
|
|
|
/**
|
|
compute gradients to input observations, the weights to the observations, and the class log posterior probabilities
|
|
*/
|
|
virtual void BackpropToNonLooping(size_t inputIndex) override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
m_needRecomputeGradientToSoftmaxInput = false;
|
|
// gradient computation@yinggongzhao
|
|
// inputIndex should be 2 this time
|
|
if (m_evalMode != NCEEvalMode::None)
|
|
LogicError("BackpropTo should only be called in training mode");
|
|
if (inputIndex == 0)
|
|
InvalidArgument("ComputeInput partial should not be called for label");
|
|
// samples+probs hidden embedding
|
|
// Input(inputIndex)->GradientFor(fr).AssignNCEDerivative(m_ncePrediction, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(2)->Value(), inputIndex);
|
|
if (inputIndex >= 2)
|
|
Input(inputIndex)->Gradient().AssignNCEDerivative(m_ncePrediction, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(2)->ValueAsMatrix(), inputIndex);
|
|
else
|
|
Input(inputIndex)->GradientFor(fr).AssignNCEDerivative(m_ncePrediction, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(2)->ValueAsMatrix(), inputIndex);
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
virtual void UpdateFunctionMBSize() override
|
|
{
|
|
// TODO (this does not really break it since for full matrices, class Matrix will resize by itself)
|
|
}
|
|
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override // -sum(left_i * log(softmax_i(right)))
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
if (Input(0)->HasMBLayout() && Input(0)->GetMBLayout()->HasGaps())
|
|
LogicError("%ls %ls operation does not handle multiple parallel sequences with gaps correctly. Contact fseide@microsoft.com if you have a need and a test case.", NodeName().c_str(), OperationName().c_str());
|
|
|
|
int positive = 0, negative = 0;
|
|
if (Input(0)->GetSampleLayout().GetNumElements() == 1)
|
|
{
|
|
for (int i = 0; i < Input(0)->Value().GetNumCols(); i++) // BUGBUG: Loops must be over frames, not columns. Columns may contain gaps.
|
|
{
|
|
if (Input(0)->Value()(0, i) > 0)
|
|
positive++;
|
|
else if (Input(0)->Value()(0, i) < 0)
|
|
negative++;
|
|
}
|
|
assert(positive * negative == 0);
|
|
}
|
|
if (m_evalMode == NCEEvalMode::Softmax || (Input(0)->GetSampleLayout().GetNumElements() == 1 && positive > 0))
|
|
{
|
|
// evaluation uses softmax
|
|
m_logSoftmax.AssignProductOf(Input(1)->Value(), true, Input(2)->ValueAsMatrix(), false);
|
|
m_logSoftmax += Input(3)->Value();
|
|
m_logSoftmax.InplaceLogSoftmax(false);
|
|
MaskMissingColumnsToZero(m_logSoftmax, Input(1)->GetMBLayout(), fr); // TODO: is this the right way to neutralize gaps?
|
|
Value().AssignSoftmaxSum(Input(0)->Value(), m_logSoftmax);
|
|
}
|
|
else if (m_evalMode == NCEEvalMode::Unnormalized || (Input(0)->GetSampleLayout().GetNumElements() == 1 && negative > 0))
|
|
{
|
|
// TODO: are we treating gaps correctly here?
|
|
Value().AssignNceUnnormalizedEval(Input(0)->Value(), Input(1)->Value(), Input(2)->ValueAsMatrix(), Input(3)->Value());
|
|
}
|
|
else
|
|
{
|
|
// TODO: are we treating gaps correctly here?
|
|
// training criterion uses NCE
|
|
// likelihood samples+probs hidden embedding bias
|
|
Value().AssignNoiseContrastiveEstimation(Input(0)->Value(), Input(1)->Value(), Input(2)->ValueAsMatrix(), Input(3)->Value(), m_ncePrediction);
|
|
}
|
|
m_needRecomputeGradientToSoftmaxInput = true;
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
Base::Validate(isFinalValidationPass);
|
|
m_pMBLayout = nullptr; // this node does not hold mini-batch data
|
|
|
|
if (isFinalValidationPass)
|
|
{
|
|
if (Input(1)->GetSampleMatrixNumRows() != Input(2)->GetAsMatrixNumRows())
|
|
LogicError("The Matrix dimension for observation and weight in the NoiseContrastiveEstimationNode operation does not match.");
|
|
if (!Input(0)->HasMBLayout() || !Input(1)->HasMBLayout() || Input(2)->HasMBLayout() || !Input(3)->HasMBLayout())
|
|
LogicError("%ls %ls operation requires inputs 0, 1, and 3 to be a minibatch, and input 2 to be a matrix.", NodeName().c_str(), OperationName().c_str());
|
|
}
|
|
|
|
SetDims(TensorShape(1), false);
|
|
}
|
|
|
|
protected:
|
|
Matrix<ElemType> m_logSoftmax;
|
|
Matrix<ElemType> m_softMax;
|
|
Matrix<ElemType> m_ncePrediction;
|
|
|
|
// gradient of cross entropy with respect to the input of softmax
|
|
// a 1 row by \sum_t m_nbrWordsInEachTime[t] vector
|
|
// one slice of size m_nbrWordsInEachTime[t] saves the input to softmax for word y_t
|
|
Matrix<ElemType> m_grdToSoftMaxInput;
|
|
bool m_needRecomputeGradientToSoftmaxInput;
|
|
|
|
size_t m_nbrNoise;
|
|
size_t m_totalNbrWords;
|
|
|
|
private:
|
|
NCEEvalMode m_evalMode;
|
|
};
|
|
template class NoiseContrastiveEstimationNode<float>;
|
|
template class NoiseContrastiveEstimationNode<double>;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// ClassBasedCrossEntropyWithSoftmaxNode (labeldata(.,t), inputdata(.,t), embeddingMatrix, clsProbBeforeSoftmaxData(.,t))
|
|
// - Input(0) [4 x T] label in dense matrix in
|
|
// (0,t) the first row is the word index
|
|
// (1,t) the second row is the class index
|
|
// (2,t) the third row is the first word index of the class
|
|
// (3,t) the last row is the first word index of the next class
|
|
// - Input(1) [hdsize x T] hidden layer activation to the node in. for a simple rnn, this is the hidden layer activty
|
|
// - Input(2) [hdsize x vocab_size] weight matrix in, for speed-up, as per word matrix can be simply obtained as column slice
|
|
// - Input(3) [nbr_cls x T] clsprob in dense matrix in. This input, if applied softmax on, is the posterior probabilty of class given observations
|
|
// -----------------------------------------------------------------------
|
|
|
|
// calculates: -sum(left_i * log(softmax_i(right))) for class given history and for word given history
|
|
// need to provide class probabilty from external node
|
|
template <class ElemType>
|
|
class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<4>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName() { return L"ClassBasedCrossEntropyWithSoftmax"; }
|
|
|
|
// our inputs
|
|
static const size_t LABELDATA = 0;
|
|
static const size_t INPUTDATA = 1;
|
|
static const size_t EMBEDDINGMATRIX = 2;
|
|
static const size_t CLASSPROBINDATA = 3;
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(ClassBasedCrossEntropyWithSoftmaxNode);
|
|
ClassBasedCrossEntropyWithSoftmaxNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name),
|
|
m_logSoftmax(deviceId),
|
|
m_softMax(deviceId),
|
|
m_grdToSoftMaxInput(deviceId),
|
|
m_clsLogSoftmax(deviceId),
|
|
m_clsSoftmax(deviceId)
|
|
{
|
|
}
|
|
|
|
private:
|
|
// iterate over a large workspace that contains all class-conditioned probs concatenated
|
|
// 'sz' is the offset into that vector. We will iterate over these vectors at a few places. Always use this same boilerplate code.
|
|
template<class F>
|
|
size_t ForColumnsWithClass(const F& op)
|
|
{
|
|
const size_t nT = Input(LABELDATA)->GetNumTimeSteps();
|
|
const size_t nS = Input(LABELDATA)->GetNumParallelSequences();
|
|
size_t sz = 0; // iterate over the packed concatenated class-conditioned prob vectors
|
|
for (size_t s = 0; s < nS; s++)
|
|
for (size_t t = 0; t < nT; t++)
|
|
{
|
|
FrameRange fr = FrameRange(Input(LABELDATA)->GetMBLayout(), t).Sequence(s);
|
|
if (Input(LABELDATA)->GetMBLayout()->IsGap(fr)) // skip gaps
|
|
continue;
|
|
|
|
const Matrix<ElemType>& lbl_t = Input(LABELDATA)->ValueFor(fr);
|
|
size_t y_t = (size_t)lbl_t(0, 0); // current word token index
|
|
size_t c_t = (size_t)lbl_t(1, 0); // current word token's class index
|
|
size_t lft_bnd = (size_t)lbl_t(2, 0); // index of first word belonging to current word token's class
|
|
size_t rgt_bnd = (size_t)lbl_t(3, 0); // and end of that range
|
|
size_t nbr_wrd = (rgt_bnd - lft_bnd); // number of words in the class
|
|
|
|
// perform the operation
|
|
op(s, t, fr, y_t, c_t, sz, lft_bnd, nbr_wrd);
|
|
|
|
sz += nbr_wrd;
|
|
}
|
|
return sz;
|
|
}
|
|
|
|
// compute gradients to input observations, the weights to the observations, and the class log posterior probabilites
|
|
virtual void BackpropToNonLooping(size_t inputIndex) override
|
|
{
|
|
// this should never be called for input[0], which is controlled through learningRateMultiplier == 0
|
|
if (inputIndex != 1 && inputIndex != 2 && inputIndex != 3)
|
|
InvalidArgument("ClassCrossEntropyWithSoftmaxNode criterion only takes with respect to input, weight to the input and class log posterior probability.");
|
|
|
|
ComputeSoftMaxPartial(); // Note: Flag m_needRecomputeGradientToSoftmaxInput guards so that this computes only once.
|
|
|
|
ForColumnsWithClass([&](size_t /*s*/, size_t /*t*/, const FrameRange& fr, size_t /*y_t*/, size_t c_t, size_t sz, size_t lft_bnd, size_t nbr_wrd)
|
|
{
|
|
// compute prb - 1 and prb
|
|
Matrix<ElemType> weightForClass = Input(EMBEDDINGMATRIX)->ValueAsMatrix().ColumnSlice(lft_bnd, nbr_wrd);
|
|
Matrix<ElemType> obs = Input(INPUTDATA)->ValueFor(fr); // hidden activation vector for current word token
|
|
Matrix<ElemType> grd_to_soft_max_input = m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd);
|
|
|
|
switch (inputIndex)
|
|
{
|
|
case 1:
|
|
{
|
|
// gradient to input
|
|
Matrix<ElemType> grd_t = Input(INPUTDATA)->GradientFor(fr);
|
|
Matrix<ElemType>::MultiplyAndAdd(weightForClass, false, grd_to_soft_max_input, true, grd_t);
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
// gradient to input weight
|
|
Matrix<ElemType> grd_to_wgt_t = Input(EMBEDDINGMATRIX)->GradientAsMatrix().ColumnSlice(lft_bnd, nbr_wrd);
|
|
Matrix<ElemType>::MultiplyAndAdd(obs, false, grd_to_soft_max_input, false, grd_to_wgt_t);
|
|
break;
|
|
}
|
|
case 3:
|
|
{
|
|
Matrix<ElemType> grd_t = Input(CLASSPROBINDATA)->GradientFor(fr);
|
|
grd_t.SetValue(Input(CLASSPROBINDATA)->DataFor(m_clsSoftmax, fr));
|
|
ComputeCEPartialToSoftmaxInputs(grd_t, Gradient(), c_t);
|
|
break;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
|
|
|
private:
|
|
void ComputeCEPartialToSoftmaxInputs(Matrix<ElemType>& inputGradientValues, Matrix<ElemType>& gradientValues, size_t y_t)
|
|
{
|
|
Matrix<ElemType>::MinusOneAt(inputGradientValues, y_t);
|
|
Matrix<ElemType>::Scale(gradientValues, inputGradientValues);
|
|
}
|
|
|
|
// gradient of cross entropy w.r.t. to input to softmax
|
|
void ComputeSoftMaxPartial()
|
|
{
|
|
if (m_needRecomputeGradientToSoftmaxInput)
|
|
{
|
|
m_grdToSoftMaxInput.Resize(1, m_totalNbrWords); // buffer that contains a concatenation of class-conditional values
|
|
|
|
ForColumnsWithClass([&](size_t /*s*/, size_t /*t*/, const FrameRange& /*fr*/, size_t y_t, size_t /*c_t*/, size_t sz, size_t lft_bnd, size_t nbr_wrd)
|
|
{
|
|
Matrix<ElemType> softMax = m_softMax.ColumnSlice(sz, nbr_wrd);
|
|
|
|
size_t idx_in_class = y_t - lft_bnd;
|
|
ComputeCEPartialToSoftmaxInputs(softMax, Gradient(), idx_in_class);
|
|
|
|
m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd).SetValue(softMax);
|
|
});
|
|
|
|
m_needRecomputeGradientToSoftmaxInput = false;
|
|
}
|
|
}
|
|
|
|
public:
|
|
virtual void UpdateFunctionMBSize() override
|
|
{
|
|
// TODO: Resize temp matrices here (not doing so does not really fail since for full matrices, class Matrix will resize by itself)
|
|
}
|
|
|
|
// -sum(left_i * log(softmax_i(right)))
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
|
{
|
|
// get the label matrix to CPU, ideally in location=BOTH state
|
|
Input(LABELDATA)->Value().TransferToDeviceIfNotThere(CPUDEVICE, /*ismoved =*/ false/*means: BOTH state OK*/, /*emptyTransfer =*/ false, /*updatePreferredDevice =*/ false);
|
|
|
|
auto& functionValues = Value();
|
|
|
|
const size_t hdSize = Input(INPUTDATA)->GetSampleMatrixNumRows();
|
|
assert(m_nbrCls == Input(CLASSPROBINDATA)->GetSampleMatrixNumRows());
|
|
|
|
// compute the class posteriors
|
|
m_clsLogSoftmax = Input(CLASSPROBINDATA)->Value();
|
|
m_clsLogSoftmax.InplaceLogSoftmax(true); // log
|
|
m_clsSoftmax.AssignExpOf(m_clsLogSoftmax); // non-log
|
|
|
|
// create a large workspace to contain all class-conditioned probs concatenated
|
|
m_totalNbrWords = ForColumnsWithClass([](size_t /*s*/, size_t /*t*/, const FrameRange& /*fr*/, size_t y_t, size_t /*c_t*/, size_t /*sz*/, size_t lft_bnd, size_t nbr_wrd)
|
|
{
|
|
if (nbr_wrd == 0)
|
|
LogicError("ClassBasedCrossEntropyWithSoftmax: Encountered a class of size 0.");
|
|
if (y_t < lft_bnd || y_t >= lft_bnd + nbr_wrd)
|
|
LogicError("ClassBasedCrossEntropyWithSoftmax: Word index out of bounds of class-member index range (word not a class member).");
|
|
});
|
|
// now m_totalNbrWords = total size of concatenated vector
|
|
|
|
// buffer to hold the concatenated class-conditioned prob vectors
|
|
m_softMax.Resize(1, m_totalNbrWords);
|
|
m_logSoftmax.Resize(1, m_totalNbrWords);
|
|
|
|
// accumulate objective
|
|
functionValues.SetValue(0);
|
|
ForColumnsWithClass([&](size_t s, size_t t, const FrameRange& fr, size_t y_t, size_t c_t, size_t sz, size_t lft_bnd, size_t nbr_wrd)
|
|
{
|
|
// now get views of various arrays that correspond to the index range of words belonging to this class
|
|
|
|
// get hidden vectors for the words in this class
|
|
Matrix<ElemType> weightForClass = Input(EMBEDDINGMATRIX)->ValueAsMatrix().ColumnSlice(lft_bnd, nbr_wrd); // [hdSize x nbr_wrd]
|
|
|
|
// buffer to hold the class-conditional distribution
|
|
Matrix<ElemType> softMax_t = m_softMax.ColumnSlice(sz, nbr_wrd); // TODO: declare these outside of the loop to avoid the malloc
|
|
Matrix<ElemType> logSoftMax_t = m_logSoftmax.ColumnSlice(sz, nbr_wrd);
|
|
|
|
Matrix<ElemType> obs = Input(INPUTDATA)->ValueFor(fr); // hidden activation vector for current word token
|
|
|
|
// multiply hidden activation with weight matrix (the slice of the weight matrix for the range of class members)
|
|
// TODO: can we use 'true' here instead? Above transposition hack won't work with row slices. 'obs' not used elsewhere
|
|
obs.Reshape(1, hdSize); // transpose it (make it a column vector)
|
|
logSoftMax_t.AssignProductOf(obs /*(1 x hdSize)*/, false, weightForClass /*hdSize x nbr_wrd*/, false); // -> 1 x nbr_word
|
|
|
|
// log softmax(W x_t)
|
|
logSoftMax_t.InplaceLogSoftmax(false);
|
|
|
|
// and non-log version
|
|
softMax_t.SetValue(logSoftMax_t);
|
|
softMax_t.InplaceExp();
|
|
// we now have a column vector of class-conditional probabilities over the class members
|
|
|
|
// add the word's class-conditional log posterior
|
|
size_t idx_in_class = y_t - lft_bnd;
|
|
Matrix<ElemType>::AddElementToElement(logSoftMax_t, 0, idx_in_class, functionValues, 0, 0); // (1x1)
|
|
|
|
// add the class log posterior probability (for backprop)
|
|
auto clsLogSoftmax_t = Input(CLASSPROBINDATA)->DataFor(m_clsLogSoftmax, fr);
|
|
Matrix<ElemType>::AddElementToElement(clsLogSoftmax_t, c_t, 0, functionValues, 0, 0); // (1x1)
|
|
});
|
|
|
|
functionValues *= (-1);
|
|
|
|
#if NANCHECK
|
|
functionValues.HasNan("ClassBasedCrossEntropyWithSoftmax");
|
|
#endif
|
|
m_needRecomputeGradientToSoftmaxInput = true;
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
Base::Validate(isFinalValidationPass);
|
|
m_pMBLayout = nullptr; // this node does not hold mini-batch data
|
|
|
|
if (isFinalValidationPass)
|
|
{
|
|
if (Input(LABELDATA)->GetSampleMatrixNumRows() != 4) // label data needs to have 4 rows
|
|
LogicError("The label data in the ClassBasedCrossEntropyWithSoftmax operation must have 4 rows.");
|
|
if (Input(INPUTDATA)->GetSampleMatrixNumRows() != Input(EMBEDDINGMATRIX)->GetAsMatrixNumRows()) // input and matrix can be timed
|
|
LogicError("The matrix dimension for observation and weight in the ClassBasedCrossEntropyWithSoftmax operation does not match.");
|
|
if (Input(LABELDATA)->GetMBLayout() != Input(INPUTDATA)->GetMBLayout() || Input(LABELDATA)->GetMBLayout() != Input(CLASSPROBINDATA)->GetMBLayout())
|
|
InvalidArgument("%ls %ls operation requires that the layouts of inputs 0 (label), 1 (hidden activation), and 3 (log softmax) match.", NodeName().c_str(), OperationName().c_str());
|
|
}
|
|
|
|
SetDims(TensorShape(1), false);
|
|
|
|
m_nbrCls = Input(CLASSPROBINDATA)->GetSampleMatrixNumRows();
|
|
}
|
|
|
|
protected:
|
|
Matrix<ElemType> m_logSoftmax;
|
|
Matrix<ElemType> m_softMax;
|
|
|
|
Matrix<ElemType> m_clsLogSoftmax;
|
|
Matrix<ElemType> m_clsSoftmax;
|
|
|
|
// gradient of cross entropy with respect to the input of softmax
|
|
// a 1 row by \sum_t m_nbrWordsInEachTime[t] vector
|
|
// one slice of size m_nbrWordsInEachTime[t] saves the input to softmax for word y_t
|
|
Matrix<ElemType> m_grdToSoftMaxInput;
|
|
bool m_needRecomputeGradientToSoftmaxInput;
|
|
|
|
size_t m_nbrCls;
|
|
size_t m_totalNbrWords;
|
|
};
|
|
|
|
template class ClassBasedCrossEntropyWithSoftmaxNode<float>;
|
|
template class ClassBasedCrossEntropyWithSoftmaxNode<double>;
|
|
|
|
#ifdef COMING_SOON
|
|
|
|
// -----------------------------------------------------------------------
|
|
// CRFNode (labels, position_dependent_scores, transition_scores)
|
|
// - labels: output label vector of [0:T-1]
|
|
// - position_dependent_scores [0:T-1]: score from position dependent node,
|
|
// in the R-CRF case, it is the RNN output score before softmax
|
|
// - transition scores: square transition matrix, --TODO: log?
|
|
// in the R-CRF case, it is the transition probability between labels
|
|
// BUGBUG: This node cannot operate with truncated BPTT, but does not detect it. It also does not handle gaps or test boundary flags.
|
|
// -----------------------------------------------------------------------
|
|
|
|
/**
|
|
CRF training criterion
|
|
It uses forward-backward algorithm within a minibatch to compute statistics for sequence level optimization
|
|
This node can serve a base class for other sequence level optimization
|
|
|
|
Developed by Kaisheng Yao
|
|
This node is for replicating results of the following work
|
|
K. Yao, B. Peng, G. Zweig, D. Yu, X. Li and F. Gao, "Recurrent Conditional Random Fields", NIPS Deep Learning Workshop 2014
|
|
K. Yao, B. Peng, G. Zweig, D. Yu, X. Li and F. Gao, "Recurrent Conditional Random Fields for Language Understanding", ICASSP 2014
|
|
http://research.microsoft.com/pubs/210167/rcrf_v9.pdf
|
|
|
|
The forward-backward algorithm follows the derivation in
|
|
http://jmlr.org/papers/volume12/collobert11a/collobert11a.pdf
|
|
|
|
*/
|
|
template <class ElemType>
|
|
class CRFNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<3>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"CRF";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(CRFNode);
|
|
CRFNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name),
|
|
mAlpha(deviceId),
|
|
mBeta(deviceId),
|
|
mPostProb(deviceId)
|
|
{
|
|
}
|
|
|
|
// compute posterior probability of label y at position t
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
size_t nrow = Input(0)->Value().GetNumRows();
|
|
size_t ncol = Input(0)->Value().GetNumCols();
|
|
|
|
mAlpha.Resize(nrow, ncol);
|
|
mBeta.Resize(nrow, ncol);
|
|
mPostProb.Resize(nrow, ncol);
|
|
|
|
Value().SetValue(0.0);
|
|
Matrix<ElemType> funcVal = Value(); // TODO: This just creates a 1x1 matrix set to 0.
|
|
|
|
size_t nS = Input(0)->GetNumParallelSequences();
|
|
if (nS != 1)
|
|
LogicError("CRFNode: >1 parallel sequences are curently not implemented correctly.");
|
|
for (size_t i = 0; i < nS; i++) // process parallel sequences one by one --BUGBUG: We should loop over individual sequences.
|
|
{
|
|
FrameRange sequenceRange = fr.Sequence(i); // FrameRange to select one sequence
|
|
// BUGBUG: This ^^ is neither supported nor correct, since this code does not handle gaps or start/end flags.
|
|
ForwardPropS(
|
|
DataWithMBLayoutFor(mPostProb, sequenceRange, Input(0)->GetMBLayout()),
|
|
DataWithMBLayoutFor(mAlpha, sequenceRange, Input(0)->GetMBLayout()),
|
|
DataWithMBLayoutFor(mBeta, sequenceRange, Input(0)->GetMBLayout()),
|
|
funcVal,
|
|
Input(0)->ValueFor(sequenceRange),
|
|
Input(1)->ValueFor(sequenceRange),
|
|
Input(2)->ValueAsMatrix(), mStartLbl,
|
|
mEndLbl);
|
|
|
|
Value() += funcVal; // aggregate over sequences
|
|
}
|
|
}
|
|
|
|
virtual void BackpropToNonLooping(size_t inputIndex) override // scaled by 2*number of colmns (samples) in the Matrix<ElemType>
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
// this should never be called for input[0], which is controlled through learningRateMultiplier == 0
|
|
if (inputIndex != 1 && inputIndex != 2)
|
|
InvalidArgument("CRFNode only takes with respect to input and weight.");
|
|
|
|
if (inputIndex == 1)
|
|
{
|
|
auto gradient = Input(1)->GradientFor(fr);
|
|
Matrix<ElemType>::AddScaledDifference(Gradient(), mPostProb, Input(0)->ValueFor(fr), gradient);
|
|
}
|
|
else if (inputIndex == 2)
|
|
{
|
|
assert(Input(inputIndex)->GradientFor(fr).GetNumElements() > 0);
|
|
size_t nS = Input(0)->GetNumParallelSequences();
|
|
for (size_t i = 0; i < nS; i++) // process all sequences one by one
|
|
{
|
|
FrameRange sequenceRange = fr.Sequence(i); // FrameRange to select one sequence
|
|
auto& gradient = Input(2)->GradientAsMatrix();
|
|
TransGrdCompute(Input(0)->ValueFor(sequenceRange),
|
|
DataWithMBLayoutFor(mAlpha, sequenceRange, Input(0)->GetMBLayout()),
|
|
DataWithMBLayoutFor(mBeta, sequenceRange, Input(0)->GetMBLayout()),
|
|
Input(2)->ValueAsMatrix(),
|
|
gradient,
|
|
mStartLbl, 1);
|
|
}
|
|
}
|
|
else
|
|
return;
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// compute forward backward algorithm
|
|
/*TODO: merge with call site*/ void ForwardPropS(Matrix<ElemType> postprob, Matrix<ElemType> alpha, Matrix<ElemType> beta, Matrix<ElemType>& functionValues, const Matrix<ElemType>& lbls, const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores, int& firstLbl, int& lastLbl, const int iStep = 1)
|
|
{
|
|
// to-do, each slice is for one sentence
|
|
// to-do, number of slices correspond to number of frames
|
|
// this implementation only supports one sentence per minibatch
|
|
|
|
int nObs = lbls.GetNumCols();
|
|
|
|
// change to other values so can support multiple sentences in each minibatch
|
|
assert(iStep == 1);
|
|
ForwardCompute(alpha, lbls, pos_scores, pair_scores);
|
|
BackwardCompute(alpha, beta, functionValues, lbls, pos_scores, pair_scores, iStep);
|
|
PostProbCompute(postprob, alpha, beta);
|
|
|
|
firstLbl = -1;
|
|
for (int ik = 0; ik < lbls.GetNumRows(); ik++)
|
|
if (lbls(ik, 0) != 0)
|
|
{
|
|
firstLbl = ik;
|
|
break;
|
|
}
|
|
|
|
lastLbl = -1;
|
|
for (int ik = 0; ik < lbls.GetNumRows(); ik++)
|
|
if (lbls(ik, nObs - 1) != 0)
|
|
{
|
|
lastLbl = ik;
|
|
break;
|
|
}
|
|
|
|
functionValues.AssignInnerProductOfMatrices(lbls, pos_scores);
|
|
|
|
Matrix<ElemType> a = alpha.ColumnSlice(nObs - 1, 1);
|
|
ElemType fAlpha;
|
|
fAlpha = a.LogAddSumOfElements();
|
|
|
|
// transition score
|
|
ElemType tscore = 0;
|
|
for (int t = 0; t < nObs - 1; t++)
|
|
{
|
|
int i = -1;
|
|
for (int ik = 0; ik < lbls.GetNumRows(); ik++)
|
|
if (lbls(ik, t) != 0)
|
|
{
|
|
i = ik;
|
|
break;
|
|
}
|
|
int j = -1;
|
|
for (int ik = 0; ik < lbls.GetNumRows(); ik++)
|
|
if (lbls(ik, t + 1) != 0)
|
|
{
|
|
j = ik;
|
|
break;
|
|
}
|
|
tscore += pair_scores(j, i);
|
|
}
|
|
tscore += functionValues.Get00Element(); // correct path score
|
|
tscore -= fAlpha; // reduced by the scores from all paths
|
|
functionValues.SetValue(tscore);
|
|
|
|
functionValues *= (-1);
|
|
}
|
|
|
|
// compute forward backward algorithm
|
|
static void ForwardCompute(Matrix<ElemType>& alpha,
|
|
const Matrix<ElemType>& lbls,
|
|
const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores)
|
|
{
|
|
// to-do, shift more than 1 to support muliple sentences per minibatch
|
|
int iNumPos = lbls.GetNumCols();
|
|
int iNumLab = lbls.GetNumRows();
|
|
|
|
int firstLbl = -1;
|
|
for (int ik = 0; ik < lbls.GetNumRows(); ik++)
|
|
if (lbls(ik, 0) != 0)
|
|
{
|
|
firstLbl = ik;
|
|
break;
|
|
}
|
|
|
|
// need to have
|
|
alpha.Resize(iNumLab, iNumPos);
|
|
|
|
for (int t = 0; t < iNumPos; t++)
|
|
{
|
|
for (int k = 0; k < iNumLab; k++)
|
|
{
|
|
ElemType fTmp = (ElemType) LZERO;
|
|
for (int j = 0; j < iNumLab; j++)
|
|
{
|
|
ElemType fAlpha = (j == firstLbl) ? (ElemType) 0.0 : (ElemType) LZERO;
|
|
if (t > 0)
|
|
fAlpha = alpha(j, t - 1);
|
|
fTmp = alpha.LogAdd(fTmp, fAlpha + pair_scores(k, j));
|
|
}
|
|
fTmp += pos_scores(k, t); // include position dependent score
|
|
alpha(k, t) = fTmp;
|
|
}
|
|
}
|
|
}
|
|
|
|
// compute backward algorithm
|
|
static void BackwardCompute(const Matrix<ElemType>& alpha, Matrix<ElemType>& beta,
|
|
Matrix<ElemType>& functionValues, const Matrix<ElemType>& lbls,
|
|
const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores, const int shift = 1)
|
|
{
|
|
assert(shift == 1);
|
|
|
|
alpha.RCRFBackwardCompute(alpha, beta, functionValues, lbls, pos_scores, pair_scores, shift);
|
|
}
|
|
|
|
static void TransGrdCompute(const Matrix<ElemType>& lbls,
|
|
const Matrix<ElemType>& alpha,
|
|
const Matrix<ElemType>& beta,
|
|
const Matrix<ElemType>& pair_scores,
|
|
Matrix<ElemType>& grd,
|
|
const int startLbl,
|
|
const int shift = 1)
|
|
{
|
|
assert(shift == 1);
|
|
|
|
alpha.RCRFTransGrdCompute(lbls,
|
|
alpha,
|
|
beta,
|
|
pair_scores,
|
|
grd,
|
|
startLbl, shift);
|
|
}
|
|
|
|
// compute forward backward algorithm
|
|
static void PostProbCompute(Matrix<ElemType>& postprob, const Matrix<ElemType>& alpha, const Matrix<ElemType>& beta)
|
|
{
|
|
int iNumPos = alpha.GetNumCols();
|
|
int iNumLab = alpha.GetNumRows();
|
|
|
|
postprob.Resize(iNumLab, iNumPos);
|
|
postprob.SetValue(beta);
|
|
postprob.InplaceExp();
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
Base::Validate(isFinalValidationPass);
|
|
m_pMBLayout = nullptr; // this node does not hold mini-batch data
|
|
|
|
if (isFinalValidationPass)
|
|
if (!(Input(1)->GetSampleMatrixNumRows() == Input(2)->GetAsMatrixNumRows() && // position dependent and pair scores have same number of labels
|
|
Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() &&
|
|
Input(0)->HasMBLayout() && Input(0)->GetMBLayout() == Input(1)->GetMBLayout() &&
|
|
// Input(0)->GetNumCols() == Input(1)->GetNumCols() && // position dependent and pair scores have the same observation numbers
|
|
Input(2)->GetAsMatrixNumCols() == Input(2)->GetAsMatrixNumRows()))
|
|
{
|
|
LogicError("The Matrix dimension in the CRFNode operation does not match.");
|
|
}
|
|
|
|
SetDims(TensorShape(1), false);
|
|
}
|
|
|
|
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
|
{
|
|
Base::CopyTo(nodeP, newName, flags);
|
|
if (flags & CopyNodeFlags::copyNodeValue)
|
|
{
|
|
auto node = dynamic_pointer_cast<CRFNode<ElemType>>(nodeP);
|
|
node->mAlpha = mAlpha;
|
|
node->mBeta = mBeta;
|
|
node->mPostProb = mPostProb;
|
|
|
|
node->mStartLbl = mStartLbl;
|
|
node->mEndLbl = mEndLbl;
|
|
}
|
|
}
|
|
|
|
private:
|
|
Matrix<ElemType> mAlpha; // TODO: m_Alpha etc.
|
|
Matrix<ElemType> mBeta;
|
|
Matrix<ElemType> mPostProb;
|
|
int mStartLbl;
|
|
int mEndLbl;
|
|
};
|
|
|
|
#endif
|
|
|
|
// -----------------------------------------------------------------------
|
|
// LogisticNode (labels, prediction, weight)
|
|
// calculates: -sum(left * log(right) + (1-left)*log(1-right)) (optionally * weight)
|
|
// -----------------------------------------------------------------------
|
|
|
|
template <class ElemType>
|
|
class LogisticNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>
|
|
{
|
|
typedef ComputationNodeNonLooping<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"Logistic";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfig(LogisticNode);
|
|
LogisticNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name)
|
|
{
|
|
}
|
|
|
|
virtual void BackpropToNonLooping(size_t inputIndex) override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
if (inputIndex != 1)
|
|
InvalidArgument("%ls %ls operation cannot compute the gradient for its first inpute.", NodeName().c_str(), OperationName().c_str());
|
|
|
|
// BackpropToRight(m_temp, Input(0)->Value(), Input(2)->Value(), Input(inputIndex)->Gradient(), Gradient(), m_classZeroLabels, m_result);
|
|
// Create vector with 1 for class 1, and -1 for class 0
|
|
m_temp->AssignDifferenceOf(Input(0)->ValueFor(fr), *m_classZeroLabels); // TODO: need a slice for m_classZeroLabels?
|
|
|
|
// Multiply the vector by the Input(2)->Value()
|
|
if (m_inputs.size() == 3) // with weight
|
|
m_temp->AssignElementProductOf(*m_temp, Input(2)->ValueFor(fr)); // TODO: is Input(2) minibatch data? Confirm
|
|
|
|
// divide class by p (class 1) or (1-p) (class 0)
|
|
m_temp->AssignElementDivisionOf(*m_temp, *m_result); // TODO: this is in-place--does this function allow that?
|
|
|
|
auto gradient = Input(inputIndex)->GradientFor(fr);
|
|
Matrix<ElemType>::Multiply1x1AndWeightedAdd(-1.0f, Gradient() /*1x1*/, *m_temp, 1.0f, gradient);
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
virtual void UpdateFunctionMBSize() override
|
|
{
|
|
m_classZeroLabels->Resize(Input(0)->Value());
|
|
m_result->Resize(Input(0)->Value());
|
|
m_temp->Resize(Input(0)->Value());
|
|
}
|
|
|
|
// -sum(left * log(right) + (1-left)*log(1-right)) (optionally * weight)
|
|
virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
|
|
{
|
|
FrameRange fr(Input(0)->GetMBLayout());
|
|
|
|
const Matrix<ElemType>& classOneLabels = Input(0)->ValueFor(fr);
|
|
const Matrix<ElemType>& classOneProbabilities = Input(1)->ValueFor(fr);
|
|
Matrix<ElemType>& classZeroLabels = *m_classZeroLabels;
|
|
|
|
Matrix<ElemType> ones = ConstOnes(classOneLabels.GetNumRows(), classOneLabels.GetNumCols(), classOneLabels.GetDeviceId());
|
|
|
|
// compute the indices for the class 0 indices
|
|
classZeroLabels.AssignDifferenceOf(ones, classOneLabels);
|
|
|
|
/* We're computing result = weight*(y*p + (1-y)*(1-p) = 2*y*p + (1-y) - p) */
|
|
|
|
/* First compute result = y*p */
|
|
m_result->AssignElementProductOf(classOneLabels, classOneProbabilities);
|
|
|
|
// TODO: verify that all these operations on m_result really can do in-place (or use different methods instead)
|
|
/* Now compute result = 2*y*p */
|
|
m_result->AssignProductOf((ElemType) 2.0, *m_result);
|
|
|
|
/* Now compute result = 2*y*p + (1-y) */
|
|
m_result->AssignSumOf(*m_result, classZeroLabels);
|
|
|
|
/* Finally compute result = 2*y*p + (1-y) - p */
|
|
m_result->AssignDifferenceOf(*m_result, classOneProbabilities);
|
|
|
|
// compute the log, resulting in y*log(p) + (1-y)*log(1-p)
|
|
m_temp->AssignLogOf(*m_result);
|
|
|
|
// The error is the negative of the sum of the result
|
|
if (m_inputs.size() == 2)
|
|
Value().AssignSumOfElements(*m_temp);
|
|
else
|
|
Value().AssignInnerProductOf(Input(2)->ValueFor(fr), *m_temp, false);
|
|
Value() *= (-1);
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
if (m_inputs.size() != 2 && m_inputs.size() != 3)
|
|
InvalidArgument("%ls %ls operation requires two or three inputs.", NodeName().c_str(), OperationName().c_str());
|
|
|
|
ValidateBinaryReduce(isFinalValidationPass);
|
|
|
|
/* Note that this is the same as ValidateInferBinaryInputDims, but done for the 3rd child if it exists */
|
|
if (m_inputs.size() == 3)
|
|
{
|
|
auto weights = Input(2);
|
|
auto other = Input(1);
|
|
// borrow any unset dimension on one input from the other input
|
|
weights->ValidateInferInputDimsFrom(other->GetSampleLayout());
|
|
|
|
if (isFinalValidationPass &&
|
|
!(Input(0)->GetSampleMatrixNumRows() == Input(2)->GetSampleMatrixNumRows() &&
|
|
(Input(0)->GetMBLayout() == Input(2)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(2)->HasMBLayout())))
|
|
{
|
|
LogicError("The Matrix dimensions of the second argument weights the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
|
|
}
|
|
}
|
|
}
|
|
|
|
// request matrices needed to do node function value evaluation
|
|
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
|
|
{
|
|
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
|
RequestMatrixFromPool(m_classZeroLabels, matrixPool);
|
|
RequestMatrixFromPool(m_result, matrixPool);
|
|
RequestMatrixFromPool(m_temp, matrixPool);
|
|
}
|
|
|
|
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
|
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
|
{
|
|
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
|
ReleaseMatrixToPool(m_classZeroLabels, matrixPool);
|
|
ReleaseMatrixToPool(m_result, matrixPool);
|
|
ReleaseMatrixToPool(m_temp, matrixPool);
|
|
}
|
|
|
|
virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
|
|
{
|
|
Base::CopyTo(nodeP, newName, flags);
|
|
if (flags & CopyNodeFlags::copyNodeValue)
|
|
{
|
|
auto node = dynamic_pointer_cast<LogisticNode<ElemType>>(nodeP);
|
|
*node->m_classZeroLabels = *m_classZeroLabels;
|
|
*node->m_result = *m_result;
|
|
*node->m_temp = *m_temp;
|
|
}
|
|
}
|
|
|
|
private:
|
|
shared_ptr<Matrix<ElemType>> m_classZeroLabels;
|
|
shared_ptr<Matrix<ElemType>> m_result;
|
|
shared_ptr<Matrix<ElemType>> m_temp;
|
|
};
|
|
|
|
template class LogisticNode<float>;
|
|
template class LogisticNode<double>;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// DropoutNode (input) -- perform drop-out
|
|
// Output is scaled such that no post-scaling is necessary.
|
|
// -----------------------------------------------------------------------
|
|
|
|
template <class ElemType>
|
|
class DropoutNode : public ComputationNode<ElemType>, public NumInputs<1>
|
|
{
|
|
typedef ComputationNode<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"Dropout";
|
|
}
|
|
|
|
public:
|
|
DeclareConstructorFromConfigWithNumInputs(DropoutNode);
|
|
DropoutNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name),
|
|
m_dropoutRate(0)
|
|
{
|
|
m_randomSeed = (unsigned long) CreateUniqId();
|
|
}
|
|
|
|
virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
|
|
{
|
|
Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
|
|
Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
|
|
|
|
if (m_dropoutRate > 0)
|
|
sliceInput0Grad.AddElementProductOf(sliceOutputGrad, DataFor(*m_maskOfDropout, fr));
|
|
else
|
|
sliceInput0Grad += sliceOutputGrad;
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
// The DropoutNode does not require its output value for computing
|
|
// the gradients of its input nodes
|
|
return false;
|
|
}
|
|
|
|
virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
|
|
{
|
|
// The DropoutNode does not require any of it's input's values for computing
|
|
// the gradients of its input nodes
|
|
UNREFERENCED_PARAMETER(childIndex);
|
|
return false;
|
|
}
|
|
|
|
virtual void UpdateFunctionMBSize() override
|
|
{
|
|
Base::UpdateFunctionMBSize();
|
|
// resize temporaries to their proper size
|
|
if (m_dropoutRate > 0)
|
|
m_maskOfDropout->Resize(Input(0)->Value());
|
|
}
|
|
|
|
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
|
|
{
|
|
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
|
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
|
|
|
if (m_dropoutRate > 0)
|
|
{
|
|
// determine drop-out mask for this minibatch
|
|
auto sliceMask = DataFor(*m_maskOfDropout, fr);
|
|
sliceMask.SetUniformRandomMask((ElemType) m_dropoutRate, (ElemType)(1.0 / (1.0 - m_dropoutRate)) /*pre-scaled*/, m_randomSeed);
|
|
m_randomSeed += 1073807359; // 1073807359 is a very large prime number to avoid collision with other dropout nodes
|
|
// apply dropout mask
|
|
sliceOutputValue.AssignElementProductOf(sliceMask, sliceInput0Value);
|
|
}
|
|
else
|
|
{
|
|
sliceOutputValue.SetValue(sliceInput0Value);
|
|
}
|
|
}
|
|
|
|
virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
|
{
|
|
ValidateUnaryMap(isFinalValidationPass);
|
|
}
|
|
|
|
// special methods for this node type which ComputationNetwork knows about and calls to pass parameters
|
|
void SetDropoutRate(const double val)
|
|
{
|
|
if (val < 0 || val >= 1)
|
|
LogicError("DropoutRate must be >= 0 and < 1.");
|
|
m_dropoutRate = val;
|
|
}
|
|
|
|
void SetRandomSeed(const unsigned long val)
|
|
{
|
|
m_randomSeed = (unsigned long) val;
|
|
}
|
|
|
|
virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
|
{
|
|
Base::CopyTo(nodeP, newName, flags);
|
|
if (flags & CopyNodeFlags::copyNodeValue)
|
|
{
|
|
auto node = dynamic_pointer_cast<DropoutNode<ElemType>>(nodeP);
|
|
node->m_dropoutRate = m_dropoutRate;
|
|
node->m_randomSeed = m_randomSeed;
|
|
node->m_maskOfDropout = m_maskOfDropout;
|
|
}
|
|
}
|
|
// request matrices needed to do node function value evaluation
|
|
virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
|
|
{
|
|
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
|
RequestMatrixFromPool(m_maskOfDropout, matrixPool);
|
|
}
|
|
|
|
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
|
virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
|
|
{
|
|
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
|
ReleaseMatrixToPool(m_maskOfDropout, matrixPool);
|
|
}
|
|
|
|
private:
|
|
double m_dropoutRate;
|
|
unsigned long m_randomSeed;
|
|
|
|
shared_ptr<Matrix<ElemType>> m_maskOfDropout;
|
|
};
|
|
|
|
template class DropoutNode<float>;
|
|
template class DropoutNode<double>;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// BatchNormalizationNode (...) --TODO: document inputs
|
|
// -----------------------------------------------------------------------
|
|
|
|
// Implements batch normalization technique as described in:
|
|
// Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift [S. Ioffe, C. Szegedy]
|
|
// http://arxiv.org/abs/1502.03167
|
|
template <class ElemType>
|
|
class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInputs<5>
|
|
{
|
|
typedef ComputationNode<ElemType> Base;
|
|
UsingComputationNodeMembersBoilerplate;
|
|
static const std::wstring TypeName()
|
|
{
|
|
return L"BatchNormalization";
|
|
}
|
|
|
|
public:
|
|
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
|
|
: Base(deviceId, name), m_eval(false), m_spatial(false), m_normTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
|
|
m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
|
|
{
|
|
}
|
|
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool eval, bool spatial, double normalizationTimeConstant, double epsilon,
|
|
bool useCntkEngine, ImageLayoutKind imageLayoutKind)
|
|
: Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_epsilon(epsilon),
|
|
m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
|
|
{
|
|
}
|
|
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp)
|
|
: BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"eval"), configp->Get(L"spatial"),
|
|
configp->Get(L"normalizationTimeConstant"), configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
|
|
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
|
|
{
|
|
AttachInputs(configp, this->GetExpectedNumInputs());
|
|
}
|
|
|
|
void Save(File& fstream) const override
|
|
{
|
|
Base::Save(fstream);
|
|
fstream << m_version.VerWrittenCur() << m_version.VerReadableCur();
|
|
|
|
fstream << m_eval;
|
|
fstream << m_spatial;
|
|
fstream << m_normTimeConst;
|
|
fstream << (int32_t)m_imageLayoutKind;
|
|
fstream << m_mbCount;
|
|
fstream << m_epsilon;
|
|
fstream << m_useCntkEngine;
|
|
}
|
|
|
|
void Load(File& fstream, size_t modelVersion) override
|
|
{
|
|
Base::Load(fstream, modelVersion);
|
|
|
|
// Read and check version.
|
|
// REVIEW alexeyk: extract version checking so it can be re-used in other places.
|
|
// BUGBUG: We must serialize m_inputLayout.
|
|
int32_t verWritten;
|
|
int32_t verReadable;
|
|
fstream >> verWritten >> verReadable;
|
|
|
|
if (verReadable > verWritten)
|
|
RuntimeError("Corrupt model file.");
|
|
if (verWritten < m_version.VerWeCanReadBack())
|
|
RuntimeError("Model is too old.");
|
|
if (verReadable > m_version.VerWrittenCur())
|
|
RuntimeError("Model is too new.");
|
|
|
|
fstream >> m_eval;
|
|
fstream >> m_spatial;
|
|
if (verWritten >= 0x00010004)
|
|
fstream >> m_normTimeConst;
|
|
else
|
|
{
|
|
double expAvgFactor;
|
|
fstream >> expAvgFactor;
|
|
UNUSED(expAvgFactor); // Used in previous versions, replaced by m_normTimeConst.
|
|
}
|
|
if (verWritten >= 0x00010002)
|
|
{
|
|
fstream >> m_imageLayoutKind;
|
|
fstream >> m_mbCount;
|
|
}
|
|
if (verWritten >= 0x00010003)
|
|
{
|
|
fstream >> m_epsilon;
|
|
fstream >> m_useCntkEngine;
|
|
}
|
|
}
|
|
|
|
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
|
{
|
|
Base::CopyTo(nodeP, newName, flags);
|
|
if (flags & CopyNodeFlags::copyNodeValue)
|
|
{
|
|
auto node = dynamic_pointer_cast<BatchNormalizationNode<ElemType>>(nodeP);
|
|
assert(node != nullptr);
|
|
|
|
node->m_eval = m_eval;
|
|
node->m_spatial = m_spatial;
|
|
node->m_normTimeConst = m_normTimeConst;
|
|
node->m_imageLayoutKind = m_imageLayoutKind;
|
|
node->m_mbCount = m_mbCount;
|
|
node->m_epsilon = m_epsilon;
|
|
node->m_useCntkEngine = m_useCntkEngine;
|
|
}
|
|
}
|
|
|
|
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
|
|
{
|
|
if (m_eval)
|
|
LogicError("BatchNormalization does not compute derivatives in inference mode.");
|
|
|
|
if (inputIndex == 0) // derivative with respect to the input.
|
|
{
|
|
auto sliceOutputGrad = GradientFor(fr);
|
|
auto sliceInputValue = Input(0)->ValueFor(fr);
|
|
const Matrix<ElemType>& scale = Input(1)->Value();
|
|
const Matrix<ElemType>& bias = Input(2)->Value();
|
|
|
|
size_t batchSize = sliceInputValue.GetNumCols();
|
|
m_inT->setN(batchSize);
|
|
assert(m_convEng != nullptr);
|
|
|
|
auto sliceInputGrad = Input(0)->GradientFor(fr);
|
|
m_dScale->Resize(scale);
|
|
m_dBias->Resize(bias);
|
|
// Compute all derivatives in one step. Save derivatives with respect to scale and bias in temp matrices.
|
|
m_convEng->BackwardNormalizeBatch(*m_inT, sliceInputValue, sliceOutputGrad, sliceInputGrad, *m_scaleBiasT, scale, m_spatial,
|
|
*m_saveMean, *m_saveInvStdDev, *m_dScale, *m_dBias);
|
|
}
|
|
else if (inputIndex == 1) // derivative with respect to the scale
|
|
{
|
|
// Derivative with respect to the scale was precomputed during input derivative computation.
|
|
Matrix<ElemType>& grad = Input(1)->Gradient();
|
|
grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dScale->BufferPointer());
|
|
}
|
|
else if (inputIndex == 2) // derivative with respect to the bias
|
|
{
|
|
// Derivative with respect to the bias was precomputed during input derivative computation.
|
|
Matrix<ElemType>& grad = Input(2)->Gradient();
|
|
grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dBias->BufferPointer());
|
|
}
|
|
// No derivatives with respect to running mean and InvStdDev.
|
|
}
|
|
|
|
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
|
{
|
|
// The BatchNormalizationNode does not require its output value for computing
|
|
// the gradients of its input nodes
|
|
return false;
|
|
}
|
|
|
|
void ForwardProp(const FrameRange& fr) override
|
|
{
|
|
Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
|
|
|
|
const Matrix<ElemType>& scale = Input(1)->Value();
|
|
const Matrix<ElemType>& bias = Input(2)->Value();
|
|
Matrix<ElemType>& runMean = Input(3)->Value();
|
|
Matrix<ElemType>& runInvStdDev = Input(4)->Value();
|
|
assert(scale.GetNumRows() == bias.GetNumRows());
|
|
assert(scale.GetNumCols() == bias.GetNumCols());
|
|
assert(runMean.GetNumRows() == scale.GetNumRows());
|
|
assert(runMean.GetNumCols() == scale.GetNumCols());
|
|
assert(runMean.GetNumRows() == runInvStdDev.GetNumRows());
|
|
assert(runMean.GetNumCols() == runInvStdDev.GetNumCols());
|
|
|
|
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
|
|
|
size_t batchSize = sliceInputValue.GetNumCols();
|
|
m_inT->setN(batchSize);
|
|
assert(m_convEng != nullptr);
|
|
#if NANCHECK
|
|
sliceInputValue.HasNan("BatchNormalization-input");
|
|
#endif
|
|
if (m_eval)
|
|
m_convEng->NormalizeBatchInference(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, runMean, runInvStdDev, sliceOutputValue);
|
|
else
|
|
{
|
|
double expAvgFactor;
|
|
if (m_normTimeConst > 0)
|
|
{
|
|
// Convert to per-minibatch factor.
|
|
expAvgFactor = 1.0 - exp(-(double)GetMBLayout()->GetActualNumSamples() / m_normTimeConst);
|
|
}
|
|
else
|
|
{
|
|
// REVIEW alexeyk: hack, m_normTimeConst < 0 is used to compute CMA.
|
|
expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1;
|
|
}
|
|
|
|
if (m_saveMean->GetNumElements() != runMean.GetNumElements())
|
|
m_saveMean->Resize(runMean.GetNumRows(), runMean.GetNumCols());
|
|
if (m_saveInvStdDev->GetNumElements() != runMean.GetNumElements())
|
|
m_saveInvStdDev->Resize(runMean.GetNumRows(), runMean.GetNumCols());
|
|
|
|
m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, expAvgFactor, runMean, runInvStdDev,
|
|
sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);
|
|
|
|
m_mbCount++;
|
|
}
|
|
#if NANCHECK
|
|
sliceOutputValue.HasNan("BatchNormalization-output");
|
|
runMean.HasNan("BatchNormalization-runMean");
|
|
runInvStdDev.HasNan("BatchNormalization-runInvStdDev");
|
|
m_saveMean->HasNan("BatchNormalization-saveMean");
|
|
m_saveInvStdDev->HasNan("BatchNormalization-saveInvStdDev");
|
|
#endif
|
|
}
|
|
|
|
void Validate(bool isFinalValidationPass) override
|
|
{
|
|
Base::Validate(isFinalValidationPass);
|
|
InferMBLayoutFromInputsForStandardCase();
|
|
|
|
SetDims(Input(0));
|
|
|
|
if (isFinalValidationPass)
|
|
{
|
|
if (m_spatial && m_imageLayoutKind != CHW)
|
|
{
|
|
InvalidArgument(
|
|
"Batch normalization currently supports only cuDNN (CHW) data layout. "
|
|
"Please specify imageLayout=\"cudnn\" in BatchNormalization node in your NDL/BrainScript "
|
|
"and make sure your input data layout is CHW");
|
|
}
|
|
double cudnnMinEps = 1e-5; // CUDNN_BN_MIN_EPSILON
|
|
if (!m_useCntkEngine && m_epsilon < cudnnMinEps)
|
|
fprintf(stderr, "\nWARNING: cuDNN batch normalization requires epsilon >= %e. Epsilon will be reset to that value.\n", cudnnMinEps);
|
|
|
|
auto shape = GetSampleLayout();
|
|
|
|
if (m_factory == nullptr)
|
|
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
|
if (m_convEng == nullptr)
|
|
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_imageLayoutKind, 0, m_useCntkEngine ? BatchNormImpl::Cntk : BatchNormImpl::CuDnn);
|
|
if (m_spatial)
|
|
{
|
|
auto dims = ImageDimensions(shape, m_imageLayoutKind);
|
|
if (m_inT == nullptr)
|
|
m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
|
|
if (m_scaleBiasT == nullptr)
|
|
m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
|
|
}
|
|
else
|
|
{
|
|
if (m_inT == nullptr)
|
|
m_inT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
|
|
if (m_scaleBiasT == nullptr)
|
|
m_scaleBiasT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
|
{
|
|
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
|
if (!m_eval)
|
|
{
|
|
RequestMatrixFromPool(m_saveMean, matrixPool);
|
|
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
|
|
}
|
|
}
|
|
|
|
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
|
|
{
|
|
Base::RequestMatricesBeforeBackprop(matrixPool);
|
|
if (!m_eval)
|
|
{
|
|
RequestMatrixFromPool(m_dScale, matrixPool);
|
|
RequestMatrixFromPool(m_dBias, matrixPool);
|
|
}
|
|
}
|
|
|
|
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
|
{
|
|
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
|
if (!m_eval)
|
|
{
|
|
ReleaseMatrixToPool(m_saveMean, matrixPool);
|
|
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
|
|
ReleaseMatrixToPool(m_dScale, matrixPool);
|
|
ReleaseMatrixToPool(m_dBias, matrixPool);
|
|
}
|
|
}
|
|
|
|
void SetEvalMode(bool bnEvalMode)
|
|
{
|
|
m_eval = bnEvalMode;
|
|
}
|
|
|
|
private:
|
|
struct VersionInfo
|
|
{
|
|
//int32_t VerWrittenCur() const { return 0x00010001; } // Initial
|
|
//int32_t VerWrittenCur() const { return 0x00010002; } // Added m_imageLayoutKind and m_mbCount
|
|
//int32_t VerWrittenCur() const { return 0x00010003; } // Added m_epsilon and m_useCntkEngine
|
|
int32_t VerWrittenCur() const { return 0x00010004; } // Added m_normTimeConst
|
|
int32_t VerReadableCur() const { return 0x00010004; }
|
|
int32_t VerWeCanReadBack() const { return 0x00010001; }
|
|
};
|
|
VersionInfo m_version;
|
|
|
|
private:
|
|
// Determines whether to use training or inference(evaluation) mode.
|
|
bool m_eval;
|
|
// Determines whether to use per-activation (used after non-convolutional layers like fully connected)
|
|
// or spatial (used after convolutional layers).
|
|
bool m_spatial;
|
|
// Time constant for running mean and variance.
|
|
double m_normTimeConst;
|
|
// Epsilon used to compute inverse std deviation.
|
|
double m_epsilon;
|
|
// Whether to use CNTK or cuDNN BN implementation.
|
|
bool m_useCntkEngine;
|
|
// Layout (e.g. CHW).
|
|
ImageLayoutKind m_imageLayoutKind;
|
|
// Minibatch count, used to compute cumulative moving average.
|
|
size_t m_mbCount;
|
|
|
|
// Stores pre-computed on forward pass mean values that are used in gradient computation.
|
|
shared_ptr<Matrix<ElemType>> m_saveMean;
|
|
// Stores pre-computed on forward pass InvStdDev values that are used in gradient computation.
|
|
shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
|
|
// Stores scale derivatives
|
|
shared_ptr<Matrix<ElemType>> m_dScale;
|
|
// Stores bias derivatives.
|
|
shared_ptr<Matrix<ElemType>> m_dBias;
|
|
|
|
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
|
|
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
|
std::unique_ptr<ConvolutionTensor4D> m_inT;
|
|
std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
|
|
};
|
|
|
|
template class BatchNormalizationNode<float>;
|
|
template class BatchNormalizationNode<double>;
|
|
|
|
} } }
|