fixed SGD logging to not output the same criterion multiple times

This commit is contained in:
Frank Seide 2015-11-30 19:07:41 -08:00
Родитель efafd7c795
Коммит ea87db0cad
5 изменённых файлов: 38 добавлений и 69 удалений

Просмотреть файл

@ -422,6 +422,7 @@ void BatchLUSequenceReader<ElemType>::InitFromConfig(const ConfigRecordType & re
const LabelInfo& labelIn = m_labelInfo[labelInfoIn];
const LabelInfo& labelOut = m_labelInfo[labelInfoOut];
fprintf(stderr, "BatchLUSequenceReader: Input file is %ls\n", m_file.c_str());
m_parser.ParseInit(m_file.c_str(), labelIn.dim, labelOut.dim, labelIn.beginSequence, labelIn.endSequence, labelOut.beginSequence, labelOut.endSequence, mUnkStr);
mRequestedNumParallelSequences = readerConfig(L"nbruttsineachrecurrentiter", (size_t)1);

Просмотреть файл

@ -414,29 +414,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Base(deviceId, name)
{ }
//void ComputeInputPartialMap(const size_t inputIndex)
//{
// if (inputIndex > 1)
// InvalidArgument("LookupTable operation only takes two inputs.");
//
// //DEVICEID_TYPE input1DeviceId = Inputs(1)->FunctionValues().GetDeviceId();
// //DEVICEID_TYPE input0DeviceId = Inputs(0)->FunctionValues().GetDeviceId();
// //Inputs(1)->FunctionValues().TransferFromDeviceToDevice(input1DeviceId, input0DeviceId);
//
// if (inputIndex == 0) //left derivative
// {
// ComputeInputPartialLeft(Inputs(1)->FunctionValues(), Inputs(0)->GradientValues(), GradientValues());
// }
// else //right derivative
// {
// ComputeInputPartialRight(Inputs(0)->FunctionValues(), Inputs(1)->GradientValues(), GradientValues());
// }
// //Inputs(1)->FunctionValues().TransferFromDeviceToDevice(input0DeviceId, input1DeviceId);
//}
virtual void /*ComputationNode::*/ComputeInputPartial(const size_t inputIndex, const FrameRange & t) override
{
//if (t.IsAllFrames()) { ComputeInputPartialMap(inputIndex); return; } // TODO: remove these one by one
if (inputIndex == 0) // left derivative (embedding matrix)
{
// This is a reduction operation, hence we need to mask out gaps.
@ -501,18 +480,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
auto input1Reshaped = input1.Reshaped(rows1 / wordsInEachSample, cols1 * wordsInEachSample);
//DEVICEID_TYPE input1DeviceId = input1.GetDeviceId();
//DEVICEID_TYPE input0DeviceId = input0.GetDeviceId();
//input1.TransferFromDeviceToDevice(input1DeviceId, input0DeviceId);
auto functionValuesReshaped = functionValues.Reshaped(input0.GetNumRows(), input1Reshaped.GetNumCols());
functionValuesReshaped.AssignProductOf(input0, false, input1Reshaped, false);
//size_t rows = functionValues.GetNumRows();
//functionValues.Reshape(rows * wordsInEachSample, cols1);
//input1.TransferFromDeviceToDevice(input0DeviceId, input1DeviceId);
//input1.Reshape(rows1, cols1);
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override

Просмотреть файл

@ -42,7 +42,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_timeStep = 1;
CreateMatrixIfNull(m_functionValues);
SetDims(row_size, col_size);
//m_delayedActivation.Resize(row_size, col_size); // TODO: relevance of col_size? Why not timeStep?
m_isHistoryCarryOverManagedExternally = false; // used for PairNetworkNode/PastValueNode combination
}
protected:
@ -61,10 +60,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_timeStep = (int)timeStep;
m_functionValues->SetValue(m_initialActivationValue);
//m_delayedActivation.SetValue(m_initialActivationValue);
//m_gradientValues->Resize(row_size, col_size);
//m_gradientValues->SetValue(0.0f);
}
DelayedValueNodeBase(const ScriptableObjects::IConfigRecordPtr configp) :
DelayedValueNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"defaultHiddenActivation"), configp->Get(L"rows"), configp->Get(L"cols"), configp->Get(L"timeStep"))
@ -303,9 +298,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
out.SetValue(inp);
}
//MaskMissingValuesColumnsToZero(t); // fix gaps if any --TODO: make this take a FrameRange
// TODO: why is masking needed here? We should never carry over data from those into valid regions, right?
}
virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
@ -314,7 +306,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
// this function is only used for PairNetworkNode (on PastValueNode)
// BUGBUG: Need to transfer the layout as well. PairNetworkNod will go away.
// BUGBUG: Need to transfer the layout as well. PairNetworkNode will go away.
bool GetHistory(Matrix<ElemType>& hist, bool)
{
DEVICEID_TYPE device = hist.GetDeviceId();
@ -375,7 +367,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
using Base::m_initialActivationValue; using Base::m_delayedActivation; using Base::m_timeStep; \
using Base::m_pShiftedMBLayout; using Base::m_isHistoryCarryOverManagedExternally;
// =======================================================================
// -----------------------------------------------------------------------
// PastValueNode (input) -- delay node
// TODO: Can this just be a typedef?

Просмотреть файл

@ -827,7 +827,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
timer.Stop();
double epochTime = timer.ElapsedSeconds();
if (m_useEvalCriterionControlLR)
if (m_useEvalCriterionControlLR && epochEvalErrors.size() > 0)
{
lrControlCriterion = epochEvalErrors[0];
}
@ -840,12 +840,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
"Finished Epoch[%2d of %d]: [Training Set] TrainLossPerSample = %.8g; ",
i + 1, (int) m_maxEpochs, epochCriterion);
if (epochEvalErrors.size() == 1)
if (epochEvalErrors.size() == 0) // no eval criterion, only train criterion itself
{
fprintf(stderr,
"EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g; EpochTime=%.8g\n",
"Ave LearnRatePerSample = %.6g; Epoch Time=%.6g\n",
learnRatePerSample, epochTime);
m_lastFinishedEpochEvalErr = epochCriterion;
}
else if (epochEvalErrors.size() == 1)
{
fprintf(stderr,
"EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.6g; Epoch Time=%.6g\n",
epochEvalErrors[0], learnRatePerSample, epochTime);
m_lastFinishedEpochEvalErr = epochEvalErrors[0];
m_lastFinishedEpochEvalErr = epochEvalErrors.back();
}
else
{
@ -853,13 +860,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (size_t j = 0; j < epochEvalErrors.size(); j++)
{
fprintf(stderr, "[%lu]=%.8g; ", j, epochEvalErrors[j]);
m_lastFinishedEpochEvalErr = epochEvalErrors[j];
}
m_lastFinishedEpochEvalErr = epochEvalErrors.back();
fprintf(stderr, "Ave LearnRatePerSample = %.10g; Epoch Time=%.8g\n",
fprintf(stderr, "Ave LearnRatePerSample = %.6g; Epoch Time=%.6g\n",
learnRatePerSample, epochTime);
// TODO: why these extra log messages here and not for 1 eval criterion?
fprintf(stderr, "Finished Epoch[%2d of %d]: Criterion Node [%ls] Per Sample = %.8g\n",
i + 1, (int) m_maxEpochs, criterionNodes[0]->NodeName().c_str(), epochCriterion);
@ -876,22 +883,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
SimpleEvaluator<ElemType> evalforvalidation(net);
vector<wstring> cvSetTrainAndEvalNodes;
cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName());
cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
if (criterionNodes.size() > 0)
cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName());
if (evaluationNodes.size() > 0)
cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
vector<double> vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
fprintf(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g; EvalErrPerSample = %.8g\n",
i + 1, (int) m_maxEpochs, vScore[0], vScore[1]);
fprintf(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]);
if (vScore.size() > 1)
fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1]);
fprintf(stderr, "\n");
if (m_useCVSetControlLRIfCVExists)
{
if (m_useEvalCriterionControlLR)
if (m_useEvalCriterionControlLR && vScore.size() > 1)
lrControlCriterion = vScore[1];
else
lrControlCriterion = vScore[0]; //the first one is the training criterion.
}
lrControlCriterion = vScore[0]; // the first one is the training criterion
}
}
}
// broadcast epochCriterion to make sure each processor will have the same learning rate schedule
if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
@ -906,8 +917,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
else
{
avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) *
avgCriterion + lrControlCriterion) /
(epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
avgCriterion + lrControlCriterion) /
(epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
}
if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&

Просмотреть файл

@ -24,10 +24,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
class SimpleEvaluator
{
protected:
public:
SimpleEvaluator(ComputationNetworkPtr net, const size_t numMBsToShowResult = 100, const int traceLevel = 0)
: m_net(net), m_numMBsToShowResult(numMBsToShowResult), m_traceLevel(traceLevel)
{
@ -68,12 +65,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
//initialize eval results
// initialize eval results
std::vector<double> evalResults;
for (int i = 0; i < evalNodes.size(); i++)
evalResults.push_back((double)0);
//prepare features and labels
// prepare features and labels
auto & featureNodes = m_net->FeatureNodes();
auto & labelNodes = m_net->LabelNodes();
@ -83,7 +80,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (size_t i = 0; i < labelNodes.size(); i++)
inputMatrices[labelNodes[i]->NodeName()] = &dynamic_pointer_cast<ComputationNode<ElemType>>(labelNodes[i])->FunctionValues();
//evaluate through minibatches
// evaluate through minibatches
size_t totalEpochSamples = 0;
size_t numMBsRun = 0;
size_t actualMBSize = 0;
@ -102,14 +99,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
//for now since we share the same label masking flag we call this on one node only
//Later, when we apply different labels on different nodes
//we need to add code to call this function multiple times, one for each criteria node
// for now since we share the same label masking flag we call this on one node only
// Later, when we apply different labels on different nodes
// we need to add code to call this function multiple times, one for each criteria node
size_t numSamplesWithLabel = m_net->GetNumSamplesWithLabel(actualMBSize);
for (int i = 0; i < evalNodes.size(); i++)
{
m_net->Evaluate(evalNodes[i]);
evalResults[i] += (double)evalNodes[i]->Get00Element(); //criterionNode should be a scalar
evalResults[i] += (double)evalNodes[i]->Get00Element(); // criterionNode should be a scalar
}
totalEpochSamples += numSamplesWithLabel;
@ -132,8 +129,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
/// call DataEnd to check if end of sentence is reached
/// datareader will do its necessary/specific process for sentence ending
// call DataEnd to check if end of sentence is reached
// datareader will do its necessary/specific process for sentence ending
dataReader->DataEnd(endDataSentence);
}