refactored criterion reporting in SGD and SimpleEvaluator to carry a per-criterion-node sample count, in preparation for supporting criterion nodes with different sequence lengths for seq-2-seq

This commit is contained in:
Frank Seide 2016-04-12 23:09:08 -07:00
Родитель ac6afa4812
Коммит 1224dfc9bc
10 изменённых файлов: 265 добавлений и 210 удалений

Просмотреть файл

@ -17,6 +17,7 @@
#include "Config.h"
#include "SimpleEvaluator.h"
#include "SimpleOutputWriter.h"
#include "Criterion.h"
#include "BestGpu.h"
#include "ScriptableObjects.h"
#include "BrainScriptEvaluator.h"
@ -121,8 +122,8 @@ void DoCrossValidate(const ConfigParameters& config)
int traceLevel = config(L"traceLevel", "0");
size_t numMBsToShowResult = config(L"numMBsToShowResult", "100");
size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);
size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);
ConfigArray evalNodeNames = config(L"evalNodeNames", "");
vector<wstring> evalNodeNamesVector;
@ -131,7 +132,7 @@ void DoCrossValidate(const ConfigParameters& config)
evalNodeNamesVector.push_back(evalNodeNames[i]);
}
std::vector<std::vector<double>> cvErrorResults;
std::vector<std::vector<EpochCriterion>> cvErrorResults;
std::vector<std::wstring> cvModels;
DataReader cvDataReader(readerConfig);
@ -143,7 +144,7 @@ void DoCrossValidate(const ConfigParameters& config)
if (!fexists(cvModelPath))
{
fprintf(stderr, "model %ls does not exist.\n", cvModelPath.c_str());
fprintf(stderr, "Model %ls does not exist.\n", cvModelPath.c_str());
if (finalModelEvaluated || !fexists(modelPath))
continue; // file missing
else
@ -158,7 +159,7 @@ void DoCrossValidate(const ConfigParameters& config)
SimpleEvaluator<ElemType> eval(net, MPIWrapper::GetInstance(), numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches);
fprintf(stderr, "model %ls --> \n", cvModelPath.c_str());
fprintf(stderr, "Model %ls --> \n", cvModelPath.c_str());
auto evalErrors = eval.Evaluate(&cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
cvErrorResults.push_back(evalErrors);
@ -167,16 +168,14 @@ void DoCrossValidate(const ConfigParameters& config)
// find best model
if (cvErrorResults.size() == 0)
{
LogicError("No model is evaluated.");
}
std::vector<double> minErrors;
std::vector<int> minErrIds;
std::vector<double> evalErrors = cvErrorResults[0];
vector<double> minErrors;
vector<int> minErrIds;
vector<EpochCriterion> evalErrors = cvErrorResults[0];
for (int i = 0; i < evalErrors.size(); ++i)
{
minErrors.push_back(evalErrors[i]);
minErrors.push_back(evalErrors[i].Average());
minErrIds.push_back(0);
}
@ -185,9 +184,9 @@ void DoCrossValidate(const ConfigParameters& config)
evalErrors = cvErrorResults[i];
for (int j = 0; j < evalErrors.size(); j++)
{
if (evalErrors[j] < minErrors[j])
if (evalErrors[j].Average() < minErrors[j])
{
minErrors[j] = evalErrors[j];
minErrors[j] = evalErrors[j].Average();
minErrIds[j] = i;
}
}
@ -196,9 +195,7 @@ void DoCrossValidate(const ConfigParameters& config)
fprintf(stderr, "Best models:\n");
fprintf(stderr, "------------\n");
for (int i = 0; i < minErrors.size(); ++i)
{
fprintf(stderr, "Based on Err[%d]: Best model = %ls with min err %.8g\n", i, cvModels[minErrIds[i]].c_str(), minErrors[i]);
}
}
template void DoCrossValidate<float>(const ConfigParameters& config);

Просмотреть файл

@ -384,9 +384,9 @@ RNNs =
# It returns a dictionary with two members: h and c. prevState must be in the same format.
// TODO: Standardize on one parameter order. Is first dimension the output (like in math, strcpy, or functional style) or the input (listing inputs first)?
// If we change this, we'd need to fix the LSTM end-to-end test.
LSTMP (inputDim1, outputDim, cellDim, x, prevState, enableSelfStabilization=false) =
LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=false) =
[
inputDim = x.dim # get dimension from 'x' (if this works, we can remove the inputDim1 parameter)
#inputDim = x.dim # get dimension from 'x' (if this works, we can remove the inputDim1 parameter)
_privateInnards = [ // encapsulate the privateInnards workings
dh = prevState.h // previous values
dc = prevState.c

Просмотреть файл

@ -708,8 +708,9 @@ template <>
shared_ptr<Object> MakeRuntimeObject<ComputationNodeBase>(const IConfigRecordPtr configp)
{
let node = NewComputationNodeFromConfig(configp);
if (!node->Is<IRecurrentNode>())
node->Validate(/*isFinalValidationPass*/false); // do an initial validation, so that we have access to dimensions
// temporarily disabling this, as it caused a test to fail:
//if (!node->Is<IRecurrentNode>())
// node->Validate(/*isFinalValidationPass*/false); // do an initial validation, so that we have access to dimensions
return node;
}

81
Source/SGDLib/Criterion.h Normal file
Просмотреть файл

@ -0,0 +1,81 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// Criterion.h -- helper classes for accumulating criteria
#pragma once
#include "Basics.h"
#include "Matrix.h"
#include <memory> // for pair
#include <limits> // for isnan() and numeric_limits --TODO: is that the right header?
namespace Microsoft { namespace MSR { namespace CNTK {
// helper class for passing accumulated epoch-level criteria around, with their counts
struct EpochCriterion : public std::pair<double, size_t>
{
explicit EpochCriterion(double numer = 0.0, size_t denom = 0) : std::pair<double, size_t>(numer, denom) { }
EpochCriterion(const std::pair<double, size_t>& other) : std::pair<double, size_t>(other) { }
static EpochCriterion Infinity() { return EpochCriterion(std::numeric_limits<double>::infinity()); }
bool IsInfinity() const { return first == std::numeric_limits<double>::infinity(); }
// a few operations that are needed
double Average() const { return second > 0 ? first / second : 0.0; } // compute the epoch-average
// Note: for now using a longer complex name that is find-replaceable
bool IsNan() const { return std::isnan(first); }
EpochCriterion operator-(const EpochCriterion& other) const { return EpochCriterion(first - other.first, second - other.second); }
void operator+=(const EpochCriterion& other) { first += other.first; second += other.second; }
};
// We accumulate criteria in this struct.
// Criteria are accumulated together with their counts (counts depend on sequence lengths, and different criteria may have different sequence lengths).
template <class ElemType>
struct CriterionAccumulator
{
// constructor
CriterionAccumulator(size_t num, DEVICEID_TYPE deviceId) :
m_numerators(1, num, deviceId)
{
m_numerators.SetValue(0);
m_denominators.assign(num, 0);
}
// 'i' is the index of the element we add into (multiple eval criteria share the same matrix object)
void Accumulate(const std::vector<ComputationNodeBasePtr>& nodes, size_t i, size_t legacyNumSamples)
{
const auto& node = nodes[i]; // multiple nodes are managed by this struct
// Note: A future change will be that criterion nodes emit criteria per frame, but aggregated.
// In that case, the denominator will be accumulated from their MBLayout.
// Also, the numerator will have masking and an implicit reduction.
Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(),
0, 0, m_numerators, 0, i);
m_denominators[i] += GetNumSamples(nodes[i], legacyNumSamples);
}
// retrieve an accumulated result as a pair (numerator, denominator)
EpochCriterion GetCriterion(size_t i) const
{
return EpochCriterion(m_numerators(0, i), m_denominators[i]);
}
// retrive a result from a node
static EpochCriterion GetCriterion(const ComputationNodeBasePtr& node, size_t legacyNumSamples)
{
auto numSamples = GetNumSamples(node, legacyNumSamples);
return numSamples > 0 ? EpochCriterion(node->Get00Element(), numSamples) : EpochCriterion(0); // (avoid GPU access if 0 samples)
}
private:
// get the number of samples
static size_t GetNumSamples(const ComputationNodeBasePtr& node, size_t legacyNumSamples)
{
if (node->HasMBLayout())
return node->GetMBLayout()->GetActualNumSamples();
else
return legacyNumSamples;
}
private:
Matrix<ElemType> m_numerators; // [1 x N]
vector<size_t> m_denominators; // [N]
};
}}}

Просмотреть файл

@ -6,12 +6,12 @@ struct DistGradHeader
{
public:
size_t numSamples;
size_t numSamplesWithLabel;
size_t numSamplesWithLabel; // this is the denominator for 'criterion'
double criterion;
// variable-size array
int numEvalNode;
double evalErrors[1];
pair<double,size_t> evalErrors[1];
static DistGradHeader* Create(int numEvalNode)
{
@ -41,7 +41,8 @@ public:
criterion += other->criterion;
for (int i = 0; i < numEvalNode; i++)
{
evalErrors[i] += other->evalErrors[i];
evalErrors[i].first += other->evalErrors[i].first; // numer
evalErrors[i].second += other->evalErrors[i].second; // denom
}
}
}
@ -58,7 +59,8 @@ public:
criterion = 0;
for (int i = 0; i < numEvalNode; i++)
{
evalErrors[i] = 0;
evalErrors[i].first = 0;
evalErrors[i].second = 0;
}
}
@ -77,17 +79,19 @@ public:
}
private:
static size_t DistGradHeaderSize(size_t nEvalNode)
static size_t DistGradHeaderSize(size_t nEvalNodes)
{
return sizeof(DistGradHeader) + (sizeof(double) * (nEvalNode - 1));
// BUGBUG: Should be sizeof(evalErrors[0]), but the compiler won't let me. This is only correct because evalErrors has 1 element.
return sizeof(DistGradHeader) + (sizeof(decltype(evalErrors)) * (nEvalNodes - 1));
}
// Disallow construction and destruction since this type contains a variable sized array member
// and hence must be constructed through the create and destroy functions
DistGradHeader() = delete;
DistGradHeader() = delete;
~DistGradHeader() = delete;
// Disallow copy and move construction/assignment
DISABLE_COPY_AND_MOVE(DistGradHeader);
};
} } }
}}}

Просмотреть файл

@ -255,17 +255,18 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
net->GetDeviceId()));
}
double epochCriterion, avgCriterion, prevCriterion, lrControlCriterion;
lrControlCriterion = epochCriterion = avgCriterion = prevCriterion = std::numeric_limits<double>::infinity();
EpochCriterion prevCriterion (EpochCriterion::Infinity());
EpochCriterion epochCriterion(EpochCriterion::Infinity());
double avgCriterion, lrControlCriterion;
lrControlCriterion = avgCriterion = numeric_limits<double>::infinity();
size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
std::vector<EpochCriterion> epochEvalErrors(evaluationNodes.size(), EpochCriterion::Infinity());
std::vector<wstring> evalNodeNames;
for (size_t i = 0; i < evaluationNodes.size(); i++)
evalNodeNames.push_back(evaluationNodes[i]->NodeName());
size_t totalSamplesSeen = 0;
double learnRatePerSample = 0.5f / m_mbSize[startEpoch];
double learningRateAdjustmentFactor = 1.0f;
@ -307,10 +308,10 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (startEpoch > 0)
{
learnRateInitialized = LoadCheckPointInfo(startEpoch - 1,
/*out*/ totalSamplesSeen,
/*out*/ prevCriterion.second,
/*out*/ learnRatePerSample,
smoothedGradients,
/*out*/ prevCriterion,
/*out*/ prevCriterion.first,
/*out*/ m_prevChosenMinibatchSize);
if (learnRateInitialized)
prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
@ -464,24 +465,20 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
evaluationNodes,
inputMatrices,
learnableNodes, smoothedGradients,
epochCriterion, epochEvalErrors, totalSamplesSeen);
epochCriterion, epochEvalErrors);
timer.Stop();
double epochTime = timer.ElapsedSeconds();
if (m_useEvalCriterionControlLR && epochEvalErrors.size() > 0)
{
lrControlCriterion = epochEvalErrors[0];
}
lrControlCriterion = epochEvalErrors[0].Average();
else
{
lrControlCriterion = epochCriterion;
}
lrControlCriterion = epochCriterion.Average();
LOGPRINTF(stderr,
"Finished Epoch[%2d of %d]: [Training Set] TrainLossPerSample = %.8g; TotalSamplesSeen = %d; ",
i + 1, (int)m_maxEpochs, epochCriterion, (int)totalSamplesSeen);
m_lastFinishedEpochTrainLoss = epochCriterion;
i + 1, (int)m_maxEpochs, epochCriterion.Average(), (int)epochCriterion.second);
m_lastFinishedEpochTrainLoss = epochCriterion.Average();
if (epochEvalErrors.size() == 0) // no eval criterion, only train criterion itself
{
fprintf(stderr,
@ -530,37 +527,32 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
}
// BUGBUG: We should not use the training MB size. The training MB size is constrained by both convergence and memory. Eval is only constrained by memory.
vector<double> vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]);
let vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int)m_maxEpochs, vScore[0].Average());
if (vScore.size() > 1)
{
fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1]);
}
fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1].Average());
fprintf(stderr, "\n");
if (m_useCVSetControlLRIfCVExists)
{
if (m_useEvalCriterionControlLR && vScore.size() > 1)
{
lrControlCriterion = vScore[1];
}
lrControlCriterion = vScore[1].Average();
else
{
lrControlCriterion = vScore[0]; // the first one is the training criterion
}
lrControlCriterion = vScore[0].Average(); // the first one is the training criterion
}
}
// broadcast epochCriterion to make sure each processor will have the same learning rate schedule
if ((GetParallelizationMethod() == ParallelizationMethod::ModelAveragingSGD) && (m_mpi->NumNodesInUse() > 1))
{
m_mpi->Bcast(&epochCriterion, 1, m_mpi->MainNodeRank());
m_mpi->Bcast(&lrControlCriterion, 1, m_mpi->MainNodeRank());
m_mpi->Bcast(&epochCriterion.first, 1, m_mpi->MainNodeRank());
m_mpi->Bcast(&epochCriterion.second, 1, m_mpi->MainNodeRank());
m_mpi->Bcast(&lrControlCriterion, 1, m_mpi->MainNodeRank());
}
bool loadedPrevModel = false;
size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
if (avgCriterion == std::numeric_limits<double>::infinity())
if (avgCriterion == numeric_limits<double>::infinity())
{
avgCriterion = lrControlCriterion;
}
@ -575,7 +567,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
m_learningRatesParam.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
{
if (std::isnan(avgCriterion) || (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<double>::infinity()))
if (std::isnan(avgCriterion) || (prevCriterion.Average() - avgCriterion < 0 && prevCriterion.Average() != std::numeric_limits<double>::infinity()))
{
if (m_loadBestModel)
{
@ -583,10 +575,10 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
LOGPRINTF(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
net->RereadPersistableParameters<ElemType>(bestModelPath);
LoadCheckPointInfo(i - m_learnRateAdjustInterval,
/*out*/ totalSamplesSeen,
/*out*/ prevCriterion.second,
/*out*/ learnRatePerSample,
smoothedGradients,
/*out*/ prevCriterion,
/*out*/ prevCriterion.first,
/*out*/ m_prevChosenMinibatchSize);
loadedPrevModel = true;
}
@ -595,8 +587,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (m_continueReduce)
{
if (std::isnan(avgCriterion) ||
(prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
prevCriterion != std::numeric_limits<double>::infinity()))
(prevCriterion.Average() - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion.Average() &&
prevCriterion.Average() != numeric_limits<double>::infinity()))
{
if (learnRateReduced == false)
{
@ -623,15 +615,15 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
else
{
if (std::isnan(avgCriterion) ||
(prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
prevCriterion != std::numeric_limits<double>::infinity()))
(prevCriterion.Average() - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion.Average() &&
prevCriterion.Average() != numeric_limits<double>::infinity()))
{
learnRatePerSample *= m_learnRateDecreaseFactor;
LOGPRINTF(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
}
else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion &&
prevCriterion != std::numeric_limits<double>::infinity())
else if (prevCriterion.Average() - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion.Average() &&
prevCriterion.Average() != numeric_limits<double>::infinity())
{
learnRatePerSample *= m_learnRateIncreaseFactor;
LOGPRINTF(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
@ -647,7 +639,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// not loading previous values then set them
if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
{
prevCriterion = avgCriterion;
prevCriterion.first = prevCriterion.second * avgCriterion; // BUGBUG: What to do here???
epochsNotCountedInAvgCriterion = 0;
}
@ -662,7 +654,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// persist model and check-point info
if ((m_mpi == nullptr) || m_mpi->IsMainNode())
{
SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize);
SaveCheckPointInfo(i, prevCriterion.second, learnRatePerSample, smoothedGradients, prevCriterion.first, chosenMinibatchSize);
auto modelName = GetModelNameForEpoch(i);
LOGPRINTF(stderr, "SGD: Saving checkpoint model '%ls'\n", modelName.c_str());
net->Save(modelName);
@ -741,18 +733,16 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
StreamMinibatchInputs* inputMatrices, // TODO: why is this a pointer?
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients,
/*out*/ double& epochCriterion,
/*out*/ std::vector<double>& epochEvalErrors,
/*in/out*/ size_t& totalSamplesSeen,
std::string prefixMsg)
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
const std::string& prefixMsg)
{
ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training);
double totalTimeInMBs = 0; // use double since timer has sub-microsecond time resolution
double epochCriterionLastMBs = 0;
int numSamplesLastMBs = 0;
std::vector<double> epochEvalErrorsLastMBs(epochEvalErrors.size(), 0);
EpochCriterion epochCriterionLastMBs(0);
//int numSamplesLastMBs = 0;
vector<EpochCriterion> epochEvalErrorsLastMBs(epochEvalErrors.size(), EpochCriterion(0));
// initialize statistics
size_t totalEpochSamples = 0;
@ -762,11 +752,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
// NOTE: the following two local matrices are not used in distGradAgg path
// assume only one training criterion node for each epoch.
// The criterion values are accumulated here over the minibatches (without having to pull them off the GPU).
Matrix<ElemType> localEpochCriterion(1, 1, net->GetDeviceId());
Matrix<ElemType> localEpochEvalErrors(1, epochEvalErrors.size(), net->GetDeviceId());
localEpochCriterion.SetValue(0);
localEpochEvalErrors.SetValue(0);
CriterionAccumulator<ElemType> localEpochCriterion (1, net->GetDeviceId());
CriterionAccumulator<ElemType> localEpochEvalErrors(epochEvalErrors.size(), net->GetDeviceId());
bool useGradientAggregation = ((GetParallelizationMethod() == ParallelizationMethod::DataParallelSGD) &&
(epochNumber >= m_parallelizationStartEpochNum));
@ -784,8 +771,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
std::vector<Matrix<ElemType>*> learnParamsGradients;
if (useGradientAggregation)
{
epochCriterion = double(0.0);
epochEvalErrors.assign(epochEvalErrors.size(), double(0.0));
// TODO: This seems inconsistent: Why only reset if gradient aggregation?
epochCriterion = EpochCriterion(0);
epochEvalErrors.assign(epochEvalErrors.size(), EpochCriterion(0));
}
Profiler profiler(m_numMBsToCUDAProfile);
@ -970,13 +958,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
{
assert(wasDataRead);
// criteria are in Value()(0,0), we accumulate into another 1x1 Matrix (to avoid having to pull the values off the GPU)
Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(criterionNodes[0])->Value(),
0, 0, localEpochCriterion, 0, 0);
localEpochCriterion.Accumulate(criterionNodes, 0, numSamplesWithLabel);
for (size_t i = 0; i < evaluationNodes.size(); i++)
{
Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(evaluationNodes[i])->Value(),
0, 0, localEpochEvalErrors, 0, i);
}
localEpochEvalErrors.Accumulate(evaluationNodes, i, numSamplesWithLabel);
}
}
else
@ -1006,19 +990,23 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
}
// prepare the header
// BUGBUG: This needs to be redone for criterion nodes that have their own MBLayout.
m_gradHeader->numEvalNode = evaluationNodes.size();
m_gradHeader->numSamples = actualMBSize;
m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
m_gradHeader->criterion = actualMBSize > 0 ? criterionNodes[0]->Get00Element() : 0.0;
//m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
//m_gradHeader->criterion = actualMBSize > 0 ? criterionNodes[0]->Get00Element() : 0.0;
let thisEpochCriterion = CriterionAccumulator<ElemType>::GetCriterion(criterionNodes[0], numSamplesWithLabel);
m_gradHeader->numSamplesWithLabel = thisEpochCriterion.second;
m_gradHeader->criterion = thisEpochCriterion.first;
for (size_t i = 0; i < evaluationNodes.size(); i++)
m_gradHeader->evalErrors[i] = actualMBSize > 0 ? evaluationNodes[i]->Get00Element() : 0.0;
m_gradHeader->evalErrors[i] = CriterionAccumulator<ElemType>::GetCriterion(evaluationNodes[i], numSamplesWithLabel);
bool samplesProcessed = m_distGradAgg->AggregateGradients(learnParamsGradients, m_gradHeader, epochNumber);
noMoreSamplesToProcess = !samplesProcessed;
aggregateNumSamples = m_gradHeader->numSamples;
aggregateNumSamples = m_gradHeader->numSamples;
aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel;
epochCriterion += m_gradHeader->criterion;
epochCriterion += EpochCriterion(m_gradHeader->criterion, m_gradHeader->numSamplesWithLabel);
for (size_t i = 0; i < epochEvalErrors.size(); i++)
epochEvalErrors[i] += m_gradHeader->evalErrors[i];
}
@ -1072,7 +1060,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
numMBsRun++;
totalTimeInMBs += timer.ElapsedSeconds();
numSamplesLastMBs += (int)aggregateNumSamplesWithLabel;
//numSamplesLastMBs += (int)aggregateNumSamplesWithLabel; // now inside epochCriterionLastMBs
if (numMBsRun <= m_firstMBsToShowResult || (m_numMBsToShowResult && (numMBsRun % m_numMBsToShowResult == 0)))
{
@ -1080,18 +1068,19 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
if (!useGradientAggregation)
{
timer.Restart();
epochCriterion = localEpochCriterion.Get00Element();
epochCriterion = localEpochCriterion.GetCriterion(0);
for (size_t i = 0; i < epochEvalErrors.size(); i++)
{
epochEvalErrors[i] = localEpochEvalErrors(0, i);
}
epochEvalErrors[i] = localEpochEvalErrors.GetCriterion(i);
timer.Stop();
// Add the last trailing compute
totalTimeInMBs += timer.ElapsedSeconds();
}
double trainLossPerSample = (numSamplesLastMBs != 0) ? ((epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs) : 0.0;
//double trainLossPerSample = (numSamplesLastMBs != 0) ? ((epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs) : 0.0;
EpochCriterion thisEpochCriterion = epochCriterion - epochCriterionLastMBs;
double trainLossPerSample = thisEpochCriterion.Average(); // TODO: Check whether numSamplesLastMBs matches this ^^ difference
int numSamplesLastMBs = (int) thisEpochCriterion.second;
bool wasProgressPrinted = false;
if (epochNumber > 0 || (int) epochSize > 0)
@ -1130,7 +1119,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
double evalError = 0.0;
for (size_t i = 0; i < epochEvalErrors.size(); i++)
{
evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs;
evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]).Average(); // / numSamplesLastMBs;
string formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; ";
SGDTrace(stderr, false, formatString.c_str(), i, evalError);
}
@ -1155,20 +1144,16 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
epochCriterionLastMBs = epochCriterion;
for (size_t i = 0; i < epochEvalErrorsLastMBs.size(); i++)
{
epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
}
if (std::isnan(epochCriterion))
{
if (epochCriterion.IsNan())
RuntimeError("The training criterion is not a number (NAN).");
}
}
timer.Restart();
totalEpochSamples += aggregateNumSamplesWithLabel;
if (!useModelAveraging)
totalSamplesSeen += aggregateNumSamplesWithLabel;
//if (!useModelAveraging)
// totalSamplesSeen += aggregateNumSamplesWithLabel;
// call DataEnd function
// This signals something from SGD to the reader.
@ -1194,26 +1179,12 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
}
// compute final criterion values
if (useGradientAggregation)
if (!useGradientAggregation)
{
// with parallelization, we have them in regular variables
epochCriterion /= float(totalEpochSamples);
// unless we do parallelization, we have them in Matrix objects that possibly live on the GPU--get them over now
epochCriterion = localEpochCriterion.GetCriterion(0);
for (size_t i = 0; i < epochEvalErrors.size(); i++)
{
epochEvalErrors[i] /= totalEpochSamples;
}
}
else
{
// without, we have them in Matrix objects that possibly live on the GPU--get them over now
localEpochCriterion /= float(totalEpochSamples);
localEpochEvalErrors /= float(totalEpochSamples);
epochCriterion = localEpochCriterion.Get00Element();
for (size_t i = 0; i < epochEvalErrors.size(); i++)
{
epochEvalErrors[i] = localEpochEvalErrors(0, i);
}
epochEvalErrors[i] = localEpochEvalErrors.GetCriterion(i);
}
// in case of model averaging, do one more final aggregation of criteria
@ -1222,20 +1193,29 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
// 1. total epoch samples processed by all workers
size_t totalEpochSamplesOfAllWorkers = totalEpochSamples;
m_mpi->AllReduce(&totalEpochSamplesOfAllWorkers, 1);
totalSamplesSeen += totalEpochSamplesOfAllWorkers;
//totalSamplesSeen += totalEpochSamplesOfAllWorkers;
// 2. criterion and EvalErrors
localEpochCriterion *= (float)totalEpochSamples / totalEpochSamplesOfAllWorkers;
localEpochEvalErrors *= (float)totalEpochSamples / totalEpochSamplesOfAllWorkers;
// get criteria for this worker
epochCriterion = localEpochCriterion.GetCriterion(0);
for (size_t i = 0; i < epochEvalErrors.size(); i++)
epochEvalErrors[i] = localEpochEvalErrors.GetCriterion(i);
epochCriterion = localEpochCriterion.Get00Element();
// all-reduce epochCriterion and epochEvalErrors over nodes
m_mpi->AllReduce(&epochCriterion.first, 1);
m_mpi->AllReduce(&epochCriterion.second, 1);
// to transfer the eval vectors, we must pull them apart into STL objects and exchange them separately
// TODO: merge with training criteria
vector<double> numer(epochEvalErrors.size());
vector<size_t> denom(epochEvalErrors.size());
for (size_t i = 0; i < epochEvalErrors.size(); i++)
{
epochEvalErrors[i] = localEpochEvalErrors(0, i);
numer[i] = epochEvalErrors[i].first;
denom[i] = epochEvalErrors[i].second;
}
// merge epochCriterion and epochEvalErrors over nodes
m_mpi->AllReduce(&epochCriterion, 1);
m_mpi->AllReduce(epochEvalErrors);
m_mpi->AllReduce(numer);
m_mpi->AllReduce(denom);
for (size_t i = 0; i < epochEvalErrors.size(); i++)
epochEvalErrors[i] = EpochCriterion(numer[i], denom[i]);
// 3. modify return value
totalEpochSamples = totalEpochSamplesOfAllWorkers;
@ -1364,11 +1344,10 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
const bool learnRateInitialized,
const double largestPrevLearnRatePerSample)
{
double epochCriterion = std::numeric_limits<double>::infinity();
double prevCriterion = std::numeric_limits<double>::infinity();
vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
EpochCriterion epochCriterion(EpochCriterion::Infinity());
EpochCriterion prevCriterion (EpochCriterion::Infinity());
vector<EpochCriterion> epochEvalErrors(evaluationNodes.size(), EpochCriterion::Infinity());
size_t totalSamplesSeen = 0;
double bestLearnRatePerSample = curLearnRate;
size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
@ -1378,10 +1357,10 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
}
double baseCriterion;
EpochCriterion baseCriterion;
double minLearnRate = m_minLearnRate * 0.3f;
double learnRatePerSample = 1.0f / 8.0f / 0.618f / sqrt((double) m_mbSize[epochNumber]);
double learnRatePerSample = 1.0f / 8.0f / 0.618f / sqrt((double) m_mbSize[epochNumber]); // TODO: comment on these magic constants
if (learnRateInitialized && largestPrevLearnRatePerSample > 0)
{
@ -1395,10 +1374,10 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
double learnRate = learnRatePerSample;
size_t dummyMinibatchSize = 0;
LoadCheckPointInfo(baseModelEpoch,
/*out*/ totalSamplesSeen,
/*out*/ prevCriterion.second,
/*out*/ learnRate,
smoothedGradients,
/*out*/ prevCriterion,
/*out*/ prevCriterion.first,
/*out*/ dummyMinibatchSize);
// if model is not changed this is what we will get
@ -1407,13 +1386,13 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
featureNodes, labelNodes,
criterionNodes, evaluationNodes,
inputMatrices, learnableNodes,
smoothedGradients, /*out*/ baseCriterion,
/*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
smoothedGradients,
/*out*/ baseCriterion, /*out*/ epochEvalErrors,
"BaseAdaptiveLearnRateSearch:");
if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
{
if (prevCriterion == std::numeric_limits<double>::infinity())
if (prevCriterion.IsInfinity())
prevCriterion = baseCriterion;
double ratio = 0.3;
@ -1421,7 +1400,8 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
if (m_epochSize != requestDataSize)
ratio = pow(((double) numFramesToUseInSearch) / m_epochSize, 1.0f / 2);
baseCriterion = max(ratio * prevCriterion + (1 - ratio) * baseCriterion, baseCriterion);
// TODO: Rethink if this Average() approach is correct. This matters if the mini-epochs do not have identical sizes, e.g. due to MB-size changes.
baseCriterion.first = baseCriterion.second * max(ratio * prevCriterion.Average() + (1 - ratio) * baseCriterion.Average(), baseCriterion.Average());
}
do
@ -1434,9 +1414,8 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
evaluationNodes, inputMatrices,
learnableNodes, smoothedGradients,
/*out*/ epochCriterion, /*out*/ epochEvalErrors,
/*out*/ totalSamplesSeen, "AdaptiveLearnRateSearch:");
} while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate));
"AdaptiveLearnRateSearch:");
} while (epochCriterion.IsNan() || (epochCriterion.Average() > baseCriterion.Average() && learnRatePerSample > minLearnRate));
bestLearnRatePerSample = learnRatePerSample;
@ -1445,7 +1424,8 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
{
double leftLearnRatePerSample = 0.01 / m_mbSize[epochNumber];
double rightLearnRatePerSample = learnRatePerSample;
double leftCriterion, rightCriterion = epochCriterion;
EpochCriterion rightCriterion = epochCriterion;
EpochCriterion leftCriterion; // we compute this from the mini epoch
TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
numFramesToUseInSearch, trainSetDataReader,
@ -1453,13 +1433,13 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
featureNodes, labelNodes,
criterionNodes, evaluationNodes,
inputMatrices, learnableNodes,
smoothedGradients, /*out*/ leftCriterion,
/*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
smoothedGradients,
/*out*/ leftCriterion, /*out*/ epochEvalErrors,
"DetailBaseAdaptiveLearnRateSearch:");
while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2)
{
if (rightCriterion > leftCriterion)
if (rightCriterion.Average() > leftCriterion.Average())
{
rightLearnRatePerSample *= 0.618;
@ -1475,7 +1455,6 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
smoothedGradients,
/*out*/ rightCriterion,
/*out*/ epochEvalErrors,
/*out*/ totalSamplesSeen,
"DetailRightAdaptiveLearnRateSearch:");
}
else
@ -1494,12 +1473,11 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
smoothedGradients,
/*out*/ leftCriterion,
/*out*/ epochEvalErrors,
/*out*/ totalSamplesSeen,
"DetailLeftAdaptiveLearnRateSearch:");
}
}
bestLearnRatePerSample = (leftCriterion < rightCriterion) ? leftLearnRatePerSample : rightLearnRatePerSample;
bestLearnRatePerSample = (leftCriterion.Average() < rightCriterion.Average()) ? leftLearnRatePerSample : rightLearnRatePerSample;
}
LOGPRINTF(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g baseCriterion=%.10g\n",
@ -1642,13 +1620,13 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
size_t trialMinibatchSize = 0;
bool isFirstIteration = true;
double baseCriterion = 0;
EpochCriterion baseCriterion(0);
// increase the minibatch size by a factor of sqrt(2) in each step.
const float minibatchSizeTuningFactor = sqrtf(2.0f);
size_t lastTriedTrialMinibatchSize = 0;
double lastTriedTrialEpochCriterion = 0;
EpochCriterion lastTriedTrialEpochCriterion(0);
for (float trialMinibatchSizeFloat = (float) minMinibatchSize;
trialMinibatchSizeFloat <= maxMinibatchSize;
trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
@ -1660,9 +1638,8 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
size_t totalSamplesSeen;
std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
double epochCriterion = std::numeric_limits<double>::infinity();
std::vector<EpochCriterion> epochEvalErrors(evaluationNodes.size(), EpochCriterion::Infinity());
EpochCriterion epochCriterion(EpochCriterion::Infinity());
// Train on a few minibatches and so we can observe the epochCriterion as we try increasing
// minibatches with iteration of this loop.
@ -1673,7 +1650,6 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
evaluationNodes, inputMatrices,
learnableNodes, smoothedGradients,
/*out*/ epochCriterion, /*out*/ epochEvalErrors,
/*out*/ totalSamplesSeen,
isFirstIteration ? "BaseAdaptiveMinibatchSearch:" : "AdaptiveMinibatchSearch:");
if (isFirstIteration)
@ -1687,8 +1663,8 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
LOGPRINTF(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
}
else if (!std::isnan(epochCriterion) &&
(epochCriterion > (baseCriterion * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0)))))
else if (!epochCriterion.IsNan() &&
epochCriterion.Average() > (baseCriterion.Average() * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0))))
{
// As soon as we see the Criterion (a measure of error) start to get larger than the
// Criterion we started with, we stop.
@ -1728,16 +1704,15 @@ void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients,
/*out*/ double& epochCriterion,
/*out*/ std::vector<double>& epochEvalErrors,
/*out*/ size_t& totalSamplesSeen,
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
std::string prefixMsg)
{
TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize,
trainSetDataReader, learnRatePerSample, minibatchSize, featureNodes,
labelNodes, criterionNodes, evaluationNodes,
inputMatrices, learnableNodes, smoothedGradients,
/*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
/*out*/ epochCriterion, /*out*/ epochEvalErrors,
prefixMsg);
LOGPRINTF(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
@ -1758,13 +1733,13 @@ void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
double dummyLearnRate;
double dummtPrevCriterion;
EpochCriterion dummyPrevCriterion;
size_t dummyMinibatchSize = 0;
LoadCheckPointInfo(baseModelEpoch,
/*out*/ totalSamplesSeen,
/*out*/ dummyPrevCriterion.second,
/*out*/ dummyLearnRate,
smoothedGradients,
/*out*/ dummtPrevCriterion,
/*out*/ dummyPrevCriterion.first,
/*out*/ dummyMinibatchSize);
}

Просмотреть файл

@ -9,6 +9,7 @@
#include "SimpleEvaluator.h"
#include "DataReader.h"
#include "ScriptableObjects.h"
#include "Criterion.h"
#include <vector>
#include <string>
#include <stdexcept>
@ -384,9 +385,8 @@ protected:
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients,
/*out*/ double& epochCriterion,
/*out*/ std::vector<double>& epochEvalErrors,
/*out*/ size_t& totalSamplesSeen,
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
std::string prefixMsg = "");
size_t AdaptiveMinibatchSizing(ComputationNetworkPtr net,
@ -449,10 +449,9 @@ protected:
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients,
/*out*/ double& epochCriterion,
/*out*/ std::vector<double>& epochEvalErrors,
/*out*/ size_t& totalSamplesSeen,
std::string prefixMsg = "");
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
const std::string& prefixMsg = "");
void InitDistGradAgg(int numEvalNodes, int traceLevel);
void InitModelAggregationHandler(int traceLevel);
@ -482,7 +481,7 @@ protected:
void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, // TODO: combine totalSamplesSeen and prevCriterion into a EpochCriterion type
const double learnRatePerSample,
const std::list<Matrix<ElemType>>& smoothedGradients,
const double prevCriterion,
@ -519,17 +518,17 @@ public:
int npos);
protected:
wstring m_modelPath;
std::wstring m_modelPath;
bool m_keepCheckPointFiles;
// bool m_validateAfterModelReloading; // TODO: remove this. Why would one not validate a model?
wstring m_trainCriterionNodeName;
wstring m_evalCriterionNodeName;
std::wstring m_trainCriterionNodeName;
std::wstring m_evalCriterionNodeName;
// enable tracing. Nodes listed here get their m_traceNodeValueXXX flags set
vector<wstring> m_traceNodeNamesReal;
vector<wstring> m_traceNodeNamesCategory;
vector<wstring> m_traceNodeNamesSparse;
std::vector<std::wstring> m_traceNodeNamesReal;
std::vector<std::wstring> m_traceNodeNamesCategory;
std::vector<std::wstring> m_traceNodeNamesSparse;
size_t m_prevChosenMinibatchSize;
double m_lastFinishedEpochTrainLoss;

Просмотреть файл

@ -150,6 +150,7 @@
<ClInclude Include="..\ComputationNetworkLib\ComputationNetwork.h" />
<ClInclude Include="..\ComputationNetworkLib\ComputationNode.h" />
<ClInclude Include="..\ComputationNetworkLib\ConvolutionalNodes.h" />
<ClInclude Include="Criterion.h" />
<ClInclude Include="DataReaderHelpers.h" />
<ClInclude Include="DistGradHeader.h" />
<ClInclude Include="IDistGradAggregator.h" />
@ -184,4 +185,4 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
</Project>

Просмотреть файл

@ -141,6 +141,9 @@
<ClInclude Include="MASGD.h">
<Filter>Parallelization</Filter>
</ClInclude>
<ClInclude Include="Criterion.h">
<Filter>SGD</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Filter Include="Common">

Просмотреть файл

@ -14,6 +14,7 @@
#include "DistGradHeader.h"
#include "IDistGradAggregator.h"
#include "SimpleDistGradAggregator.h"
#include "Criterion.h"
#include <vector>
#include <string>
@ -46,7 +47,7 @@ public:
}
// returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
vector<double> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
vector<EpochCriterion> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
{
ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring);
@ -82,7 +83,7 @@ public:
}
// initialize eval results
std::vector<double> evalResults(evalNodes.size(), 0);
std::vector<EpochCriterion> evalResults(evalNodes.size(), EpochCriterion(0));
// allocate memory for forward computation
m_net->AllocateAllMatrices(evalNodes, {}, nullptr);
@ -104,9 +105,7 @@ public:
size_t numSamplesLastMBs = 0;
size_t lastMBsRun = 0; // MBs run before this display
std::vector<double> evalResultsLastMBs;
for (int i = 0; i < evalResults.size(); i++)
evalResultsLastMBs.push_back((ElemType) 0);
std::vector<EpochCriterion> evalResultsLastMBs(evalResults.size(), EpochCriterion(0));
//TODO: we should add support for distributed reading
dataReader->StartMinibatchLoop(mbSize, 0, testSize);
@ -161,9 +160,10 @@ public:
m_gradHeader->numEvalNode = evalNodes.size();
m_gradHeader->numSamples = actualMBSize;
m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
m_gradHeader->criterion = 0.0;
m_gradHeader->criterion = 0.0; // (not used here)
for (size_t i = 0; i < evalNodes.size(); i++)
m_gradHeader->evalErrors[i] = evalNodes[i]->Get00Element();
//m_gradHeader->evalErrors[i] = evalNodes[i]->Get00Element();
m_gradHeader->evalErrors[i] = CriterionAccumulator<ElemType>::GetCriterion(evalNodes[i], numSamplesWithLabel);
// TODO: We are reusing the aggregation logic inside SimpleDistGradAggregator, which has a heavy dependency
// on the gradient matrix. At some point we should refactor the aggregator class to be able to only calculating
@ -184,9 +184,8 @@ public:
else
{
for (int i = 0; i < evalNodes.size(); i++)
{
evalResults[i] += (double)evalNodes[i]->Get00Element(); // criterionNode should be a scalar
}
evalResults[i] += CriterionAccumulator<ElemType>::GetCriterion(evalNodes[i], numSamplesWithLabel);
//evalResults[i] += (double)evalNodes[i]->Get00Element(); // criterionNode should be a scalar
}
totalEpochSamples += aggregateNumSamplesWithLabel;
@ -225,15 +224,15 @@ public:
// final statistics
for (int i = 0; i < evalResultsLastMBs.size(); i++)
evalResultsLastMBs[i] = 0; // clear this since statistics display will subtract the previous value
evalResultsLastMBs[i] = EpochCriterion(0); // clear this since statistics display will subtract the previous value
fprintf(stderr, "Final Results: ");
DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastMBs, true);
for (int i = 0; i < evalResults.size(); i++)
{
evalResults[i] /= totalEpochSamples;
}
//for (int i = 0; i < evalResults.size(); i++)
//{
// evalResults[i] /= totalEpochSamples;
//}
return evalResults;
}
@ -241,24 +240,19 @@ public:
protected:
void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs,
const vector<ComputationNodeBasePtr>& evalNodes,
const double evalResults, const double evalResultsLastMBs, bool displayConvertedValue = false)
const EpochCriterion evalResults, const EpochCriterion evalResultsLastMBs, bool displayConvertedValue = false)
{
vector<double> evaR;
evaR.push_back(evalResults);
vector<double> evaLast;
evaLast.push_back(evalResultsLastMBs);
DisplayEvalStatistics(startMBNum, endMBNum, numSamplesLastMBs, evalNodes, evaR, evaLast, displayConvertedValue);
DisplayEvalStatistics(startMBNum, endMBNum, numSamplesLastMBs, evalNodes, { evalResults }, { evalResultsLastMBs }, displayConvertedValue);
}
void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector<ComputationNodeBasePtr>& evalNodes,
const vector<double>& evalResults, const vector<double>& evalResultsLastMBs, bool displayConvertedValue = false)
const vector<EpochCriterion>& evalResults, const vector<EpochCriterion>& evalResultsLastMBs, bool displayConvertedValue = false)
{
fprintf(stderr, "Minibatch[%lu-%lu]: SamplesSeen = %lu ", startMBNum, endMBNum, numSamplesLastMBs);
for (size_t i = 0; i < evalResults.size(); i++)
{
double eresult = (evalResults[i] - evalResultsLastMBs[i]) / numSamplesLastMBs;
double eresult = (evalResults[i] - evalResultsLastMBs[i]).Average(); // / numSamplesLastMBs;
fprintf(stderr, "%ls: %ls/Sample = %.8g ", evalNodes[i]->NodeName().c_str(), evalNodes[i]->OperationName().c_str(), eresult);
if (displayConvertedValue)