reverting simple evaluator for compatibility, initialing simple evaluator without m_mpi when using ASGD

This commit is contained in:
Qiwei Ye 2016-10-10 20:50:54 +08:00
Родитель 0e443fa823
Коммит 018a812f34
2 изменённых файлов: 9 добавлений и 5 удалений

Просмотреть файл

@ -589,7 +589,11 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
{
SimpleEvaluator<ElemType> evalforvalidation(net, m_mpi, m_enableDistributedMBReading);
// TODO(dataASGD) making evaluator becoming nondistributed one when using asynchonized data parallel.
if (UsingAsyncGradientAggregation(i+1))
SimpleEvaluator<ElemType> evalforvalidation(net, nullptr, m_enableDistributedMBReading);
else
SimpleEvaluator<ElemType> evalforvalidation(net, m_mpi, m_enableDistributedMBReading);
vector<wstring> cvSetTrainAndEvalNodes;
if (criterionNodes.size() > 0)
{
@ -601,8 +605,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
}
// BUGBUG: We should not use the training MB size. The training MB size is constrained by both convergence and memory. Eval is only constrained by memory.
// Todo(dataASGD) cross validating should set to nondistributed reader while using asynchonized data parallel
let vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i], UsingAsyncGradientAggregation(i));
// Todo(dataASGD) adding an options for that the cross validating should set to nondistributed reader while using asynchonized data parallel
let vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
LOGPRINTF(stderr, "Finished Epoch[%2d of %d]: [Validate] ", i + 1, (int)m_maxEpochs);
for (size_t k = 0; k < vScore.size() /*&& k < 2*/; k++)
vScore[k].LogCriterion(cvSetTrainAndEvalNodes[k], /*addSemicolon=*/k + 1 < vScore.size());

Просмотреть файл

@ -48,7 +48,7 @@ public:
}
// returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
vector<EpochCriterion> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize, const bool useDataParallelASGD = false)
vector<EpochCriterion> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
{
ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring);
@ -107,7 +107,7 @@ public:
std::vector<EpochCriterion> evalResultsLastLogged(evalResults.size(), EpochCriterion(0));
bool useParallelTrain = (m_mpi != nullptr) && !useDataParallelASGD;
bool useParallelTrain = (m_mpi != nullptr);
bool useDistributedMBReading = useParallelTrain && m_enableDistributedMBReading && dataReader->SupportsDistributedMBRead();
if (useDistributedMBReading)
dataReader->StartDistributedMinibatchLoop(mbSize, 0, m_mpi->CurrentNodeRank(), m_mpi->NumNodesInUse(), inputMatrices.GetStreamDescriptions(), testSize);