cherry-picked: numGradientBits is now a vector; simplified logging of MB scaling

This commit is contained in:
Frank Seide 2016-09-16 19:50:15 -07:00 коммит произвёл U-FAREAST\fseide
Родитель dfcade2d8c
Коммит 0bbfdbef99
4 изменённых файлов: 99 добавлений и 57 удалений

@ -1 +1 @@
Subproject commit 87767425a4ec3b93aa574295f5332460155d0d74
Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284

Просмотреть файл

@ -141,15 +141,15 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
else
{
LOGPRINTF(stderr, "Training criteria:\n");
for (const auto& node : criterionNodes)
{
LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
}
if (criterionNodes.empty())
{
LOGPRINTF(stderr, "\t(none)\n");
InvalidArgument("TrainOrAdaptModel: No criterion node was specified.");
}
for (const auto& node : criterionNodes)
{
LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
}
if (criterionNodes.empty())
{
LOGPRINTF(stderr, "\t(none)\n");
InvalidArgument("TrainOrAdaptModel: No criterion node was specified.");
}
}
// determine evaluationNodes from GetEvalCriterionNodes(), ensuring each criterion is only logged once
@ -277,10 +277,10 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
{
fprintf(stderr, "out of %d parameter tensors and %d nodes with gradient:\n\n",
(int)learnableNodes.size(), (int)numNeedsGradient);
for (let nodeDescription : nodesToUpdateDescriptions)
{
LOGPRINTF(stderr, "\t%ls\n", nodeDescription.c_str());
}
for (let nodeDescription : nodesToUpdateDescriptions)
{
LOGPRINTF(stderr, "\t%ls\n", nodeDescription.c_str());
}
}
// one blank line before training progress log
@ -302,16 +302,20 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
for (int i = 0; i < m_numPrevLearnRates; i++)
prevLearnRates[i] = -1.0;
m_prevChosenMinibatchSize = m_mbSize[startEpoch];
int currentNumGradientBits = 0; // this remembers the last #gradient bits we set for dataParallelSGD (init val 0 has no meaning, just keep compiler happy)
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD)
{
InitDistGradAgg(evaluationNodes.size(), m_traceLevel);
currentNumGradientBits = m_numGradientBits[startEpoch]; // remember so that we can detect a change
InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel);
}
else if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD ||
GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD)
{
InitModelAggregationHandler(m_syncStatsTrace, net->GetDeviceId());
}
// precompute mean and invStdDev nodes and save initial model
// When no precompute, only save if we did not load the model from a
// checkpoint but instead built it from a network description
@ -378,6 +382,14 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// --- MAIN EPOCH LOOP
for (int i = startEpoch; i < (int) m_maxEpochs; i++) // TODO: why is this an int, and not a size_t?
{
// (re-)initialize 1-bit SGD
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD &&
currentNumGradientBits != m_numGradientBits[i])
{
currentNumGradientBits = m_numGradientBits[i];
InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel);
}
// Synchronize all ranks before proceeding to ensure that
// rank 0 has finished writing the previous model file
if (m_mpi != nullptr)
@ -464,6 +476,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
criterionNodes, evaluationNodes,
inputMatrices, learnableNodes,
smoothedGradients, smoothedCounts, learningRateAdjustmentFactor);
if (m_traceLevel < 1 && chosenMinibatchSize != m_prevChosenMinibatchSize)
LOGPRINTF(stderr, "Minibatch size adapted to %d.\n", (int)chosenMinibatchSize);
m_prevChosenMinibatchSize = chosenMinibatchSize;
}
else
@ -476,9 +490,11 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
double momentumPerSample = GetMomentumPerSample(i /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequencesForFixingBPTTMode());
// time constant = number of samples after which a contribution has been reduced to e^-1
double momentumAsTimeConstant = momentumPerSample == 0.0 ? 0.0
: momentumPerSample >= 1.0 ? 0.0
: -1.0 / log(momentumPerSample);
double momentumAsTimeConstant = momentumPerSample == 0.0
? 0.0
: momentumPerSample >= 1.0
? 0.0
: -1.0 / log(momentumPerSample);
if (m_traceLevel > 0)
{
fprintf(stderr, "\n");
@ -863,8 +879,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
LOGPRINTF(stderr, "Starting minibatch loop");
if (useGradientAggregation)
{
fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)",
(int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits);
fprintf(stderr, ", DataParallelSGD training (myRank = %d, numNodes = %d, numGradientBits = %d)",
(int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits[epochNumber]);
if (m_bufferedAsyncGradientAggregation)
fprintf(stderr, ", BufferedAsyncGradientAggregation is ENABLED");
@ -1752,8 +1768,9 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Evaluating minibatchSizes %d..%d\n",
(int)epochNumber + 1, (int)RoundToMultipleOf64(minMinibatchSize), (int)RoundToMultipleOf64(maxMinibatchSize));
size_t lastTriedTrialMinibatchSize = 0;
EpochCriterion lastTriedTrialEpochCriterion(0);
size_t lastGoodMinibatchSize = 0;
size_t lastTriedMinibatchSize = 0;
EpochCriterion lastGoodEpochCriterion(0);
for (float trialMinibatchSizeFloat = (float) minMinibatchSize;
trialMinibatchSizeFloat <= maxMinibatchSize;
trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
@ -1770,6 +1787,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
// Train on a few minibatches and so we can observe the epochCriterion as we try increasing
// minibatches with iteration of this loop.
lastTriedMinibatchSize = trialMinibatchSize;
TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
numFramesToUseInSearch, trainSetDataReader,
learnRatePerSample, trialMinibatchSize, featureNodes,
@ -1778,21 +1796,22 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
learnableNodes, smoothedGradients, smoothedCounts,
/*out*/ epochCriterion, /*out*/ epochEvalErrors,
isFirstIteration ? "BaseAdaptiveMinibatchSearch:" : "AdaptiveMinibatchSearch:");
lastTriedMinibatchSize = trialMinibatchSize;
if (isFirstIteration)
{
// for the first iteration of the loop only, set baseCriterion
// to the result we got from TrainOneMiniEpochAndReloadModel().
baseCriterion = epochCriterion;
lastTriedTrialMinibatchSize = trialMinibatchSize;
lastTriedTrialEpochCriterion = baseCriterion;
lastGoodMinibatchSize = trialMinibatchSize;
lastGoodEpochCriterion = baseCriterion;
isFirstIteration = false;
if (m_traceLevel > 0)
{
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Computed baseCriterion %.8f for minibatchSize=%d\n",
(int)epochNumber + 1, baseCriterion.Average(), (int)trialMinibatchSize);
}
}
}
else if (!epochCriterion.IsNan() &&
epochCriterion.Average() > (baseCriterion.Average() * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0))))
@ -1805,8 +1824,8 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
}
else
{
lastTriedTrialMinibatchSize = trialMinibatchSize;
lastTriedTrialEpochCriterion = epochCriterion;
lastGoodMinibatchSize = trialMinibatchSize;
lastGoodEpochCriterion = epochCriterion;
if (m_traceLevel > 0 && trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
{
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Keep searching... epochCriterion = %.8f vs. baseCriterion = %.8f\n",
@ -1814,10 +1833,27 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
}
}
}
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Search successful. New minibatchSize is %d. epochCriterion = %.8f vs baseCriterion = %.8f\n",
(int)epochNumber+1, (int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion.Average(), baseCriterion.Average());
return lastTriedTrialMinibatchSize;
if (m_traceLevel > 0)
{
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Search successful. New minibatchSize is %d. epochCriterion = %.8f vs baseCriterion = %.8f\n",
(int)epochNumber + 1, (int)lastGoodMinibatchSize, lastGoodEpochCriterion.Average(), baseCriterion.Average());
}
#if 1 // BUGBUG: Somehow state leaks across trials. Workaround: redo the last known good one to reset that. Helps somewhat until we fix this.
if (lastTriedMinibatchSize != lastGoodMinibatchSize)
{
std::vector<EpochCriterion> epochEvalErrors(evaluationNodes.size(), EpochCriterion::Infinity());
EpochCriterion epochCriterion(EpochCriterion::Infinity());
TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
numFramesToUseInSearch, trainSetDataReader,
learnRatePerSample, trialMinibatchSize, featureNodes,
labelNodes, criterionNodes,
evaluationNodes, inputMatrices,
learnableNodes, smoothedGradients, smoothedCounts,
/*out*/ epochCriterion, /*out*/ epochEvalErrors,
"FixMinibatchSearch:");
}
#endif
return lastGoodMinibatchSize;
}
// run training over a small subset of an epoch, used by automatic LR and MB-size tuning
@ -1903,31 +1939,24 @@ void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net
}
template <class ElemType>
void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int traceLevel)
void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel)
{
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD)
{
if (m_distGradAgg == nullptr)
{
assert(GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD);
if (traceLevel > 0)
fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);
#ifdef CNTK_PARALLEL_TRAINING_SUPPORT
m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, m_numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
#else
if (m_numGradientBits != (8 * sizeof(ElemType)))
{
RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
}
m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
#endif // !CNTK_PARALLEL_TRAINING_SUPPORT
}
if (m_gradHeader == nullptr)
{
m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) {
DistGradHeader::Destroy(ptr);
});
}
if (numGradientBits != (8 * sizeof(ElemType)))
{
RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
}
m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
#endif // !CNTK_PARALLEL_TRAINING_SUPPORT
m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
}
template <class ElemType>
@ -2649,7 +2678,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
// parallel training
m_parallelizationMethod = ParallelizationMethod::none;
m_numGradientBits = 32;
m_numGradientBits = vector<int>{8 * (int)sizeofElemType}; // means no quantization
m_zeroThresholdFor1Bit = true;
m_bufferedAsyncGradientAggregation = false;
m_enableDistributedMBReading = false;
@ -2685,7 +2714,20 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
if ( m_numGradientBits < 1 || m_numGradientBits > (8 * sizeofElemType) )
{
<<<<<<< HEAD
InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
=======
const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record()));
let defaultGradientBits = 8 * (int)sizeofElemType;
m_numGradientBits = configDataParallelSGD(L"gradientBits", ConfigRecordType::Array(intargvector(vector<int>{defaultGradientBits})));
m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
for (size_t i = 0; i < m_numGradientBits.size(); i++)
{
if (m_numGradientBits[i] < 1 || m_numGradientBits[i] > defaultGradientBits)
InvalidArgument("gradientBits values must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double.");
}
>>>>>>> 5316380... numGradientBits is now a vector;
}
}
if (configParallelTrain.Exists(L"ModelAveragingSGD"))

Просмотреть файл

@ -264,7 +264,7 @@ protected:
int m_syncStatsTrace;
// Data parallel SGD training parameters
int m_numGradientBits;
intargvector m_numGradientBits;
bool m_bufferedAsyncGradientAggregation;
bool m_zeroThresholdFor1Bit;
@ -470,7 +470,7 @@ protected:
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
const std::string& prefixMsg = "");
void InitDistGradAgg(int numEvalNodes, int traceLevel);
void InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel);
void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID);
public:
// UpdateWeights() - actual weight update, implementing various update rules

Просмотреть файл

@ -98,7 +98,7 @@ TrainConvNet = {
parallelizationMethod = "DataParallelSGD"
parallelizationStartEpoch = 1
distributedMBReading = true
dataParallelSGD = { gradientBits = 2 }
dataParallelSGD = { gradientBits = 2:1 }
}
AutoAdjust = {
autoAdjustMinibatch = true # enable automatic growing of minibatch size