cherry-picked: numGradientBits is now a vector; simplified logging of MB scaling
This commit is contained in:
Родитель
dfcade2d8c
Коммит
0bbfdbef99
|
@ -1 +1 @@
|
|||
Subproject commit 87767425a4ec3b93aa574295f5332460155d0d74
|
||||
Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284
|
|
@ -141,15 +141,15 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
else
|
||||
{
|
||||
LOGPRINTF(stderr, "Training criteria:\n");
|
||||
for (const auto& node : criterionNodes)
|
||||
{
|
||||
LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
}
|
||||
if (criterionNodes.empty())
|
||||
{
|
||||
LOGPRINTF(stderr, "\t(none)\n");
|
||||
InvalidArgument("TrainOrAdaptModel: No criterion node was specified.");
|
||||
}
|
||||
for (const auto& node : criterionNodes)
|
||||
{
|
||||
LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
}
|
||||
if (criterionNodes.empty())
|
||||
{
|
||||
LOGPRINTF(stderr, "\t(none)\n");
|
||||
InvalidArgument("TrainOrAdaptModel: No criterion node was specified.");
|
||||
}
|
||||
}
|
||||
|
||||
// determine evaluationNodes from GetEvalCriterionNodes(), ensuring each criterion is only logged once
|
||||
|
@ -277,10 +277,10 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
{
|
||||
fprintf(stderr, "out of %d parameter tensors and %d nodes with gradient:\n\n",
|
||||
(int)learnableNodes.size(), (int)numNeedsGradient);
|
||||
for (let nodeDescription : nodesToUpdateDescriptions)
|
||||
{
|
||||
LOGPRINTF(stderr, "\t%ls\n", nodeDescription.c_str());
|
||||
}
|
||||
for (let nodeDescription : nodesToUpdateDescriptions)
|
||||
{
|
||||
LOGPRINTF(stderr, "\t%ls\n", nodeDescription.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// one blank line before training progress log
|
||||
|
@ -302,16 +302,20 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
for (int i = 0; i < m_numPrevLearnRates; i++)
|
||||
prevLearnRates[i] = -1.0;
|
||||
|
||||
m_prevChosenMinibatchSize = m_mbSize[startEpoch];
|
||||
|
||||
int currentNumGradientBits = 0; // this remembers the last #gradient bits we set for dataParallelSGD (init val 0 has no meaning, just keep compiler happy)
|
||||
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD)
|
||||
{
|
||||
InitDistGradAgg(evaluationNodes.size(), m_traceLevel);
|
||||
currentNumGradientBits = m_numGradientBits[startEpoch]; // remember so that we can detect a change
|
||||
InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel);
|
||||
}
|
||||
else if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD ||
|
||||
GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD)
|
||||
{
|
||||
InitModelAggregationHandler(m_syncStatsTrace, net->GetDeviceId());
|
||||
}
|
||||
|
||||
|
||||
// precompute mean and invStdDev nodes and save initial model
|
||||
// When no precompute, only save if we did not load the model from a
|
||||
// checkpoint but instead built it from a network description
|
||||
|
@ -378,6 +382,14 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
// --- MAIN EPOCH LOOP
|
||||
for (int i = startEpoch; i < (int) m_maxEpochs; i++) // TODO: why is this an int, and not a size_t?
|
||||
{
|
||||
// (re-)initialize 1-bit SGD
|
||||
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD &&
|
||||
currentNumGradientBits != m_numGradientBits[i])
|
||||
{
|
||||
currentNumGradientBits = m_numGradientBits[i];
|
||||
InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel);
|
||||
}
|
||||
|
||||
// Synchronize all ranks before proceeding to ensure that
|
||||
// rank 0 has finished writing the previous model file
|
||||
if (m_mpi != nullptr)
|
||||
|
@ -464,6 +476,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
criterionNodes, evaluationNodes,
|
||||
inputMatrices, learnableNodes,
|
||||
smoothedGradients, smoothedCounts, learningRateAdjustmentFactor);
|
||||
if (m_traceLevel < 1 && chosenMinibatchSize != m_prevChosenMinibatchSize)
|
||||
LOGPRINTF(stderr, "Minibatch size adapted to %d.\n", (int)chosenMinibatchSize);
|
||||
m_prevChosenMinibatchSize = chosenMinibatchSize;
|
||||
}
|
||||
else
|
||||
|
@ -476,9 +490,11 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
|
||||
double momentumPerSample = GetMomentumPerSample(i /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequencesForFixingBPTTMode());
|
||||
// time constant = number of samples after which a contribution has been reduced to e^-1
|
||||
double momentumAsTimeConstant = momentumPerSample == 0.0 ? 0.0
|
||||
: momentumPerSample >= 1.0 ? 0.0
|
||||
: -1.0 / log(momentumPerSample);
|
||||
double momentumAsTimeConstant = momentumPerSample == 0.0
|
||||
? 0.0
|
||||
: momentumPerSample >= 1.0
|
||||
? 0.0
|
||||
: -1.0 / log(momentumPerSample);
|
||||
if (m_traceLevel > 0)
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
@ -863,8 +879,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
LOGPRINTF(stderr, "Starting minibatch loop");
|
||||
if (useGradientAggregation)
|
||||
{
|
||||
fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)",
|
||||
(int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits);
|
||||
fprintf(stderr, ", DataParallelSGD training (myRank = %d, numNodes = %d, numGradientBits = %d)",
|
||||
(int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits[epochNumber]);
|
||||
|
||||
if (m_bufferedAsyncGradientAggregation)
|
||||
fprintf(stderr, ", BufferedAsyncGradientAggregation is ENABLED");
|
||||
|
@ -1752,8 +1768,9 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Evaluating minibatchSizes %d..%d\n",
|
||||
(int)epochNumber + 1, (int)RoundToMultipleOf64(minMinibatchSize), (int)RoundToMultipleOf64(maxMinibatchSize));
|
||||
|
||||
size_t lastTriedTrialMinibatchSize = 0;
|
||||
EpochCriterion lastTriedTrialEpochCriterion(0);
|
||||
size_t lastGoodMinibatchSize = 0;
|
||||
size_t lastTriedMinibatchSize = 0;
|
||||
EpochCriterion lastGoodEpochCriterion(0);
|
||||
for (float trialMinibatchSizeFloat = (float) minMinibatchSize;
|
||||
trialMinibatchSizeFloat <= maxMinibatchSize;
|
||||
trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
|
||||
|
@ -1770,6 +1787,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
|
||||
// Train on a few minibatches and so we can observe the epochCriterion as we try increasing
|
||||
// minibatches with iteration of this loop.
|
||||
lastTriedMinibatchSize = trialMinibatchSize;
|
||||
TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
|
||||
numFramesToUseInSearch, trainSetDataReader,
|
||||
learnRatePerSample, trialMinibatchSize, featureNodes,
|
||||
|
@ -1778,21 +1796,22 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
learnableNodes, smoothedGradients, smoothedCounts,
|
||||
/*out*/ epochCriterion, /*out*/ epochEvalErrors,
|
||||
isFirstIteration ? "BaseAdaptiveMinibatchSearch:" : "AdaptiveMinibatchSearch:");
|
||||
lastTriedMinibatchSize = trialMinibatchSize;
|
||||
|
||||
if (isFirstIteration)
|
||||
{
|
||||
// for the first iteration of the loop only, set baseCriterion
|
||||
// to the result we got from TrainOneMiniEpochAndReloadModel().
|
||||
baseCriterion = epochCriterion;
|
||||
lastTriedTrialMinibatchSize = trialMinibatchSize;
|
||||
lastTriedTrialEpochCriterion = baseCriterion;
|
||||
lastGoodMinibatchSize = trialMinibatchSize;
|
||||
lastGoodEpochCriterion = baseCriterion;
|
||||
isFirstIteration = false;
|
||||
|
||||
if (m_traceLevel > 0)
|
||||
{
|
||||
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Computed baseCriterion %.8f for minibatchSize=%d\n",
|
||||
(int)epochNumber + 1, baseCriterion.Average(), (int)trialMinibatchSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!epochCriterion.IsNan() &&
|
||||
epochCriterion.Average() > (baseCriterion.Average() * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0))))
|
||||
|
@ -1805,8 +1824,8 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
}
|
||||
else
|
||||
{
|
||||
lastTriedTrialMinibatchSize = trialMinibatchSize;
|
||||
lastTriedTrialEpochCriterion = epochCriterion;
|
||||
lastGoodMinibatchSize = trialMinibatchSize;
|
||||
lastGoodEpochCriterion = epochCriterion;
|
||||
if (m_traceLevel > 0 && trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
|
||||
{
|
||||
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Keep searching... epochCriterion = %.8f vs. baseCriterion = %.8f\n",
|
||||
|
@ -1814,10 +1833,27 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
}
|
||||
}
|
||||
}
|
||||
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Search successful. New minibatchSize is %d. epochCriterion = %.8f vs baseCriterion = %.8f\n",
|
||||
(int)epochNumber+1, (int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion.Average(), baseCriterion.Average());
|
||||
|
||||
return lastTriedTrialMinibatchSize;
|
||||
if (m_traceLevel > 0)
|
||||
{
|
||||
LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Search successful. New minibatchSize is %d. epochCriterion = %.8f vs baseCriterion = %.8f\n",
|
||||
(int)epochNumber + 1, (int)lastGoodMinibatchSize, lastGoodEpochCriterion.Average(), baseCriterion.Average());
|
||||
}
|
||||
#if 1 // BUGBUG: Somehow state leaks across trials. Workaround: redo the last known good one to reset that. Helps somewhat until we fix this.
|
||||
if (lastTriedMinibatchSize != lastGoodMinibatchSize)
|
||||
{
|
||||
std::vector<EpochCriterion> epochEvalErrors(evaluationNodes.size(), EpochCriterion::Infinity());
|
||||
EpochCriterion epochCriterion(EpochCriterion::Infinity());
|
||||
TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
|
||||
numFramesToUseInSearch, trainSetDataReader,
|
||||
learnRatePerSample, trialMinibatchSize, featureNodes,
|
||||
labelNodes, criterionNodes,
|
||||
evaluationNodes, inputMatrices,
|
||||
learnableNodes, smoothedGradients, smoothedCounts,
|
||||
/*out*/ epochCriterion, /*out*/ epochEvalErrors,
|
||||
"FixMinibatchSearch:");
|
||||
}
|
||||
#endif
|
||||
return lastGoodMinibatchSize;
|
||||
}
|
||||
|
||||
// run training over a small subset of an epoch, used by automatic LR and MB-size tuning
|
||||
|
@ -1903,31 +1939,24 @@ void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int traceLevel)
|
||||
void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel)
|
||||
{
|
||||
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD)
|
||||
{
|
||||
if (m_distGradAgg == nullptr)
|
||||
{
|
||||
assert(GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD);
|
||||
if (traceLevel > 0)
|
||||
fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);
|
||||
|
||||
#ifdef CNTK_PARALLEL_TRAINING_SUPPORT
|
||||
m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, m_numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
|
||||
m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
|
||||
#else
|
||||
if (m_numGradientBits != (8 * sizeof(ElemType)))
|
||||
{
|
||||
RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
|
||||
}
|
||||
|
||||
m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
|
||||
#endif // !CNTK_PARALLEL_TRAINING_SUPPORT
|
||||
}
|
||||
|
||||
if (m_gradHeader == nullptr)
|
||||
{
|
||||
m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) {
|
||||
DistGradHeader::Destroy(ptr);
|
||||
});
|
||||
}
|
||||
if (numGradientBits != (8 * sizeof(ElemType)))
|
||||
{
|
||||
RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
|
||||
}
|
||||
|
||||
m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
|
||||
#endif // !CNTK_PARALLEL_TRAINING_SUPPORT
|
||||
|
||||
m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -2649,7 +2678,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
|
|||
|
||||
// parallel training
|
||||
m_parallelizationMethod = ParallelizationMethod::none;
|
||||
m_numGradientBits = 32;
|
||||
m_numGradientBits = vector<int>{8 * (int)sizeofElemType}; // means no quantization
|
||||
m_zeroThresholdFor1Bit = true;
|
||||
m_bufferedAsyncGradientAggregation = false;
|
||||
m_enableDistributedMBReading = false;
|
||||
|
@ -2685,7 +2714,20 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
|
|||
m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
|
||||
if ( m_numGradientBits < 1 || m_numGradientBits > (8 * sizeofElemType) )
|
||||
{
|
||||
<<<<<<< HEAD
|
||||
InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
|
||||
=======
|
||||
const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record()));
|
||||
let defaultGradientBits = 8 * (int)sizeofElemType;
|
||||
m_numGradientBits = configDataParallelSGD(L"gradientBits", ConfigRecordType::Array(intargvector(vector<int>{defaultGradientBits})));
|
||||
m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
|
||||
m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
|
||||
for (size_t i = 0; i < m_numGradientBits.size(); i++)
|
||||
{
|
||||
if (m_numGradientBits[i] < 1 || m_numGradientBits[i] > defaultGradientBits)
|
||||
InvalidArgument("gradientBits values must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double.");
|
||||
}
|
||||
>>>>>>> 5316380... numGradientBits is now a vector;
|
||||
}
|
||||
}
|
||||
if (configParallelTrain.Exists(L"ModelAveragingSGD"))
|
||||
|
|
|
@ -264,7 +264,7 @@ protected:
|
|||
int m_syncStatsTrace;
|
||||
|
||||
// Data parallel SGD training parameters
|
||||
int m_numGradientBits;
|
||||
intargvector m_numGradientBits;
|
||||
bool m_bufferedAsyncGradientAggregation;
|
||||
bool m_zeroThresholdFor1Bit;
|
||||
|
||||
|
@ -470,7 +470,7 @@ protected:
|
|||
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
|
||||
const std::string& prefixMsg = "");
|
||||
|
||||
void InitDistGradAgg(int numEvalNodes, int traceLevel);
|
||||
void InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel);
|
||||
void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID);
|
||||
public:
|
||||
// UpdateWeights() - actual weight update, implementing various update rules
|
||||
|
|
|
@ -98,7 +98,7 @@ TrainConvNet = {
|
|||
parallelizationMethod = "DataParallelSGD"
|
||||
parallelizationStartEpoch = 1
|
||||
distributedMBReading = true
|
||||
dataParallelSGD = { gradientBits = 2 }
|
||||
dataParallelSGD = { gradientBits = 2:1 }
|
||||
}
|
||||
AutoAdjust = {
|
||||
autoAdjustMinibatch = true # enable automatic growing of minibatch size
|
||||
|
|
Загрузка…
Ссылка в новой задаче