From 0bbfdbef99e2e231acf8517c9e8ed7bc54c4bd09 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 16 Sep 2016 19:50:15 -0700 Subject: [PATCH] cherry-picked: numGradientBits is now a vector; simplified logging of MB scaling --- Source/1BitSGD | 2 +- Source/SGDLib/SGD.cpp | 148 +++++++++++------- Source/SGDLib/SGD.h | 4 +- .../ImageHandsOn/ImageHandsOn_Task6.cntk | 2 +- 4 files changed, 99 insertions(+), 57 deletions(-) diff --git a/Source/1BitSGD b/Source/1BitSGD index 87767425a..f7afb8c6a 160000 --- a/Source/1BitSGD +++ b/Source/1BitSGD @@ -1 +1 @@ -Subproject commit 87767425a4ec3b93aa574295f5332460155d0d74 +Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284 diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index a2aea4c38..793672b5b 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -141,15 +141,15 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, else { LOGPRINTF(stderr, "Training criteria:\n"); - for (const auto& node : criterionNodes) - { - LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str()); - } - if (criterionNodes.empty()) - { - LOGPRINTF(stderr, "\t(none)\n"); - InvalidArgument("TrainOrAdaptModel: No criterion node was specified."); - } + for (const auto& node : criterionNodes) + { + LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str()); + } + if (criterionNodes.empty()) + { + LOGPRINTF(stderr, "\t(none)\n"); + InvalidArgument("TrainOrAdaptModel: No criterion node was specified."); + } } // determine evaluationNodes from GetEvalCriterionNodes(), ensuring each criterion is only logged once @@ -277,10 +277,10 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, { fprintf(stderr, "out of %d parameter tensors and %d nodes with gradient:\n\n", (int)learnableNodes.size(), (int)numNeedsGradient); - for (let nodeDescription : nodesToUpdateDescriptions) - { - LOGPRINTF(stderr, "\t%ls\n", nodeDescription.c_str()); - } + for (let nodeDescription : nodesToUpdateDescriptions) + { + LOGPRINTF(stderr, "\t%ls\n", nodeDescription.c_str()); + } } // one blank line before training progress log @@ -302,16 +302,20 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, for (int i = 0; i < m_numPrevLearnRates; i++) prevLearnRates[i] = -1.0; + m_prevChosenMinibatchSize = m_mbSize[startEpoch]; + + int currentNumGradientBits = 0; // this remembers the last #gradient bits we set for dataParallelSGD (init val 0 has no meaning, just keep compiler happy) if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD) { - InitDistGradAgg(evaluationNodes.size(), m_traceLevel); + currentNumGradientBits = m_numGradientBits[startEpoch]; // remember so that we can detect a change + InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel); } else if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD || GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD) { InitModelAggregationHandler(m_syncStatsTrace, net->GetDeviceId()); } - + // precompute mean and invStdDev nodes and save initial model // When no precompute, only save if we did not load the model from a // checkpoint but instead built it from a network description @@ -378,6 +382,14 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, // --- MAIN EPOCH LOOP for (int i = startEpoch; i < (int) m_maxEpochs; i++) // TODO: why is this an int, and not a size_t? { + // (re-)initialize 1-bit SGD + if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD && + currentNumGradientBits != m_numGradientBits[i]) + { + currentNumGradientBits = m_numGradientBits[i]; + InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel); + } + // Synchronize all ranks before proceeding to ensure that // rank 0 has finished writing the previous model file if (m_mpi != nullptr) @@ -464,6 +476,8 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, smoothedCounts, learningRateAdjustmentFactor); + if (m_traceLevel < 1 && chosenMinibatchSize != m_prevChosenMinibatchSize) + LOGPRINTF(stderr, "Minibatch size adapted to %d.\n", (int)chosenMinibatchSize); m_prevChosenMinibatchSize = chosenMinibatchSize; } else @@ -476,9 +490,11 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, double momentumPerSample = GetMomentumPerSample(i /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequencesForFixingBPTTMode()); // time constant = number of samples after which a contribution has been reduced to e^-1 - double momentumAsTimeConstant = momentumPerSample == 0.0 ? 0.0 - : momentumPerSample >= 1.0 ? 0.0 - : -1.0 / log(momentumPerSample); + double momentumAsTimeConstant = momentumPerSample == 0.0 + ? 0.0 + : momentumPerSample >= 1.0 + ? 0.0 + : -1.0 / log(momentumPerSample); if (m_traceLevel > 0) { fprintf(stderr, "\n"); @@ -863,8 +879,8 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, LOGPRINTF(stderr, "Starting minibatch loop"); if (useGradientAggregation) { - fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)", - (int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits); + fprintf(stderr, ", DataParallelSGD training (myRank = %d, numNodes = %d, numGradientBits = %d)", + (int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits[epochNumber]); if (m_bufferedAsyncGradientAggregation) fprintf(stderr, ", BufferedAsyncGradientAggregation is ENABLED"); @@ -1752,8 +1768,9 @@ size_t SGD::SearchForBestMinibatchSize(ComputationNetworkPtr net, LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Evaluating minibatchSizes %d..%d\n", (int)epochNumber + 1, (int)RoundToMultipleOf64(minMinibatchSize), (int)RoundToMultipleOf64(maxMinibatchSize)); - size_t lastTriedTrialMinibatchSize = 0; - EpochCriterion lastTriedTrialEpochCriterion(0); + size_t lastGoodMinibatchSize = 0; + size_t lastTriedMinibatchSize = 0; + EpochCriterion lastGoodEpochCriterion(0); for (float trialMinibatchSizeFloat = (float) minMinibatchSize; trialMinibatchSizeFloat <= maxMinibatchSize; trialMinibatchSizeFloat *= minibatchSizeTuningFactor) @@ -1770,6 +1787,7 @@ size_t SGD::SearchForBestMinibatchSize(ComputationNetworkPtr net, // Train on a few minibatches and so we can observe the epochCriterion as we try increasing // minibatches with iteration of this loop. + lastTriedMinibatchSize = trialMinibatchSize; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, numFramesToUseInSearch, trainSetDataReader, learnRatePerSample, trialMinibatchSize, featureNodes, @@ -1778,21 +1796,22 @@ size_t SGD::SearchForBestMinibatchSize(ComputationNetworkPtr net, learnableNodes, smoothedGradients, smoothedCounts, /*out*/ epochCriterion, /*out*/ epochEvalErrors, isFirstIteration ? "BaseAdaptiveMinibatchSearch:" : "AdaptiveMinibatchSearch:"); + lastTriedMinibatchSize = trialMinibatchSize; if (isFirstIteration) { // for the first iteration of the loop only, set baseCriterion // to the result we got from TrainOneMiniEpochAndReloadModel(). baseCriterion = epochCriterion; - lastTriedTrialMinibatchSize = trialMinibatchSize; - lastTriedTrialEpochCriterion = baseCriterion; + lastGoodMinibatchSize = trialMinibatchSize; + lastGoodEpochCriterion = baseCriterion; isFirstIteration = false; if (m_traceLevel > 0) { LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Computed baseCriterion %.8f for minibatchSize=%d\n", (int)epochNumber + 1, baseCriterion.Average(), (int)trialMinibatchSize); - } + } } else if (!epochCriterion.IsNan() && epochCriterion.Average() > (baseCriterion.Average() * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0)))) @@ -1805,8 +1824,8 @@ size_t SGD::SearchForBestMinibatchSize(ComputationNetworkPtr net, } else { - lastTriedTrialMinibatchSize = trialMinibatchSize; - lastTriedTrialEpochCriterion = epochCriterion; + lastGoodMinibatchSize = trialMinibatchSize; + lastGoodEpochCriterion = epochCriterion; if (m_traceLevel > 0 && trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize) { LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Keep searching... epochCriterion = %.8f vs. baseCriterion = %.8f\n", @@ -1814,10 +1833,27 @@ size_t SGD::SearchForBestMinibatchSize(ComputationNetworkPtr net, } } } - LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Search successful. New minibatchSize is %d. epochCriterion = %.8f vs baseCriterion = %.8f\n", - (int)epochNumber+1, (int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion.Average(), baseCriterion.Average()); - - return lastTriedTrialMinibatchSize; + if (m_traceLevel > 0) + { + LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Search successful. New minibatchSize is %d. epochCriterion = %.8f vs baseCriterion = %.8f\n", + (int)epochNumber + 1, (int)lastGoodMinibatchSize, lastGoodEpochCriterion.Average(), baseCriterion.Average()); + } +#if 1 // BUGBUG: Somehow state leaks across trials. Workaround: redo the last known good one to reset that. Helps somewhat until we fix this. + if (lastTriedMinibatchSize != lastGoodMinibatchSize) + { + std::vector epochEvalErrors(evaluationNodes.size(), EpochCriterion::Infinity()); + EpochCriterion epochCriterion(EpochCriterion::Infinity()); + TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, + numFramesToUseInSearch, trainSetDataReader, + learnRatePerSample, trialMinibatchSize, featureNodes, + labelNodes, criterionNodes, + evaluationNodes, inputMatrices, + learnableNodes, smoothedGradients, smoothedCounts, + /*out*/ epochCriterion, /*out*/ epochEvalErrors, + "FixMinibatchSearch:"); + } +#endif + return lastGoodMinibatchSize; } // run training over a small subset of an epoch, used by automatic LR and MB-size tuning @@ -1903,31 +1939,24 @@ void SGD::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net } template -void SGD::InitDistGradAgg(int numEvalNodes, int traceLevel) +void SGD::InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel) { - if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD) - { - if (m_distGradAgg == nullptr) - { + assert(GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD); + if (traceLevel > 0) + fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits); + #ifdef CNTK_PARALLEL_TRAINING_SUPPORT - m_distGradAgg = std::make_shared>(m_mpi, m_numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace); + m_distGradAgg = std::make_shared>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace); #else - if (m_numGradientBits != (8 * sizeof(ElemType))) - { - RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!"); - } - - m_distGradAgg = std::make_shared>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace); -#endif // !CNTK_PARALLEL_TRAINING_SUPPORT - } - - if (m_gradHeader == nullptr) - { - m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { - DistGradHeader::Destroy(ptr); - }); - } + if (numGradientBits != (8 * sizeof(ElemType))) + { + RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!"); } + + m_distGradAgg = std::make_shared>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace); +#endif // !CNTK_PARALLEL_TRAINING_SUPPORT + + m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); }); } template @@ -2649,7 +2678,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType) // parallel training m_parallelizationMethod = ParallelizationMethod::none; - m_numGradientBits = 32; + m_numGradientBits = vector{8 * (int)sizeofElemType}; // means no quantization m_zeroThresholdFor1Bit = true; m_bufferedAsyncGradientAggregation = false; m_enableDistributedMBReading = false; @@ -2685,7 +2714,20 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType) m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false); if ( m_numGradientBits < 1 || m_numGradientBits > (8 * sizeofElemType) ) { +<<<<<<< HEAD InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!"); +======= + const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record())); + let defaultGradientBits = 8 * (int)sizeofElemType; + m_numGradientBits = configDataParallelSGD(L"gradientBits", ConfigRecordType::Array(intargvector(vector{defaultGradientBits}))); + m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true); + m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false); + for (size_t i = 0; i < m_numGradientBits.size(); i++) + { + if (m_numGradientBits[i] < 1 || m_numGradientBits[i] > defaultGradientBits) + InvalidArgument("gradientBits values must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double."); + } +>>>>>>> 5316380... numGradientBits is now a vector; } } if (configParallelTrain.Exists(L"ModelAveragingSGD")) diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index ee4de6796..97d298877 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -264,7 +264,7 @@ protected: int m_syncStatsTrace; // Data parallel SGD training parameters - int m_numGradientBits; + intargvector m_numGradientBits; bool m_bufferedAsyncGradientAggregation; bool m_zeroThresholdFor1Bit; @@ -470,7 +470,7 @@ protected: /*out*/ std::vector& epochEvalErrors, const std::string& prefixMsg = ""); - void InitDistGradAgg(int numEvalNodes, int traceLevel); + void InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel); void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID); public: // UpdateWeights() - actual weight update, implementing various update rules diff --git a/Tutorials/ImageHandsOn/ImageHandsOn_Task6.cntk b/Tutorials/ImageHandsOn/ImageHandsOn_Task6.cntk index aec775deb..abfe7f1fc 100644 --- a/Tutorials/ImageHandsOn/ImageHandsOn_Task6.cntk +++ b/Tutorials/ImageHandsOn/ImageHandsOn_Task6.cntk @@ -98,7 +98,7 @@ TrainConvNet = { parallelizationMethod = "DataParallelSGD" parallelizationStartEpoch = 1 distributedMBReading = true - dataParallelSGD = { gradientBits = 2 } + dataParallelSGD = { gradientBits = 2:1 } } AutoAdjust = { autoAdjustMinibatch = true # enable automatic growing of minibatch size