From 0bbfdbef99e2e231acf8517c9e8ed7bc54c4bd09 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 16 Sep 2016 19:50:15 -0700
Subject: [PATCH] cherry-picked: numGradientBits is now a vector; simplified
 logging of MB scaling

---
 Source/1BitSGD                                |   2 +-
 Source/SGDLib/SGD.cpp                         | 148 +++++++++++-------
 Source/SGDLib/SGD.h                           |   4 +-
 .../ImageHandsOn/ImageHandsOn_Task6.cntk      |   2 +-
 4 files changed, 99 insertions(+), 57 deletions(-)
diff --git a/Source/1BitSGD b/Source/1BitSGD
index 87767425a..f7afb8c6a 160000
--- a/Source/1BitSGD
+++ b/Source/1BitSGD
@@ -1 +1 @@
-Subproject commit 87767425a4ec3b93aa574295f5332460155d0d74
+Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index a2aea4c38..793672b5b 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -141,15 +141,15 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
     else
     {
         LOGPRINTF(stderr, "Training criteria:\n");
-    for (const auto& node : criterionNodes)
-    {
-        LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
-    }
-    if (criterionNodes.empty())
-    {
-        LOGPRINTF(stderr, "\t(none)\n");
-        InvalidArgument("TrainOrAdaptModel: No criterion node was specified.");
-    }
+        for (const auto& node : criterionNodes)
+        {
+            LOGPRINTF(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
+        }
+        if (criterionNodes.empty())
+        {
+            LOGPRINTF(stderr, "\t(none)\n");
+            InvalidArgument("TrainOrAdaptModel: No criterion node was specified.");
+        }
     }
 
     // determine evaluationNodes from GetEvalCriterionNodes(), ensuring each criterion is only logged once
@@ -277,10 +277,10 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
     {
         fprintf(stderr, "out of %d parameter tensors and %d nodes with gradient:\n\n",
             (int)learnableNodes.size(), (int)numNeedsGradient);
-    for (let nodeDescription : nodesToUpdateDescriptions)
-    {
-        LOGPRINTF(stderr, "\t%ls\n", nodeDescription.c_str());
-    }
+        for (let nodeDescription : nodesToUpdateDescriptions)
+        {
+            LOGPRINTF(stderr, "\t%ls\n", nodeDescription.c_str());
+        }
     }
 
     // one blank line before training progress log
@@ -302,16 +302,20 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
     for (int i = 0; i < m_numPrevLearnRates; i++)
         prevLearnRates[i] = -1.0;
 
+    m_prevChosenMinibatchSize = m_mbSize[startEpoch];
+
+    int currentNumGradientBits = 0; // this remembers the last #gradient bits we set for dataParallelSGD (init val 0 has no meaning, just keep compiler happy)
     if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD)
     {
-        InitDistGradAgg(evaluationNodes.size(), m_traceLevel);
+        currentNumGradientBits = m_numGradientBits[startEpoch]; // remember so that we can detect a change
+        InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel);
     }
     else if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD || 
              GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD)
     {
         InitModelAggregationHandler(m_syncStatsTrace, net->GetDeviceId());
     }
-    
+
     // precompute mean and invStdDev nodes and save initial model
     // When no precompute, only save if we did not load the model from a 
     // checkpoint but instead built it from a network description
@@ -378,6 +382,14 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
     // --- MAIN EPOCH LOOP
     for (int i = startEpoch; i < (int) m_maxEpochs; i++) // TODO: why is this an int, and not a size_t?
     {
+        // (re-)initialize 1-bit SGD
+        if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD &&
+            currentNumGradientBits != m_numGradientBits[i])
+        {
+            currentNumGradientBits = m_numGradientBits[i];
+            InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel);
+        }
+
         // Synchronize all ranks before proceeding to ensure that
         // rank 0 has finished writing the previous model file
         if (m_mpi != nullptr)
@@ -464,6 +476,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
                                                           criterionNodes, evaluationNodes,
                                                           inputMatrices, learnableNodes,
                                                           smoothedGradients, smoothedCounts, learningRateAdjustmentFactor);
+            if (m_traceLevel < 1 && chosenMinibatchSize != m_prevChosenMinibatchSize)
+                LOGPRINTF(stderr, "Minibatch size adapted to %d.\n", (int)chosenMinibatchSize);
             m_prevChosenMinibatchSize = chosenMinibatchSize;
         }
         else
@@ -476,9 +490,11 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
 
         double momentumPerSample = GetMomentumPerSample(i /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequencesForFixingBPTTMode());
         // time constant = number of samples after which a contribution has been reduced to e^-1
-        double momentumAsTimeConstant = momentumPerSample == 0.0 ? 0.0
-                                                                 : momentumPerSample >= 1.0 ? 0.0
-                                                                                            : -1.0 / log(momentumPerSample);
+        double momentumAsTimeConstant = momentumPerSample == 0.0
+                                        ? 0.0
+                                        : momentumPerSample >= 1.0
+                                            ? 0.0
+                                            : -1.0 / log(momentumPerSample);
         if (m_traceLevel > 0)
         {
             fprintf(stderr, "\n");
@@ -863,8 +879,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
         LOGPRINTF(stderr, "Starting minibatch loop");
         if (useGradientAggregation)
         {
-            fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)",
-                    (int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits);
+            fprintf(stderr, ", DataParallelSGD training (myRank = %d, numNodes = %d, numGradientBits = %d)",
+                    (int) m_mpi->CurrentNodeRank(), (int) m_mpi->NumNodesInUse(), (int) m_numGradientBits[epochNumber]);
 
             if (m_bufferedAsyncGradientAggregation)
                 fprintf(stderr, ", BufferedAsyncGradientAggregation is ENABLED");
@@ -1752,8 +1768,9 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
     LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Evaluating minibatchSizes %d..%d\n",
         (int)epochNumber + 1, (int)RoundToMultipleOf64(minMinibatchSize), (int)RoundToMultipleOf64(maxMinibatchSize));
 
-    size_t lastTriedTrialMinibatchSize = 0;
-    EpochCriterion lastTriedTrialEpochCriterion(0);
+    size_t lastGoodMinibatchSize = 0;
+    size_t lastTriedMinibatchSize = 0;
+    EpochCriterion lastGoodEpochCriterion(0);
     for (float trialMinibatchSizeFloat = (float) minMinibatchSize;
          trialMinibatchSizeFloat <= maxMinibatchSize;
          trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
@@ -1770,6 +1787,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
 
         // Train on a few minibatches and so we can observe the epochCriterion as we try increasing
         // minibatches with iteration of this loop.
+        lastTriedMinibatchSize = trialMinibatchSize;
         TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
                                         numFramesToUseInSearch, trainSetDataReader,
                                         learnRatePerSample, trialMinibatchSize, featureNodes,
@@ -1778,21 +1796,22 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
                                         learnableNodes, smoothedGradients, smoothedCounts,
                                         /*out*/ epochCriterion, /*out*/ epochEvalErrors,
                                         isFirstIteration ? "BaseAdaptiveMinibatchSearch:" : "AdaptiveMinibatchSearch:");
+        lastTriedMinibatchSize = trialMinibatchSize;
 
         if (isFirstIteration)
         {
             // for the first iteration of the loop only, set baseCriterion
             // to the result we got from TrainOneMiniEpochAndReloadModel().
             baseCriterion = epochCriterion;
-            lastTriedTrialMinibatchSize = trialMinibatchSize;
-            lastTriedTrialEpochCriterion = baseCriterion;
+            lastGoodMinibatchSize = trialMinibatchSize;
+            lastGoodEpochCriterion = baseCriterion;
             isFirstIteration = false;
 
             if (m_traceLevel > 0)
             {
                 LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Computed baseCriterion %.8f for minibatchSize=%d\n",
                           (int)epochNumber + 1, baseCriterion.Average(), (int)trialMinibatchSize);
-        }
+            }
         }
         else if (!epochCriterion.IsNan() &&
                  epochCriterion.Average() > (baseCriterion.Average() * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0))))
@@ -1805,8 +1824,8 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
         }
         else
         {
-            lastTriedTrialMinibatchSize = trialMinibatchSize;
-            lastTriedTrialEpochCriterion = epochCriterion;
+            lastGoodMinibatchSize = trialMinibatchSize;
+            lastGoodEpochCriterion = epochCriterion;
             if (m_traceLevel > 0 && trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
             {
                 LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Keep searching... epochCriterion = %.8f vs. baseCriterion = %.8f\n",
@@ -1814,10 +1833,27 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
             }
         }
     }
-    LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Search successful. New minibatchSize is %d. epochCriterion = %.8f vs baseCriterion = %.8f\n",
-              (int)epochNumber+1, (int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion.Average(), baseCriterion.Average());
-
-    return lastTriedTrialMinibatchSize;
+    if (m_traceLevel > 0)
+    {
+        LOGPRINTF(stderr, " AdaptiveMinibatchSearch Epoch[%d]: Search successful. New minibatchSize is %d. epochCriterion = %.8f vs baseCriterion = %.8f\n",
+                  (int)epochNumber + 1, (int)lastGoodMinibatchSize, lastGoodEpochCriterion.Average(), baseCriterion.Average());
+    }
+#if 1 // BUGBUG: Somehow state leaks across trials. Workaround: redo the last known good one to reset that. Helps somewhat until we fix this.
+    if (lastTriedMinibatchSize != lastGoodMinibatchSize)
+    {
+        std::vector<EpochCriterion> epochEvalErrors(evaluationNodes.size(), EpochCriterion::Infinity());
+        EpochCriterion epochCriterion(EpochCriterion::Infinity());
+        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                        numFramesToUseInSearch, trainSetDataReader,
+                                        learnRatePerSample, trialMinibatchSize, featureNodes,
+                                        labelNodes, criterionNodes,
+                                        evaluationNodes, inputMatrices,
+                                        learnableNodes, smoothedGradients, smoothedCounts,
+                                        /*out*/ epochCriterion, /*out*/ epochEvalErrors,
+                                        "FixMinibatchSearch:");
+    }
+#endif
+    return lastGoodMinibatchSize;
 }
 
 // run training over a small subset of an epoch, used by automatic LR and MB-size tuning
@@ -1903,31 +1939,24 @@ void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net
 }
 
 template <class ElemType>
-void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int traceLevel)
+void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel)
 {
-    if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD)
-    {
-        if (m_distGradAgg == nullptr)
-        {
+    assert(GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD);
+    if (traceLevel > 0)
+        fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);
+
 #ifdef CNTK_PARALLEL_TRAINING_SUPPORT
-            m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, m_numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
+    m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
 #else
-            if (m_numGradientBits != (8 * sizeof(ElemType)))
-            {
-                RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
-            }
-
-            m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
-#endif // !CNTK_PARALLEL_TRAINING_SUPPORT
-        }
-
-        if (m_gradHeader == nullptr)
-        {
-            m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) {
-                DistGradHeader::Destroy(ptr);
-            });
-        }
+    if (numGradientBits != (8 * sizeof(ElemType)))
+    {
+        RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
     }
+
+    m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
+#endif // !CNTK_PARALLEL_TRAINING_SUPPORT
+
+    m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
 }
 
 template <class ElemType>
@@ -2649,7 +2678,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
 
     // parallel training
     m_parallelizationMethod = ParallelizationMethod::none;
-    m_numGradientBits = 32;
+    m_numGradientBits = vector<int>{8 * (int)sizeofElemType}; // means no quantization
     m_zeroThresholdFor1Bit = true;
     m_bufferedAsyncGradientAggregation = false;
     m_enableDistributedMBReading = false;
@@ -2685,7 +2714,20 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
             m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
                 if ( m_numGradientBits < 1 || m_numGradientBits > (8 * sizeofElemType) )
             {
+<<<<<<< HEAD
                 InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
+=======
+                const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record()));
+                let defaultGradientBits = 8 * (int)sizeofElemType;
+                m_numGradientBits = configDataParallelSGD(L"gradientBits", ConfigRecordType::Array(intargvector(vector<int>{defaultGradientBits})));
+                m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
+                m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
+                for (size_t i = 0; i < m_numGradientBits.size(); i++)
+                {
+                    if (m_numGradientBits[i] < 1 || m_numGradientBits[i] > defaultGradientBits)
+                        InvalidArgument("gradientBits values must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double.");
+                }
+>>>>>>> 5316380... numGradientBits is now a vector;
             }
         }
         if (configParallelTrain.Exists(L"ModelAveragingSGD"))
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index ee4de6796..97d298877 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -264,7 +264,7 @@ protected:
     int m_syncStatsTrace;
 
     // Data parallel SGD training parameters
-    int m_numGradientBits;
+    intargvector m_numGradientBits;
     bool m_bufferedAsyncGradientAggregation;
     bool m_zeroThresholdFor1Bit;
 
@@ -470,7 +470,7 @@ protected:
                          /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                          const std::string& prefixMsg = "");
 
-    void InitDistGradAgg(int numEvalNodes, int traceLevel);
+    void InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel);
     void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID);
 public:
     // UpdateWeights() - actual weight update, implementing various update rules
diff --git a/Tutorials/ImageHandsOn/ImageHandsOn_Task6.cntk b/Tutorials/ImageHandsOn/ImageHandsOn_Task6.cntk
index aec775deb..abfe7f1fc 100644
--- a/Tutorials/ImageHandsOn/ImageHandsOn_Task6.cntk
+++ b/Tutorials/ImageHandsOn/ImageHandsOn_Task6.cntk
@@ -98,7 +98,7 @@ TrainConvNet = {
             parallelizationMethod = "DataParallelSGD"
             parallelizationStartEpoch = 1
             distributedMBReading = true
-            dataParallelSGD = { gradientBits = 2 }
+            dataParallelSGD = { gradientBits = 2:1 }
         }
         AutoAdjust = {
             autoAdjustMinibatch = true        # enable automatic growing of minibatch size