Squashed commit of the following:

Add python side work-around to allow the combination of mean gradient and specifying number samples with the right implementation; in Python, achieve the legacy logic of having per sample learning rate with mean gradient by overriding the per sample learning rate with ignored_minibatch_size. Also clarification in docstring.
2017-09-14 13:01:55 -07:00 · 2017-09-14 13:01:55 -07:00 · a01f8cdb37
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -4460,13 +4460,13 @@ namespace CNTK
        ///
        /// A special value that can be used for the minibatchSize to indicate that the reference minibatch size is not specified.
        ///
-        static const size_t UnspecifiedMinibatchSize = 0;
+        static const size_t IgnoredMinibatchSize = 0;
        ///
        /// Create a schedule with a constant parameter value.
        /// @param value a single value to populate the schedule
        /// @param minibatchSize a minibatch size that the @e value specifies for.
        ///
-        CNTK_API TrainingParameterSchedule(T value, size_t minibatchSize = UnspecifiedMinibatchSize);
+        CNTK_API TrainingParameterSchedule(T value, size_t minibatchSize = IgnoredMinibatchSize);

 #ifndef SWIG
        ///
@ -4475,7 +4475,7 @@ namespace CNTK
        /// and so on. The last value is then used repeatedly until the end of training. 
        /// @e minibatchSize is the a minibatch size that each schedule[i] specifies for.
        ///
-        CNTK_API TrainingParameterSchedule(const std::vector<T>& schedule, size_t epochSize = FullDataSweep, size_t minibatchSize = UnspecifiedMinibatchSize);
+        CNTK_API TrainingParameterSchedule(const std::vector<T>& schedule, size_t epochSize = FullDataSweep, size_t minibatchSize = IgnoredMinibatchSize);
 #endif

        ///
@ -4488,7 +4488,7 @@ namespace CNTK
        /// after which the values is switched to '0.005'.
        /// @e minibatchSize is the a minibatch size that each schedule[i] specifies for.
        ///
-        CNTK_API TrainingParameterSchedule(const std::vector<std::pair<size_t, T>>& schedule, size_t epochSize = FullDataSweep, size_t minibatchSize = UnspecifiedMinibatchSize);
+        CNTK_API TrainingParameterSchedule(const std::vector<std::pair<size_t, T>>& schedule, size_t epochSize = FullDataSweep, size_t minibatchSize = IgnoredMinibatchSize);


        ///
@ -4635,7 +4635,7 @@ namespace CNTK
        ///
        /// A special value that can be used for the minibatchSize to indicate that the reference minibatch size is not specified.
        ///
-        CNTK_API static const size_t UnspecifiedMinibatchSize;
+        CNTK_API static const size_t IgnoredMinibatchSize;

    public:
        //
@ -4717,7 +4717,7 @@ namespace CNTK
        ///setting and be specialized to its own reference minibatch size. However, this is only suggested for advanced
        ///users.
        CNTK_API void SetMinibatchSize(std::size_t minibatchSize) { GetOptions().Add(MinibatchSizeKey, minibatchSize); }
-        CNTK_API std::size_t GetMinibatchSize() const { return GetOptions().GetOrElse(MinibatchSizeKey, UnspecifiedMinibatchSize); }
+        CNTK_API std::size_t GetMinibatchSize() const { return GetOptions().GetOrElse(MinibatchSizeKey, IgnoredMinibatchSize); }

        CNTK_API void SetLearningRateSchedule(const LearningRateSchedule& learningRateSchedule) { m_learningRateSchedule = learningRateSchedule; }
        CNTK_API const LearningRateSchedule& GetLearningRateSchedule() const { return m_learningRateSchedule; }
@ -4726,7 +4726,7 @@ namespace CNTK
        template<typename T>
        static bool IsCompatibleMode(const TrainingParameterSchedule<T>& schedule)
        {
-            return schedule.GetMinibatchSize() == UnspecifiedMinibatchSize;
+            return schedule.GetMinibatchSize() == IgnoredMinibatchSize;
        }

        ///
@ -4737,7 +4737,7 @@ namespace CNTK
        {
            if (GetOptions().Contains(MinibatchSizeKey))
            {
-                return GetMinibatchSize() == UnspecifiedMinibatchSize;
+                return GetMinibatchSize() == IgnoredMinibatchSize;
            }
            else
                //if the learner minbiatch size is not set, by default it is not in compatible mode.
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -36,7 +36,7 @@ namespace CNTK
    ///
    /// A special value that can be used for the minibatchSize to indicate that the reference minibatch size is not specified.
    ///
-    CNTK_API const size_t Learner::UnspecifiedMinibatchSize = TrainingParameterSchedule<double>::UnspecifiedMinibatchSize;
+    CNTK_API const size_t Learner::IgnoredMinibatchSize = TrainingParameterSchedule<double>::IgnoredMinibatchSize;

  
    // This method completely replaces the current schedule with the new schedule. However, since
--- a/Tests/UnitTests/V2LibraryTests/LearnerTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/LearnerTests.cpp
@ -162,13 +162,13 @@ void TestTrainingParametersSchedule()
    assert(schedule3[100] == 0.3);

    LearningRateSchedule schedule4(vector<double>{ 0.5 }, 10 ); // without vector<> gcc complains that conversion here is ambiguousS
-    assert(schedule4.GetMinibatchSize() == LearningRateSchedule::UnspecifiedMinibatchSize);
+    assert(schedule4.GetMinibatchSize() == LearningRateSchedule::IgnoredMinibatchSize);
    assert(schedule4[0] == 0.5);
    assert(schedule4[10] == 0.5);
    assert(schedule4[100] == 0.5);

    LearningRateSchedule schedule5{ std::vector<double>{ 0.5, 0.3, 0.2 }, 10 };
-    assert(schedule5.GetMinibatchSize() == LearningRateSchedule::UnspecifiedMinibatchSize); //unspecified reference minibatch size is 0
+    assert(schedule5.GetMinibatchSize() == LearningRateSchedule::IgnoredMinibatchSize); //unspecified reference minibatch size is 0
    assert(schedule5[0] == 0.5);
    assert(schedule5[9] == 0.5);
    assert(schedule5[10] == 0.3);
@ -177,20 +177,20 @@ void TestTrainingParametersSchedule()
    assert(schedule5[100] == 0.2);

    MomentumSchedule schedule6{ { make_pair(1, 0.5) } }; // without make_pair this is interpreted as a vector of doubles
-    assert(schedule6.GetMinibatchSize() == MomentumSchedule::UnspecifiedMinibatchSize);
+    assert(schedule6.GetMinibatchSize() == MomentumSchedule::IgnoredMinibatchSize);
    assert(schedule6[0] == 0.5);
    assert(schedule6[10] == 0.5);
    assert(schedule6[100] == 0.5);

    LearningRateSchedule schedule7{ std::vector<std::pair<size_t, double>>{ { 1, 0.5 }, { 1, 0.3 }, { 1, 0.2 } } };
-    assert(schedule7.GetMinibatchSize() == LearningRateSchedule::UnspecifiedMinibatchSize);
+    assert(schedule7.GetMinibatchSize() == LearningRateSchedule::IgnoredMinibatchSize);
    assert(schedule7[0] == 0.5);
    assert(schedule7[1] == 0.3);
    assert(schedule7[2] == 0.2);
    assert(schedule7[100] == 0.2);

    MomentumSchedule schedule8{ std::vector<std::pair<size_t, double>>{ { 1, 0.5 }, { 1, 0.3 }, { 1, 0.2 } }, 10 };
-    assert(schedule8.GetMinibatchSize() == MomentumSchedule::UnspecifiedMinibatchSize);
+    assert(schedule8.GetMinibatchSize() == MomentumSchedule::IgnoredMinibatchSize);
    assert(schedule8[0] == 0.5);
    assert(schedule8[9] == 0.5);
    assert(schedule8[10] == 0.3);
@ -208,7 +208,7 @@ void TestTrainingParametersSchedule()
    assert(schedule9[100] == 0.2);

    MomentumSchedule schedule10 = { std::vector<std::pair<size_t, double>>{ { 3, 0.5 }, { 2, 0.3 }, { 1, 0.2 } }, 10 };
-    assert(schedule10.GetMinibatchSize() == MomentumSchedule::UnspecifiedMinibatchSize);
+    assert(schedule10.GetMinibatchSize() == MomentumSchedule::IgnoredMinibatchSize);
    assert(schedule10[0] == 0.5);
    assert(schedule10[29] == 0.5);
    assert(schedule10[30] == 0.3);
--- a/bindings/python/cntk/cntk_py.i
+++ b/bindings/python/cntk/cntk_py.i
@ -117,10 +117,10 @@

 %rename(l1_regularization_weight) CNTK::AdditionalLearningOptions::l1RegularizationWeight;
 %rename(l2_regularization_weight) CNTK::AdditionalLearningOptions::l2RegularizationWeight;
-%rename(unspecified_minibatch_size) CNTK::TrainingParameterSchedule<double>::UnspecifiedMinibatchSize;
-%rename(unspecified_minibatch_size) CNTK::TrainingParameterSchedule<std::size_t>::UnspecifiedMinibatchSize;
+%rename(ignored_minibatch_size) CNTK::TrainingParameterSchedule<double>::IgnoredMinibatchSize;
+%rename(ignored_minibatch_size) CNTK::TrainingParameterSchedule<std::size_t>::IgnoredMinibatchSize;
 %rename(_MINIBATCH_SIZE) CNTK::Learner::MinibatchSizeKey; // L"MinibatchSize"
-%rename(unspecified_minibatch_size)  CNTK::Learner::UnspecifiedMinibatchSize;
+%rename(ignored_minibatch_size)  CNTK::Learner::IgnoredMinibatchSize;
 %rename(_options) CNTK::Learner::GetOptions;

 %rename(ndcg_at_1) CNTK::NDCGAt1;
--- a/bindings/python/cntk/contrib/deeprl/agent/qlearning.py
+++ b/bindings/python/cntk/contrib/deeprl/agent/qlearning.py
@ -107,7 +107,7 @@ class QLearning(AgentBaseClass):
            self._q.parameters,
            C.learners.learning_rate_schedule(
                self._parameters.initial_eta, C.learners.UnitType.sample),
-            minibatch_size=minibatch_size,
+            use_mean_gradient=True,
            momentum=C.learners.momentum_schedule(self._parameters.momentum),
            variance_momentum=C.learners.momentum_schedule(0.999),
            gradient_clipping_threshold_per_sample=
--- a/bindings/python/cntk/learners/init.py
+++ b/bindings/python/cntk/learners/init.py
@ -154,7 +154,7 @@ class Learner(cntk_py.Learner):
        _verify_learning_rate_type(learning_rate)
        if not learning_rate.is_minibatch_size_explicitly_specified:
            #If the schedule minibatch size is not explicitly specified, the learner's specification will take over
-            if self.minibatch_size is not None and self.minibatch_size != self.unspecified_minibatch_size:
+            if self.minibatch_size is not None and self.minibatch_size != self.ignored_minibatch_size:
                learning_rate.minibatch_size = self.minibatch_size
        return super(Learner, self).reset_learning_rate(learning_rate)

@ -164,7 +164,7 @@ class Learner(cntk_py.Learner):
        '''
        return super(Learner, self).learning_rate()

-IGNORE = Learner.unspecified_minibatch_size
+IGNORE = Learner.ignored_minibatch_size
 '''
 Indicate that the minibatch size is ignored in learning's hyper-parameter schedule.
 '''
@ -288,7 +288,7 @@ def training_parameter_schedule(schedule, unit=UnitType.minibatch, epoch_size=No
    if unit == UnitType.sample:
        ref_minibatch_size = 1
    else: # unit == UnitType.minibatch
-        ref_minibatch_size = cntk_py.training_double_parameter_schedule.unspecified_minibatch_size
+        ref_minibatch_size = cntk_py.training_double_parameter_schedule.ignored_minibatch_size

    if isinstance(schedule, cntk_py.training_double_parameter_schedule):
        schedule.is_minibatch_size_explicitly_specified = True #legacy learning parameter always have the specification
@ -325,9 +325,9 @@ def learning_parameter_schedule(schedule, minibatch_size=None, epoch_size=None):
         pair, i.e. [(num_epoch_1, p_1), (num_epoch_n, p_2), .., (num_epoch_n, p_n)], the i-th parameter is used as a 
         value from the (``epoch_size`` * (num_epoch_0 + ... + num_epoch_2 + ... + num_epoch_(i-1) + 1)-th sample to the 
         (``epoch_size`` * num_epoch_i)-th sample (taking num_epoch_0 = 0 as a special initialization).
-        minibatch_size (int): an integer to specify the reference minibatch size that schedule are designed for; 
+        minibatch_size (int): an integer to specify the minibatch size that schedule are designed for. 
         CNTK will scale the schedule internally so as to simulate the behavior of the schedule as much as possible
-         to match the designed effect. If it is not specified, CNTK will set to the special value cntk.learners.unspecified_minibatch_size.
+         to match the designed effect. If it is not specified, CNTK will set to the special value :attr:`IGNORE`.
        epoch_size (optional, int): number of samples as a scheduling unit.
         Parameters in the schedule change their values every ``epoch_size``
         samples. If no ``epoch_size`` is provided, this parameter is substituted
@ -514,17 +514,17 @@ def _infer_ref_minibatch_size_from_legacy_use_mean_gradient(ref_minibatch_size,
        #if ref_minibatch_size and the legacy use_mean_gradient are neither specified
        return None
    if ref_minibatch_size is not None:
-        if use_mean_gradient == True and ref_minibatch_size != cntk_py.Learner.unspecified_minibatch_size:
+        if use_mean_gradient == True and ref_minibatch_size != cntk_py.Learner.ignored_minibatch_size:
            Warning(
                'Learner reference minibatch size is specified while use_mean_gradient (depreated option) is specified to True. Learner reference minibatch size will override the mean gradient behavior')
        #if the ref_minibatch_size is specified, it overrides the legacay use_mean_gradient specification
        return ref_minibatch_size
    elif use_mean_gradient is not None:
        #if the ref_minibatch_size is NOT specified, the legacay use_mean_gradient specification take in the effect
-        return cntk_py.Learner.unspecified_minibatch_size if use_mean_gradient is True else None
+        return cntk_py.Learner.ignored_minibatch_size if use_mean_gradient is True else None
    return None

-def _infer_learning_parameter_schedule(number_or_schedule, ref_minibatch_size, epoch_size):
+def _infer_learning_parameter_schedule(number_or_schedule, ref_minibatch_size, epoch_size, use_mean_gradient=None):
    #the input is a number, create a new training parameter
    if isinstance(number_or_schedule, (int, float)) or \
            (isinstance(number_or_schedule, list) and all(isinstance(r, (int, float, tuple)) for r in number_or_schedule)):
@ -538,6 +538,13 @@ def _infer_learning_parameter_schedule(number_or_schedule, ref_minibatch_size, e
        if not number_or_schedule.is_minibatch_size_explicitly_specified and ref_minibatch_size is not None:
            #If the schedule minibatch size is not explicitly specified, the learner's specification will take over
            number_or_schedule.minibatch_size = ref_minibatch_size
+        #for backward compatibility: use_mean_gradient = True and lr.unit = UnitType.sample
+        #this combination was there to avoid the double-scaling of gradients when the gradients are already mean gradients
+        if use_mean_gradient and number_or_schedule.minibatch_size == 1:
+            #override the learning rate's minibatch_size to IGNORE
+            number_or_schedule.minibatch_size = IGNORE
+            Warning('use_mean_gradient=True and learning_rate_schedule.unit=UnitType.sample is a deprecated combination. '
+                    'Please use the new learner APIs: see https://www.cntk.ai/pythondocs/cntk.learners.html for details.')
        return number_or_schedule
    else:
        raise ValueError('training parameter schedule type (%s) not supported. '
@ -549,7 +556,7 @@ def _infer_learning_rate_schedule_and_ref_minibatch_size(use_mean_gradient, ref_
    #if non-None reference_minibatch_size will take precedence otherwise according use_mean_gradient if it is True
    ref_minibatch_size = _infer_ref_minibatch_size_from_legacy_use_mean_gradient(ref_minibatch_size, use_mean_gradient)
    #if minibatch_size is not None, any schedules that are with unspecified reference minibatch size will be overrided.
-    schedule = _infer_learning_parameter_schedule(schedule, ref_minibatch_size, epoch_size)
+    schedule = _infer_learning_parameter_schedule(schedule, ref_minibatch_size, epoch_size, use_mean_gradient)
    _verify_learning_rate_type(schedule)
    return schedule, ref_minibatch_size

@ -588,8 +595,8 @@ def sgd(parameters, lr,
         size is usually set to the same as the minibatch data source's size. CNTK will perform automatic scaling of the parameters
         to enable efficient model parameter update implementation while approximate the behavior of pre-designed and pre-tuned parameters.
         In case that minibatch_size is not specified, CNTK will inherit the minibatch size from the learning rate schedule;
-         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to 1. Setting minibatch_size to 0
-         will have the parameters apply as it is preventing CNTK performing any parameter scaling. See also:  :func:`learning_parameter_schedule`
+         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to :attr:`IGNORE`. Setting minibatch_size to :attr:`IGNORE`
+         will have the learner apply as it is preventing CNTK performing any hyper-parameter scaling. See also:  :func:`learning_parameter_schedule`
        epoch_size (optional, int): number of samples as a scheduling unit for learning rate. See also:  :func:`learning_parameter_schedule`


@ -658,8 +665,8 @@ def momentum_sgd(parameters, lr, momentum, unit_gain=default_unit_gain_value(),
         size is usually set to the same as the minibatch data source's size. CNTK will perform automatic scaling of the parameters
         to enable efficient model parameter update implementation while approximate the behavior of pre-designed and pre-tuned parameters.
         In case that minibatch_size is not specified, CNTK will inherit the minibatch size from the learning rate schedule;
-         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to 1. Setting minibatch_size to 0
-         will have the parameters apply as it is preventing CNTK performing any parameter scaling.
+         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to :attr:`IGNORE`. Setting minibatch_size to :attr:`IGNORE`
+         will have the learner apply as it is preventing CNTK performing any hyper-parameter scaling. See also:  :func:`learning_parameter_schedule`
        epoch_size (optional, int): number of samples as a scheduling unit for learning rate and momentum. See also:  :func:`learning_parameter_schedule`

    Returns:
@ -727,8 +734,8 @@ def nesterov(parameters, lr, momentum, unit_gain=default_unit_gain_value(),
         size is usually set to the same as the minibatch data source's size. CNTK will perform automatic scaling of the parameters
         to enable efficient model parameter update implementation while approximate the behavior of pre-designed and pre-tuned parameters.
         In case that minibatch_size is not specified, CNTK will inherit the minibatch size from the learning rate schedule;
-         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to 1. Setting minibatch_size to 0
-         will have the parameters apply as it is preventing CNTK performing any parameter scaling.
+         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to :attr:`IGNORE`. Setting minibatch_size to :attr:`IGNORE`
+         will have the learner apply as it is preventing CNTK performing any hyper-parameter scaling. See also:  :func:`learning_parameter_schedule`
        epoch_size (optional, int): number of samples as a scheduling unit for learning rate and momentum. See also:  :func:`learning_parameter_schedule`

    Returns:
@ -801,9 +808,8 @@ def adadelta(parameters, lr=learning_rate_schedule(1, UnitType.sample), rho=0.95
         size is usually set to the same as the minibatch data source's size. CNTK will perform automatic scaling of the parameters
         to enable efficient model parameter update implementation while approximate the behavior of pre-designed and pre-tuned parameters.
         In case that minibatch_size is not specified, CNTK will inherit the minibatch size from the learning rate schedule;
-         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to 1. Setting minibatch_size to 0
-         will have the parameters apply as it is preventing CNTK performing any parameter scaling. If the learner's learning rate 
-         schedule ``lr`` has its own specification of reference minibatch size, the learning rate schedule's specification takes precedence. 
+         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to :attr:`IGNORE`. Setting minibatch_size to :attr:`IGNORE`
+         will have the learner apply as it is preventing CNTK performing any hyper-parameter scaling. See also:  :func:`learning_parameter_schedule`
        epoch_size (optional, int): number of samples as a scheduling unit for learning rate. See also:  :func:`learning_parameter_schedule`

    Returns:
@ -870,8 +876,8 @@ def adagrad(parameters, lr, need_ave_multiplier=True,
         size is usually set to the same as the minibatch data source's size. CNTK will perform automatic scaling of the parameters
         to enable efficient model parameter update implementation while approximate the behavior of pre-designed and pre-tuned parameters.
         In case that minibatch_size is not specified, CNTK will inherit the minibatch size from the learning rate schedule;
-         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to 1. Setting minibatch_size to 0
-         will have the parameters apply as it is preventing CNTK performing any parameter scaling.
+         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to :attr:`IGNORE`. Setting minibatch_size to :attr:`IGNORE`
+         will have the learner apply as it is preventing CNTK performing any hyper-parameter scaling. See also:  :func:`learning_parameter_schedule`
        epoch_size (optional, int): number of samples as a scheduling unit for learning rate. See also:  :func:`learning_parameter_schedule`

    Returns:
@ -945,8 +951,8 @@ def fsadagrad(parameters, lr, momentum, unit_gain=default_unit_gain_value(),
         size is usually set to the same as the minibatch data source's size. CNTK will perform automatic scaling of the parameters
         to enable efficient model parameter update implementation while approximate the behavior of pre-designed and pre-tuned parameters.
         In case that minibatch_size is not specified, CNTK will inherit the minibatch size from the learning rate schedule;
-         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to 1. Setting minibatch_size to 0
-         will have the parameters apply as it is preventing CNTK performing any parameter scaling.
+         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to :attr:`IGNORE`. Setting minibatch_size to :attr:`IGNORE`
+         will have the learner apply as it is preventing CNTK performing any hyper-parameter scaling. See also:  :func:`learning_parameter_schedule`
        epoch_size (optional, int): number of samples as a scheduling unit for learning rate, momentum and variance_momentum. See also:  :func:`learning_parameter_schedule`

    Returns:
@ -1025,8 +1031,8 @@ def adam(parameters, lr, momentum, unit_gain=default_unit_gain_value(),
         size is usually set to the same as the minibatch data source's size. CNTK will perform automatic scaling of the parameters
         to enable efficient model parameter update implementation while approximate the behavior of pre-designed and pre-tuned parameters.
         In case that minibatch_size is not specified, CNTK will inherit the minibatch size from the learning rate schedule;
-         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to 1. Setting minibatch_size to 0
-         will have the parameters apply as it is preventing CNTK performing any parameter scaling.
+         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to :attr:`IGNORE`. Setting minibatch_size to :attr:`IGNORE`
+         will have the learner apply as it is preventing CNTK performing any hyper-parameter scaling. See also:  :func:`learning_parameter_schedule`
        epoch_size (optional, int): number of samples as a scheduling unit for learning rate, momentum and variance_momentum. See also:  :func:`learning_parameter_schedule`

    Returns:
@ -1104,8 +1110,8 @@ def rmsprop(parameters, lr,
         size is usually set to the same as the minibatch data source's size. CNTK will perform automatic scaling of the parameters
         to enable efficient model parameter update implementation while approximate the behavior of pre-designed and pre-tuned parameters.
         In case that minibatch_size is not specified, CNTK will inherit the minibatch size from the learning rate schedule;
-         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to 1. Setting minibatch_size to 0
-         will have the parameters apply as it is preventing CNTK performing any parameter scaling.
+         if the learning rate schedule does not specify the minibatch_size, CNTK will set it to :attr:`IGNORE`. Setting minibatch_size to :attr:`IGNORE`
+         will have the learner apply as it is preventing CNTK performing any hyper-parameter scaling. See also:  :func:`learning_parameter_schedule`
        epoch_size (optional, int): number of samples as a scheduling unit for learning rate. See also:  :func:`learning_parameter_schedule`

    Returns:
--- a/bindings/python/cntk/learners/tests/learner_test.py
+++ b/bindings/python/cntk/learners/tests/learner_test.py
@ -138,6 +138,15 @@ def test_learner_init_legacy():
    assert learner.learning_rate() == 0.1
    assert learner.minibatch_size == C.learners.IGNORE  # the learner's reference minibatch size is still 0

+    # this will be deprecated in future version: This is logical invalid combination but it was the only way to use mean gradient and set learning rate in the past.
+    learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), use_mean_gradient=True)
+    assert learner.is_compatible_mode() == True
+    assert learner.learning_rate() == 0.1
+    #test the override in the new version
+    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
+    assert learner.minibatch_size == C.learners.IGNORE  # the learner's reference minibatch size is still 0
+
+
    # for backcompatibility test
    # this will be deprecated in future version
    # The UnitType will provide per minibatch instruction for the learner