Merge branch 'dongyu/needsGradientChange' of https://github.com/Microsoft/CNTK into fseide/s2s

Conflicts: Source/CNTK/tests.cpp Source/ComputationNetworkLib/ComputationNetwork.h Source/ComputationNetworkLib/ComputationNetworkEditing.cpp Source/ComputationNetworkLib/InputAndParamNodes.h Source/ComputationNetworkLib/PreComputeNodes.h
2016-02-24 13:55:41 -08:00 · 2016-02-24 13:55:41 -08:00 · 32004edfa2
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
@ -1652,7 +1652,7 @@ LearnableParameter(row, cols,

 \begin_layout Plain Layout

-Parameter(row, cols, {needGradient=true|false, 
+Parameter(row, cols, {needsGradient=true|false, 
 \end_layout

 \begin_layout Plain Layout
@ -1680,7 +1680,7 @@ cols - number of columns in the parameter, defaults to 1.
 \end_layout

 \begin_layout Itemize
-needGradient- [named optional] determines whether the parameter should be
+needsGradient- [named optional] determines whether the parameter should be
 updated by the training algorithm.
 Defaults is true.
 \end_layout
@ -4221,7 +4221,7 @@ features=InputValue [784,32]

 \begin_layout Plain Layout

-L1.BFF.B=LearnableParameter [256,1] NeedGradient=true
+L1.BFF.B=LearnableParameter [256,1] NeedsGradient=true
 \end_layout

 \begin_layout Plain Layout
@ -4266,7 +4266,7 @@ L1.BFF.FF.T=Times ( L1.BFF.W , normInput )

 \begin_layout Plain Layout

-L1.BFF.W=LearnableParameter [256,784] NeedGradient=true
+L1.BFF.W=LearnableParameter [256,784] NeedsGradient=true
 \end_layout

 \begin_layout Plain Layout
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
@ -4023,7 +4023,7 @@ In many cases, not all the gradients need to be computed.
 updated and thus it is unnecessary to compute the gradients with regard
 to these parameters.
 We can reduce the gradient computation by keeping a 
-\begin_inset Formula $needGradient$
+\begin_inset Formula $needsGradient$
 \end_inset

 flag for each node.
@ -4032,7 +4032,7 @@ In many cases, not all the gradients need to be computed.
 
 \begin_inset CommandInset ref
 LatexCommand ref
-reference "alg:CN-NeedGradient"
+reference "alg:CN-needsGradient"

 \end_inset

@ -4047,7 +4047,7 @@ reference "alg:CN-ForwardOrder-DAG"
 and 
 \begin_inset CommandInset ref
 LatexCommand ref
-reference "alg:CN-NeedGradient"
+reference "alg:CN-needsGradient"

 \end_inset

@ -4101,7 +4101,7 @@ status collapsed

 \end_inset

-UpdateNeedGradientFlag
+UpdateneedsGradientFlag
 \begin_inset ERT
 status collapsed

@ -4427,7 +4427,7 @@ State

 call 
 \noun on
-UpdateNeedGradientFlag
+UpdateneedsGradientFlag
 \noun default
 (
 \begin_inset Formula $c$
@ -4511,7 +4511,7 @@ status open
 \end_inset


-\begin_inset Formula $node.AnyChildNeedGradient()$
+\begin_inset Formula $node.AnyChildneedsGradient()$
 \end_inset


@ -4566,7 +4566,7 @@ State
 \noun default
 \color inherit
 
-\begin_inset Formula $node.needGradient\leftarrow true$
+\begin_inset Formula $node.needsGradient\leftarrow true$
 \end_inset


@ -4626,7 +4626,7 @@ State
 \noun default
 \color inherit
 
-\begin_inset Formula $node.needGradient\leftarrow false$
+\begin_inset Formula $node.needsGradient\leftarrow false$
 \end_inset


@ -4730,14 +4730,14 @@ end{algorithmic}

 \begin_layout Plain Layout
 Update the 
-\begin_inset Formula $needGradient$
+\begin_inset Formula $needsGradient$
 \end_inset

 flag recursively.
 
 \begin_inset CommandInset label
 LatexCommand label
-name "alg:CN-NeedGradient"
+name "alg:CN-needsGradient"

 \end_inset

--- a/Examples/Image/MNIST/Config/Macros.ndl
+++ b/Examples/Image/MNIST/Config/Macros.ndl
@ -17,8 +17,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, expAvg) = [
    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
-    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
+    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, expAvgFactor = expAvg)
    y = RectifiedLinear(bn)
@ -35,8 +35,8 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) =
 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg) = [
    b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
    sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
-    m = LearnableParameter(outMap, 1, init=fixedValue, value=0, needGradient=false)
-    isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, needGradient=false)
+    m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
+    isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
    y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, expAvgFactor=expAvg, imageLayout=$imageLayout$)
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@ -37,14 +37,14 @@ DNN=[
    rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap2 = 32
-    #rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
+    #rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
    #rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
    rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap3 = 64
-    #rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
+    #rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
    #rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
    rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
--- a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
@ -52,7 +52,7 @@ DNN=[
    rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap2 = 32
-    #rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
+    #rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
    #rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
    rn2_1 = ResNetNode2Inc2(rn1_18, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
@ -74,7 +74,7 @@ DNN=[
    rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap3 = 64
-    #rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
+    #rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
    #rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
    rn3_1 = ResNetNode2Inc2(rn2_18, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@ -11,8 +11,8 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
 {
    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
    sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
    y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
@ -83,8 +83,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, expAvg)
    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
    b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = Parameter(outDim, 1, init = fixedValue, value = scValue) 
-    m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
+    m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, expAvgFactor = expAvg)
    y = RectifiedLinear(bn)
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@ -2,8 +2,8 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
 {
    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
    sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
    y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, epsilon = 0.000000001, imageLayout = "cudnn")
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
@ -45,14 +45,14 @@ DNN=[
    rn1_3 = ResNetNode2A(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap2 = 128
-    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", needGradient = false)
+    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", learningRateMultiplier = 0)
    rn2_1 = ResNetNode2AInc(rn1_3, cMap2, 576, 1152, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
    rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn2_3 = ResNetNode2A(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn2_4 = ResNetNode2A(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
    
    cMap3 = 256
-    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", needGradient = false)
+    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", learningRateMultiplier = 0)
    rn3_1 = ResNetNode2AInc(rn2_4, cMap3, 1152, 2304, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
    rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn3_3 = ResNetNode2A(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
@ -61,7 +61,7 @@ DNN=[
    rn3_6 = ResNetNode2A(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap4 = 512
-    rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
+    rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", learningRateMultiplier = 0)
    rn4_1 = ResNetNode2AInc(rn3_6, cMap4, 2304, 4608, kW, kH, convWScale, convBValue, scValue, expAvg, rn4_1_Wproj)
    rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, expAvg)
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
@ -14,8 +14,8 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
    b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
-    m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
+    m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false)
    y = RectifiedLinear(bn)
@ -46,8 +46,8 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
    sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
-    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, imageLayout = "cudnn")
--- a/Source/CNTK/tests.cpp
+++ b/Source/CNTK/tests.cpp
@ -259,7 +259,7 @@ void TestConfiguration(const ConfigParameters& configBase)
                size_t cols = 0;
                if (!IsParameter(paramsMap, configNode[2]))
                    cols = configNode[2];
-                bool needGradient = false;
+                bool learningRateMultiplier = 0;
                bool init = false;
                ConfigArray initData;

@ -268,8 +268,8 @@ void TestConfiguration(const ConfigParameters& configBase)
                {
                    ConfigParameters configParam = configNode[i];
                    // TODO: update to learningRateMultiplier
-                    if (configParam.Exists("needGradient")) // TODO: should this be a test for 'true' rather than Exists()?
-                        needGradient = true;
+                    if (configParam.Exists("learningRateMultiplier")) // TODO: should this be a test for 'true' rather than Exists()?
+                        needsGradient = (float)configParam("learningRateMultiplier") > 0? true : false;
                    else if (configParam.Exists("init"))
                    {
                        init = true;
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -328,7 +328,6 @@ public:
    void AddFeatureNode(ComputationNodeBasePtr featureNode);
    void RemoveFeatureNode(ComputationNodeBasePtr featureNode);
    void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr);
-    void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr); // for backward compatibility
    void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);

    // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@ -323,32 +323,6 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
    }
 }

-// sets m_learningRateMultiplier in all LearnableParameters feeding into the passed rootNode
-// Called from MEL
-// TODO: This function should be implemented using teh above. No code dup please!
-void ComputationNetwork::SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode)
-{
-    // find nodes from all available nodes
-    if (rootNode == nullptr)
-    {
-        for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
-        {
-            ComputationNodeBasePtr node = nodeIter->second;
-            if (node->OperationName() == OperationNameOf(LearnableParameter))
-                node->SetLearningRateMultiplier((float)needGradient);
-        }
-    }
-    else
-    {
-        // for calculating a specific node
-        for (const auto& node : GetEvalOrder(rootNode))
-        {
-            if (node->OperationName() == OperationNameOf(LearnableParameter))
-                node->SetLearningRateMultiplier((float)needGradient);
-        }
-    }
-}
-
 void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
 {
    vector<ComputationNodeBasePtr> nodes;
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -288,7 +288,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
 {
    for (auto nodeIter = m_nestedNodes.rbegin(); nodeIter != m_nestedNodes.rend(); ++nodeIter)
    {
-        if ((*nodeIter)->NeedGradient())
+        if ((*nodeIter)->NeedsGradient())
            (*nodeIter)->ReleaseMatricesAfterBackprop(matrixPool);
    }
 }
@ -828,7 +828,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
                // PAR mode: we can allocate and immediately deallocate one by one
                n->AllocateGradientMatricesForInputs(m_matrixPool);
                // Root node's information will be used and should not be shared with others, also it's small (1x1)
-                if ((n != trainRootNode) && n->NeedGradient())
+                if ((n != trainRootNode) && n->NeedsGradient())
                    n->ReleaseMatricesAfterBackprop(m_matrixPool);
            }
        }
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -585,7 +585,7 @@ public:
        fprintf(stderr, "Node --> %ls = %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr);
    }

-    bool NeedGradient() const { return m_needsGradient; }
+    bool NeedsGradient() const { return m_needsGradient; }

    void SetLearningRateMultiplier(float f) 
    { 
@ -1382,7 +1382,7 @@ public:
    {
        for (int i = 0; i < m_inputs.size(); i++)
        {
-            if (m_inputs[i]->NeedGradient())
+            if (m_inputs[i]->NeedsGradient())
                m_inputs[i]->RequestMatricesBeforeBackprop(matrixPool);
        }
    }
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -65,7 +65,11 @@ public:
        // TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
        AttachInputs(configp, this->GetExpectedNumInputs());
        // parameters[rows, [cols=1]] plus other optional parameters (learningRateMultiplier=[1|0|float], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
-        SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
+        if (configp->Exists(L"learningRateMultiplier"))
+            SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
+        else if (configp->Exists(L"needsGradient") || configp->Exists(L"needGradient") || configp->Exists(L"computeGradient"))
+            InvalidArgument("needsGradient|needGradient|computeGradient are not supported in BrainScript. Use learningRateMultiplier instead.");
+
        wstring initString = configp->Get(L"init");
        if (initString == L"fixedValue")
            Value().SetValue((ElemType) configp->Get(L"value"));
@ -263,7 +267,7 @@ public:
            char str[4096];
            sprintf(str, "[%lu,%lu]  ", GetAsMatrixNumRows(), GetAsMatrixNumCols());
            fstream << string(str);
-            sprintf(str, "learningRateMultiplier=%f  NeedGradient=%s", m_learningRateMultiplier, m_learningRateMultiplier>0 ? "true" : "false"); // TODO: update NDL to accept a better matching name as well
+            sprintf(str, "learningRateMultiplier=%f  NeedsGradient=%s", m_learningRateMultiplier, m_learningRateMultiplier>0 ? "true" : "false"); // TODO: update NDL to accept a better matching name as well
            fstream << string(str);
        }

--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -366,7 +366,7 @@ public:
    virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) override
    {
        // this is a special handling case. We need to allocate sparse matrix directly instead of from pool.
-        if (Input(0)->NeedGradient() && Input(1)->Value().GetMatrixType() == SPARSE)
+        if (Input(0)->NeedsGradient() && Input(1)->Value().GetMatrixType() == SPARSE)
        {
            Input(0)->CreateGradientMatrixIfNull();
            Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
--- a/Source/ComputationNetworkLib/PreComputeNodes.h
+++ b/Source/ComputationNetworkLib/PreComputeNodes.h
@ -436,7 +436,7 @@ public:

    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
    {
-        InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage.");
+        InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
    }

    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
@ -506,10 +506,6 @@ public:
                InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
        }

-        // TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
-        Input(1)->SetLearningRateMultiplier(0); // prevent learning
-        Input(2)->SetLearningRateMultiplier(0);
-
        SetDims(Input(0));
    }
 };
@ -540,7 +536,7 @@ public:

    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
    {
-        InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage.");
+        InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
    }

    // (feature-mean).*InvStdDev
@ -618,10 +614,6 @@ public:
                InvalidArgument("PerDimMeanVarDeNormalizationNode: All inputs should have same sample layout.");
        }

-        // TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
-        Input(1)->SetLearningRateMultiplier(0); // prevent learning
-        Input(2)->SetLearningRateMultiplier(0);
-
        SetDims(Input(0));
    }
 };
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -519,10 +519,7 @@ public:
 private:
    void FindBestForwardAlgo(const CuDnnTensor4D& inT, const CuDnnFilter& filtT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnTensor4D& outT)
    {
-        // Need to re-run auto-tuner in case batch size has been changed.
-        // We assume no other dimensions of tensors can change so we don't check it.
-        // REVIEW alexeyk: is this a safe assumption? Can convolution configuration change in runtime?
-        if (m_fwdAlgo.Algo.status == CUDNN_STATUS_SUCCESS && inT.n() == m_fwdAlgo.CurMBSize && outT.n() == m_fwdAlgo.CurMBSize)
+        if (!m_fwdAlgo.NeedAutotuning(inT, outT))
            return;
        const int MaxAlgoCount = 10;
        int calgo = 0;
@ -543,7 +540,7 @@ private:

    void FindBestBackwardDataAlgo(const CuDnnFilter& filtT, const CuDnnTensor4D& srcGradT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnTensor4D& gradT)
    {
-        if (m_backDataAlgo.Algo.status == CUDNN_STATUS_SUCCESS && srcGradT.n() == m_backDataAlgo.CurMBSize && gradT.n() == m_backDataAlgo.CurMBSize)
+        if (!m_backDataAlgo.NeedAutotuning(srcGradT, gradT))
            return;
        const int MaxAlgoCount = 10;
        int calgo = 0;
@ -564,7 +561,7 @@ private:

    void FindBestBackwardFilterAlgo(const CuDnnTensor4D& inT, const CuDnnTensor4D& srcGradT, const CuDnnConvolutionDescriptor& convDesc, const CuDnnFilter& filtT)
    {
-        if (m_backFiltAlgo.Algo.status == CUDNN_STATUS_SUCCESS && inT.n() == m_backFiltAlgo.CurMBSize && srcGradT.n() == m_backFiltAlgo.CurMBSize)
+        if (!m_backFiltAlgo.NeedAutotuning(inT, srcGradT))
            return;
        const int MaxAlgoCount = 10;
        int calgo = 0;
@ -595,6 +592,16 @@ private:
        // Current mini-batch size, needed for re-computing statistics in auto-tuner.
        size_t CurMBSize;
        T Algo;
+
+        bool NeedAutotuning(const CuDnnTensor4D& t1, const CuDnnTensor4D& t2)
+        {
+            // Need to re-run auto-tuner in case minibatch size is increased.
+            // If minibatch size is decreased we assume that previously selected algorithm requires less or the same amount of workspace.
+            // This is done to avoid re-running auto-tuner every time in case minibatch size changes frequently (e.g. when distributed reading is enabled).
+            // REVIEW alexeyk: potentially, this might cause some perf issues if better (faster) algo can be selected for a smaller mininbatch.
+            // We assume no other dimensions of tensors can change so we don't check it.
+            return (Algo.status != CUDNN_STATUS_SUCCESS || t1.n() > CurMBSize || t2.n() > CurMBSize);
+        }
    };

    using C = Consts<ElemType>;