Updated ResNet samples.

2016-01-14 10:49:33 -08:00 · 2016-01-14 10:49:33 -08:00 · 949c30473b
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
@ -32,7 +32,9 @@ Train=[
    SGD=[
        epochSize=0
        minibatchSize=128
-        learningRatesPerMB=0.1*80:0.01*40:0.001
+        # Note that learning rates are 10x more than in the paper due to a different
+        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
+        learningRatesPerMB=1.0*80:0.1*40:0.01
        momentumPerMB=0.9
        maxEpochs=160
        L2RegWeight=0.0001
@ -43,7 +45,7 @@ Train=[
            distributedMBReading=true
            parallelizationStartEpoch=1
            DataParallelSGD=[
-                gradientBits=1
+                gradientBits=32
            ]
        ]
    ]
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@ -12,7 +12,8 @@ LocalMacros = [
    
    convWScale = 7.07
    convBValue = 0
-    fc1WScale = 12
+    
+    fc1WScale = 0.4
    fc1BValue = 0
    
    scValue = 1
@ -24,8 +25,6 @@ LocalMacros = [
    
    hStride1 = 1
    vStride1 = 1
-    hStride2 = 2
-    vStride2 = 2
 ]

 DNN=[
@ -38,14 +37,16 @@ DNN=[
    rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap2 = 32
-    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
-    rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
+    #rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
+    #rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
+    rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap3 = 64
-    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
-    rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
+    #rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
+    #rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
+    rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
                
--- a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config
@ -32,9 +32,11 @@ Train=[
    SGD=[
        epochSize=0
        minibatchSize=128
-        learningRatesPerMB=0.1*80:0.01*40:0.001
+        # Note that learning rates are 10x more than in the paper due to a different
+        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
+        learningRatesPerMB=0.1*1:1.0*80:0.1*40:0.01
        momentumPerMB=0.9
-        maxEpochs=1
+        maxEpochs=160
        L2RegWeight=0.0001
        dropoutRate=0
        
@ -43,7 +45,7 @@ Train=[
            distributedMBReading=true
            parallelizationStartEpoch=1
            DataParallelSGD=[
-                gradientBits=1
+                gradientBits=32
            ]
        ]
    ]
--- a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
@ -12,7 +12,8 @@ LocalMacros = [
    
    convWScale = 7.07
    convBValue = 0
-    fc1WScale = 12
+    
+    fc1WScale = 0.4
    fc1BValue = 0
    
    scValue = 1
@ -24,8 +25,6 @@ LocalMacros = [
    
    hStride1 = 1
    vStride1 = 1
-    hStride2 = 2
-    vStride2 = 2
 ]

 DNN=[
@ -53,8 +52,9 @@ DNN=[
    rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap2 = 32
-    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
-    rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
+    #rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
+    #rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
+    rn2_1 = ResNetNode2Inc2(rn1_18, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn2_4 = ResNetNode2(rn2_3, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
@ -74,8 +74,9 @@ DNN=[
    rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap3 = 64
-    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
-    rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
+    #rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
+    #rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
+    rn3_1 = ResNetNode2Inc2(rn2_18, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, expAvg)
    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
    rn3_4 = ResNetNode2(rn3_3, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@ -3,82 +3,70 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = "cudnn")
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    p = Plus(c, b);
-    y = RectifiedLinear(p);
+    p = Plus(c, b)
+    y = RectifiedLinear(p)
 }

-ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg)
+ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
 {
-    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
    sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
-    y = RectifiedLinear(bn);
+    y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
+}
+
+ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg)
+{
+    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
+}
+
+ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg)
+{
+    c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg)
+    y = RectifiedLinear(c)
 }

 ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, expAvg)
 {
-    W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
-    y1 = RectifiedLinear(bn1);
-    
-    W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
-    p = Plus(bn2, inp)
-    y2 = RectifiedLinear(p);
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, expAvg)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, expAvg)
+    p = Plus(c2, inp)
+    y = RectifiedLinear(p)
 }

 ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, expAvg, Wproj)
 {
    # First convolution layer.
-    W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true, imageLayout = "cudnn")
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
-    y1 = RectifiedLinear(bn1);
-    
-    # Second convolution layer.
-    W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale)
-    b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, expAvg)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, expAvg)
    
    # Projection convolution layer.
-    #b_proj = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    #sc_proj = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    #m_proj = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    #isd_proj = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    c_proj = ConvBNLayerW(Wproj, inp, outMap, 1, 1, 2, 2, bValue, scValue, expAvg)
+    #c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
    
-    c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
-    #bn_proj = BatchNormalization(c_proj, sc_proj, b_proj, m_proj, isd_proj, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
+    p = Plus(c2, c_proj)
+    y = RectifiedLinear(p)
+}
+
+ResNetNode2Inc2(inp, inMap, outMap, inWCount, wCount, kW, kH, wScale, w1Scale, bValue, scValue, expAvg)
+{
+    pool = MaxPooling(inp, 1, 1, 2, 2, imageLayout = "cudnn")
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, expAvg)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, expAvg)
+    c3 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, expAvg)
    
-    #p = Plus(bn2, bn_proj)
-    p = Plus(bn2, c_proj)
-    y2 = RectifiedLinear(p);
+    p = Plus(c2, pool)
+    r = RowStack(p, c3)
+    y = RectifiedLinear(r)
 }

 DnnReLULayer(inDim, outDim, x, wScale, bValue)
--- a/Examples/Image/Miscellaneous/CIFAR-10/Output/03_ResNet_Train_AddBNEval_Test.log.160
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Output/03_ResNet_Train_AddBNEval_Test.log.160