diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ProjWeightsGen.py b/Examples/Image/Miscellaneous/CIFAR-10/03_ProjWeightsGen.py new file mode 100644 index 000000000..a208adcbc --- /dev/null +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ProjWeightsGen.py @@ -0,0 +1,12 @@ +import sys +import numpy as np + +def writeConvWeights(fname, cmapIn): + cmapOut = 2 * cmapIn + w = np.eye(cmapOut, cmapIn) + np.savetxt(fname, w, fmt = '%d', delimiter = ' ') + +if __name__ == "__main__": + cmapIn = int(sys.argv[1]) + fname = sys.argv[2] + writeConvWeights(fname, cmapIn) \ No newline at end of file diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config index 3465ce2db..7b8d259d4 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config @@ -17,6 +17,9 @@ command=Train:AddBNEval:Test stderr=$OutputDir$/03_ResNet traceLevel=1 +Proj16to32Filename = $ConfigDir$/16to32.txt +Proj32to64Filename = $ConfigDir$/32to64.txt + Train=[ action=train modelPath=$ModelDir$/03_ResNet @@ -39,7 +42,7 @@ Train=[ distributedMBReading=true parallelizationStartEpoch=1 DataParallelSGD=[ - gradientBits=32 + gradientBits=1 ] ] diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl index 8ff6af0ca..013b0cfb8 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl @@ -32,20 +32,22 @@ DNN=[ cMap1 = 16 conv1 = ConvBNReLULayer2(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue) - rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue) - rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue) - rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue) + rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue) + rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue) + rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue) cMap2 = 32 - rn2_1 = ResNetNode2Reduce(rn1_3, cMap2, 144, 288, 16384, 8192, kW, kH, convWScale, convBValue, scValue) - rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue) - rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue) + rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false) + rn2_1 = ResNetNode2Conv(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, rn2_1_Wproj) + rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue) + rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue) cMap3 = 64 - rn3_1 = ResNetNode2Reduce(rn2_3, cMap3, 288, 576, 8192, 4096, kW, kH, convWScale, convBValue, scValue) - rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue) - rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue) - + rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false) + rn3_1 = ResNetNode2Conv(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, rn3_1_Wproj) + rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue) + rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue) + # pool poolW = 3 poolH = 3 diff --git a/Examples/Image/Miscellaneous/CIFAR-10/16to32.txt b/Examples/Image/Miscellaneous/CIFAR-10/16to32.txt new file mode 100644 index 000000000..7742979aa --- /dev/null +++ b/Examples/Image/Miscellaneous/CIFAR-10/16to32.txt @@ -0,0 +1,32 @@ +1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/Examples/Image/Miscellaneous/CIFAR-10/32to64.txt b/Examples/Image/Miscellaneous/CIFAR-10/32to64.txt new file mode 100644 index 000000000..0c78cca62 --- /dev/null +++ b/Examples/Image/Miscellaneous/CIFAR-10/32to64.txt @@ -0,0 +1,64 @@ +1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl index c2760cb35..b1a5442eb 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl @@ -41,7 +41,7 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue) isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true) - bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true) + bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 0.9) y1 = RectifiedLinear(bn1); W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) @@ -51,12 +51,12 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue) isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true) - bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true) + bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 0.9) p = Plus(bn2, inp) y2 = RectifiedLinear(p); } -ResNetNode2Reduce(inp, outMap, inWCount, wCount, inDim, outDim, kW, kH, wScale, bValue, scValue) +ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, Wproj) { W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b1 = Parameter(outMap, 1, init = fixedValue, value = bValue) @@ -65,7 +65,7 @@ ResNetNode2Reduce(inp, outMap, inWCount, wCount, inDim, outDim, kW, kH, wScale, isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true) - bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true) + bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 0.9) y1 = RectifiedLinear(bn1); W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale) @@ -75,10 +75,10 @@ ResNetNode2Reduce(inp, outMap, inWCount, wCount, inDim, outDim, kW, kH, wScale, isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true) - bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true) - WP = Parameter(outDim, inDim) - t = Times(WP, inp, init = Gaussian, initValueScale = wScale) - p = Plus(bn2, t) + bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 0.9) + + cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false) + p = Plus(bn2, cproj) y2 = RectifiedLinear(p); } diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl index 85cee3bbf..287289b1c 100644 --- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl +++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl @@ -8,6 +8,20 @@ DnnReLULayer(inDim, outDim, x, wScale, bValue) y = RectifiedLinear(z) } +# Fully-connected layer with batch normalization and ReLU activation. +DnnBNReLULayer(inDim, outDim, x, wScale, bValue) +{ + W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) + b = Parameter(inDim, 1, init = fixedValue, value = bValue) + sc = Parameter(inDim, 1, init = Gaussian, initValueScale = 0.01) + m = Parameter(inDim, 1, init = fixedValue, value = 0, needGradient = false) + isd = Parameter(inDim, 1, init = fixedValue, value = 0, needGradient = false) + + bn = BatchNormalization(x, sc, b, m, isd, eval = false, spatial = false) + t = Times(W, bn) + y = RectifiedLinear(t) +} + # Fully-connected layer. DnnLayer(inDim, outDim, x, wScale, bValue) { @@ -27,3 +41,16 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) y = RectifiedLinear(z); } +# Convolutional layer with batch normalization and ReLU activation. +ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue) +{ + W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) + b = Parameter(outMap, 1, init = fixedValue, value = bValue) + sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) + m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) + isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) + + c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true) + bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true) + y = RectifiedLinear(bn); +} diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config index c2fe8edf9..d2132eb70 100644 --- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config +++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config @@ -67,7 +67,7 @@ Train=[ # Horizontal random flip, will be enabled by default if cropType=Random #hflip=0 # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1. - cropRatio=0.9 + cropRatio=0.875 # Crop scale ratio jitter type. # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio jitterType=UniRatio diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl index 6ed547881..eb7fe867c 100644 --- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl +++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl @@ -1,7 +1,7 @@ -load=ndlMnistMacros +load=ndlMacros run=DNN -ndlMnistMacros = [ +ndlMacros = [ ImageW = 224 ImageH = 224 ImageC = 3 diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.config b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.config new file mode 100644 index 000000000..b4d0fc3ec --- /dev/null +++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.config @@ -0,0 +1,118 @@ +RootDir = "." + +ConfigDir = "$RootDir$" +DataDir = "$RootDir$" +OutputDir = "$RootDir$/Output" +ModelDir = "$OutputDir$/Models" + +ndlMacros=$ConfigDir$/Macros.ndl + +precision=float +deviceId=Auto + +command=Train:AddTop5Eval:Test + +parallelTrain=false + +stderr=$OutputDir$/VGG_E_BN +traceLevel=1 + +Train=[ + action=train + modelPath=$ModelDir$/VGG_E_BN + + NDLNetworkBuilder=[ + networkDescription=$ConfigDir$/VGG_E_BN.ndl + ] + + SGD=[ + epochSize=0 + minibatchSize=16 + learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003 + momentumPerMB=0.9 + maxEpochs=70 + gradUpdateType=None + L2RegWeight=0.0005 + dropoutRate=0*5:0.5 + + ParallelTrain=[ + parallelizationMethod=DataParallelSGD + distributedMBReading=true + parallelizationStartEpoch=1 + DataParallelSGD=[ + gradientBits=1 + ] + ] + + numMBsToShowResult=10 + ] + + reader=[ + readerType=ImageReader + # Map file which maps images to labels using the following format: + # + # Example: + # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG0 + file=$DataDir$/train_map.txt + # Randomize images before every epoch. Possible values: None, Auto. Default: Auto. + randomize=Auto + features=[ + # Below are the required parameters. + width=224 + height=224 + channels=3 + # Below are the optional parameters. + # Possible values: Center, Random. Default: Center + cropType=Random + # Horizontal random flip, will be enabled by default if cropType=Random + #hflip=0 + # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1. + cropRatio=0.875 + # Crop scale ratio jitter type. + # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio + jitterType=UniRatio + # Interpolation to use when scaling image to width x height size. + # Possible values: nearest, linear, cubic, lanczos. Default: linear. + interpolations=Linear + # Stores mean values for each pixel in OpenCV matrix XML format. + meanFile=$ConfigDir$/ImageNet1K_mean.xml + ] + labels=[ + labelDim=1000 + ] + ] +] + +AddTop5Eval=[ + action=edit + CurModel=$ModelDir$/VGG_E_BN + NewModel=$ModelDir$/VGG_E_BN.Top5 + editPath=$ConfigDir$/add_top5_layer.mel +] + +Test=[ + action=test + modelPath=$ModelDir$/VGG_E_BN.Top5 + # Set minibatch size for testing. + minibatchSize=128 + + NDLNetworkBuilder=[ + networkDescription=$ConfigDir$/VGG_E_BN.ndl + ] + + reader=[ + readerType=ImageReader + file=$DataDir$/val_map.txt + randomize=None + features=[ + width=224 + height=224 + channels=3 + cropType=Center + meanFile=$ConfigDir$/ImageNet1K_mean.xml + ] + labels=[ + labelDim=1000 + ] + ] +] diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl new file mode 100644 index 000000000..d07b2e407 --- /dev/null +++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl @@ -0,0 +1,87 @@ +load=ndlMacros +run=DNN + +ndlMacros = [ + ImageW = 224 + ImageH = 224 + ImageC = 3 + LabelDim = 1000 + + features = ImageInput(ImageW, ImageH, ImageC, tag = feature) + featOffs = Const(0, rows = 150528) + featScaled = Plus(features, featOffs) + labels = Input(LabelDim, tag = label) + + # Kernels width and height. + kW = 3 + kH = 3 + # Kernel stride. + hs = 1 + vs = 1 + + # Pooling settings. + poolW = 2 + poolH = 2 + poolhs = 2 + poolvs = 2 + + # Initial parameter values. + convWScale = 7.07 + convBValue = 0 + scValue = 0.03 + fc1WScale = 3.0 + fc1BValue = 1 + fc2WScale = 3.0 + fc2BValue = 1 + fc3WScale = 1.0 + fc3BValue = 1 +] + +DNN=[ + cMap1 = 64 + conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv2 = ConvBNReLULayer(conv1, cMap1, 576, kW, kH, hs, vs, convWScale, convBValue, scValue) + + pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs) + + cMap3 = 128 + conv3 = ConvBNReLULayer(pool1, cMap3, 576, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv4 = ConvBNReLULayer(conv3, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue) + + pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs) + + cMap5 = 256 + conv5 = ConvBNReLULayer(pool2, cMap5, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv6 = ConvBNReLULayer(conv5, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv7 = ConvBNReLULayer(conv6, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv8 = ConvBNReLULayer(conv7, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue) + + pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs) + + cMap9 = 512 + conv9 = ConvBNReLULayer(pool3, cMap9, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv10 = ConvBNReLULayer(conv9, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv11 = ConvBNReLULayer(conv10, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv12 = ConvBNReLULayer(conv11, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue) + + pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs) + + cMap13 = 512 + conv13 = ConvBNReLULayer(pool4, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv14 = ConvBNReLULayer(conv13, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv15 = ConvBNReLULayer(conv14, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue) + conv16 = ConvBNReLULayer(conv15, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue) + + pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs) + + hiddenDim = 4096 + h1 = DnnBNReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue) + h1_d = Dropout(h1) + h2 = DnnBNReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue) + h2_d = Dropout(h2) + ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue) + + CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria) + Err = ErrorPrediction(labels, ol, tag = Eval) + OutputNodes = ol +] diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h index 1b6a3a820..4db7a71a8 100644 --- a/Source/ComputationNetworkLib/ConvolutionalNodes.h +++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h @@ -171,9 +171,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { { Base::Validate(isFinalValidationPass); - if (m_horizontalSubsample > m_kernelWidth || m_verticalSubsample > m_kernelHeight) - InvalidArgument("In ConvolutionNode horizontalSubsample must <= kernelWidth and verticalSubsample must <= kernelHeight."); - InferMBLayoutFromInputsForStandardCase(); InferImageDimsFromInputs(); @@ -387,9 +384,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { { Base::Validate(isFinalValidationPass); - if (m_horizontalSubsample > m_windowWidth || m_verticalSubsample > m_windowHeight) - InvalidArgument("PoolingNodeBase: horizontalSubsample must <= windowWidth and verticalSubsample must <= windowHeight."); - InferMBLayoutFromInputsForStandardCase(); InferImageDimsFromInputs(); diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index f035bd2d5..2023bd5f7 100644 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -1960,7 +1960,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + // _elementWIseSigmoidOnCuda has an implementation that avoids possible overflow errors, but is slightly slower and may have an accuracy regression. + // We have a new implementation that is non-branching (yay!) that Frank will check in. +#if 0 + _elementWiseSigmoidOnCuda<<>>(a.m_pArray, m_pArray, N); +#else _assignSigmoidOf<<>>(a.m_pArray,m_pArray,N); +#endif if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2213,19 +2219,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { template GPUMatrix& GPUMatrix::InplaceTruncateBottom (const ElemType threshold) { - if (IsEmpty()) - LogicError("InplaceTruncateBottom: Matrix is empty."); - - CUDA_LONG N=(CUDA_LONG)GetNumElements(); - int blocksPerGrid =(int)ceil(N*1.0/GridDim::maxThreadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _inplaceTruncateBottom<<>>(m_pArray,threshold,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; + return AssignTruncateBottomOf(*this, threshold); } template @@ -2255,18 +2249,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { template GPUMatrix& GPUMatrix::InplaceTruncateTop (const ElemType threshold) { - if (IsEmpty()) - LogicError("InplaceTruncateTop: Matrix is empty."); - CUDA_LONG N=(CUDA_LONG)GetNumElements(); - int blocksPerGrid =(int)ceil(N*1.0/GridDim::maxThreadsPerBlock); - PrepareDevice(); - cudaEvent_t done = nullptr; - if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _inplaceTruncateTop<<>>(m_pArray,threshold,N); - if (do_sync) CUDA_CALL(cudaEventRecord(done)); - if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); - if (do_sync) CUDA_CALL(cudaEventDestroy(done)); - return *this; + return AssignTruncateTopOf(*this, threshold); } template diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index fd01a972f..dc358a93d 100644 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -90,6 +90,25 @@ static __inline__ __device__ double atomicAdd(double* address, double val) UNUSE // CUDA kernels follow, lots of them // =========================================================================== +// _elementWise*() kernels +// +// Designed to operate on contiguous blocks of memory, where the output is a simple function of the inputs. +// The first parameters of every function are inputs, and the last two arguments to each function are always +// (ElemenType *res, CUDA_LONG N), a pointer and length of the output block. Each thread computes a function +// of the inputs for one value in the output. + +// This macro overloads _x() with float and double arguments, and inlines the correct library function. This simplifies templated kernel code. +// TODO: merge with similar definition in TensorOps.h +#define DEF_ELEMENT_PRIMITIVE(x) __device__ __forceinline__ float _##x(float f) { return x##f(f); } __device__ __forceinline__ double _##x(double f) { return x(f); } + +DEF_ELEMENT_PRIMITIVE(exp) +DEF_ELEMENT_PRIMITIVE(log) +DEF_ELEMENT_PRIMITIVE(tanh) +DEF_ELEMENT_PRIMITIVE(sqrt) +DEF_ELEMENT_PRIMITIVE(fabs) +DEF_ELEMENT_PRIMITIVE(cos) +DEF_ELEMENT_PRIMITIVE(sin) + template __global__ void _elementWisePowerOnCuda( const ElemType alpha, @@ -134,48 +153,19 @@ __global__ void _elementWiseSigmoidOnCuda( ElemType *res, const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N); - if (sizeof(ElemType)==sizeof(double)) + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT; + if (a[id] >= 0) { - if (a[id]>=0) - { - double e = exp(-1*a[id]); - res[id]=1/(1+e); + double e = _exp(-a[id]); + res[id] = 1 / (1 + e); } else { - double e = exp(a[id]); - res[id]=e/(1+e); + double e = _exp(a[id]); + res[id] = e / (1 + e); } - } - else - { - if (res[id]>=0) - { - float e = expf(-1*a[id]); - res[id]=1/(1+e); - } - else - { - float e = exp(a[id]); // BUGBUG: Looks like this should be expf(). - res[id]=e/(1+e); - } - } }; -__device__ __forceinline__ float _exp(float f) -{ - return expf(f); -} - -__device__ __forceinline__ double _exp(double f) -{ - return exp(f); -} - -//#define TENSOR_OPS_DECL __device__ __host__ -//#include "TensorOps.h" - template __global__ void _assignSigmoidOf( const ElemType* a, @@ -224,16 +214,8 @@ __global__ void _elementWiseTanhOnCuda( ElemType *res, const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N); - if (sizeof(ElemType)==sizeof(double)) - { - res[id]=tanh(a[id]); - } - else - { - res[id]=tanhf(a[id]); - } - + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT; + res[id] = _tanh(a[id]); }; //to prevent negative values caused by floating operations, we force inputs to be >=0 @@ -244,15 +226,8 @@ __global__ void _elementWiseSqrtOnCuda( ElemType *res, const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N); - if (sizeof(ElemType)==sizeof(double)) - { - res[id]=sqrt(max((ElemType)0, a[id])); - } - else - { - res[id]=sqrtf(max(ElemType(0), a[id])); - } + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT; + res[id] = _sqrt(max((ElemType)0, a[id])); }; template @@ -261,15 +236,8 @@ __global__ void _elementWiseExpOnCuda( ElemType *res, const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N); - if (sizeof(ElemType)==sizeof(double)) - { - res[id]=exp(a[id]); - } - else - { - res[id]=expf(a[id]); - } + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT; + res[id] = _exp(a[id]); }; template @@ -278,22 +246,8 @@ __global__ void _elementWiseLogOnCuda( ElemType *res, const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N); - if (a[id] @@ -302,15 +256,8 @@ __global__ void _elementWiseAbsOnCuda( ElemType *res, const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N); - if (sizeof(ElemType)==sizeof(double)) - { - res[id]=fabs(a[id]); - } - else - { - res[id]=fabsf(a[id]); - } + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT; + res[id] = _fabs(a[id]); }; template @@ -319,15 +266,8 @@ __global__ void _elementWiseCosineOnCuda( ElemType *res, const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N); - if (sizeof(ElemType)==sizeof(double)) - { - res[id]=cos(a[id]); - } - else - { - res[id]=cosf(a[id]); - } + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT; + res[id] = _cos(a[id]); }; template @@ -336,18 +276,10 @@ __global__ void _elementWiseNegativeSineOnCuda( ElemType *res, const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N); - if (sizeof(ElemType)==sizeof(double)) - { - res[id]=-sin(a[id]); - } - else - { - res[id]=-sinf(a[id]); - } + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT; + res[id] = -_sin(a[id]); }; - template __global__ void _setValue( ElemType* a, @@ -1141,6 +1073,7 @@ __global__ void _assignColumnwiseHardmaxOf( } } +#if 0 template __global__ void _inplaceTruncateBottom( ElemType* a, @@ -1153,6 +1086,7 @@ __global__ void _inplaceTruncateBottom( if (a[id] __global__ void _assignTruncateBottom( @@ -1161,15 +1095,11 @@ __global__ void _assignTruncateBottom( const ElemType threshold, const CUDA_LONG N) { - CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (a[id] __global__ void _inplaceTruncateTop( ElemType* a, @@ -1182,6 +1112,7 @@ __global__ void _inplaceTruncateTop( if (a[id]>threshold) a[id]=threshold; } +#endif template __global__ void _assignTruncateTop( @@ -1190,13 +1121,8 @@ __global__ void _assignTruncateTop( const ElemType threshold, const CUDA_LONG N) { - CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; - if (a[id]>threshold) - us[id]=threshold; - else - us[id]=a[id]; + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT; + us[id] = a[id] > threshold ? threshold : a[id]; } template @@ -3716,9 +3642,7 @@ __global__ void _inplaceTruncate( const ElemType threshold, const CUDA_LONG N) { - CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; - if (id>=N) - return; + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT ElemType locThresholdPos = abs(threshold); ElemType locTHresholdNeg = -locThresholdPos; if (a[id] > locThresholdPos) diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu index d77f3f8fc..3831475c1 100644 --- a/Source/Math/GPUSparseMatrix.cu +++ b/Source/Math/GPUSparseMatrix.cu @@ -2526,7 +2526,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(N*1.0/GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _inplaceTruncateBottom << > >(NzValues(), threshold, N); + _assignTruncateBottom << > >(NzValues(), NzValues(), threshold, N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); @@ -2570,7 +2570,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { int blocksPerGrid =(int)ceil(N*1.0/GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); - _inplaceTruncateTop << > >(NzValues(), threshold, N); + _assignTruncateTop << > >(NzValues(), NzValues(), threshold, N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done));