merged with master
This commit is contained in:
Коммит
7784350f29
|
@ -8333,9 +8333,9 @@ SquareError
|
|||
\begin_layout Standard
|
||||
\begin_inset Formula
|
||||
\begin{eqnarray}
|
||||
v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & \leftarrow & \frac{1}{2}\mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
|
||||
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
|
||||
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{\mathbf{Y}}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
|
||||
v\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
|
||||
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
|
||||
\nabla_{\mathbf{Y}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}-2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
|
||||
\end{eqnarray}
|
||||
|
||||
\end_inset
|
||||
|
@ -8367,8 +8367,8 @@ Note that
|
|||
\color none
|
||||
\begin_inset Formula
|
||||
\begin{eqnarray}
|
||||
\frac{\partial v}{\partial\mathbf{X}} & = & \mathbf{X}-\mathbf{Y}\\
|
||||
\frac{\partial v}{\partial\mathbf{Y}} & = & \mathbf{-\left(X-\mathbf{Y}\right)}.
|
||||
\frac{\partial v}{\partial\mathbf{X}} & = & +2\left(\mathbf{X}-\mathbf{Y}\right)\\
|
||||
\frac{\partial v}{\partial\mathbf{Y}} & = & -2\left(\mathbf{X}-\mathbf{Y}\right).
|
||||
\end{eqnarray}
|
||||
|
||||
\end_inset
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import urllib
|
||||
import urllib.request
|
||||
import gzip
|
||||
import os
|
||||
import struct
|
||||
|
|
|
@ -13,7 +13,6 @@ deviceId = 0
|
|||
imageLayout = "cudnn"
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
# imageLayout = "legacy"
|
||||
|
||||
command = MNISTtrain:MNISTtest
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ DNN = [
|
|||
err = ErrorPrediction(labels, ol)
|
||||
|
||||
# Special Nodes
|
||||
errTop5 = ErrorPrediction(labels, ol, Const(1), tag="eval")
|
||||
FeatureNodes = (features)
|
||||
LabelNodes = (labels)
|
||||
CriterionNodes = (ce)
|
||||
|
|
|
@ -13,7 +13,6 @@ deviceId = 0
|
|||
imageLayout = "cudnn"
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
# imageLayout = "legacy"
|
||||
|
||||
command = train:test
|
||||
|
||||
|
@ -42,7 +41,7 @@ train = [
|
|||
SGD = [
|
||||
epochSize = 60000
|
||||
minibatchSize = 32
|
||||
learningRatesPerMB = 0.5
|
||||
learningRatesPerMB = 0.1*5:0.3
|
||||
momentumPerMB = 0*10:0.7
|
||||
maxEpochs = 15
|
||||
]
|
||||
|
|
|
@ -23,16 +23,17 @@ DNN=[
|
|||
hStride1 = 1
|
||||
vStride1 = 1
|
||||
# weight[cMap1, kW1 * kH1 * inputChannels]
|
||||
# ConvReLULayer is defined in Macros.ndl
|
||||
conv1_act = ConvReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1)
|
||||
|
||||
# Conv2DReLULayer is defined in Macros.ndl
|
||||
conv1 = Conv2DReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1)
|
||||
|
||||
# pool1
|
||||
pool1W = 2
|
||||
pool1H = 2
|
||||
pool1hStride = 2
|
||||
pool1vStride = 2
|
||||
pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
|
||||
|
||||
# MaxPooling is a standard NDL node.
|
||||
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
|
||||
|
||||
# conv2
|
||||
kW2 = 5
|
||||
kH2 = 5
|
||||
|
@ -40,19 +41,20 @@ DNN=[
|
|||
hStride2 = 1
|
||||
vStride2 = 1
|
||||
# weight[cMap2, kW2 * kH2 * cMap1]
|
||||
# ConvReLULayer is defined in Macros.ndl
|
||||
conv2_act = ConvReLULayer(pool1, cMap2, 400, kW2, kH2, hStride2, vStride2, 10, 1)
|
||||
|
||||
# ConvNDReLULayer is defined in Macros.ndl
|
||||
conv2 = ConvNDReLULayer(pool1, kW2, kH2, cMap1, 400, cMap2, hStride2, vStride2, 10, 1)
|
||||
|
||||
# pool2
|
||||
pool2W = 2
|
||||
pool2H = 2
|
||||
pool2hStride = 2
|
||||
pool2vStride = 2
|
||||
pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=$imageLayout$)
|
||||
|
||||
# MaxNDPooling is defined in Macros.ndl
|
||||
pool2 = MaxNDPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=$imageLayout$)
|
||||
|
||||
h1Dim = 128
|
||||
# DNNImageSigmoidLayer and DNNLayer are defined in Macros.ndl
|
||||
h1 = DNNImageSigmoidLayer(4, 4, cMap2, h1Dim, pool2, 1)
|
||||
h1 = DNNImageSigmoidLayer(7, 7, cMap2, h1Dim, pool2, 1)
|
||||
ol = DNNLayer(h1Dim, labelDim, h1, 1)
|
||||
|
||||
ce = CrossEntropyWithSoftmax(labels, ol)
|
||||
|
|
|
@ -13,9 +13,8 @@ deviceId = 0
|
|||
imageLayout = "cudnn"
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
# imageLayout = "legacy"
|
||||
|
||||
command = train:CreateEvalModel:test
|
||||
command = train:test
|
||||
|
||||
precision = "float"
|
||||
modelPath = "$ModelDir$/03_ConvBatchNorm"
|
||||
|
@ -38,9 +37,11 @@ train = [
|
|||
SGD = [
|
||||
epochSize = 60000
|
||||
minibatchSize = 32
|
||||
learningRatesPerMB = 0.5
|
||||
momentumPerMB = 0*10:0.7
|
||||
learningRatesPerMB = 0.5:0.1
|
||||
momentumPerMB = 0.9
|
||||
maxEpochs = 2
|
||||
#batchNormalizationTimeConstant=0 # Set through NDL
|
||||
batchNormalizationBlendTimeConstant=0:1#INF
|
||||
]
|
||||
|
||||
reader = [
|
||||
|
@ -63,17 +64,6 @@ train = [
|
|||
]
|
||||
]
|
||||
|
||||
#######################################
|
||||
# Edit model #
|
||||
#######################################
|
||||
|
||||
CreateEvalModel=[
|
||||
action=edit
|
||||
CurModel=$ModelDir$/03_ConvBatchNorm
|
||||
NewModel=$ModelDir$/03_ConvBatchNorm.Eval
|
||||
editPath=$ConfigDir$/03_ConvBatchNorm.mel
|
||||
]
|
||||
|
||||
#######################################
|
||||
# TEST CONFIG #
|
||||
#######################################
|
||||
|
@ -82,7 +72,7 @@ test = [
|
|||
action = "test"
|
||||
minibatchSize = 32
|
||||
|
||||
modelPath=$ModelDir$/03_ConvBatchNorm.Eval
|
||||
modelPath=$ModelDir$/03_ConvBatchNorm
|
||||
|
||||
NDLNetworkBuilder = [
|
||||
networkDescription = "$ConfigDir$/03_ConvBatchNorm.ndl"
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
m=LoadModel($CurModel$, format=cntk)
|
||||
SetDefaultModel(m)
|
||||
|
||||
SetPropertyForSubTree(CE, batchNormEvalMode, true)
|
||||
|
||||
SaveModel(m, $NewModel$, format=cntk)
|
|
@ -15,7 +15,7 @@ ndlMnistMacros = [
|
|||
labels = InputValue(labelDim)
|
||||
|
||||
scValue = 1
|
||||
# Batch normalization time constant.
|
||||
# Batch normalization time constant (normalizationTimeConstant). blendTimeConstant is set through .cntk file.
|
||||
bnTimeConst = 1024
|
||||
|
||||
convWScale = 10
|
||||
|
|
|
@ -1,28 +1,28 @@
|
|||
DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
|
||||
W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale)
|
||||
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale)
|
||||
W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
|
||||
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
y = Sigmoid(z)
|
||||
]
|
||||
|
||||
DNNImageSigmoidLayer(inW, inH, inC, outDim, x, parmScale) = [
|
||||
W = ImageParameter(outDim, inW, inH, inC, init="uniform", initValueScale=parmScale, imageLayout=$imageLayout$)
|
||||
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale)
|
||||
W = ImageParameter(outDim, inW, inH, inC, init="uniform", initValueScale=parmScale, initOnCPUOnly=true, imageLayout=$imageLayout$)
|
||||
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
y = Sigmoid(z)
|
||||
]
|
||||
|
||||
DNNLayer(inDim, outDim, x, parmScale) = [
|
||||
W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale)
|
||||
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale)
|
||||
W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
|
||||
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
]
|
||||
|
||||
DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
|
||||
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale, initOnCPUOnly=true)
|
||||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
@ -32,12 +32,36 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
|
|||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
|
||||
convW = LearnableParameter(outMap, inWCount, init="uniform", initValueScale=wScale)
|
||||
convB = ImageParameter(1, 1, outMap, init="fixedValue", value=bValue, imageLayout=$imageLayout$)
|
||||
conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=$imageLayout$)
|
||||
convPlusB = Plus(conv, convB);
|
||||
act = RectifiedLinear(convPlusB);
|
||||
ConvW(outMap, inWCount, wScale) = [
|
||||
W = LearnableParameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=true)
|
||||
]
|
||||
|
||||
ConvB(outMap, bValue) = [
|
||||
b = ImageParameter(1, 1, outMap, init="fixedValue", value=bValue, imageLayout=$imageLayout$)
|
||||
]
|
||||
|
||||
Conv2D(w, inp, kW, kH, outMap, hStride, vStride) = [
|
||||
c = Convolution(w, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
|
||||
]
|
||||
|
||||
ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride) = [
|
||||
c = Convolution(w, inp, {kW, kH, inMap}, mapCount=outMap, stride={hStride, vStride, inMap}, sharing={true, true, true}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
|
||||
]
|
||||
|
||||
Conv2DReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
|
||||
w = ConvW(outMap, inWCount, wScale)
|
||||
b = ConvB(outMap, bValue)
|
||||
c = Conv2D(w, inp, kW, kH, outMap, hStride, vStride)
|
||||
cpb = Plus(c, b);
|
||||
out = RectifiedLinear(cpb);
|
||||
]
|
||||
|
||||
ConvNDReLULayer(inp, kW, kH, inMap, inWCount, outMap, hStride, vStride, wScale, bValue) = [
|
||||
w = ConvW(outMap, inWCount, wScale)
|
||||
b = ConvB(outMap, bValue)
|
||||
c = ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride)
|
||||
cpb = Plus(c, b);
|
||||
out = RectifiedLinear(cpb);
|
||||
]
|
||||
|
||||
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst) = [
|
||||
|
@ -51,7 +75,7 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
|
|||
]
|
||||
|
||||
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
|
||||
W = LearnableParameter(outMap, inWCount, init=Gaussian, initValueScale=wScale)
|
||||
W = LearnableParameter(outMap, inWCount, init=Gaussian, initValueScale=wScale, initOnCPUOnly=true)
|
||||
c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
]
|
||||
|
||||
|
@ -59,3 +83,7 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
|
|||
c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
y = RectifiedLinear(c)
|
||||
]
|
||||
|
||||
MaxNDPooling(inp, kW, kH, hStride, vStride) = [
|
||||
p = Pooling(inp, "max", {kW, kH, 1}, stride={hStride, vStride, 1}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
|
||||
]
|
|
@ -70,7 +70,7 @@ To run the sample, navigate to the Data folder and run the following command:
|
|||
|
||||
3. 03_ConvBatchNorm.ndl is almost identical to 02_Convolution.ndl
|
||||
except that it uses batch normalization for the convolutional and fully connected layers.
|
||||
As a result, it achieves around 0.92% of error after training for just 2 epochs (and less than 30 seconds).
|
||||
As a result, it achieves around 0.8% of error after training for just 2 epochs (and less than 30 seconds).
|
||||
To run the sample, navigate to the Data folder and run the following command:
|
||||
`cntk configFile=../Config/03_ConvBatchNorm.cntk`
|
||||
|
||||
|
|
|
@ -12,7 +12,6 @@ deviceId = 0
|
|||
imageLayout = "cudnn"
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
# imageLayout = "legacy"
|
||||
|
||||
prefetch = "true"
|
||||
|
||||
|
@ -45,6 +44,7 @@ Train = [
|
|||
readerType = "UCIFastReader"
|
||||
file = "$DataDir$/Train.txt"
|
||||
randomize = "auto"
|
||||
minibatchMode="full"
|
||||
features = [
|
||||
dim = 3072
|
||||
start = 1
|
||||
|
|
|
@ -12,11 +12,10 @@ deviceId = 0
|
|||
imageLayout = "cudnn"
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
# imageLayout = "legacy"
|
||||
|
||||
prefetch = "true"
|
||||
|
||||
command = Train:AddBNEval:Test
|
||||
command = Train:Test
|
||||
|
||||
stderr = "$OutputDir$/02_BatchNormConv"
|
||||
traceLevel = 1
|
||||
|
@ -44,6 +43,7 @@ Train = [
|
|||
readerType = "UCIFastReader"
|
||||
file = "$DataDir$/Train.txt"
|
||||
randomize = "auto"
|
||||
minibatchMode="full"
|
||||
features = [
|
||||
dim = 3072
|
||||
start = 1
|
||||
|
@ -57,16 +57,9 @@ Train = [
|
|||
]
|
||||
]
|
||||
|
||||
AddBNEval = [
|
||||
action = "edit"
|
||||
CurModel = "$ModelDir$/02_BatchNormConv"
|
||||
NewModel = "$ModelDir$/02_BatchNormConv.Eval"
|
||||
editPath = "$ConfigDir$/02_BatchNormConv.mel"
|
||||
]
|
||||
|
||||
Test = [
|
||||
action = "test"
|
||||
modelPath = "$ModelDir$/02_BatchNormConv.Eval"
|
||||
modelPath = "$ModelDir$/02_BatchNormConv"
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize = 16
|
||||
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
m=LoadModel($CurModel$, format=cntk)
|
||||
SetDefaultModel(m)
|
||||
|
||||
SetPropertyForSubTree(CE, batchNormEvalMode, true)
|
||||
|
||||
SaveModel(m, $NewModel$, format=cntk)
|
|
@ -12,12 +12,11 @@ deviceId = 0
|
|||
imageLayout = "cudnn"
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
# imageLayout = "legacy"
|
||||
|
||||
prefetch = "true"
|
||||
parallelTrain = "false"
|
||||
|
||||
command = Train:AddBNEval:Test
|
||||
command = Train:Test
|
||||
|
||||
stderr = "$OutputDir$/03_ResNet"
|
||||
traceLevel = 1
|
||||
|
@ -75,16 +74,9 @@ Train = [
|
|||
]
|
||||
]
|
||||
|
||||
AddBNEval = [
|
||||
action = "edit"
|
||||
CurModel = "$ModelDir$/03_ResNet"
|
||||
NewModel = "$ModelDir$/03_ResNet.Eval"
|
||||
editPath = "$ConfigDir$/03_ResNet.mel"
|
||||
]
|
||||
|
||||
Test = [
|
||||
action = "test"
|
||||
modelPath = "$ModelDir$/03_ResNet.Eval"
|
||||
modelPath = "$ModelDir$/03_ResNet"
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize = 512
|
||||
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
m=LoadModel($CurModel$, format=cntk)
|
||||
SetDefaultModel(m)
|
||||
|
||||
SetPropertyForSubTree(CE, batchNormEvalMode, true)
|
||||
|
||||
SaveModel(m, $NewModel$, format=cntk)
|
|
@ -38,14 +38,14 @@ DNN=[
|
|||
rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap2 = 32
|
||||
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
|
||||
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
|
||||
rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
|
||||
#rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
|
||||
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap3 = 64
|
||||
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
|
||||
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
|
||||
rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
|
||||
#rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
|
||||
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
|
|
@ -13,12 +13,11 @@ deviceId = 0
|
|||
imageLayout = "cudnn"
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
# imageLayout = "legacy"
|
||||
|
||||
prefetch="true"
|
||||
parallelTrain="false"
|
||||
|
||||
command=Train:AddBNEval:Test
|
||||
command=Train:Test
|
||||
|
||||
stderr="$OutputDir$/04_ResNet_56"
|
||||
traceLevel=1
|
||||
|
@ -76,16 +75,9 @@ Train=[
|
|||
]
|
||||
]
|
||||
|
||||
AddBNEval=[
|
||||
action="edit"
|
||||
CurModel="$ModelDir$/04_ResNet_56"
|
||||
NewModel="$ModelDir$/04_ResNet_56.Eval"
|
||||
editPath="$ConfigDir$/03_ResNet.mel"
|
||||
]
|
||||
|
||||
Test=[
|
||||
action="test"
|
||||
modelPath="$ModelDir$/04_ResNet_56.Eval"
|
||||
modelPath="$ModelDir$/04_ResNet_56"
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize=512
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ DNN=[
|
|||
rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap2 = 32
|
||||
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
|
||||
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
|
||||
rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
|
||||
#rn2_1 = ResNetNode2Inc2(rn1_18, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
|
||||
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
@ -75,7 +75,7 @@ DNN=[
|
|||
rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap3 = 64
|
||||
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
|
||||
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
|
||||
rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
|
||||
#rn3_1 = ResNetNode2Inc2(rn2_18, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
|
||||
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
RootDir = "."
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
DataDir = "$RootDir$"
|
||||
OutputDir = "$RootDir$/Output"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
ndlMacros = "$ConfigDir$/Macros.ndl"
|
||||
|
||||
precision = "float"
|
||||
deviceId = 0
|
||||
imageLayout = "cudnn"
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
|
||||
prefetch = "true"
|
||||
|
||||
command = Train:Test
|
||||
|
||||
modelPath = "$ModelDir$/05_ConvLocal"
|
||||
|
||||
stderr = "$OutputDir$/05_ConvLocal"
|
||||
traceLevel = 1
|
||||
numMBsToShowResult = 50
|
||||
|
||||
Train = [
|
||||
action = "train"
|
||||
|
||||
NDLNetworkBuilder = [
|
||||
networkDescription = "$ConfigDir$/05_ConvLocal.ndl"
|
||||
]
|
||||
|
||||
SGD = [
|
||||
epochSize = 49984
|
||||
minibatchSize = 64
|
||||
learningRatesPerMB = 0.01*10:0.003*10:0.001
|
||||
momentumPerMB = 0.9*20:0.99
|
||||
maxEpochs = 30
|
||||
L2RegWeight = 0.03
|
||||
]
|
||||
|
||||
reader = [
|
||||
readerType = "UCIFastReader"
|
||||
file = "$DataDir$/Train.txt"
|
||||
randomize = "auto"
|
||||
minibatchMode="full"
|
||||
features = [
|
||||
dim = 3072
|
||||
start = 1
|
||||
]
|
||||
labels = [
|
||||
dim = 1
|
||||
start = 0
|
||||
labelDim = 10
|
||||
labelMappingFile = "$DataDir$/labelsmap.txt"
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
Test = [
|
||||
action = "test"
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize = 16
|
||||
|
||||
reader = [
|
||||
readerType = "UCIFastReader"
|
||||
file = "$DataDir$/Test.txt"
|
||||
randomize = "none"
|
||||
features = [
|
||||
dim = 3072
|
||||
start = 1
|
||||
]
|
||||
labels = [
|
||||
dim = 1
|
||||
start = 0
|
||||
labelDim = 10
|
||||
labelMappingFile = "$DataDir$/labelsmap.txt"
|
||||
]
|
||||
]
|
||||
]
|
|
@ -0,0 +1,84 @@
|
|||
load=ndlMnistMacros
|
||||
run=DNN
|
||||
|
||||
ndlMnistMacros = [
|
||||
ImageW = 32
|
||||
ImageH = 32
|
||||
ImageC = 3
|
||||
LabelDim = 10
|
||||
|
||||
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = $imageLayout$)
|
||||
featOffs = Const(128)
|
||||
featScaled = Minus(features, featOffs)
|
||||
labels = Input(LabelDim, tag = label)
|
||||
|
||||
conv1WScale = 0.0043
|
||||
conv1BValue = 0
|
||||
conv2WScale = 1.414
|
||||
conv2BValue = 0
|
||||
conv3WScale = 1.414
|
||||
conv3BValue = 0
|
||||
conv4WScale = 1.414
|
||||
conv4BValue = 0
|
||||
fc1WScale = 1.5
|
||||
fc1BValue = 0
|
||||
]
|
||||
|
||||
DNN=[
|
||||
# conv1
|
||||
kW1 = 5
|
||||
kH1 = 5
|
||||
cMap1 = 64
|
||||
hStride1 = 1
|
||||
vStride1 = 1
|
||||
# weight[cMap1, kW1 * kH1 * ImageC]
|
||||
conv1 = ConvReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue)
|
||||
|
||||
# pool1
|
||||
pool1W = 3
|
||||
pool1H = 3
|
||||
pool1hStride = 2
|
||||
pool1vStride = 2
|
||||
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout = $imageLayout$)
|
||||
|
||||
# conv2
|
||||
kW2 = 5
|
||||
kH2 = 5
|
||||
cMap2 = 64
|
||||
hStride2 = 1
|
||||
vStride2 = 1
|
||||
# weight[cMap2, kW2 * kH2 * cMap1]
|
||||
conv2 = ConvReLULayer(pool1, cMap2, 1600, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue)
|
||||
|
||||
# pool2
|
||||
pool2W = 3
|
||||
pool2H = 3
|
||||
pool2hStride = 2
|
||||
pool2vStride = 2
|
||||
pool2 = MaxPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout = $imageLayout$)
|
||||
|
||||
# conv_local3
|
||||
kW3 = 3
|
||||
kH3 = 3
|
||||
cMap3 = 64
|
||||
hStride3 = 1
|
||||
vStride3 = 1
|
||||
# weight[cMap3 * pool2OutW * poolOutH, kW3 * kH3 * cMap2]
|
||||
conv3 = ConvLocalReLULayer(pool2, cMap3, 3136, cMap2, 576, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue)
|
||||
|
||||
# conv_local4
|
||||
kW4 = 3
|
||||
kH4 = 3
|
||||
cMap4 = 32
|
||||
hStride4 = 1
|
||||
vStride4 = 1
|
||||
# weight[cMap4 * conv3OutW * conv3OutH, kW4 * kH4 * cMap3]
|
||||
conv4 = ConvLocalReLULayer(conv3, cMap4, 1568, cMap3, 576, kW4, kH4, hStride4, vStride4, conv4WScale, conv4BValue)
|
||||
|
||||
ol = DnnImageLastLayer(7, 7, cMap4, labelDim, conv4, fc1WScale, fc1BValue)
|
||||
|
||||
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
|
||||
Err = ErrorPrediction(labels, ol, tag = Eval)
|
||||
OutputNodes = ol
|
||||
]
|
||||
|
|
@ -7,6 +7,15 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
|
|||
y = RectifiedLinear(p)
|
||||
]
|
||||
|
||||
ConvLocalReLULayer(inp, outMap, outWCount, inMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
|
||||
[
|
||||
W = LearnableParameter(outWCount, inWCount, init = Gaussian, initValueScale = wScale)
|
||||
b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
|
||||
c = Convolution(W, inp, {kW, kH, inMap}, mapCount = outMap, stride = {hStride, vStride, inMap}, sharing = {false, false, false}, imageLayout = $imageLayout$)
|
||||
p = Plus(c, b)
|
||||
y = RectifiedLinear(p)
|
||||
]
|
||||
|
||||
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
|
@ -15,7 +24,7 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
|
|||
isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
]
|
||||
|
||||
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
|
@ -30,6 +39,17 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
|
|||
y = RectifiedLinear(c)
|
||||
]
|
||||
|
||||
ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
]
|
||||
|
||||
ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
# First convolution layer.
|
||||
|
@ -48,7 +68,7 @@ ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, b
|
|||
c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
|
||||
# Projection convolution layer.
|
||||
c_proj = ConvBNLayerW(Wproj, inp, outMap, 1, 1, 2, 2, bValue, scValue, bnTimeConst)
|
||||
c_proj = ProjLayer(Wproj, inp, outMap, 2, 2, bValue, scValue, bnTimeConst)
|
||||
#c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = $imageLayout$)
|
||||
|
||||
p = Plus(c2, c_proj)
|
||||
|
@ -95,7 +115,7 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
|
|||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
@ -107,7 +127,7 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
|
|||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
@ -118,3 +138,11 @@ DnnLastLayer(hiddenDim, labelDim, x, wScale, bValue)
|
|||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
]
|
||||
|
||||
DnnImageLastLayer(inW, inH, inC, labelDim, x, wScale, bValue)
|
||||
[
|
||||
W = ImageParameter(labelDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
|
||||
b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
]
|
||||
|
|
|
@ -6411,4 +6411,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
|
|||
Allocating matrices for forward and/or backward propagation.
|
||||
Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0819 CE: CrossEntropyWithSoftmax/Sample = 0.35141698
|
||||
Final Results: Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0819 CE: CrossEntropyWithSoftmax/Sample = 0.35141698 Perplexity = 1.4210798
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -9899,4 +9899,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
|
|||
Allocating matrices for forward and/or backward propagation.
|
||||
Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0644 CE: CrossEntropyWithSoftmax/Sample = 0.3034767
|
||||
Final Results: Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0644 CE: CrossEntropyWithSoftmax/Sample = 0.3034767 Perplexity = 1.35456
|
||||
COMPLETED
|
||||
__COMPLETED__
|
||||
|
|
|
@ -25,7 +25,7 @@ Then install numpy package by following instruction from: http://www.scipy.org/i
|
|||
2. Alternatively install Python Anaconda distribution which contains most of the popular Python packages including numpy:
|
||||
http://continuum.io/downloads
|
||||
|
||||
`-f` parameter is optional and specifies output format of the datasets. `cudnn` option (default) saves dataset in a spatial-major format used by cuDNN, while `legacy` - in CNTK legacy format. Use `cudnn` if CNTK is compiled with cuDNN **and** running on GPU and `legacy` otherwise.
|
||||
`-f` parameter is optional and specifies output format of the datasets. `cudnn` option (default) saves dataset in a spatial-major format used by cuDNN, while `legacy` - in CNTK legacy format. Use `cudnn` if CNTK is compiled with cuDNN and `legacy` otherwise.
|
||||
|
||||
ResNet samples require converting CIFAR-10 dataset to actual images. This can be performed by running the following command:
|
||||
```
|
||||
|
@ -54,5 +54,7 @@ cntk configFile=02_BatchNormConv.cntk
|
|||
3. 03_ResNet.ndl and 04_ResNet_56.ndl are very deep convolutional networks that use ResNet architecture and have 20 and 56 layers respectively (http://arxiv.org/abs/1512.03385).
|
||||
With 03_ResNet.cntk you should get around 8.2% of error after training for about 50 minutes. 04_ResNet_56.cntk should produce around 6.4% of error after training for about 3 hours (see log files in the Output directory).
|
||||
|
||||
4. 05_ConvLocal.cntk uses locally-connected convolution layers (see `conv_local3` and `conv_local4` in `05_ConvLocal.cntk`) and resembles a network described here: https://code.google.com/p/cuda-convnet/source/browse/trunk/example-layers/layers-conv-local-11pct.cfg
|
||||
|
||||
For more details, refer to .ndl and corresponding .cntk files.
|
||||
|
||||
|
|
|
@ -66,7 +66,7 @@ Train=[
|
|||
# Possible values: Center, Random. Default: Center
|
||||
cropType="Random"
|
||||
# Horizontal random flip, will be enabled by default if cropType=Random
|
||||
#hflip=0
|
||||
#hflip="true"
|
||||
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
|
||||
cropRatio=0.875
|
||||
# Crop scale ratio jitter type.
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
m1=LoadModel($CurModel$, format=cntk)
|
||||
SetDefaultModel(m1)
|
||||
|
||||
# Switch batch normalization to eval mode.
|
||||
SetPropertyForSubTree(CE, batchNormEvalMode, true)
|
||||
|
||||
# Add top-5 error prediction node.
|
||||
ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = Eval)
|
||||
|
||||
|
|
|
@ -1,18 +1,29 @@
|
|||
Conv(W, inp, outMap, kW, kH, hStride, vStride)
|
||||
[
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
]
|
||||
|
||||
BN(inp, mapCount, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
|
||||
m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
y = BatchNormalization(inp, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
|
||||
]
|
||||
|
||||
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
|
||||
c = Conv(W, inp, outMap, kW, kH, hStride, vStride)
|
||||
y = BN(c, outMap, bValue, scValue, bnTimeConst)
|
||||
]
|
||||
|
||||
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
|
||||
c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
c = Conv(W, inp, outMap, kW, kH, hStride, vStride)
|
||||
y = BN(c, outMap, bValue, scValue, bnTimeConst)
|
||||
]
|
||||
|
||||
ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
|
@ -21,6 +32,19 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
|
|||
y = RectifiedLinear(c)
|
||||
]
|
||||
|
||||
Conv1x1(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
W = Parameter(outMap, inMap, init = Gaussian, initValueScale = wScale)
|
||||
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = "cudnn")
|
||||
y = BN(c, outMap, bValue, scValue, bnTimeConst)
|
||||
]
|
||||
|
||||
Conv1x1ReLU(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
c = Conv1x1(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
y = RectifiedLinear(c)
|
||||
]
|
||||
|
||||
# Standard building block for ResNet with identity shortcut (option A).
|
||||
ResNetNode2A(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
|
||||
[
|
||||
|
@ -48,15 +72,30 @@ ResNetNode2AInc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue,
|
|||
y2 = RectifiedLinear(p)
|
||||
]
|
||||
|
||||
# Standard building block for ResNet with padding (option B).
|
||||
ResNetNode2BInc(inp, outMap, inMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
# First convolution layer.
|
||||
c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
|
||||
# Second convolution layer, no ReLU.
|
||||
c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
|
||||
# Projection convolution layer.
|
||||
c_proj = Conv1x1(inp, outMap, inMap, 2, 2, wScale, bValue, scValue, bnTimeConst)
|
||||
|
||||
p = Plus(c2, c_proj)
|
||||
y2 = RectifiedLinear(p)
|
||||
]
|
||||
|
||||
# Bottleneck building block for ResNet.
|
||||
ResNetNode3A(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
# 1x1 reducing convolution.
|
||||
c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
c1 = Conv1x1ReLU(inp, convMap, inMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
# 3x3 convolution.
|
||||
c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
# 1x1 expanding convolution, no ReLU.
|
||||
c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
|
||||
p = Plus(c3, inp)
|
||||
y = RectifiedLinear(p)
|
||||
|
@ -65,11 +104,11 @@ ResNetNode3A(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, b
|
|||
ResNetNode3AInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst, wProj, projStride)
|
||||
[
|
||||
# 1x1 reducing convolution.
|
||||
c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
|
||||
c1 = Conv1x1ReLU(inp, convMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
|
||||
# 3x3 convolution.
|
||||
c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
# 1x1 expanding convolution, no ReLU.
|
||||
c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
# Input-to-output mapping convolution.
|
||||
c_proj = ConvBNLayerW(wProj, inp, outMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
|
||||
|
||||
|
@ -80,13 +119,13 @@ ResNetNode3AInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue
|
|||
ResNetNode3BInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst, projStride)
|
||||
[
|
||||
# 1x1 reducing convolution.
|
||||
c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
|
||||
c1 = Conv1x1ReLU(inp, convMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
|
||||
# 3x3 convolution.
|
||||
c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
# 1x1 expanding convolution, no ReLU.
|
||||
c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
# Input-to-output mapping convolution.
|
||||
c_proj = ConvBNLayer(inp, outMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
|
||||
c_proj = Conv1x1(inp, outMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
|
||||
|
||||
p = Plus(c3, c_proj)
|
||||
y = RectifiedLinear(p)
|
||||
|
@ -99,3 +138,8 @@ DnnLayer(hiddenDim, labelDim, x, wScale, bValue)
|
|||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
]
|
||||
|
||||
MaxNDPooling(inp, kW, kH, hStride, vStride)
|
||||
[
|
||||
p = Pooling(inp, "max", {kW, kH, 1}, stride = {hStride, vStride, 1}, autoPadding = {true, true, false}, imageLayout = "cudnn")
|
||||
]
|
||||
|
|
|
@ -71,7 +71,7 @@ Train=[
|
|||
# Possible values: Center, Random. Default: Center
|
||||
cropType="Random"
|
||||
# Horizontal random flip, will be enabled by default if cropType=Random
|
||||
#hflip=0
|
||||
#hflip="true"
|
||||
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
|
||||
cropRatio=0.46666:0.875
|
||||
# Crop scale ratio jitter type.
|
||||
|
|
|
@ -41,8 +41,8 @@ DNN=[
|
|||
conv1WScale = 0.6
|
||||
conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
|
||||
# Max pooling
|
||||
pool1W = 2
|
||||
pool1H = 2
|
||||
pool1W = 3
|
||||
pool1H = 3
|
||||
pool1hs = 2
|
||||
pool1vs = 2
|
||||
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
RootDir = "."
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
DataDir = "$RootDir$"
|
||||
OutputDir = "$RootDir$/Output"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
ndlMacros="$ConfigDir$/Macros.ndl"
|
||||
|
||||
precision="float"
|
||||
deviceId="Auto"
|
||||
|
||||
command=Train:CreateEval:Test
|
||||
|
||||
parallelTrain="false"
|
||||
|
||||
stderr="$OutputDir$/ResNet_18"
|
||||
traceLevel=1
|
||||
numMBsToShowResult=500
|
||||
|
||||
Train=[
|
||||
action="train"
|
||||
modelPath="$ModelDir$/ResNet_18"
|
||||
|
||||
NDLNetworkBuilder=[
|
||||
networkDescription="$ConfigDir$/ResNet_18.ndl"
|
||||
]
|
||||
|
||||
SGD=[
|
||||
epochSize=0
|
||||
minibatchSize=256
|
||||
# Note that learning rates are 10x more than in the paper due to a different
|
||||
# momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
|
||||
learningRatesPerMB=1.0*35:0.1*35:0.01
|
||||
momentumPerMB=0.9
|
||||
maxEpochs=125
|
||||
gradUpdateType="None"
|
||||
L2RegWeight=0.0001
|
||||
dropoutRate=0
|
||||
|
||||
ParallelTrain=[
|
||||
parallelizationMethod="DataParallelSGD"
|
||||
distributedMBReading="true"
|
||||
parallelizationStartEpoch=1
|
||||
DataParallelSGD=[
|
||||
gradientBits=32
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
reader=[
|
||||
readerType="ImageReader"
|
||||
# Map file which maps images to labels using the following format:
|
||||
# <full path to image><tab><numerical label (0-based class id)>
|
||||
# Example:
|
||||
# C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
|
||||
file="$DataDir$/train_map.txt"
|
||||
# Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
|
||||
randomize="Auto"
|
||||
features=[
|
||||
# Below are the required parameters.
|
||||
width=224
|
||||
height=224
|
||||
channels=3
|
||||
# Below are the optional parameters.
|
||||
# Possible values: Center, Random. Default: Center
|
||||
cropType="Random"
|
||||
# Horizontal random flip, will be enabled by default if cropType=Random
|
||||
#hflip="true"
|
||||
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
|
||||
cropRatio=0.46666:0.875
|
||||
# Crop scale ratio jitter type.
|
||||
# Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
|
||||
jitterType="UniRatio"
|
||||
# Interpolation to use when scaling image to width x height size.
|
||||
# Possible values: nearest, linear, cubic, lanczos. Default: linear.
|
||||
interpolations="Linear"
|
||||
# Stores mean values for each pixel in OpenCV matrix XML format.
|
||||
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
|
||||
]
|
||||
labels=[
|
||||
labelDim=1000
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
CreateEval=[
|
||||
action="edit"
|
||||
CurModel="$ModelDir$/ResNet_18"
|
||||
NewModel="$ModelDir$/ResNet_18.Eval"
|
||||
editPath="$ConfigDir$/CreateEvalModel.mel"
|
||||
]
|
||||
|
||||
Test=[
|
||||
action="test"
|
||||
modelPath="$ModelDir$/ResNet_18.Eval"
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize=64
|
||||
|
||||
reader=[
|
||||
readerType="ImageReader"
|
||||
file="$DataDir$/val_map.txt"
|
||||
randomize="None"
|
||||
features=[
|
||||
width=224
|
||||
height=224
|
||||
channels=3
|
||||
cropType="Center"
|
||||
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
|
||||
]
|
||||
labels=[
|
||||
labelDim=1000
|
||||
]
|
||||
]
|
||||
]
|
|
@ -0,0 +1,72 @@
|
|||
load=ndlMacros
|
||||
run=DNN
|
||||
|
||||
ndlMacros = [
|
||||
ImageW = 224
|
||||
ImageH = 224
|
||||
ImageC = 3
|
||||
LabelDim = 1000
|
||||
|
||||
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
|
||||
labels = Input(LabelDim, tag = label)
|
||||
|
||||
# Kernels width and height.
|
||||
kW = 3
|
||||
kH = 3
|
||||
# Kernel stride.
|
||||
hs = 1
|
||||
vs = 1
|
||||
|
||||
# Initial parameter values.
|
||||
convWScale = 7.07
|
||||
convBValue = 0
|
||||
|
||||
fcWScale = 1.13
|
||||
fcBValue = 0
|
||||
|
||||
scValue = 1
|
||||
|
||||
# Batch normalization time constant.
|
||||
bnTimeConst = 32768
|
||||
]
|
||||
|
||||
DNN=[
|
||||
conv1WScale = 0.6
|
||||
cMap1 = 64
|
||||
conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
|
||||
# Max pooling
|
||||
pool1W = 3
|
||||
pool1H = 3
|
||||
pool1hs = 2
|
||||
pool1vs = 2
|
||||
pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
|
||||
|
||||
rn1_1 = ResNetNode2A(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn1_2 = ResNetNode2A(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap2 = 128
|
||||
rn2_1 = ResNetNode2BInc(rn1_2, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap3 = 256
|
||||
rn3_1 = ResNetNode2BInc(rn2_2, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap4 = 512
|
||||
rn4_1 = ResNetNode2BInc(rn3_2, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
# Global average pooling
|
||||
pool2W = 7
|
||||
pool2H = 7
|
||||
pool2hs = 1
|
||||
pool2vs = 1
|
||||
pool5 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn")
|
||||
|
||||
ol = DnnLayer(cMap4, labelDim, pool5, fcWScale, fcBValue)
|
||||
|
||||
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
|
||||
Err = ErrorPrediction(labels, ol, tag = Eval)
|
||||
OutputNodes = ol
|
||||
]
|
|
@ -70,7 +70,7 @@ Train=[
|
|||
# Possible values: Center, Random. Default: Center
|
||||
cropType="Random"
|
||||
# Horizontal random flip, will be enabled by default if cropType=Random
|
||||
#hflip=0
|
||||
#hflip="true"
|
||||
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
|
||||
cropRatio=0.46666:0.875
|
||||
# Crop scale ratio jitter type.
|
||||
|
|
|
@ -35,26 +35,24 @@ DNN=[
|
|||
cMap1 = 64
|
||||
conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
|
||||
# Max pooling
|
||||
pool1W = 2
|
||||
pool1H = 2
|
||||
pool1W = 3
|
||||
pool1H = 3
|
||||
pool1hs = 2
|
||||
pool1vs = 2
|
||||
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
|
||||
pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
|
||||
|
||||
rn1_1 = ResNetNode2A(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn1_2 = ResNetNode2A(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn1_3 = ResNetNode2A(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap2 = 128
|
||||
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", needGradient = false)
|
||||
rn2_1 = ResNetNode2AInc(rn1_3, cMap2, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
|
||||
rn2_1 = ResNetNode2BInc(rn1_3, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn2_3 = ResNetNode2A(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn2_4 = ResNetNode2A(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap3 = 256
|
||||
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", needGradient = false)
|
||||
rn3_1 = ResNetNode2AInc(rn2_4, cMap3, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
|
||||
rn3_1 = ResNetNode2BInc(rn2_4, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn3_3 = ResNetNode2A(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn3_4 = ResNetNode2A(rn3_3, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
@ -62,8 +60,7 @@ DNN=[
|
|||
rn3_6 = ResNetNode2A(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap4 = 512
|
||||
rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
|
||||
rn4_1 = ResNetNode2AInc(rn3_6, cMap4, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn4_1_Wproj)
|
||||
rn4_1 = ResNetNode2BInc(rn3_6, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
|
|
|
@ -71,7 +71,7 @@ Train=[
|
|||
# Possible values: Center, Random. Default: Center
|
||||
cropType="Random"
|
||||
# Horizontal random flip, will be enabled by default if cropType=Random
|
||||
#hflip=0
|
||||
#hflip="true"
|
||||
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
|
||||
cropRatio=0.46666:0.875
|
||||
# Crop scale ratio jitter type.
|
||||
|
|
|
@ -41,11 +41,11 @@ DNN=[
|
|||
conv1WScale = 0.6
|
||||
conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
|
||||
# Max pooling
|
||||
pool1W = 2
|
||||
pool1H = 2
|
||||
pool1W = 3
|
||||
pool1H = 3
|
||||
pool1hs = 2
|
||||
pool1vs = 2
|
||||
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
|
||||
pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
|
||||
|
||||
rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst, 1)
|
||||
rn1_2 = ResNetNode3A(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
m1=LoadModel($CurModel$, format=cntk)
|
||||
SetDefaultModel(m1)
|
||||
|
||||
# Switch batch normalization to eval mode.
|
||||
SetPropertyForSubTree(CE, batchNormEvalMode, true)
|
||||
|
||||
# Add top-5 error prediction node.
|
||||
ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = "eval")
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
|
|||
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false)
|
||||
bn = BatchNormalization(t, sc, b, m, isd, spatial = false)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
|
@ -50,6 +50,6 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
|
|||
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, imageLayout = "cudnn")
|
||||
bn = BatchNormalization(c, sc, b, m, isd, spatial = true, imageLayout = "cudnn")
|
||||
y = RectifiedLinear(bn);
|
||||
]
|
||||
|
|
|
@ -56,7 +56,7 @@ Train=[
|
|||
# Possible values: Center, Random. Default: Center
|
||||
cropType="Random"
|
||||
# Horizontal random flip, will be enabled by default if cropType=Random
|
||||
#hflip=0
|
||||
#hflip="true"
|
||||
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
|
||||
cropRatio=0.875
|
||||
# Crop scale ratio jitter type.
|
||||
|
|
|
@ -65,7 +65,7 @@ Train=[
|
|||
# Possible values: Center, Random. Default: Center
|
||||
cropType="Random"
|
||||
# Horizontal random flip, will be enabled by default if cropType=Random
|
||||
#hflip=0
|
||||
#hflip="true"
|
||||
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
|
||||
cropRatio=0.875
|
||||
# Crop scale ratio jitter type.
|
||||
|
|
|
@ -65,7 +65,7 @@ Train=[
|
|||
# Possible values: Center, Random. Default: Center
|
||||
cropType="Random"
|
||||
# Horizontal random flip, will be enabled by default if cropType=Random
|
||||
#hflip=0
|
||||
#hflip="true"
|
||||
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
|
||||
cropRatio=0.875
|
||||
# Crop scale ratio jitter type.
|
||||
|
|
25
Makefile
25
Makefile
|
@ -31,6 +31,8 @@
|
|||
# defaults to /usr/local/
|
||||
# These can be overridden on the command line, e.g. make BUILDTYPE=debug
|
||||
|
||||
ARCH=$(shell uname)
|
||||
|
||||
ifndef BUILD_TOP
|
||||
BUILD_TOP=.
|
||||
endif
|
||||
|
@ -211,9 +213,11 @@ CNTKMATH:=cntkmath
|
|||
BUILDINFO:= $(SOURCEDIR)/CNTK/buildinfo.h
|
||||
GENBUILD:=Tools/generate_build_info
|
||||
|
||||
$(BUILDINFO): $(GENBUILD)
|
||||
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
@$(GENBUILD) $(BUILD_TOP)/Config.make
|
||||
BUILDINFO_OUTPUT := $(shell $(GENBUILD) $(BUILD_TOP)/Config.make && echo Success)
|
||||
|
||||
ifneq ("$(BUILDINFO_OUTPUT)","Success")
|
||||
$(error Could not generate $(BUILDINFO))
|
||||
endif
|
||||
|
||||
|
||||
########################################
|
||||
|
@ -228,6 +232,9 @@ READER_SRC =\
|
|||
$(SOURCEDIR)/Readers/ReaderLib/ReaderShim.cpp \
|
||||
$(SOURCEDIR)/Readers/ReaderLib/ChunkRandomizer.cpp \
|
||||
$(SOURCEDIR)/Readers/ReaderLib/SequenceRandomizer.cpp \
|
||||
$(SOURCEDIR)/Readers/ReaderLib/SequencePacker.cpp \
|
||||
$(SOURCEDIR)/Readers/ReaderLib/BpttPacker.cpp \
|
||||
$(SOURCEDIR)/Readers/ReaderLib/PackerBase.cpp \
|
||||
$(SOURCEDIR)/Readers/ReaderLib/SampleModePacker.cpp \
|
||||
|
||||
COMMON_SRC =\
|
||||
|
@ -250,6 +257,7 @@ MATH_SRC =\
|
|||
$(SOURCEDIR)/Math/TensorView.cpp \
|
||||
$(SOURCEDIR)/Math/CUDAPageLockedMemAllocator.cpp \
|
||||
$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
|
||||
$(SOURCEDIR)/Math/BatchNormalizationEngine.cpp \
|
||||
|
||||
ifdef CUDA_PATH
|
||||
MATH_SRC +=\
|
||||
|
@ -258,7 +266,9 @@ MATH_SRC +=\
|
|||
$(SOURCEDIR)/Math/GPUSparseMatrix.cu \
|
||||
$(SOURCEDIR)/Math/GPUWatcher.cu \
|
||||
$(SOURCEDIR)/Math/MatrixQuantizerGPU.cu \
|
||||
$(SOURCEDIR)/Math/CuDnnCommon.cu \
|
||||
$(SOURCEDIR)/Math/CuDnnConvolutionEngine.cu \
|
||||
$(SOURCEDIR)/Math/CuDnnBatchNormalization.cu \
|
||||
$(SOURCEDIR)/Math/GPUDataTransferer.cpp \
|
||||
|
||||
else
|
||||
|
@ -376,6 +386,7 @@ LUSEQUENCEREADER_SRC =\
|
|||
$(SOURCEDIR)/Readers/LUSequenceReader/DataWriterLocal.cpp \
|
||||
$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceParser.cpp \
|
||||
$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceReader.cpp \
|
||||
$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceWriter.cpp \
|
||||
|
||||
LUSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LUSEQUENCEREADER_SRC))
|
||||
|
||||
|
@ -595,8 +606,9 @@ CNTK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(C
|
|||
|
||||
CNTK:=$(BINDIR)/cntk
|
||||
ALL+=$(CNTK)
|
||||
SRC+=$(CNTK_SRC)
|
||||
|
||||
$(CNTK): $(BUILDINFO) $(CNTK_OBJ) | $(CNTKMATH_LIB)
|
||||
$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building output for $(ARCH) with build type $(BUILDTYPE)
|
||||
|
@ -638,10 +650,7 @@ $(OBJDIR)/%.o : %.cpp Makefile
|
|||
@mkdir -p $(dir $@)
|
||||
$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}
|
||||
|
||||
.PHONY: force clean buildall all
|
||||
|
||||
force: $(BUILDINFO)
|
||||
|
||||
.PHONY: clean buildall all
|
||||
|
||||
clean:
|
||||
@echo $(SEPARATOR)
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "ConvolutionalNodes.h"
|
||||
#include "NonlinearityNodes.h"
|
||||
#include "ReshapingNodes.h"
|
||||
#include "InputAndParamNodes.h"
|
||||
#include "TensorShape.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
@ -288,36 +289,135 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, timeStep, name);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationNameOf(ConvolutionNode))
|
||||
else if (cnNodeType == OperationNameOf(ConvolutionNode) || cnNodeType == OperationNameOf(PoolingNode))
|
||||
{
|
||||
if (parameter.size() != 7)
|
||||
RuntimeError("%ls should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"].", cnNodeType.c_str());
|
||||
if (parameter.size() != 3 && parameter.size() != 7)
|
||||
{
|
||||
if (cnNodeType == OperationNameOf(ConvolutionNode))
|
||||
{
|
||||
RuntimeError("%ls: unexpected parameter count. %ls supports 2 modes: \n"
|
||||
"1. 2D convolution which takes 7 fixed parameters [weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] \n"
|
||||
"and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"]. \n"
|
||||
"2. ND convolution which takes 3 fixed parameters [weightNodeName, inputValueNodeName, kernelShape] and \n"
|
||||
"9 optional parameters [mapCount = [1|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
|
||||
"For ND convolution, parameters kernelShape, mapCount, stride, sharing, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
|
||||
cnNodeType.c_str(), cnNodeType.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
RuntimeError("%ls: unexpected parameter count. %ls 3 fixed parameters [inputValueNodeName, poolKind, kernelShape] and \n"
|
||||
"5 optional parameters stride = [1|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
|
||||
"Parameters kernelShape, stride, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
|
||||
cnNodeType.c_str(), cnNodeType.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// setup the parameter position of children so we can hook them up later
|
||||
nodeParamCount = 2;
|
||||
nodeParamStart = 0;
|
||||
nodeParamCount = cnNodeType == OperationNameOf(ConvolutionNode) ? 2 : 1;
|
||||
|
||||
if (pass == ndlPassInitial)
|
||||
{
|
||||
int id = 2; // skip weightNode and inputValueNode
|
||||
if (parameter.size() == 3)
|
||||
{
|
||||
auto reqParams = node->GetParameters(false);
|
||||
auto optParams = node->GetParameters(true);
|
||||
auto paramGetter = [reqParams, node](size_t index) -> TensorShape
|
||||
{
|
||||
assert(index < reqParams.size());
|
||||
auto parm = reqParams[index];
|
||||
if (parm->GetType() != ndlTypeArray)
|
||||
return TensorShape((size_t)parm->GetScalar());
|
||||
auto parms = node->GetParentScript()->ParseVariable(parm->GetValue(), false)->GetParameters();
|
||||
vector<size_t> dims(parms.size());
|
||||
for (size_t i = 0; i < dims.size(); i++)
|
||||
dims[i] = parms[i]->GetValue();
|
||||
return TensorShape(dims);
|
||||
};
|
||||
auto paramResolver = [optParams, node](const char* name, size_t defaultVal) -> TensorShape
|
||||
{
|
||||
auto res = std::find_if(begin(optParams), end(optParams), [name](const NDLNode<ElemType>* n) { return EqualCI(n->GetName(), name); });
|
||||
if (res == end(optParams))
|
||||
return TensorShape(defaultVal);
|
||||
auto parm = node->GetParentScript()->ParseVariable((*res)->GetValue(), false);
|
||||
if (parm->GetType() == ndlTypeConstant)
|
||||
return TensorShape((size_t)parm->GetValue());
|
||||
auto parms = parm->GetParameters();
|
||||
vector<size_t> dims(parms.size());
|
||||
for (size_t i = 0; i < dims.size(); i++)
|
||||
dims[i] = parms[i]->GetValue();
|
||||
return TensorShape(dims);
|
||||
};
|
||||
auto boolParamResolver = [&optParams, node](const char* name, bool defaultVal) -> vector<bool>
|
||||
{
|
||||
auto res = std::find_if(begin(optParams), end(optParams), [name](const NDLNode<ElemType>* n) { return EqualCI(n->GetName(), name); });
|
||||
if (res == end(optParams))
|
||||
return vector<bool>{defaultVal};
|
||||
auto parm = node->GetParentScript()->ParseVariable((*res)->GetValue(), false);
|
||||
if (parm == nullptr)
|
||||
return vector<bool>{(*res)->GetValue()};
|
||||
if (parm->GetType() != ndlTypeArray)
|
||||
return vector<bool>{parm->GetValue()};
|
||||
auto parms = parm->GetParameters();
|
||||
vector<bool> dims(parms.size());
|
||||
for (size_t i = 0; i < dims.size(); i++)
|
||||
dims[i] = parms[i]->GetValue();
|
||||
return dims;
|
||||
};
|
||||
|
||||
// evaluate only scalar parameters
|
||||
vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
|
||||
id = 0; // reset counter because the params array starts at zero
|
||||
size_t kernelWidth = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
size_t kernelHeight = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
size_t outputChannels = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
size_t horizontalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
size_t verticalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
assert(id == 5);
|
||||
auto kernelShape = paramGetter(reqParams.size() - 1);
|
||||
auto mapCount = paramResolver("mapCount", 1);
|
||||
auto stride = paramResolver("stride", 1);
|
||||
auto sharing = boolParamResolver("sharing", true);
|
||||
auto autoPad = boolParamResolver("autoPadding", true);
|
||||
auto lowerPad = paramResolver("lowerPad", 0);
|
||||
auto upperPad = paramResolver("upperPad", 0);
|
||||
ImageLayoutKind imageLayout = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW"));
|
||||
size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
|
||||
|
||||
// optional
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
|
||||
bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
|
||||
size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
|
||||
auto pool = PoolKind::None;
|
||||
if (cnNodeType == OperationNameOf(PoolingNode))
|
||||
{
|
||||
auto parm = node->GetParentScript()->ParseVariable(reqParams[1]->GetValue(), false);
|
||||
pool = PoolKindFrom(wstring(parm->GetValue()));
|
||||
}
|
||||
|
||||
nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
|
||||
horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding, maxTempMemSizeInSamples, name);
|
||||
if (pool == PoolKind::None)
|
||||
{
|
||||
nodePtr = builder.Convolution(NULL, NULL, kernelShape, mapCount, stride, sharing,
|
||||
autoPad, lowerPad, upperPad, imageLayout, maxTempMemSizeInSamples, name);
|
||||
}
|
||||
else
|
||||
{
|
||||
nodePtr = builder.Pooling(NULL, pool, kernelShape, stride, autoPad, lowerPad, upperPad, imageLayout, name);
|
||||
}
|
||||
|
||||
}
|
||||
else if (parameter.size() == 7)
|
||||
{
|
||||
int id = 2; // skip weightNode and inputValueNode
|
||||
|
||||
// evaluate only scalar parameters
|
||||
vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
|
||||
id = 0; // reset counter because the params array starts at zero
|
||||
size_t kernelWidth = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
size_t kernelHeight = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
size_t outputChannels = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
size_t horizontalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
size_t verticalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
|
||||
assert(id == 5);
|
||||
|
||||
// optional
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
|
||||
bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
|
||||
size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
|
||||
|
||||
nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
|
||||
horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding,
|
||||
maxTempMemSizeInSamples, name);
|
||||
}
|
||||
else
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationNameOf(MaxPoolingNode))
|
||||
|
@ -392,9 +492,9 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
|
||||
|
||||
// Optional parameters
|
||||
bool eval = node->GetOptionalParameter("eval", "false");
|
||||
bool spatial = node->GetOptionalParameter("spatial", "false");
|
||||
double normTimeConst = node->GetOptionalParameter("normalizationTimeConstant", "0");
|
||||
double blendTimeConst = node->GetOptionalParameter("blendTimeConstant", "0");
|
||||
double epsilon = node->GetOptionalParameter("epsilon", "0.00001");
|
||||
std::wstring bnEngineS = node->GetOptionalParameter("engine", "cntk");
|
||||
bool useCntkEngine;
|
||||
|
@ -406,7 +506,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
InvalidArgument("Unsupported batch normalization engine, choose either \"cntk\"(default) or \"cudnn\".");
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW"));
|
||||
|
||||
nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, eval, spatial, normTimeConst, epsilon, useCntkEngine, imageLayoutKind, name);
|
||||
nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, spatial, normTimeConst, blendTimeConst, epsilon, useCntkEngine, imageLayoutKind, name);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
|
@ -157,6 +157,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
|
|||
#endif
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples")) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(CosineNode), L"Cos")) ret = true;
|
||||
|
|
|
@ -79,14 +79,15 @@ Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logist
|
|||
WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]
|
||||
ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
|
||||
CastAs (type, data) = ReconcileMBLayout (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
|
||||
Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]
|
||||
Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
|
||||
Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
|
||||
MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]
|
||||
AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]
|
||||
ColumnwiseCrossProduct = KhatriRaoProduct // deprecated
|
||||
ClassificationError = ErrorPrediction
|
||||
Delay = PastValue
|
||||
|
||||
BatchNormalization(input, scale, bias, runMean, runInvStdDev, eval, spatial, normalizationTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
|
||||
BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
|
||||
Abs(x, tag='') = new ComputationNode [ operation = 'Abs' ; inputs = x /*plus the function args*/ ]
|
||||
ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
|
||||
ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
|
||||
|
|
|
@ -70,7 +70,7 @@ void TestCn(const ConfigParameters& config);
|
|||
|
||||
void RedirectStdErr(wstring logpath)
|
||||
{
|
||||
fprintf(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
|
||||
LOGPRINTF(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
|
||||
auto f = make_shared<File>(logpath.c_str(), fileOptionsWrite | fileOptionsText);
|
||||
if (dup2(fileno(*f), 2) == -1)
|
||||
{
|
||||
|
@ -165,7 +165,7 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
|
||||
if (numCPUThreads > 0)
|
||||
{
|
||||
std::cerr << "Using " << numCPUThreads << " CPU threads." << endl;
|
||||
LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
|
||||
}
|
||||
|
||||
bool progressTracing = config(L"progressTracing", false);
|
||||
|
@ -187,14 +187,14 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
if (action[j] == "train" || action[j] == "trainRNN")
|
||||
{
|
||||
wstring modelPath = commandParams("modelPath");
|
||||
std::wcerr << "CNTKModelPath: " << modelPath << endl;
|
||||
LOGPRINTF(stderr, "CNTKModelPath: %ls\n", modelPath.c_str());
|
||||
size_t maxEpochs = GetMaxEpochs(commandParams);
|
||||
std::cerr << "CNTKCommandTrainInfo: " + command[i] << " : " << maxEpochs << endl;
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainInfo: %s : %d\n", command[i].c_str(), (int) maxEpochs);
|
||||
fullTotalMaxEpochs += maxEpochs;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cerr << "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : " << fullTotalMaxEpochs << endl;
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : %d\n", (int) fullTotalMaxEpochs);
|
||||
|
||||
// set up progress tracing for compute cluster management
|
||||
if (progressTracing && (!mpi || mpi->IsMainNode()))
|
||||
|
@ -225,19 +225,20 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
// print a banner to visually separate each action in the log
|
||||
const char* delim = "##############################################################################";
|
||||
const char* prefix = "Action ";
|
||||
fprintf(stderr, "\n%s\n", delim);
|
||||
fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
|
||||
fprintf(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
|
||||
fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
|
||||
fprintf(stderr, "%s\n\n", delim);
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "%s\n", delim);
|
||||
LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
|
||||
LOGPRINTF(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
|
||||
LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
|
||||
LOGPRINTF(stderr, "%s\n\n", delim);
|
||||
|
||||
if ((mpi == nullptr) || (commandstoRunOnAllRanks.find(thisAction) != commandstoRunOnAllRanks.end()) || mpi->IsMainNode())
|
||||
{
|
||||
if (thisAction == "train" || thisAction == "trainRNN")
|
||||
{
|
||||
std::cerr << "CNTKCommandTrainBegin: " + command[i] << endl;
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
|
||||
DoTrain<ConfigParameters, ElemType>(commandParams);
|
||||
std::cerr << "CNTKCommandTrainEnd: " + command[i] << endl;
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
|
||||
fullEpochsOffset += GetMaxEpochs(commandParams);
|
||||
}
|
||||
else if (thisAction == "adapt")
|
||||
|
@ -298,7 +299,8 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "\nAction \"%s\" complete.\n\n", thisAction.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Action \"%s\" complete.\n\n", thisAction.c_str());
|
||||
|
||||
NDLScript<ElemType> ndlScript;
|
||||
ndlScript.ClearGlobal(); // clear global macros between commands
|
||||
|
@ -321,51 +323,51 @@ std::string TimeDateStamp()
|
|||
|
||||
void PrintBuiltInfo()
|
||||
{
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
fprintf(stderr, "Build info: \n\n");
|
||||
fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
|
||||
fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "Build info: \n\n");
|
||||
LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
|
||||
LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
|
||||
#ifdef _BUILDTYPE_
|
||||
fprintf(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
|
||||
LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
|
||||
#endif
|
||||
#ifdef _BUILDTARGET_
|
||||
fprintf(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
|
||||
LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
|
||||
#endif
|
||||
#ifdef _WITH_1BITSGD_
|
||||
fprintf(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
|
||||
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
|
||||
#endif
|
||||
#ifdef _MATHLIB_
|
||||
fprintf(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
|
||||
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
|
||||
#endif
|
||||
#ifdef _CUDA_PATH_
|
||||
fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
|
||||
LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
|
||||
#endif
|
||||
#ifdef _CUB_PATH_
|
||||
fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
|
||||
LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
|
||||
#endif
|
||||
#ifdef _CUDNN_PATH_
|
||||
fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
|
||||
LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
|
||||
#endif
|
||||
#ifdef _GIT_EXIST
|
||||
fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
|
||||
fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
|
||||
LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
|
||||
LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
|
||||
#endif
|
||||
#ifdef _BUILDER_
|
||||
fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
|
||||
LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
|
||||
#endif
|
||||
#ifdef _BUILDPATH_
|
||||
fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
|
||||
LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
|
||||
#endif
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
void PrintUsageInfo()
|
||||
{
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
fprintf(stderr, "Usage: cntk configFile=yourConfigFile\n");
|
||||
fprintf(stderr, "For detailed information please consult the CNTK book\n");
|
||||
fprintf(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
|
||||
fprintf(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
LOGPRINTF(stderr, "Usage: cntk configFile=yourConfigFile\n");
|
||||
LOGPRINTF(stderr, "For detailed information please consult the CNTK book\n");
|
||||
LOGPRINTF(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
|
||||
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -414,7 +416,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
for (const auto& arg : args)
|
||||
startupMessage += L" " + arg;
|
||||
|
||||
fprintf(stderr, "%ls\n", startupMessage.c_str());
|
||||
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
|
||||
|
||||
// parse command-line options
|
||||
vector<wstring> sourceFiles;
|
||||
|
@ -443,6 +445,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
// compile the BrainScript
|
||||
wstring bs = L"[\n";
|
||||
bs += L"include \'cntk.core.bs'"; // start with including the standard macros
|
||||
|
||||
// Note: Using lowercase ^^ here to match the Linux name of the CNTK exe.
|
||||
//bs += standardFunctions + computationNodes + commonMacros + L"\n";
|
||||
for (const auto& sourceFile : sourceFiles)
|
||||
|
@ -451,7 +454,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
for (const auto& over : overrides)
|
||||
bs += L"with [ " + over + L" ]\n";
|
||||
|
||||
fprintf(stderr, "\n\nBrainScript -->\n\n%ls\n\n", bs.c_str());
|
||||
fprintf(stderr, "\n\n");
|
||||
LOGPRINTF(stderr, "BrainScript -->\n\n%ls\n\n", bs.c_str());
|
||||
|
||||
let expr = BS::ParseConfigExpression(bs, move(includePaths)); // parse
|
||||
let valp = BS::Evaluate(expr); // evaluate parse into a dictionary
|
||||
|
@ -460,8 +464,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
// legacy parameters that have changed spelling
|
||||
if (config.Find(L"DoneFile")) // variables follow camel case (start with lower-case letters)
|
||||
InvalidArgument("Legacy spelling of 'DoneFile' no longer allowed. Use 'doneFile'.");
|
||||
|
||||
if (config.Find(L"command")) // spelling error, should be plural. Using 'actions' instead to match the data type.
|
||||
InvalidArgument("Legacy spelling of 'command' no longer allowed. Use 'actions'.");
|
||||
|
||||
if (config.Find(L"type"))
|
||||
InvalidArgument("Legacy name 'type' no longer allowed. Use 'precision'.");
|
||||
|
||||
|
@ -486,7 +492,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
logpath += msra::strfun::wstrprintf(L"rank%d", (int) mpi->CurrentNodeRank());
|
||||
|
||||
RedirectStdErr(logpath);
|
||||
fprintf(stderr, "%ls\n", startupMessage.c_str());
|
||||
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
|
||||
}
|
||||
|
||||
// echo config info to log
|
||||
|
@ -497,16 +503,18 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
int numCPUThreads = config(L"numCPUThreads", 0);
|
||||
numCPUThreads = CPUMatrix<float /*any will do*/>::SetNumThreads(numCPUThreads);
|
||||
if (numCPUThreads > 0)
|
||||
fprintf(stderr, "Using %d CPU threads.\n", numCPUThreads);
|
||||
LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
|
||||
|
||||
bool progressTracing = config(L"progressTracing", false);
|
||||
size_t fullTotalMaxEpochs = 1; // BUGBUG: BS does not allow me to read out the max epochs parameters, as that would instantiate and thus execute the objects
|
||||
|
||||
// set up progress tracing for compute cluster management
|
||||
if (progressTracing && ((mpi == nullptr) || mpi->IsMainNode()))
|
||||
ProgressTracing::TraceTotalNumberOfSteps(fullTotalMaxEpochs); // enable tracing, using this as the total number of epochs
|
||||
|
||||
// MAIN LOOP that executes the actions
|
||||
auto actionsVal = config[L"actions"];
|
||||
|
||||
// Note: weird behavior. If 'actions' is a scalar value (rather than an array) then it will have been resolved already after the above call. That means, it has already completed its action!
|
||||
// Not pretty, but a direct consequence of the lazy evaluation. The only good solution would be to have a syntax for arrays including length 0 and 1.
|
||||
// Since this in the end behaves indistinguishable from the array loop below, we will keep it for now.
|
||||
|
@ -532,7 +540,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
|
||||
fcloseOrDie(fp);
|
||||
}
|
||||
fprintf(stderr, "COMPLETED\n"), fflush(stderr);
|
||||
// TODO: change this back to COMPLETED, double underscores don't look good in output
|
||||
LOGPRINTF(stderr, "__COMPLETED__\n");
|
||||
fflush(stderr);
|
||||
|
||||
MPIWrapper::DeleteInstance();
|
||||
return EXIT_SUCCESS;
|
||||
|
@ -541,11 +551,16 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
|
|||
// ---------------------------------------------------------------------------
|
||||
// main() for old CNTK config language
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & repots Win32 exceptions
|
||||
// called from wmain which is a wrapper that catches & repots Win32 exceptions
|
||||
int wmainOldCNTKConfig(int argc, wchar_t* argv[])
|
||||
{
|
||||
ConfigParameters config;
|
||||
std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);
|
||||
std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config); // get the command param set they want
|
||||
bool timestamping = config(L"timestamping", false);
|
||||
if (timestamping)
|
||||
{
|
||||
ProgressTracing::SetTimestampingFlag();
|
||||
}
|
||||
|
||||
// get the command param set they want
|
||||
wstring logpath = config(L"stderr", L"");
|
||||
|
@ -586,8 +601,9 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
|
|||
std::string timestamp = TimeDateStamp();
|
||||
|
||||
// dump config info
|
||||
fprintf(stderr, "\nRunning on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
|
||||
fprintf(stderr, "Command line: \n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
|
||||
LOGPRINTF(stderr, "Command line: \n");
|
||||
for (int i = 0; i < argc; i++)
|
||||
fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability
|
||||
fprintf(stderr, "\n\n");
|
||||
|
@ -595,24 +611,27 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
|
|||
#if 1 //def _DEBUG
|
||||
// This simply merges all the different config parameters specified (eg, via config files or via command line directly),
|
||||
// and prints it.
|
||||
fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
|
||||
fprintf(stderr, "%s\n", rawConfigString.c_str());
|
||||
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n");
|
||||
fprintf(stderr, "\n\n");
|
||||
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
|
||||
LOGPRINTF(stderr, "%s\n", rawConfigString.c_str());
|
||||
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n");
|
||||
|
||||
// Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overriden at command line),
|
||||
// Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overridden at command line),
|
||||
// All of these assignments will appear, even though only the last assignment matters.
|
||||
fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
|
||||
fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
|
||||
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
|
||||
LOGPRINTF(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
|
||||
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
|
||||
|
||||
// This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
|
||||
// value it is set to will appear).
|
||||
fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
|
||||
config.dumpWithResolvedVariables();
|
||||
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
|
||||
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "Commands:");
|
||||
LOGPRINTF(stderr, "Commands:");
|
||||
for (int i = 0; i < command.size(); i++)
|
||||
fprintf(stderr, " %s", command[i].c_str());
|
||||
fprintf(stderr, "\n");
|
||||
|
@ -623,7 +642,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
|
|||
if (config.Exists("type"))
|
||||
InvalidArgument("CNTK: Use of 'type' parameter is deprecated, it is called 'precision' now.");
|
||||
|
||||
fprintf(stderr, "Precision = \"%s\"\n", type.c_str());
|
||||
LOGPRINTF(stderr, "Precision = \"%s\"\n", type.c_str());
|
||||
|
||||
if (type == "float")
|
||||
DoCommands<float>(config, mpi);
|
||||
else if (type == "double")
|
||||
|
@ -638,7 +658,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
|
|||
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
|
||||
fcloseOrDie(fp);
|
||||
}
|
||||
fprintf(stderr, "COMPLETED\n"), fflush(stderr);
|
||||
// TODO: Change back to COMPLETED (no underscores)
|
||||
LOGPRINTF(stderr, "__COMPLETED__\n"), fflush(stderr);
|
||||
|
||||
MPIWrapper::DeleteInstance();
|
||||
return EXIT_SUCCESS;
|
||||
|
@ -659,43 +680,52 @@ void AllocationFailureHandler()
|
|||
int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions
|
||||
{
|
||||
std::set_new_handler(AllocationFailureHandler);
|
||||
|
||||
try
|
||||
{
|
||||
{
|
||||
PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
|
||||
|
||||
if (argc <= 1)
|
||||
{
|
||||
fprintf(stderr, "No command-line argument given.\n");
|
||||
LOGPRINTF(stderr, "No command-line argument given.\n");
|
||||
PrintUsageInfo();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
// detect legacy CNTK configuration
|
||||
bool isOldCNTKConfig = false;
|
||||
for (int i = 0; i < argc && !isOldCNTKConfig; i++)
|
||||
isOldCNTKConfig |= !_wcsnicmp(L"configFile=", argv[i], 11);
|
||||
|
||||
if (isOldCNTKConfig)
|
||||
return wmainOldCNTKConfig(argc, argv);
|
||||
|
||||
// run from BrainScript
|
||||
return wmainWithBS(argc, argv);
|
||||
}
|
||||
catch (const ScriptableObjects::ScriptingException& err)
|
||||
{
|
||||
fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
|
||||
err.PrintError();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
catch (const IExceptionWithCallStackBase& err)
|
||||
{
|
||||
fprintf(stderr, "\nEXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
catch (const std::exception& err)
|
||||
{
|
||||
fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
fprintf(stderr, "\nUnknown ERROR occurred\n");
|
||||
fprintf(stderr, "\n");
|
||||
LOGPRINTF(stderr, "Unknown ERROR occurred\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
@ -703,7 +733,8 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
|
|||
#ifdef __WINDOWS__
|
||||
void TerminateThis()
|
||||
{
|
||||
fprintf(stderr, "terminate_this: aborting\n"), fflush(stderr);
|
||||
LOGPRINTF(stderr, "terminate_this: aborting\n");
|
||||
fflush(stderr);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
|
@ -714,7 +745,7 @@ static void LogDelayLoadError(PEXCEPTION_POINTERS pExcPointers)
|
|||
if (pExcPointers->ExceptionRecord->ExceptionCode == EXCEPTION_DLL_NOT_FOUND)
|
||||
{
|
||||
const auto & pDelayLoadInfo = *PDelayLoadInfo(pExcPointers->ExceptionRecord->ExceptionInformation[0]);
|
||||
fprintf(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
|
||||
LOGPRINTF(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -736,7 +767,7 @@ int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 excepti
|
|||
else if (code == EXCEPTION_INT_DIVIDE_BY_ZERO) msg = ": Integer division by zero";
|
||||
else if (code == EXCEPTION_STACK_OVERFLOW) msg = ": Stack overflow";
|
||||
else if (code == EXCEPTION_DLL_NOT_FOUND) msg = ": Module not found";
|
||||
fprintf(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
|
||||
LOGPRINTF(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
|
||||
fflush(stderr);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include "ModelEditLanguage.h"
|
||||
#include "ConvolutionalNodes.h"
|
||||
#include "InputAndParamNodes.h"
|
||||
#include <map>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
@ -58,8 +59,7 @@ enum MELProperty
|
|||
melPropFinalCriterion,
|
||||
melPropEvaluation,
|
||||
melPropOutput,
|
||||
melPropRecurrent,
|
||||
melPropBatchNormMode
|
||||
melPropRecurrent
|
||||
};
|
||||
|
||||
// SetGroupTag - Set the group tag on a node
|
||||
|
@ -73,7 +73,7 @@ void MELScript<ElemType>::SetGroupTag(ComputationNodeBasePtr nodeProp, Computati
|
|||
cn->AddToNodeGroup(groupTag, nodeProp);
|
||||
else
|
||||
cn->RemoveFromNodeGroup(groupTag, nodeProp);
|
||||
}
|
||||
}
|
||||
|
||||
// ProcessNDLScript - Process the NDL script
|
||||
// netNdl - netNDL structure
|
||||
|
@ -384,18 +384,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
|
|||
inputNodes[i - 1] = nodeFrom[0];
|
||||
}
|
||||
|
||||
#if 1
|
||||
nodeTo[0]->AttachInputs(inputNodes);
|
||||
#else // TODO: delete this
|
||||
if (inputNodes.size() == 1)
|
||||
nodeTo[0]->AttachInputs(inputNodes[0]);
|
||||
else if (inputNodes.size() == 2)
|
||||
nodeTo[0]->AttachInputs(inputNodes[0], inputNodes[1]);
|
||||
else if (inputNodes.size() == 3)
|
||||
nodeTo[0]->AttachInputs(inputNodes[0], inputNodes[1], inputNodes[2]);
|
||||
else
|
||||
RuntimeError("SetNodeInputs(): You specified more than 3 input nodes.");
|
||||
#endif
|
||||
}
|
||||
else if (EqualInsensitive(name, "SetProperty"))
|
||||
{
|
||||
|
@ -416,8 +405,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
|
|||
|
||||
// map property name to property enum
|
||||
// Please keep this table sorted.
|
||||
if (EqualInsensitive(propName, "batchNormEvalMode")) prop = melPropBatchNormMode;
|
||||
else if (EqualInsensitive(propName, "criterion")) prop = melPropFinalCriterion;
|
||||
if (EqualInsensitive(propName, "criterion")) prop = melPropFinalCriterion;
|
||||
else if (EqualInsensitive(propName, "evaluation")) prop = melPropEvaluation;
|
||||
else if (EqualInsensitive(propName, "feature")) prop = melPropFeature;
|
||||
else if (EqualInsensitive(propName, "label")) prop = melPropLabel;
|
||||
|
@ -483,32 +471,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
|
|||
// what to do here?
|
||||
break;
|
||||
}
|
||||
case melPropBatchNormMode:
|
||||
{
|
||||
if (node->OperationName() != OperationNameOf(BatchNormalizationNode))
|
||||
{
|
||||
RuntimeError("Invalid node type: node %ls (type:%ls) is not a %ls node; therefore cannot apply batchNormEvalMode on it.",
|
||||
node->NodeName().c_str(),
|
||||
node->OperationName().c_str(),
|
||||
OperationNameOf(BatchNormalizationNode).c_str());
|
||||
}
|
||||
bool property = params[2];
|
||||
auto pnode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
|
||||
if (pnode)
|
||||
pnode->SetEvalMode(property);
|
||||
else
|
||||
{
|
||||
auto pnode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
|
||||
if (pnode2)
|
||||
pnode2->SetEvalMode(property);
|
||||
else
|
||||
{
|
||||
RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n",
|
||||
node->NodeName().c_str());
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
RuntimeError("Invalid property, %s, is not supported", propName.c_str());
|
||||
|
@ -534,10 +496,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
|
|||
{
|
||||
prop = melPropLearningRateMultiplier;
|
||||
}
|
||||
else if (EqualInsensitive(propName, "batchNormEvalMode"))
|
||||
{
|
||||
prop = melPropBatchNormMode;
|
||||
}
|
||||
else
|
||||
{
|
||||
RuntimeError("Invalid property, %s, is not supported", propName.c_str());
|
||||
|
@ -566,12 +524,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
|
|||
netNdl->cn->SetLearnableNodesBelowLearningRateMultiplier(learningRateMultiplier, node);
|
||||
break;
|
||||
}
|
||||
case melPropBatchNormMode:
|
||||
{
|
||||
bool evalMode = params[2];
|
||||
netNdl->cn->SetBatchNormalizationNodesBelowEvalMode(evalMode, node);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
RuntimeError("Invalid property, %s, is not supported", propName.c_str());
|
||||
|
|
|
@ -4,10 +4,33 @@
|
|||
//
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include "TimerUtility.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// TODO: make this proper C++ functions with variadic templates and a name that reflects their difference to fprintf(stderr) which already implies printing to log
|
||||
// If the Tracing flag is set, print out a timestamp with no new line at the end
|
||||
#define PREPENDTS(stream) \
|
||||
do \
|
||||
{ \
|
||||
if (ProgressTracing::GetTimestampingFlag()) \
|
||||
{ \
|
||||
std::time_t tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); \
|
||||
char mbstr[30]; \
|
||||
if (std::strftime(mbstr, sizeof(mbstr), "%m/%d/%Y %H:%M:%S", std::localtime(&tt))) \
|
||||
fprintf(stream, "%s: ", mbstr); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
// Print out a log message. If the Tracing flag is set, prepend with a timestamp
|
||||
#define LOGPRINTF(stream, ...) \
|
||||
do \
|
||||
{ \
|
||||
PREPENDTS(stream); \
|
||||
fprintf(stream, __VA_ARGS__); \
|
||||
} while(0)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ProgressTracing -- static helper class for logging a progress indicator
|
||||
//
|
||||
|
@ -29,12 +52,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
{
|
||||
bool m_enabled;
|
||||
bool m_tracingFlag;
|
||||
bool m_timestampFlag; // TODO: What does this do? TODO: camelCase
|
||||
size_t m_totalNumberOfSteps; // total number of epochs in entire training run
|
||||
size_t m_currentStepOffset; // current offset
|
||||
Timer m_progressTracingTimer;
|
||||
|
||||
ProgressTracing()
|
||||
: m_enabled(false), m_tracingFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
|
||||
: m_enabled(false), m_tracingFlag(false), m_timestampFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -50,12 +74,24 @@ public:
|
|||
return GetStaticInstance().m_tracingFlag;
|
||||
}
|
||||
|
||||
static bool GetTimestampingFlag()
|
||||
{
|
||||
return GetStaticInstance().m_timestampFlag;
|
||||
// TODO: timestampFlag or timestampingFlag? (Or timeStampFlag?)
|
||||
}
|
||||
|
||||
static void SetTracingFlag()
|
||||
{
|
||||
auto& us = GetStaticInstance();
|
||||
us.m_tracingFlag = true;
|
||||
}
|
||||
|
||||
static void SetTimestampingFlag()
|
||||
{
|
||||
auto& us = GetStaticInstance();
|
||||
us.m_timestampFlag = true;
|
||||
}
|
||||
|
||||
// call TraceTotalNumberOfSteps() to set the total number of steps
|
||||
// Calling this with totalNumberOfSteps>0 will enable progress tracing.
|
||||
static void TraceTotalNumberOfSteps(size_t totalNumberOfSteps)
|
||||
|
|
|
@ -780,6 +780,11 @@ static inline ImageLayoutKind ImageLayoutKindFrom(const wstring& s)
|
|||
struct ImageDimensions
|
||||
{
|
||||
size_t m_width, m_height, m_numChannels;
|
||||
// convenience accessors. TODO: use only one name. Rename the members themselves?
|
||||
size_t w() const { return m_width; }
|
||||
size_t h() const { return m_height; }
|
||||
size_t c() const { return m_numChannels; }
|
||||
|
||||
// interpret TensorShape as image
|
||||
ImageDimensions(const TensorShape& shape, ImageLayoutKind imageLayoutKind)
|
||||
{
|
||||
|
@ -787,14 +792,14 @@ struct ImageDimensions
|
|||
InvalidArgument("Convolution operation currently only supports 1D or 2D convolution on 3D tensors.");
|
||||
if (imageLayoutKind == ImageLayoutKind::CHW)
|
||||
{
|
||||
m_width = shape[0];
|
||||
m_height = shape[1];
|
||||
m_width = shape[0];
|
||||
m_height = shape[1];
|
||||
m_numChannels = shape[2];
|
||||
}
|
||||
else if (imageLayoutKind == ImageLayoutKind::HWC)
|
||||
{
|
||||
m_width = shape[1];
|
||||
m_height = shape[2];
|
||||
m_width = shape[1];
|
||||
m_height = shape[2];
|
||||
m_numChannels = shape[0];
|
||||
}
|
||||
else
|
||||
|
|
|
@ -609,11 +609,6 @@ void renameOrDie(const std::string& from, const std::string& to)
|
|||
// WORKAROUND: "rename" should do this but this is a workaround
|
||||
// to the HDFS FUSE implementation's bug of failing to do so
|
||||
// workaround for FUSE rename when running on Philly
|
||||
if (ProgressTracing::GetTracingFlag())
|
||||
{
|
||||
fprintf(stderr, "rename %s to %s\n", from.c_str(), to.c_str());
|
||||
}
|
||||
|
||||
unlinkOrDie(to);
|
||||
if (rename(from.c_str(), to.c_str()) != 0)
|
||||
{
|
||||
|
|
|
@ -514,25 +514,32 @@ template <class ElemType>
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
/*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant)
|
||||
/*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode,
|
||||
double normalizationTimeConstant, double& prevNormalizationTimeConstant,
|
||||
double blendTimeConstant, double& prevBlendTimeConstant)
|
||||
{
|
||||
if (normalizationTimeConstant != prevNormalizationTimeConstant && normalizationTimeConstant != numeric_limits<double>::infinity())
|
||||
if (normalizationTimeConstant != prevNormalizationTimeConstant || blendTimeConstant != prevBlendTimeConstant)
|
||||
{
|
||||
fprintf(stderr, "Setting batch normalization time constant to %.8g.\n", normalizationTimeConstant);
|
||||
if (normalizationTimeConstant != prevNormalizationTimeConstant)
|
||||
fprintf(stderr, "Setting batch normalization time constant to %.8g.\n", normalizationTimeConstant);
|
||||
if (blendTimeConstant != prevBlendTimeConstant)
|
||||
fprintf(stderr, "Setting batch normalization blend time constant to %.8g.\n", blendTimeConstant);
|
||||
// TODO: Change this to use an interface that is independent of <ElemType>.
|
||||
list<ComputationNodeBasePtr> batchNormalizationNodes = net->GetNodesWithType(OperationNameOf(BatchNormalizationNode), criterionNode);
|
||||
if (batchNormalizationNodes.size() == 0 && normalizationTimeConstant != numeric_limits<double>::infinity())
|
||||
auto batchNormalizationNodes = net->GetNodesWithType(OperationNameOf(BatchNormalizationNode), criterionNode);
|
||||
if (batchNormalizationNodes.size() == 0)
|
||||
fprintf(stderr, "WARNING: there is no batch normalization node.\n");
|
||||
else
|
||||
{
|
||||
for (auto& nodeIter : batchNormalizationNodes)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<BatchNormalizationNode<ElemType>>(nodeIter);
|
||||
node->SetNormalizationTimeConstant(normalizationTimeConstant);
|
||||
node->SetNormalizationTimeConstants(normalizationTimeConstant, prevNormalizationTimeConstant,
|
||||
blendTimeConstant, prevBlendTimeConstant);
|
||||
}
|
||||
}
|
||||
|
||||
prevNormalizationTimeConstant = normalizationTimeConstant;
|
||||
prevBlendTimeConstant = blendTimeConstant;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1434,7 +1441,7 @@ template void ComputationNetwork::Read<float>(const wstring& fileName);
|
|||
template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
|
||||
template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
|
||||
template /*static*/ void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
|
||||
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
|
||||
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
|
||||
template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
|
||||
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
|
||||
template void ComputationNetwork::SaveToDbnFile<float>(ComputationNetworkPtr net, const std::wstring& fileName) const;
|
||||
|
@ -1444,7 +1451,7 @@ template void ComputationNetwork::Read<double>(const wstring& fileName);
|
|||
template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
|
||||
template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
|
||||
template /*static*/ void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
|
||||
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
|
||||
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
|
||||
template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
|
||||
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
|
||||
template void ComputationNetwork::SaveToDbnFile<double>(ComputationNetworkPtr net, const std::wstring& fileName) const;
|
||||
|
|
|
@ -103,8 +103,6 @@ public:
|
|||
Read<ElemType>(fileName);
|
||||
// perform all further post-processing, caching, etc.
|
||||
CompileNetwork();
|
||||
// To ensure that all the BN nodes changed to eval mode unless it's in Training mode.
|
||||
SetBatchNormalizationNodesBelowEvalMode(true);
|
||||
}
|
||||
|
||||
// static helper to instantiate a network from a file
|
||||
|
@ -363,7 +361,6 @@ public:
|
|||
void AddFeatureNode(ComputationNodeBasePtr featureNode);
|
||||
//ComputationNodeBasePtr RemoveFeatureNode(ComputationNodeBasePtr featureNode);
|
||||
void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr);
|
||||
void SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// node access
|
||||
|
@ -429,7 +426,9 @@ public:
|
|||
static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
|
||||
|
||||
template <class ElemType>
|
||||
static void SetBatchNormalizationTimeConstant(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
|
||||
static void SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode,
|
||||
double normalizationTimeConstant, double& prevNormalizationTimeConstant,
|
||||
double blendTimeConstant, double& prevBlendTimeConstant);
|
||||
|
||||
template <class ElemType>
|
||||
static void SetSeqParam(ComputationNetworkPtr net,
|
||||
|
|
|
@ -106,13 +106,13 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
|
|||
assert(node->m_numNonDelayedParentsInLoop == 0); // (in PurgeStateForFormingRecurrentLoops())
|
||||
}
|
||||
for (let& node : nestedNodes)
|
||||
{
|
||||
for (auto& input : node->GetInputs())
|
||||
{
|
||||
for (auto& input : node->GetInputs())
|
||||
{
|
||||
if (input->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
|
||||
input->m_numNonDelayedParentsInLoop++; // cound #parents of 'input' that are not delay nodes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// re-traverse the graph for all nestedNodes, starting with the first
|
||||
// Then update m_nestedNodes with the re-traversed order.
|
||||
|
@ -301,19 +301,19 @@ void ComputationNetwork::DetermineSCCsR(ComputationNodeBasePtr cur,
|
|||
for (let& iter : m_allSEQNodes)
|
||||
{
|
||||
for (let& iter2 : iter->m_nestedNodes)
|
||||
{
|
||||
{
|
||||
if (iter2 == cur)
|
||||
{
|
||||
bFound = true;
|
||||
{
|
||||
bFound = true;
|
||||
// validate that the loop is really the same, by a set comparison
|
||||
unordered_set<ComputationNodeBasePtr> newLoop ( nestedNodes.begin(), nestedNodes.end());
|
||||
unordered_set<ComputationNodeBasePtr> existingLoop(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end());
|
||||
if (newLoop != existingLoop)
|
||||
LogicError("DetermineSCCsR: %ls %ls operation rediscovered in a loop, but that loop is not the same as last time.", cur->NodeName().c_str(), cur->OperationName().c_str());
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bFound)
|
||||
fprintf(stderr, "\nDetermineSCCsR: %ls %ls operation was discovered multiple times as as loop participant", cur->NodeName().c_str(), cur->OperationName().c_str());
|
||||
// TODO: Once we forbid FormRecurrentLoops() from non-NULL, can we ever re-hit a loop here? If not, then turn bFound into a LogicError().
|
||||
|
|
|
@ -128,6 +128,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring& node
|
|||
if (nodeType == OperationNameOf(AveragePoolingNode)) return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(BatchNormalizationNode)) return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(ConvolutionNode)) return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(PoolingNode)) return New<PoolingNode<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(SparseInputValue)) return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(InputValue)) return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
|
||||
else if (nodeType == OperationNameOf(LearnableParameter)) return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
|
||||
|
@ -229,6 +230,27 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
|
|||
maxTempMemSizeInSamples));
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount,
|
||||
const TensorShape& strideShape, const std::vector<bool>& sharing,
|
||||
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
kernelShape, mapCount, strideShape,
|
||||
sharing, autoPadding, lowerPad, upperPad,
|
||||
imageLayout, maxTempMemSizeInSamples));
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreatePoolingNode(const std::wstring& nodeName, PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
|
||||
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<PoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
poolKind, kernelShape, strideShape, autoPadding, lowerPad, upperPad, imageLayout));
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateMaxPoolingNode(const std::wstring& nodeName,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
|
||||
|
@ -261,7 +283,9 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
|
|||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
|
||||
const ComputationNodePtr inputValues,
|
||||
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
|
||||
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
|
||||
const size_t horizontalSubsample, const size_t verticalSubsample,
|
||||
ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
|
@ -269,6 +293,34 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convo
|
|||
maxTempMemSizeInSamples), { weight, inputValues });
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
|
||||
const ComputationNodePtr inputValues,
|
||||
const TensorShape& kernelShape, const TensorShape& mapCount,
|
||||
const TensorShape& strideShape, const std::vector<bool>& sharing,
|
||||
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
kernelShape, mapCount, strideShape,
|
||||
sharing, autoPadding, lowerPad, upperPad,
|
||||
imageLayout, maxTempMemSizeInSamples),
|
||||
weight, inputValues);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Pooling(const ComputationNodePtr inputValues,
|
||||
PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
|
||||
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<PoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
|
||||
poolKind, kernelShape, strideShape, autoPadding, lowerPad, upperPad, imageLayout),
|
||||
inputValues);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
|
@ -636,10 +688,11 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Looku
|
|||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::BatchNormalization(const ComputationNodePtr input,
|
||||
const ComputationNodePtr scale, const ComputationNodePtr bias, const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev,
|
||||
bool eval, bool spatial, double normalizationTimeConstant, double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind,
|
||||
bool spatial, double normalizationTimeConstant, double blendTimeConstant, double epsilon, bool useCntkEngine,
|
||||
ImageLayoutKind imageLayoutKind,
|
||||
const std::wstring nodeName)
|
||||
{
|
||||
return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, eval, spatial, normalizationTimeConstant, epsilon, useCntkEngine, imageLayoutKind), { input, scale, bias, runMean, runInvStdDev });
|
||||
return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, useCntkEngine, imageLayoutKind), { input, scale, bias, runMean, runInvStdDev });
|
||||
}
|
||||
|
||||
template class ComputationNetworkBuilder<float>;
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
#include "Basics.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "ComputationNetwork.h"
|
||||
#include "TrainingNodes.h" // for NCEEvalMode
|
||||
#include "TrainingNodes.h" // for NCEEvalMode
|
||||
#include "ConvolutionalNodes.h" // for PoolKind
|
||||
#include "ScriptableObjects.h"
|
||||
#include "TensorShape.h"
|
||||
#include <string>
|
||||
|
@ -51,7 +52,15 @@ public:
|
|||
ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows);
|
||||
ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout);
|
||||
ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout);
|
||||
ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
|
||||
ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
|
||||
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
|
||||
ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
|
||||
const size_t horizontalSubsample, const size_t verticalSubsample,
|
||||
ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
|
||||
ComputationNodePtr CreatePoolingNode(const std::wstring& nodeName, PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
|
||||
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout);
|
||||
ComputationNodePtr CreateMaxPoolingNode(const std::wstring& nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
|
||||
ComputationNodePtr CreateAveragePoolingNode(const std::wstring& nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
|
||||
// this is the catch-all for all cases not covered as special cases above
|
||||
|
@ -60,7 +69,7 @@ public:
|
|||
// The following functions create nodes and link them to the network and their inputs.
|
||||
// TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
|
||||
ComputationNodePtr BatchNormalization(const ComputationNodePtr input, const ComputationNodePtr scale, const ComputationNodePtr bias,
|
||||
const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool eval = false, bool spatial = false, double normalizationTimeConstant = 0, double epsilon = 1e-5, bool useCntkEngine = true,
|
||||
const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool spatial = false, double normalizationTimeConstant = 0, double blendTimeConstant = 0, double epsilon = 1e-5, bool useCntkEngine = true,
|
||||
ImageLayoutKind imageLayoutKind = ImageLayoutKind::CHW, const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Convolution(const ComputationNodePtr weight,
|
||||
const ComputationNodePtr inputValues,
|
||||
|
@ -68,6 +77,17 @@ public:
|
|||
const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0,
|
||||
const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Convolution(const ComputationNodePtr weight,
|
||||
const ComputationNodePtr inputValues,
|
||||
const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
|
||||
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
|
||||
const std::wstring nodeName = L"");
|
||||
ComputationNodePtr Pooling(const ComputationNodePtr inputValues,
|
||||
PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
|
||||
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout,
|
||||
const std::wstring nodeName = L"");
|
||||
ComputationNodePtr MaxPooling(const ComputationNodePtr inputValues,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const std::wstring nodeName = L"");
|
||||
|
|
|
@ -332,42 +332,4 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
|
|||
}
|
||||
}
|
||||
|
||||
void ComputationNetwork::SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
|
||||
{
|
||||
vector<ComputationNodeBasePtr> nodes;
|
||||
if (rootNode == nullptr)
|
||||
{
|
||||
for (auto pair : m_nameToNodeMap)
|
||||
{
|
||||
nodes.push_back(pair.second);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto allnodes = rootNode->EnumerateNodes();
|
||||
for (auto node : allnodes)
|
||||
nodes.push_back(node);
|
||||
}
|
||||
|
||||
for (auto& node : nodes)
|
||||
{
|
||||
if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
|
||||
{
|
||||
auto pNode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
|
||||
if (!pNode)
|
||||
{
|
||||
auto pNode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
|
||||
if (!pNode2)
|
||||
{
|
||||
RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n", node->NodeName().c_str());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pNode->SetEvalMode(evalMode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}}}
|
||||
|
|
|
@ -114,9 +114,11 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
|
|||
{
|
||||
// instead of the node itself, include the sentinel SEQTraversalFlowControlNode in our list
|
||||
m_nestedNodes.push_back(recInfo);
|
||||
|
||||
// and verify that we only encountered the loop once (all nodes should have been consecutive)
|
||||
if (!loopsSeen.insert(recInfo).second)
|
||||
LogicError("PARTraversalFlowControlNode: members of loop %ls are not consecutive in node list.", recInfo->NodeName().c_str());
|
||||
|
||||
// consume all nodes that are part of the same loop (they are all consecutive)
|
||||
while (nodeIter != allNodes.end() && (*nodeIter)->IsPartOfLoop() && FindInRecurrentLoops(recurrentInfo, *nodeIter) == recInfo)
|
||||
nodeIter++;
|
||||
|
@ -303,8 +305,10 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
|
|||
// look in all recurrent loops of the network
|
||||
// TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
|
||||
for (auto& iter : recurrentInfo)
|
||||
{
|
||||
if (std::find(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end(), node) != iter->m_nestedNodes.end()) // TODO: should this loop need to be a method of SEQTraversalFlowControlNode?
|
||||
return iter;
|
||||
}
|
||||
return nullptr; // not part of a recurrent loop
|
||||
}
|
||||
|
||||
|
@ -357,8 +361,10 @@ void ComputationNetwork::PrintComputationTree(const ComputationNodeBasePtr& root
|
|||
if (nodes.size() == 0)
|
||||
fprintf(stderr, "\n(empty)\n");
|
||||
else
|
||||
{
|
||||
for (const auto& node : nodes)
|
||||
node->PrintSelf(printMatrices);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -399,7 +405,7 @@ void ComputationNetwork::CompileNetwork()
|
|||
// all steps below have to be repeated for all root nodes (=nodes without parents and PreComputeNodes)
|
||||
DetermineSetOfAllRoots();
|
||||
|
||||
fprintf(stderr, "\n%d roots:\n", (int) m_allRoots.size());
|
||||
fprintf(stderr, "\n%d roots:\n", (int)m_allRoots.size());
|
||||
for (const auto& root : m_allRoots)
|
||||
fprintf(stderr, "\t%ls = %ls()\n", root->NodeName().c_str(), root->OperationName().c_str());
|
||||
|
||||
|
@ -469,7 +475,7 @@ void ComputationNetwork::DetermineSetOfAllRoots()
|
|||
auto input = node->Input(i);
|
||||
if (!input) // this may be the result of an incorrect MEL operation
|
||||
{
|
||||
InvalidArgument("DetermineSetOfAllRoots: Input %d of %ls %ls operation if not connected, network is malformed.",
|
||||
InvalidArgument("DetermineSetOfAllRoots: Input %d of %ls %ls operation is not connected, network is malformed.",
|
||||
(int) i, node->NodeName().c_str(), node->OperationName().c_str());
|
||||
}
|
||||
referencedNodes.insert(input);
|
||||
|
@ -592,7 +598,7 @@ void ComputationNetwork::ValidateNetwork()
|
|||
}
|
||||
if (!nonDefaultNodes.empty())
|
||||
{
|
||||
fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int) nonDefaultNodes.size(), (int) nodes.size());
|
||||
fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int)nonDefaultNodes.size(), (int)nodes.size());
|
||||
// for (auto node : nonDefaultNodes)
|
||||
// fprintf(stderr, " %ls\n", node->NodeName().c_str());
|
||||
// fprintf(stderr, "\n\n");
|
||||
|
@ -652,6 +658,7 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
|
|||
hasVisitedChild |= child->m_visited; // if not a single visited child then no point in validating
|
||||
allChildrenVisited &= child->m_visited;
|
||||
}
|
||||
|
||||
// if there is not at least one visited child
|
||||
bool valid = false;
|
||||
if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate()
|
||||
|
@ -850,7 +857,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
|
|||
else
|
||||
{
|
||||
nodeIter->RequestMatricesBeforeForwardProp(m_matrixPool);
|
||||
// we only release matrices for the children since the root node's informatioin will be used and should not be shared
|
||||
// we only release matrices for the children since the root node's information will be used and should not be shared
|
||||
// with others
|
||||
ReleaseMatricesAfterEvalForChildren(nodeIter, parentCount);
|
||||
}
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
#include "RecurrentNodes.h"
|
||||
#include "NonlinearityNodes.h"
|
||||
#include "LinearAlgebraNodes.h"
|
||||
#include "ConvolutionalNodes.h"
|
||||
#include "ReshapingNodes.h"
|
||||
|
||||
#include "ComputationNetwork.h"
|
||||
|
|
|
@ -402,6 +402,19 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
|
|||
}
|
||||
let& sequences = pMBLayout->GetAllSequences();
|
||||
let width = pMBLayout->GetNumTimeSteps();
|
||||
|
||||
TensorShape tensorShape = GetSampleLayout();
|
||||
stringstream str;
|
||||
let dims = tensorShape.GetDims();
|
||||
for (auto dim : dims)
|
||||
str << dim << ' ';
|
||||
let shape = str.str(); // BUGBUG: change to string(tensorShape) to make sure we always use the same format
|
||||
|
||||
bool sequencePrologueHasShape = sequencePrologue.find("%x") != sequencePrologue.npos;
|
||||
bool sampleSeparatorHasShape = sampleSeparator.find("%x") != sampleSeparator.npos;
|
||||
bool sequencePrologueHasSeqId = sequencePrologue.find("%d") != sequencePrologue.npos;
|
||||
bool sampleSeparatorHasSeqId = sampleSeparator.find("%d") != sampleSeparator.npos;
|
||||
|
||||
for (size_t s = 0; s < sequences.size(); s++)
|
||||
{
|
||||
const auto& seqInfo = sequences[s];
|
||||
|
@ -429,9 +442,30 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
|
|||
let seqCols = t1 - t0;
|
||||
let seqStride = pMBLayout->GetNumParallelSequences() * matStride;
|
||||
|
||||
auto seqProl = sequencePrologue;
|
||||
auto sampleSep = sampleSeparator;
|
||||
|
||||
if (sequencePrologueHasShape || sampleSeparatorHasShape)
|
||||
{
|
||||
auto sh = msra::strfun::_strprintf<char>("%s%ld", shape.c_str(), (unsigned long long)seqInfo.GetNumTimeSteps());
|
||||
if (sequencePrologueHasShape)
|
||||
seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%x", sh);
|
||||
if (sampleSeparatorHasShape)
|
||||
sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%x", sh);
|
||||
}
|
||||
|
||||
if (sequencePrologueHasSeqId || sampleSeparatorHasSeqId)
|
||||
{
|
||||
auto sh = msra::strfun::_strprintf<char>("%ld", (unsigned long long)seqInfo.seqId);
|
||||
if (sequencePrologueHasSeqId)
|
||||
seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%d", sh);
|
||||
if (sampleSeparatorHasSeqId)
|
||||
sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%d", sh);
|
||||
}
|
||||
|
||||
if (s > 0)
|
||||
fprintfOrDie(f, "%s", sequenceSeparator.c_str());
|
||||
fprintfOrDie(f, "%s", sequencePrologue.c_str());
|
||||
fprintfOrDie(f, "%s", seqProl.c_str());
|
||||
|
||||
// output it according to our format specification
|
||||
auto formatChar = valueFormatString.back();
|
||||
|
@ -530,14 +564,14 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
|
|||
else
|
||||
{
|
||||
for (size_t j = 0; j < jend; j++) // loop over output rows --BUGBUG: row index is 'i'!! Rename these!!
|
||||
{
|
||||
if (j > 0)
|
||||
fprintfOrDie(f, "%s", sampleSeparator.c_str());
|
||||
{
|
||||
if (j > 0)
|
||||
fprintfOrDie(f, "%s", sampleSep.c_str());
|
||||
if (j == jstop && jstop < jend - 1) // if jstop == jend-1 we may as well just print the value instead of '...'
|
||||
{
|
||||
{
|
||||
fprintfOrDie(f, "...+%d", (int)(jend - jstop)); // 'nuff said
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// inject sample tensor index if we are printing row-wise and it's a tensor
|
||||
if (!transpose && sampleLayout.size() > 1 && !isCategoryLabel) // each row is a different sample dimension
|
||||
{
|
||||
|
@ -547,15 +581,15 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
|
|||
}
|
||||
// print a row of values
|
||||
for (size_t i = 0; i < iend; i++) // loop over elements
|
||||
{
|
||||
if (i > 0)
|
||||
fprintfOrDie(f, "%s", elementSeparator.c_str());
|
||||
{
|
||||
if (i > 0)
|
||||
fprintfOrDie(f, "%s", elementSeparator.c_str());
|
||||
if (i == istop && istop < iend - 1)
|
||||
{
|
||||
{
|
||||
fprintfOrDie(f, "...+%d", (int)(iend - istop));
|
||||
break;
|
||||
}
|
||||
double dval = seqData[i * istride + j * jstride];
|
||||
break;
|
||||
}
|
||||
double dval = seqData[i * istride + j * jstride];
|
||||
print(dval);
|
||||
}
|
||||
}
|
||||
|
@ -566,7 +600,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
|
|||
}
|
||||
|
||||
/*static*/ string WriteFormattingOptions::Processed(const wstring& nodeName, string fragment, size_t minibatchId)
|
||||
{
|
||||
{
|
||||
fragment = msra::strfun::ReplaceAll<string>(fragment, "\\n", "\n");
|
||||
fragment = msra::strfun::ReplaceAll<string>(fragment, "\\r", "\r");
|
||||
fragment = msra::strfun::ReplaceAll<string>(fragment, "\\t", "\t");
|
||||
|
@ -577,7 +611,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
|
|||
fragment = msra::strfun::ReplaceAll<string>(fragment, "%n", msra::strfun::_strprintf<char>("%ld", minibatchId).c_str());
|
||||
// %d: sequenceId
|
||||
return fragment;
|
||||
}
|
||||
}
|
||||
|
||||
template <class ConfigRecordType>
|
||||
WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
|
||||
|
@ -588,14 +622,14 @@ WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
|
|||
{
|
||||
const ConfigRecordType& formatConfig(config(L"format", ConfigRecordType::Record()));
|
||||
if (formatConfig.ExistsCurrent(L"type")) // do not inherit 'type' from outer block
|
||||
{
|
||||
{
|
||||
wstring type = formatConfig(L"type");
|
||||
if (type == L"real") ; // default
|
||||
else if (type == L"category") isCategoryLabel = true;
|
||||
else if (type == L"sparse") isSparse = true;
|
||||
else InvalidArgument("write: type must be 'real', 'category', or 'sparse'");
|
||||
labelMappingFile = (wstring)formatConfig(L"labelMappingFile", L"");
|
||||
}
|
||||
}
|
||||
transpose = formatConfig(L"transpose", transpose);
|
||||
prologue = formatConfig(L"prologue", prologue);
|
||||
epilogue = formatConfig(L"epilogue", epilogue);
|
||||
|
@ -606,8 +640,8 @@ WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
|
|||
sampleSeparator = msra::strfun::utf8(formatConfig(L"sampleSeparator", (wstring)msra::strfun::utf16(sampleSeparator)));
|
||||
precisionFormat = msra::strfun::utf8(formatConfig(L"precisionFormat", (wstring)msra::strfun::utf16(precisionFormat)));
|
||||
// TODO: change those strings into wstrings to avoid this conversion mess
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WriteFormattingOptions::Save(File& fstream) const
|
||||
{
|
||||
|
@ -623,7 +657,7 @@ void WriteFormattingOptions::Save(File& fstream) const
|
|||
fstream << elementSeparator;
|
||||
fstream << sampleSeparator;
|
||||
fstream << precisionFormat;
|
||||
}
|
||||
}
|
||||
|
||||
void WriteFormattingOptions::Load(File& fstream, size_t modelVersion)
|
||||
{
|
||||
|
@ -710,5 +744,6 @@ public:
|
|||
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedTensorShape> registerTensorShape(L"TensorShape");
|
||||
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<int>> registerIntVector (L"IntVector");
|
||||
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<size_t>> registerSizeVector (L"SizeVector");
|
||||
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<bool>> registerBoolVector (L"BoolVector");
|
||||
|
||||
}}}
|
||||
|
|
|
@ -31,17 +31,15 @@
|
|||
// version number to control how to read and write
|
||||
#define CNTK_MODEL_VERSION_1 1
|
||||
#define CNTK_MODEL_VERSION_2 2
|
||||
#define CNTK_MODEL_VERSION_3 3 // (Row)Slice: axis; LearnableParameter: tensor shape; Times: outputRank; TransposeDimensions: axes
|
||||
#define CNTK_MODEL_VERSION_4 4 // PastValue: tensor shape
|
||||
#define CNTK_MODEL_VERSION_5 5 // ElemType tag in model file
|
||||
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_5
|
||||
#define CNTK_MODEL_VERSION_3 3
|
||||
#define CNTK_MODEL_VERSION_4 4 // PastValue
|
||||
#define CNTK_MODEL_VERSION_5 5 // ND convolution and pooling
|
||||
#define CNTK_MODEL_VERSION_6 6 // Batch norm blending
|
||||
#define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
|
||||
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_7
|
||||
|
||||
extern bool g_shareNodeValueMatrices;
|
||||
|
||||
#ifndef UNREFERENCED_PARAMETER // TODO: unify with UNUSED()
|
||||
#define UNREFERENCED_PARAMETER(P) (P)
|
||||
#endif
|
||||
|
||||
// helper mode for debugging
|
||||
// If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
|
||||
// #define TRACK_GAP_NANS
|
||||
|
@ -902,7 +900,7 @@ public:
|
|||
if (m_value)
|
||||
{
|
||||
node->CreateValueMatrixIfNull();
|
||||
node->m_value->SetValue(*m_value);
|
||||
node->m_value->SetValue(*m_value);
|
||||
}
|
||||
else
|
||||
node->m_value = nullptr;
|
||||
|
@ -1112,6 +1110,9 @@ public:
|
|||
const Matrix<ElemType>& Gradient() const { return *m_gradient; }
|
||||
Matrix<ElemType>& Gradient() { return *m_gradient; }
|
||||
|
||||
MatrixBasePtr GradientPtr() const { return m_gradient; }
|
||||
// TODO: This is only used for testing whether a gradient has been allocated. Maybe reduce to bool HasGradient()?
|
||||
|
||||
private:
|
||||
|
||||
template<class E>
|
||||
|
@ -1268,8 +1269,8 @@ protected:
|
|||
DetermineDataSize(rows, cols);
|
||||
try
|
||||
{
|
||||
m.VerifySize(rows, cols);
|
||||
}
|
||||
m.VerifySize(rows, cols);
|
||||
}
|
||||
catch (const std::exception& e)
|
||||
{
|
||||
Rethrow(e);
|
||||
|
@ -1499,8 +1500,8 @@ public:
|
|||
"%13.10f"/*valueFormatString*/);
|
||||
if (m_traceNodeValueSparse)
|
||||
WriteMinibatchWithFormatting(stderr, FrameRange(), SIZE_MAX, SIZE_MAX, false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/true, std::vector<std::string>(),
|
||||
""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/,
|
||||
"%13.10f"/*valueFormatString*/);
|
||||
""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/,
|
||||
"%13.10f"/*valueFormatString*/);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -7,31 +7,19 @@
|
|||
#include "Basics.h"
|
||||
#include "Matrix.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "InputAndParamNodes.h"
|
||||
#include "ConvolutionEngine.h"
|
||||
|
||||
#include <unordered_set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <atomic>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ConvolutionNode (convolutionWeights, inputFeature)
|
||||
// ConvolutionNodeBase
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// Convolutions (incl. pooling) support two different storage formats:
|
||||
// ConvolutionNodeBase is a base class for ND-convolution(ConvolutionNode) and ND-pooling(PoolingNode).
|
||||
//
|
||||
// 2D convolutions (incl. pooling) support two different storage formats:
|
||||
//
|
||||
// * legacy ("HWC") mode (CPU and GPU without cudnn): Channels are tuples of scalars
|
||||
// * legacy ("HWC") mode: Channels are tuples of scalars
|
||||
//
|
||||
// This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
|
||||
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
|
||||
|
@ -40,7 +28,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// - output : [C' x W' x H' x T] or ARRAY[1..T] OF ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..C']
|
||||
// - filter : [C' x W" x H" x C ] or ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..C']
|
||||
//
|
||||
// * cudnn ("CHW") mode (GPU only): Channels are planes
|
||||
// * cudnn ("CHW") mode (works both GPU and CPU): Channels are planes
|
||||
//
|
||||
// - input : [W x H x C x T] or ARRAY[1..T] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
|
||||
// - output : [W' x H' x C' x T] or ARRAY[1..T] OF ARRAY[1..C'] OF ARRAY[1..H'] OF ARRAY[1..W']
|
||||
|
@ -54,71 +42,269 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// - 3 for color images, 1 for B&W images
|
||||
// - for hidden layer: dimension of activation vector for each pixel
|
||||
// - C' = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
|
||||
//
|
||||
// For ND-convolution/pooling only second format ('cudnn') is supported.
|
||||
//
|
||||
template <class ElemType>
|
||||
class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
|
||||
class ConvolutionNodeBase : public ComputationNode<ElemType>
|
||||
{
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
|
||||
static const std::wstring TypeName() { return L"Convolution"; }
|
||||
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
|
||||
|
||||
public:
|
||||
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name),
|
||||
m_kernelWidth(SIZE_MAX),
|
||||
m_kernelHeight(SIZE_MAX),
|
||||
// initialize to dummy values so we catch missing initialization
|
||||
m_horizontalSubsample(SIZE_MAX),
|
||||
m_verticalSubsample(SIZE_MAX),
|
||||
m_zeroPadding(false),
|
||||
m_maxTempMemSizeInSamples(SIZE_MAX),
|
||||
m_imageLayoutKind(ImageLayoutKind::HWC)
|
||||
ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name), m_poolKind(PoolKind::None), m_maxTempMemSizeInSamples(0)
|
||||
{
|
||||
SetDims(ImageDimensions::AsTensorShape(1, 1, 0, m_imageLayoutKind), 0);
|
||||
}
|
||||
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
|
||||
const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0)
|
||||
: Base(deviceId, name),
|
||||
m_outputChannels(outputChannels),
|
||||
m_kernelWidth(kernelWidth),
|
||||
m_kernelHeight(kernelHeight),
|
||||
m_horizontalSubsample(horizontalSubsample),
|
||||
m_verticalSubsample(verticalSubsample),
|
||||
m_zeroPadding(zeroPadding),
|
||||
m_maxTempMemSizeInSamples(maxTempMemSizeInSamples),
|
||||
m_imageLayoutKind(imageLayoutKind)
|
||||
ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
|
||||
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
PoolKind poolKind, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
|
||||
: Base(deviceId, name), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(strideShape), m_sharing(sharing),
|
||||
m_autoPad(autoPadding), m_lowerPad(lowerPad), m_upperPad(upperPad), m_poolKind(poolKind),
|
||||
m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
|
||||
{
|
||||
SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: necessary?
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
}
|
||||
ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
: ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelWidth"), configp->Get(L"kernelHeight"), configp->Get(L"outputChannels"),
|
||||
configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")),
|
||||
configp->Get(L"zeroPadding"), configp->Get(L"maxTempMemSizeInSamples"))
|
||||
{
|
||||
// weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0
|
||||
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
|
||||
}
|
||||
|
||||
public:
|
||||
void Save(File& fstream) const override
|
||||
{
|
||||
Base::Save(fstream);
|
||||
fstream << m_kernelWidth << m_kernelHeight << m_horizontalSubsample << m_verticalSubsample;
|
||||
uint32_t imageLayoutKind = (uint32_t) m_imageLayoutKind;
|
||||
uint32_t outputChannels = (uint32_t) m_outputChannels;
|
||||
fstream << outputChannels << imageLayoutKind;
|
||||
fstream << m_zeroPadding << m_maxTempMemSizeInSamples;
|
||||
|
||||
m_kernelShape.Save(fstream);
|
||||
m_mapCount.Save(fstream);
|
||||
m_stride.Save(fstream);
|
||||
fstream << m_sharing;
|
||||
fstream << m_autoPad;
|
||||
m_lowerPad.Save(fstream);
|
||||
m_upperPad.Save(fstream);
|
||||
fstream << (int32_t)m_poolKind;
|
||||
fstream << (int32_t)m_imageLayout;
|
||||
fstream << m_maxTempMemSizeInSamples;
|
||||
}
|
||||
|
||||
void Load(File& fstream, size_t modelVersion) override
|
||||
{
|
||||
Base::Load(fstream, modelVersion);
|
||||
fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample;
|
||||
uint32_t imageLayoutKind, outputChannels;
|
||||
fstream >> outputChannels >> imageLayoutKind;
|
||||
m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
|
||||
m_outputChannels = outputChannels;
|
||||
SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), HasMBLayout()); // TODO: needed?
|
||||
fstream >> m_zeroPadding >> m_maxTempMemSizeInSamples;
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
|
||||
// Let ConvolutionNode handle older models.
|
||||
if (modelVersion >= CNTK_MODEL_VERSION_5)
|
||||
{
|
||||
m_kernelShape.Load(fstream);
|
||||
m_mapCount.Load(fstream);
|
||||
m_stride.Load(fstream);
|
||||
fstream >> m_sharing;
|
||||
fstream >> m_autoPad;
|
||||
m_lowerPad.Load(fstream);
|
||||
m_upperPad.Load(fstream);
|
||||
int32_t k;
|
||||
fstream >> k;
|
||||
m_poolKind = (PoolKind)k;
|
||||
int32_t layout;
|
||||
fstream >> layout;
|
||||
m_imageLayout = (ImageLayoutKind)layout;
|
||||
fstream >> m_maxTempMemSizeInSamples;
|
||||
}
|
||||
}
|
||||
|
||||
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
||||
{
|
||||
Base::CopyTo(nodeP, newName, flags);
|
||||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ConvolutionNodeBase<ElemType>>(nodeP);
|
||||
node->m_kernelShape = m_kernelShape;
|
||||
node->m_mapCount = m_mapCount;
|
||||
node->m_stride = m_stride;
|
||||
node->m_sharing = m_sharing;
|
||||
node->m_autoPad = m_autoPad;
|
||||
node->m_lowerPad = m_lowerPad;
|
||||
node->m_upperPad = m_upperPad;
|
||||
node->m_poolKind = m_poolKind;
|
||||
node->m_imageLayout = m_imageLayout;
|
||||
node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
|
||||
}
|
||||
}
|
||||
|
||||
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
|
||||
{
|
||||
auto sliceOutputGrad = GradientFor(fr);
|
||||
|
||||
if (m_poolKind == PoolKind::None)
|
||||
{
|
||||
if (inputIndex == 0) // derivative with respect to the weight matrix
|
||||
{
|
||||
auto& grad = Input(0)->GradientAsMatrix();
|
||||
auto sliceInput1Value = Input(1)->ValueFor(fr);
|
||||
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
|
||||
}
|
||||
else if (inputIndex == 1) // derivative with respect to the input feature
|
||||
{
|
||||
auto& input0 = Input(0)->ValueAsMatrix();
|
||||
auto sliceInput1Grad = Input(1)->GradientFor(fr);
|
||||
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
|
||||
|
||||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
|
||||
}
|
||||
}
|
||||
|
||||
bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ConvolutionNode requires output values only for max pooling.
|
||||
return m_poolKind == PoolKind::Max;
|
||||
}
|
||||
|
||||
void ForwardProp(const FrameRange& fr) override
|
||||
{
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
if (m_poolKind == PoolKind::None)
|
||||
{
|
||||
const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
|
||||
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
|
||||
m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
const Matrix<ElemType>& input0 = Input(0)->ValueFor(fr);
|
||||
m_convEng->ForwardPooling(input0, sliceOutputValue);
|
||||
}
|
||||
}
|
||||
|
||||
void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override
|
||||
{
|
||||
Base::DumpNodeInfo(printValues, printMetadata, fstream);
|
||||
|
||||
if (m_convEng != nullptr)
|
||||
fstream << "Geometry: " << string(*m_convEng->Geometry()) << "\n";
|
||||
fstream << "PoolKind: " << (int)m_poolKind << "\n";
|
||||
}
|
||||
|
||||
protected:
|
||||
TensorShape m_kernelShape;
|
||||
TensorShape m_mapCount;
|
||||
TensorShape m_stride;
|
||||
std::vector<bool> m_sharing;
|
||||
std::vector<bool> m_autoPad;
|
||||
TensorShape m_lowerPad;
|
||||
TensorShape m_upperPad;
|
||||
PoolKind m_poolKind;
|
||||
ImageLayoutKind m_imageLayout;
|
||||
|
||||
size_t m_maxTempMemSizeInSamples;
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
||||
};
|
||||
|
||||
#define UsingConvolutionNodeBaseMembers \
|
||||
UsingComputationNodeMembersBoilerplate; \
|
||||
protected: \
|
||||
using Base::m_kernelShape; \
|
||||
using Base::m_mapCount; \
|
||||
using Base::m_stride; \
|
||||
using Base::m_sharing; \
|
||||
using Base::m_autoPad; \
|
||||
using Base::m_lowerPad; \
|
||||
using Base::m_upperPad; \
|
||||
using Base::m_poolKind; \
|
||||
using Base::m_imageLayout; \
|
||||
using Base::m_maxTempMemSizeInSamples; \
|
||||
using Base::m_tempMatrix; \
|
||||
using Base::m_convEng; \
|
||||
public:
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// ConvolutionNode (convolutionWeights, inputFeature)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
class ConvolutionNode : public ConvolutionNodeBase<ElemType>, public NumInputs<2>
|
||||
{
|
||||
typedef ConvolutionNodeBase<ElemType> Base;
|
||||
UsingConvolutionNodeBaseMembers;
|
||||
static const std::wstring TypeName()
|
||||
{
|
||||
return L"Convolution";
|
||||
}
|
||||
|
||||
public:
|
||||
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name)
|
||||
{
|
||||
}
|
||||
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
|
||||
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
|
||||
: Base(deviceId, name, kernelShape, mapCount, strideShape, sharing, autoPadding, lowerPad, upperPad, PoolKind::None, imageLayout, maxTempMemSizeInSamples),
|
||||
m_convolution2D(false)
|
||||
{
|
||||
}
|
||||
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
|
||||
const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayout,
|
||||
bool zeroPadding, size_t maxTempMemSizeInSamples)
|
||||
: ConvolutionNode(deviceId, name, TensorShape(kernelWidth, kernelHeight, 1), TensorShape(1, 1, outputChannels),
|
||||
TensorShape(horizontalSubsample, verticalSubsample, 1), vector<bool>{true},
|
||||
vector<bool>{zeroPadding}, TensorShape(0), TensorShape(0),
|
||||
imageLayout, maxTempMemSizeInSamples)
|
||||
{
|
||||
m_convolution2D = true;
|
||||
}
|
||||
ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
: ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelShape"), configp->Get(L"mapCount"), configp->Get(L"strideShape"),
|
||||
configp->Get(L"dimSharing"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
|
||||
ImageLayoutKindFrom(configp->Get(L"imageLayout")), configp->Get(L"maxTempMemSizeInSamples"))
|
||||
{
|
||||
AttachInputs(configp, GetExpectedNumInputs());
|
||||
}
|
||||
|
||||
public:
|
||||
void Save(File& fstream) const override
|
||||
{
|
||||
Base::Save(fstream);
|
||||
fstream << m_convolution2D;
|
||||
}
|
||||
|
||||
void Load(File& fstream, size_t modelVersion) override
|
||||
{
|
||||
Base::Load(fstream, modelVersion);
|
||||
|
||||
// Back compat: load pre-ND convolution models.
|
||||
if (modelVersion < CNTK_MODEL_VERSION_5)
|
||||
{
|
||||
size_t kW, kH, sW, sH;
|
||||
fstream >> kW;
|
||||
fstream >> kH;
|
||||
fstream >> sW;
|
||||
fstream >> sH;
|
||||
uint32_t imageLayout, mapCount;
|
||||
fstream >> mapCount;
|
||||
fstream >> imageLayout;
|
||||
m_imageLayout = (ImageLayoutKind)imageLayout;
|
||||
bool pad;
|
||||
fstream >> pad;
|
||||
fstream >> m_maxTempMemSizeInSamples;
|
||||
m_poolKind = PoolKind::None;
|
||||
m_convolution2D = true;
|
||||
|
||||
m_kernelShape = TensorShape(kW, kH, 1);
|
||||
m_mapCount = TensorShape(mapCount);
|
||||
m_stride = TensorShape(sW, sH, 1);
|
||||
m_sharing = vector<bool>{true};
|
||||
m_autoPad = vector<bool>{pad};
|
||||
m_lowerPad = TensorShape(0);
|
||||
m_upperPad = TensorShape(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
fstream >> m_convolution2D;
|
||||
}
|
||||
}
|
||||
|
||||
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
||||
|
@ -127,144 +313,92 @@ public:
|
|||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ConvolutionNode<ElemType>>(nodeP);
|
||||
node->m_kernelWidth = m_kernelWidth;
|
||||
node->m_kernelHeight = m_kernelHeight;
|
||||
|
||||
node->m_horizontalSubsample = m_horizontalSubsample;
|
||||
node->m_verticalSubsample = m_verticalSubsample;
|
||||
|
||||
node->m_zeroPadding = m_zeroPadding;
|
||||
|
||||
node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
|
||||
|
||||
node->m_imageLayoutKind = m_imageLayoutKind;
|
||||
|
||||
node->m_tempMatrix->SetValue(*m_tempMatrix);
|
||||
node->m_convolution2D = m_convolution2D;
|
||||
}
|
||||
}
|
||||
|
||||
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
|
||||
{
|
||||
auto sliceOutputGrad = GradientFor(fr);
|
||||
auto sliceInput1Value = Input(1)->ValueFor(fr);
|
||||
|
||||
size_t batchSize = sliceInput1Value.GetNumCols();
|
||||
m_inT->setN(batchSize);
|
||||
m_outT->setN(batchSize);
|
||||
assert(m_convEng != nullptr);
|
||||
if (inputIndex == 0) // derivative with respect to the weight matrix
|
||||
{
|
||||
auto& grad = Input(0)->GradientAsMatrix();
|
||||
m_convEng->BackwardFilter(*m_outT, sliceOutputGrad, *m_inT, sliceInput1Value, *m_convDesc, *m_filterT, grad, fr.IsAllFrames(), *m_tempMatrix);
|
||||
}
|
||||
else if (inputIndex == 1) // derivative with respect to the input feature
|
||||
{
|
||||
auto& input0 = Input(0)->ValueAsMatrix();
|
||||
auto sliceInput1Grad = Input(1)->GradientFor(fr);
|
||||
m_convEng->BackwardData(*m_outT, sliceOutputGrad, *m_filterT, input0, *m_convDesc, *m_inT, sliceInput1Grad, *m_tempMatrix);
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override
|
||||
{
|
||||
// The ConvolutionNode does not require its output value for computing
|
||||
// the gradients of its input nodes
|
||||
return false;
|
||||
}
|
||||
|
||||
void ForwardProp(const FrameRange& fr) override
|
||||
{
|
||||
const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
|
||||
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
// update the tensor dimension w.r.t. number of samples
|
||||
size_t batchSize = sliceInput1Value.GetNumCols();
|
||||
m_inT->setN(batchSize);
|
||||
m_outT->setN(batchSize);
|
||||
assert(m_convEng != nullptr);
|
||||
#if NANCHECK
|
||||
input0.HasNan("Convolution-input0");
|
||||
sliceInput1Value.HasNan("Convolution-input1");
|
||||
#endif
|
||||
m_convEng->Forward(*m_inT, sliceInput1Value, *m_filterT, input0, *m_convDesc, *m_outT, sliceOutputValue, *m_tempMatrix);
|
||||
#if NANCHECK
|
||||
sliceOutputValue.HasNan("Convolution");
|
||||
#endif
|
||||
}
|
||||
|
||||
void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
|
||||
void Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
|
||||
|
||||
// get input and output tensor shape and interpret as image dimensions
|
||||
auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
|
||||
size_t inputIdx = GetExpectedNumInputs() - 1;
|
||||
TensorShape inputShape;
|
||||
if (m_convolution2D)
|
||||
{
|
||||
// Need to update some tensors with correct input dims.
|
||||
auto inDims = ImageDimensions(GetInputSampleLayout(inputIdx), m_imageLayout);
|
||||
// inputShape is used in ConvolveGeometry which supports only CHW layout.
|
||||
inputShape = inDims.AsTensorShape(ImageLayoutKind::CHW);
|
||||
size_t kW = m_kernelShape[0];
|
||||
size_t kH = m_kernelShape[1];
|
||||
size_t sW = m_stride[0];
|
||||
size_t sH = m_stride[1];
|
||||
m_kernelShape = TensorShape(kW, kH, inDims.m_numChannels);
|
||||
m_stride = TensorShape(sW, sH, inDims.m_numChannels);
|
||||
|
||||
if (isFinalValidationPass && (inDims.m_width < m_kernelWidth || inDims.m_height < m_kernelHeight))
|
||||
InvalidArgument("%ls %ls operation requires that input width be >= kernelWidth and input height >= kernelHeight.", NodeName().c_str(), OperationName().c_str());
|
||||
|
||||
// determine output tensor shape
|
||||
const int kernelWidthCenter = m_zeroPadding ? m_kernelWidth % 2 : m_kernelWidth;
|
||||
const int kernelHeightCenter = m_zeroPadding ? m_kernelHeight % 2 : m_kernelHeight;
|
||||
auto outDims = ImageDimensions(
|
||||
(inDims.m_width - kernelWidthCenter) / m_horizontalSubsample + 1,
|
||||
(inDims.m_height - kernelHeightCenter) / m_verticalSubsample + 1,
|
||||
m_outputChannels);
|
||||
|
||||
size_t weightCols = m_kernelWidth * m_kernelHeight * inDims.m_numChannels;
|
||||
size_t mapCount = m_mapCount.GetNumElements();
|
||||
size_t weightCols = kW * kH * inDims.m_numChannels;
|
||||
|
||||
// check/infer input [0] (weights)
|
||||
// BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
|
||||
Input(0)->ValidateInferInputDimsFrom(TensorShape(m_outputChannels, weightCols));
|
||||
Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols));
|
||||
|
||||
if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != m_outputChannels))
|
||||
LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int) m_outputChannels, (int) weightCols);
|
||||
if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != mapCount))
|
||||
{
|
||||
LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]",
|
||||
Input(0)->NodeName().c_str(), (int)mapCount, (int)weightCols);
|
||||
}
|
||||
|
||||
// that's our dimension
|
||||
SetDims(outDims.AsTensorShape(m_imageLayoutKind), true);
|
||||
auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
|
||||
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
|
||||
// ConvolveGeometry always uses CHW.
|
||||
SetDims(ImageDimensions(outDims, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m_imageLayout != ImageLayoutKind::CHW)
|
||||
{
|
||||
InvalidArgument(
|
||||
"%ls %ls supports only cuDNN (CHW) data layout. "
|
||||
"Please specify imageLayout=\"cudnn\" in %ls node in your script "
|
||||
"and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
|
||||
}
|
||||
inputShape = GetInputSampleLayout(inputIdx);
|
||||
auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
|
||||
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
|
||||
SetDims(outDims, HasMBLayout());
|
||||
}
|
||||
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
// set up the various engines and descriptor objects
|
||||
// REVIEW alexeyk: is there a better place to create engines?
|
||||
assert(m_factory);
|
||||
// if (m_factory == nullptr)
|
||||
// m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
// TODO: This seems to expose too much internal knowlegde of the engine to the ConvolutionNode().
|
||||
// Why not just pass everything to the engine creator, and get one object that holds everything.
|
||||
if (m_convEng == nullptr)
|
||||
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_imageLayoutKind, m_maxTempMemSizeInSamples, BatchNormImpl::Cntk);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
|
||||
if (m_filterT == nullptr)
|
||||
m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inDims.m_numChannels, m_outputChannels);
|
||||
if (m_outT == nullptr)
|
||||
m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
|
||||
if (m_convDesc == nullptr)
|
||||
m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
|
||||
// REVIEW alexeyk: create per-channel bias (shared across all pixels). Consider adding other types of biases.
|
||||
if (m_biasT == nullptr)
|
||||
m_biasT = m_factory->CreateTensor(1, 1, outDims.m_numChannels, 1);
|
||||
{
|
||||
auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
|
||||
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
|
||||
m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
|
||||
m_maxTempMemSizeInSamples, m_poolKind);
|
||||
}
|
||||
|
||||
if (Input(0)->GetAsMatrixNumCols() != m_kernelShape.GetNumElements() ||
|
||||
Input(0)->GetAsMatrixNumRows() != m_convEng->Geometry()->KernelCount())
|
||||
{
|
||||
LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
|
||||
Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override
|
||||
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::DumpNodeInfo(printValues, printMetadata, fstream);
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
|
||||
auto outDims = ImageDimensions(m_sampleLayout, m_imageLayoutKind);
|
||||
|
||||
char str[4096];
|
||||
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inDims.m_width, inDims.m_height, inDims.m_numChannels);
|
||||
fstream << string(str);
|
||||
sprintf(str, "Kernel[Width:%lu, Height:%lu] SubSample[Horizontal:%lu, Vertical:%lu]\n", m_kernelWidth, m_kernelHeight, m_horizontalSubsample, m_verticalSubsample);
|
||||
fstream << string(str);
|
||||
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", outDims.m_width, outDims.m_height, outDims.m_numChannels);
|
||||
fstream << string(str);
|
||||
sprintf(str, "zeroPadding=%ls maxTempMemSizeInSamples=%lu\n", m_zeroPadding ? L"true" : L"false", m_maxTempMemSizeInSamples);
|
||||
fstream << string(str);
|
||||
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
void SetmMaxTempMemSizeInSamples(const size_t maxTempMemSizeInSamples)
|
||||
|
@ -272,47 +406,78 @@ public:
|
|||
m_maxTempMemSizeInSamples = maxTempMemSizeInSamples;
|
||||
}
|
||||
|
||||
// request matrices needed to do node function value evaluation
|
||||
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
RequestMatrixFromPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
|
||||
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t m_outputChannels;
|
||||
size_t m_kernelWidth, m_kernelHeight;
|
||||
size_t m_horizontalSubsample, m_verticalSubsample;
|
||||
bool m_zeroPadding;
|
||||
bool m_1DConvolutionOnGPUSparse;
|
||||
|
||||
shared_ptr<Matrix<ElemType>> m_tempMatrix;
|
||||
size_t m_maxTempMemSizeInSamples; // can change during runtime
|
||||
|
||||
ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
|
||||
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
||||
|
||||
std::unique_ptr<ConvolutionTensor4D> m_inT;
|
||||
std::unique_ptr<ConvolutionFilter> m_filterT;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_outT;
|
||||
std::unique_ptr<ConvolutionDescriptor> m_convDesc;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_biasT;
|
||||
protected:
|
||||
bool m_convolution2D;
|
||||
};
|
||||
|
||||
template class ConvolutionNode<float>;
|
||||
template class ConvolutionNode<double>;
|
||||
// -----------------------------------------------------------------------
|
||||
// PoolingNode (inputFeature)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
class PoolingNode : public ConvolutionNodeBase<ElemType>, public NumInputs<1>
|
||||
{
|
||||
typedef ConvolutionNodeBase<ElemType> Base;
|
||||
UsingConvolutionNodeBaseMembers;
|
||||
static const std::wstring TypeName()
|
||||
{
|
||||
return L"Pooling";
|
||||
}
|
||||
|
||||
public:
|
||||
PoolingNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name)
|
||||
{
|
||||
}
|
||||
PoolingNode(DEVICEID_TYPE deviceId, const wstring& name, PoolKind pool, const TensorShape& kernelShape, const TensorShape& strideShape,
|
||||
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
ImageLayoutKind imageLayout)
|
||||
: Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, pool, imageLayout, 0)
|
||||
{
|
||||
}
|
||||
PoolingNode(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
: PoolingNode(configp->Get(L"deviceId"), L"<placeholder>", PoolKindFrom(configp->Get(L"pool")), configp->Get(L"kernelShape"),
|
||||
configp->Get(L"strideShape"),
|
||||
configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
|
||||
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
|
||||
{
|
||||
AttachInputs(configp, GetExpectedNumInputs());
|
||||
}
|
||||
|
||||
public:
|
||||
void Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
InferMBLayoutFromInputsForStandardCase();
|
||||
|
||||
if (m_imageLayout != ImageLayoutKind::CHW)
|
||||
{
|
||||
InvalidArgument(
|
||||
"%ls %ls supports only cuDNN (CHW) data layout. "
|
||||
"Please specify imageLayout=\"cudnn\" in %ls node in your script "
|
||||
"and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
|
||||
}
|
||||
|
||||
auto inputShape = GetInputSampleLayout(0);
|
||||
auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
|
||||
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
|
||||
SetDims(outDims, HasMBLayout());
|
||||
|
||||
if (isFinalValidationPass)
|
||||
{
|
||||
if (m_convEng == nullptr)
|
||||
{
|
||||
auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
|
||||
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
|
||||
m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
|
||||
m_maxTempMemSizeInSamples, m_poolKind);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// PoolingNodeBase (input)
|
||||
// Legacy PoolingNodeBase (input)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -339,7 +504,6 @@ public:
|
|||
m_verticalSubsample(verticalSubsample),
|
||||
m_imageLayoutKind(imageLayoutKind)
|
||||
{
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
}
|
||||
PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
: PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")))
|
||||
|
@ -362,8 +526,7 @@ public:
|
|||
uint32_t imageLayoutKind, windowWidth;
|
||||
fstream >> windowWidth >> imageLayoutKind >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
|
||||
m_windowWidth = windowWidth;
|
||||
m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
m_imageLayoutKind = (ImageLayoutKind)imageLayoutKind;
|
||||
}
|
||||
|
||||
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
||||
|
@ -394,12 +557,7 @@ public:
|
|||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
size_t batchSize = sliceInput0Value.GetNumCols();
|
||||
m_inT->setN(batchSize);
|
||||
m_outT->setN(batchSize);
|
||||
assert(m_poolEng != nullptr);
|
||||
assert(m_poolDesc != nullptr);
|
||||
m_poolEng->Backward(*m_outT, sliceOutputValue, sliceOutputGrad, *m_poolDesc, *m_inT, sliceInput0Value, sliceInput0Grad);
|
||||
m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
|
||||
}
|
||||
|
||||
void ForwardProp(const FrameRange& fr) override
|
||||
|
@ -407,12 +565,7 @@ public:
|
|||
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
size_t batchSize = sliceInput0Value.GetNumCols();
|
||||
m_inT->setN(batchSize);
|
||||
m_outT->setN(batchSize);
|
||||
assert(m_poolEng != nullptr);
|
||||
assert(m_poolDesc != nullptr);
|
||||
m_poolEng->Forward(*m_inT, sliceInput0Value, *m_poolDesc, *m_outT, sliceOutputValue);
|
||||
m_convEng->ForwardPooling(sliceInput0Value, sliceOutputValue);
|
||||
}
|
||||
|
||||
void Validate(bool isFinalValidationPass) override
|
||||
|
@ -439,16 +592,14 @@ public:
|
|||
if (isFinalValidationPass)
|
||||
{
|
||||
// set up various engines and descriptor objects
|
||||
// REVIEW alexeyk: is there a better place to create engines?
|
||||
assert(m_factory);
|
||||
// if (m_factory == nullptr)
|
||||
// m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
if (m_poolEng == nullptr)
|
||||
m_poolEng = m_factory->CreatePoolEngine(m_deviceId, m_imageLayoutKind);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
|
||||
if (m_outT == nullptr)
|
||||
m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
|
||||
m_geometry = std::make_shared<ConvolveGeometry>(inDims.AsTensorShape(m_imageLayoutKind),
|
||||
ImageDimensions(m_windowWidth, m_windowHeight, 1).AsTensorShape(m_imageLayoutKind),
|
||||
TensorShape(1),
|
||||
ImageDimensions(m_horizontalSubsample, m_verticalSubsample, 1).AsTensorShape(m_imageLayoutKind),
|
||||
ConvolveGeometry::BoolVec{true},
|
||||
ConvolveGeometry::BoolVec{false},
|
||||
TensorShape(0),
|
||||
TensorShape(0));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -479,12 +630,8 @@ protected:
|
|||
|
||||
ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
|
||||
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
|
||||
std::unique_ptr<PoolingEngine<ElemType>> m_poolEng;
|
||||
|
||||
std::unique_ptr<ConvolutionTensor4D> m_inT;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_outT;
|
||||
std::unique_ptr<PoolingDescriptor> m_poolDesc;
|
||||
ConvolveGeometryPtr m_geometry;
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
||||
};
|
||||
|
||||
// add this at the start of each derived class, to get access to the members of ComputationNode
|
||||
|
@ -493,19 +640,20 @@ protected:
|
|||
UsingComputationNodeMembersBoilerplate; \
|
||||
\
|
||||
protected: \
|
||||
using Base::m_factory; \
|
||||
using Base::m_poolDesc; \
|
||||
using Base::m_geometry; \
|
||||
using Base::m_convEng; \
|
||||
using Base::m_windowWidth; \
|
||||
using Base::m_windowHeight; \
|
||||
using Base::m_horizontalSubsample; \
|
||||
using Base::m_verticalSubsample; \
|
||||
using Base::m_inputSizePerSample; \
|
||||
using Base::m_outputSizePerSample; \
|
||||
using Base::m_imageLayoutKind; \
|
||||
\
|
||||
public:
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// MaxPoolingNode
|
||||
// Legacy MaxPoolingNode
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -535,16 +683,13 @@ public:
|
|||
void Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
if (isFinalValidationPass && m_poolDesc == nullptr)
|
||||
m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Max, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
|
||||
if (isFinalValidationPass && m_convEng == nullptr)
|
||||
m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Max);
|
||||
}
|
||||
};
|
||||
|
||||
template class MaxPoolingNode<float>;
|
||||
template class MaxPoolingNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// AveragePoolingNode
|
||||
// Legacy AveragePoolingNode
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -574,12 +719,9 @@ public:
|
|||
void Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
Base::Validate(isFinalValidationPass);
|
||||
if (isFinalValidationPass && m_poolDesc == nullptr)
|
||||
m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Average, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
|
||||
if (isFinalValidationPass && m_convEng == nullptr)
|
||||
m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Average);
|
||||
}
|
||||
};
|
||||
|
||||
template class AveragePoolingNode<float>;
|
||||
template class AveragePoolingNode<double>;
|
||||
|
||||
} } }
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
|
||||
#include "Basics.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "ConvolutionalNodes.h"
|
||||
#include "Matrix.h"
|
||||
#include "TensorView.h"
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
|
||||
#include "Basics.h"
|
||||
#include "ComputationNode.h"
|
||||
#include "ConvolutionEngine.h"
|
||||
#include "BatchNormalizationEngine.h"
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
@ -20,8 +20,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// -----------------------------------------------------------------------
|
||||
// SquareErrorNode (left, right)
|
||||
// = SumElements ((left - right) .* (left - right))
|
||||
// Note: to save computation the gradient may be scaled by an constant.
|
||||
// TODO: ^^ Dig out what that constant is and document it here. "may be scaled"??
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -47,9 +45,9 @@ public:
|
|||
FrameRange fr(Input(0)->GetMBLayout());
|
||||
m_leftMinusRight->AssignDifferenceOf(Input(0)->ValueFor(fr), Input(1)->ValueFor(fr));
|
||||
MaskMissingColumnsToZero(*m_leftMinusRight, Input(0)->GetMBLayout(), fr); // we are fine since it will only be called with full minibatch.
|
||||
ElemType v = m_leftMinusRight->FrobeniusNorm();
|
||||
ElemType v = m_leftMinusRight->FrobeniusNorm(); // v = sqrt( sum{ (I0[i] - I1[i])^2 } )
|
||||
Value().VerifySize(1, 1);
|
||||
Value().SetValue(v * v / 2);
|
||||
Value().SetValue(v * v); // Value = sum{ (I0[i] - I1[i])^2 }
|
||||
#if NANCHECK
|
||||
Value().HasNan("SquareError");
|
||||
#endif
|
||||
|
@ -59,7 +57,7 @@ public:
|
|||
{
|
||||
FrameRange fr(Input(0)->GetMBLayout());
|
||||
auto gradient = Input(inputIndex)->GradientFor(fr);
|
||||
Matrix<ElemType>::Multiply1x1AndWeightedAdd(inputIndex == 0 ? 1.0f : -1.0f, Gradient() /*1x1*/, *m_leftMinusRight, 1.0f, gradient);
|
||||
Matrix<ElemType>::Multiply1x1AndWeightedAdd(inputIndex == 0 ? 2.0f : -2.0f, Gradient() /*1x1*/, *m_leftMinusRight, 1.0f, gradient); // O = (I0-I1)^2; dO/dI0 = 2*(I0-I1); dO/dI1 = -2*(I0-I1)
|
||||
}
|
||||
|
||||
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
|
||||
|
@ -1522,12 +1520,43 @@ template class DropoutNode<float>;
|
|||
template class DropoutNode<double>;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// BatchNormalizationNode (...) --TODO: document inputs
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev, spatial,
|
||||
// normalizationTimeConstant = 0, blendTimeConstant = 0,
|
||||
// epsilon = 0.00001,
|
||||
// useCntkEngine = true, imageLayout = 'cudnn')
|
||||
//
|
||||
// Implements batch normalization technique as described in:
|
||||
// Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift [S. Ioffe, C. Szegedy]
|
||||
// http://arxiv.org/abs/1502.03167
|
||||
// In short, it normalizes layer outputs for every minibatch for each output(feature) independently and applies affine transformation to preserve representation of the layer.
|
||||
// That is, for layer input:
|
||||
//
|
||||
// m = mean(input)
|
||||
// var = variance(input)
|
||||
// input_norm = (input - mean) / sqrt(var)
|
||||
// output = gamma * input_norm + beta
|
||||
//
|
||||
// where gamma and beta are trainable parameters(represented as LearnableParameter).
|
||||
//
|
||||
// * input is the input of the batch normalization node
|
||||
// * scale is a LearnableParameter that stores scale vector(gamma term in the equation above).
|
||||
// * bias is a LearnableParameter that stores bias vector(beta term). scale and bias must have the same dimensions which must be equal
|
||||
// to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
|
||||
// * runMean is the running mean which is used during evaluation phase and might be used during training as well.
|
||||
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
|
||||
// * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
|
||||
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
|
||||
// * spatial is a flag that specifies whether to compute mean / var for each feature in a mininbatch independently or, in case of convolutional layers, per feature map.
|
||||
// * normalizationTimeConstant is the time constant which is used to compute running average of mean and variance.
|
||||
// Value 0 (default) means there will be no exponential smoothing and running mean / variance will always have values computed for the last seen mininbatch.
|
||||
// Value 1#INF (infinity)means running values are "frozen" (i.e.will not be updated).
|
||||
// * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
|
||||
// Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
|
||||
// Value 1#INF (infinity)means only running mean / var will be used(this is used, for example, in evaluation phase).
|
||||
// * epsilon is a conditioner constant used in computing InvStdDev
|
||||
// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN - based.
|
||||
// * imageLayout is the image layout.Only cudnn is supported.
|
||||
// -----------------------------------------------------------------------
|
||||
template <class ElemType>
|
||||
class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInputs<5>
|
||||
{
|
||||
|
@ -1540,19 +1569,20 @@ class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInput
|
|||
|
||||
public:
|
||||
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
|
||||
: Base(deviceId, name), m_eval(false), m_spatial(false), m_normTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
|
||||
: Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
|
||||
m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
|
||||
{
|
||||
}
|
||||
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool eval, bool spatial, double normalizationTimeConstant, double epsilon,
|
||||
bool useCntkEngine, ImageLayoutKind imageLayoutKind)
|
||||
: Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_epsilon(epsilon),
|
||||
m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
|
||||
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
|
||||
double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind)
|
||||
: Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
|
||||
m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
|
||||
{
|
||||
}
|
||||
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
: BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"eval"), configp->Get(L"spatial"),
|
||||
configp->Get(L"normalizationTimeConstant"), configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
|
||||
: BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
|
||||
configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"),
|
||||
configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
|
||||
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
|
||||
{
|
||||
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
|
||||
|
@ -1561,11 +1591,10 @@ public:
|
|||
void Save(File& fstream) const override
|
||||
{
|
||||
Base::Save(fstream);
|
||||
fstream << m_version.VerWrittenCur() << m_version.VerReadableCur();
|
||||
|
||||
fstream << m_eval;
|
||||
fstream << m_spatial;
|
||||
fstream << m_normTimeConst;
|
||||
fstream << m_blendTimeConst;
|
||||
fstream << (int32_t)m_imageLayoutKind;
|
||||
fstream << m_mbCount;
|
||||
fstream << m_epsilon;
|
||||
|
@ -1576,40 +1605,56 @@ public:
|
|||
{
|
||||
Base::Load(fstream, modelVersion);
|
||||
|
||||
// Read and check version.
|
||||
// REVIEW alexeyk: extract version checking so it can be re-used in other places.
|
||||
// BUGBUG: We must serialize m_inputLayout.
|
||||
int32_t verWritten;
|
||||
int32_t verReadable;
|
||||
fstream >> verWritten >> verReadable;
|
||||
|
||||
if (verReadable > verWritten)
|
||||
RuntimeError("Corrupt model file.");
|
||||
if (verWritten < m_version.VerWeCanReadBack())
|
||||
RuntimeError("Model is too old.");
|
||||
if (verReadable > m_version.VerWrittenCur())
|
||||
RuntimeError("Model is too new.");
|
||||
|
||||
fstream >> m_eval;
|
||||
fstream >> m_spatial;
|
||||
if (verWritten >= 0x00010004)
|
||||
if (modelVersion >= CNTK_MODEL_VERSION_6)
|
||||
{
|
||||
fstream >> m_spatial;
|
||||
fstream >> m_normTimeConst;
|
||||
else
|
||||
{
|
||||
double expAvgFactor;
|
||||
fstream >> expAvgFactor;
|
||||
UNUSED(expAvgFactor); // Used in previous versions, replaced by m_normTimeConst.
|
||||
}
|
||||
if (verWritten >= 0x00010002)
|
||||
{
|
||||
fstream >> m_blendTimeConst;
|
||||
fstream >> m_imageLayoutKind;
|
||||
fstream >> m_mbCount;
|
||||
}
|
||||
if (verWritten >= 0x00010003)
|
||||
{
|
||||
fstream >> m_epsilon;
|
||||
fstream >> m_useCntkEngine;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use old versioning scheme for older models.
|
||||
|
||||
// Read and check version.
|
||||
// REVIEW alexeyk: extract version checking so it can be re-used in other places.
|
||||
int32_t verWritten;
|
||||
int32_t verReadable;
|
||||
fstream >> verWritten >> verReadable;
|
||||
|
||||
if (verReadable > verWritten)
|
||||
RuntimeError("Corrupt model file.");
|
||||
if (verWritten < m_version.VerWeCanReadBack())
|
||||
RuntimeError("Model is too old.");
|
||||
if (verReadable > m_version.VerWrittenCur())
|
||||
RuntimeError("Model is too new.");
|
||||
|
||||
bool eval;
|
||||
fstream >> eval;
|
||||
UNUSED(eval);
|
||||
fstream >> m_spatial;
|
||||
if (verWritten >= 0x00010004)
|
||||
fstream >> m_normTimeConst;
|
||||
else
|
||||
{
|
||||
double expAvgFactor;
|
||||
fstream >> expAvgFactor;
|
||||
UNUSED(expAvgFactor); // Used in previous versions, replaced by m_normTimeConst.
|
||||
}
|
||||
if (verWritten >= 0x00010002)
|
||||
{
|
||||
fstream >> m_imageLayoutKind;
|
||||
fstream >> m_mbCount;
|
||||
}
|
||||
if (verWritten >= 0x00010003)
|
||||
{
|
||||
fstream >> m_epsilon;
|
||||
fstream >> m_useCntkEngine;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
|
||||
|
@ -1620,9 +1665,9 @@ public:
|
|||
auto node = dynamic_pointer_cast<BatchNormalizationNode<ElemType>>(nodeP);
|
||||
assert(node != nullptr);
|
||||
|
||||
node->m_eval = m_eval;
|
||||
node->m_spatial = m_spatial;
|
||||
node->m_normTimeConst = m_normTimeConst;
|
||||
node->m_blendTimeConst = m_blendTimeConst;
|
||||
node->m_imageLayoutKind = m_imageLayoutKind;
|
||||
node->m_mbCount = m_mbCount;
|
||||
node->m_epsilon = m_epsilon;
|
||||
|
@ -1630,20 +1675,8 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
void SetNormalizationTimeConstant(const double normalizationTimeConstant)
|
||||
{
|
||||
m_normTimeConst = normalizationTimeConstant;
|
||||
}
|
||||
|
||||
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
|
||||
{
|
||||
static bool m_evalWarningIssued = false; //make sure we only print warning once
|
||||
if (m_eval && !m_evalWarningIssued)
|
||||
{
|
||||
fprintf(stderr, "WARNING: You turned BatchNormalization to evaluation mode during training. Please make sure this is intended.\n");
|
||||
m_evalWarningIssued = true;
|
||||
}
|
||||
|
||||
if (inputIndex == 0) // derivative with respect to the input.
|
||||
{
|
||||
auto sliceOutputGrad = GradientFor(fr);
|
||||
|
@ -1651,15 +1684,11 @@ public:
|
|||
const Matrix<ElemType>& scale = Input(1)->Value();
|
||||
const Matrix<ElemType>& bias = Input(2)->Value();
|
||||
|
||||
size_t batchSize = sliceInputValue.GetNumCols();
|
||||
m_inT->setN(batchSize);
|
||||
assert(m_convEng != nullptr);
|
||||
|
||||
auto sliceInputGrad = Input(0)->GradientFor(fr);
|
||||
m_dScale->Resize(scale);
|
||||
m_dBias->Resize(bias);
|
||||
// Compute all derivatives in one step. Save derivatives with respect to scale and bias in temp matrices.
|
||||
m_convEng->BackwardNormalizeBatch(*m_inT, sliceInputValue, sliceOutputGrad, sliceInputGrad, *m_scaleBiasT, scale, m_spatial,
|
||||
m_bnEng->Backward(sliceInputValue, sliceOutputGrad, sliceInputGrad, scale,
|
||||
*m_saveMean, *m_saveInvStdDev, *m_dScale, *m_dBias);
|
||||
}
|
||||
else if (inputIndex == 1) // derivative with respect to the scale
|
||||
|
@ -1701,48 +1730,45 @@ public:
|
|||
|
||||
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
|
||||
|
||||
size_t batchSize = sliceInputValue.GetNumCols();
|
||||
m_inT->setN(batchSize);
|
||||
assert(m_convEng != nullptr);
|
||||
#if NANCHECK
|
||||
sliceInputValue.HasNan("BatchNormalization-input");
|
||||
#endif
|
||||
if (m_eval)
|
||||
m_convEng->NormalizeBatchInference(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, runMean, runInvStdDev, sliceOutputValue);
|
||||
double expAvgFactor;
|
||||
double blendFactor;
|
||||
if (!Environment().IsTraining())
|
||||
{
|
||||
expAvgFactor = 0;
|
||||
blendFactor = 1.0;
|
||||
|
||||
m_saveMean->Resize(0, 0);
|
||||
m_saveInvStdDev->Resize(0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
double expAvgFactor;
|
||||
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
|
||||
if (m_normTimeConst > 0)
|
||||
{
|
||||
// Convert to per-minibatch factor.
|
||||
expAvgFactor = 1.0 - exp(-(double)GetMBLayout()->GetActualNumSamples() / m_normTimeConst);
|
||||
// Convert to per-minibatch factor. Treat positivie infinity as if running mean/var parameters are "frozen"
|
||||
// that is, do not require updates.
|
||||
expAvgFactor = !isfinite(m_normTimeConst) ? 0 : (1.0 - exp(-numSamples / m_normTimeConst));
|
||||
}
|
||||
else
|
||||
{
|
||||
// REVIEW alexeyk: hack, m_normTimeConst < 0 is used to compute CMA.
|
||||
expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1;
|
||||
expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1.0;
|
||||
}
|
||||
|
||||
if (m_saveMean == nullptr)
|
||||
fprintf(stderr, "WARNING: m_saveMean is null\n");
|
||||
if (m_saveInvStdDev == nullptr)
|
||||
fprintf(stderr, "WARNING: m_saveInvStdDev is null\n");
|
||||
if (!isfinite(m_blendTimeConst))
|
||||
blendFactor = 1.0;
|
||||
else
|
||||
blendFactor = m_blendTimeConst > 0 ? (m_blendTimeConst / (m_blendTimeConst + numSamples)) : 0;
|
||||
|
||||
m_saveMean->Resize(runMean);
|
||||
m_saveInvStdDev->Resize(runMean);
|
||||
}
|
||||
|
||||
m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, expAvgFactor, runMean, runInvStdDev,
|
||||
m_bnEng->Forward(sliceInputValue, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev,
|
||||
sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);
|
||||
|
||||
m_mbCount++;
|
||||
}
|
||||
#if NANCHECK
|
||||
sliceOutputValue.HasNan("BatchNormalization-output");
|
||||
runMean.HasNan("BatchNormalization-runMean");
|
||||
runInvStdDev.HasNan("BatchNormalization-runInvStdDev");
|
||||
m_saveMean->HasNan("BatchNormalization-saveMean");
|
||||
m_saveInvStdDev->HasNan("BatchNormalization-saveInvStdDev");
|
||||
#endif
|
||||
}
|
||||
|
||||
void Validate(bool isFinalValidationPass) override
|
||||
{
|
||||
|
@ -1756,34 +1782,23 @@ public:
|
|||
if (m_spatial && m_imageLayoutKind != CHW)
|
||||
{
|
||||
InvalidArgument(
|
||||
"Batch normalization currently supports only cuDNN (CHW) data layout. "
|
||||
"%ls %ls currently supports only cuDNN (CHW) data layout. "
|
||||
"Please specify imageLayout=\"cudnn\" in BatchNormalization node in your NDL/BrainScript "
|
||||
"and make sure your input data layout is CHW");
|
||||
"and make sure your input data layout is CHW", NodeName().c_str(), OperationName().c_str());
|
||||
}
|
||||
double cudnnMinEps = 1e-5; // CUDNN_BN_MIN_EPSILON
|
||||
if (!m_useCntkEngine && m_epsilon < cudnnMinEps)
|
||||
fprintf(stderr, "\nWARNING: cuDNN batch normalization requires epsilon >= %e. Epsilon will be reset to that value.\n", cudnnMinEps);
|
||||
|
||||
if (m_blendTimeConst < 0)
|
||||
InvalidArgument("%ls %ls requires blend time constant to be >= 0.", NodeName().c_str(), OperationName().c_str());
|
||||
|
||||
auto shape = GetSampleLayout();
|
||||
|
||||
if (m_factory == nullptr)
|
||||
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
|
||||
if (m_convEng == nullptr)
|
||||
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_imageLayoutKind, 0, m_useCntkEngine ? BatchNormImpl::Cntk : BatchNormImpl::CuDnn);
|
||||
if (m_spatial)
|
||||
if (m_bnEng == nullptr)
|
||||
{
|
||||
auto dims = ImageDimensions(shape, m_imageLayoutKind);
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
|
||||
if (m_scaleBiasT == nullptr)
|
||||
m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m_inT == nullptr)
|
||||
m_inT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
|
||||
if (m_scaleBiasT == nullptr)
|
||||
m_scaleBiasT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
|
||||
m_bnEng = BatchNormEngine<ElemType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
|
||||
m_useCntkEngine ? BatchNormEngineKind::Cntk : BatchNormEngineKind::CuDnn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1791,41 +1806,39 @@ public:
|
|||
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeForwardProp(matrixPool);
|
||||
//if (!m_eval)
|
||||
{
|
||||
RequestMatrixFromPool(m_saveMean, matrixPool);
|
||||
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
|
||||
}
|
||||
}
|
||||
|
||||
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::RequestMatricesBeforeBackprop(matrixPool);
|
||||
//if (!m_eval)
|
||||
{
|
||||
RequestMatrixFromPool(m_dScale, matrixPool);
|
||||
RequestMatrixFromPool(m_dBias, matrixPool);
|
||||
}
|
||||
}
|
||||
|
||||
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
|
||||
{
|
||||
Base::ReleaseMatricesAfterBackprop(matrixPool);
|
||||
//if (!m_eval)
|
||||
{
|
||||
ReleaseMatrixToPool(m_saveMean, matrixPool);
|
||||
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
|
||||
ReleaseMatrixToPool(m_dScale, matrixPool);
|
||||
ReleaseMatrixToPool(m_dBias, matrixPool);
|
||||
}
|
||||
}
|
||||
|
||||
void SetEvalMode(bool bnEvalMode)
|
||||
void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
|
||||
double blendTimeConstant, double prevBlendTimeConstant)
|
||||
{
|
||||
m_eval = bnEvalMode;
|
||||
// As this function is called from SGD solver (global), make sure we don't
|
||||
// override settings set in NDL when it's not necessary.
|
||||
if (normalizationTimeConstant != prevNormalizationTimeConstant)
|
||||
m_normTimeConst = normalizationTimeConstant;
|
||||
if (blendTimeConstant != prevBlendTimeConstant)
|
||||
m_blendTimeConst = blendTimeConstant;
|
||||
}
|
||||
|
||||
private:
|
||||
// Old versioning - do not use. Do not remove until we're sure there are no old models around.
|
||||
struct VersionInfo
|
||||
{
|
||||
//int32_t VerWrittenCur() const { return 0x00010001; } // Initial
|
||||
|
@ -1838,13 +1851,20 @@ private:
|
|||
VersionInfo m_version;
|
||||
|
||||
private:
|
||||
// Determines whether to use training or inference(evaluation) mode.
|
||||
bool m_eval;
|
||||
// Determines whether to use per-activation (used after non-convolutional layers like fully connected)
|
||||
// or spatial (used after convolutional layers).
|
||||
bool m_spatial;
|
||||
// Time constant for running mean and variance.
|
||||
double m_normTimeConst;
|
||||
// Time constant for blending running mean/var and current minibatch mean/var.
|
||||
// The main idea is to represent current minibatch statistics as MAP estimate, linear interpolation
|
||||
// of smoothed and minibatch statistics.
|
||||
// The idea is due to Frank Seide et al.
|
||||
// It should also work well in data parallelism scenario
|
||||
// as opposed to plain vanilla BN implementation which would require aggregation of statistics
|
||||
// from all nodes.
|
||||
// REVIEW alexeyk: if this works, document it properly in Wiki.
|
||||
double m_blendTimeConst;
|
||||
// Epsilon used to compute inverse std deviation.
|
||||
double m_epsilon;
|
||||
// Whether to use CNTK or cuDNN BN implementation.
|
||||
|
@ -1863,10 +1883,7 @@ private:
|
|||
// Stores bias derivatives.
|
||||
shared_ptr<Matrix<ElemType>> m_dBias;
|
||||
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_inT;
|
||||
std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
|
||||
};
|
||||
|
||||
template class BatchNormalizationNode<float>;
|
||||
|
|
|
@ -88,7 +88,7 @@
|
|||
<ClCompile Include="EvalWrapper.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\..\..\Common\Include\Eval.h" />
|
||||
<ClInclude Include="..\..\Common\Include\Eval.h" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\..\..\Common\Include\Eval.h">
|
||||
<ClInclude Include="..\..\Common\Include\Eval.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "BatchNormalizationEngine.h"
|
||||
#include "CuDnnFactories.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev)
|
||||
{
|
||||
assert(in.GetNumRows() == m_inOutT.GetNumElements());
|
||||
assert(out.GetNumRows() == m_inOutT.GetNumElements());
|
||||
assert(in.GetNumCols() == out.GetNumCols());
|
||||
assert(std::isfinite(expAvgFactor) && (0 <= expAvgFactor && expAvgFactor <= 1));
|
||||
assert(std::isfinite(blendFactor) && (0 <= blendFactor && blendFactor <= 1));
|
||||
assert(std::isfinite(epsilon) && epsilon > 0);
|
||||
if (!m_spatial)
|
||||
{
|
||||
assert(m_inOutT.GetNumElements() == scale.GetNumRows());
|
||||
assert(m_inOutT.GetNumElements() == bias.GetNumRows());
|
||||
assert(m_inOutT.GetNumElements() == runMean.GetNumRows());
|
||||
assert(m_inOutT.GetNumElements() == runInvStdDev.GetNumRows());
|
||||
assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
|
||||
}
|
||||
else
|
||||
{
|
||||
assert((m_inOutT.GetNumElements() % scale.GetNumRows()) == 0);
|
||||
assert((m_inOutT.GetNumElements() % bias.GetNumRows()) == 0);
|
||||
assert((m_inOutT.GetNumElements() % runMean.GetNumRows()) == 0);
|
||||
assert((m_inOutT.GetNumElements() % runInvStdDev.GetNumRows()) == 0);
|
||||
assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
|
||||
}
|
||||
assert(scale.GetNumCols() == 1);
|
||||
assert(bias.GetNumCols() == 1);
|
||||
assert(runMean.GetNumCols() == 1);
|
||||
assert(runInvStdDev.GetNumCols() == 1);
|
||||
assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
|
||||
assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
|
||||
|
||||
EnsureCompatible();
|
||||
ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale,
|
||||
const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
|
||||
{
|
||||
EnsureCompatible();
|
||||
BackwardCore(in, srcGrad, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
class CntkBatchNormEngine : public BatchNormEngine<ElemType>
|
||||
{
|
||||
public:
|
||||
using Base = BatchNormEngine<ElemType>;
|
||||
using typename Base::Mat;
|
||||
|
||||
public:
|
||||
CntkBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout)
|
||||
: Base(deviceId, inOutT, spatial, imageLayout)
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
using Base::m_deviceId;
|
||||
using Base::m_imageLayout;
|
||||
using Base::m_inOutT;
|
||||
using Base::m_spatial;
|
||||
|
||||
void EnsureCompatible() override
|
||||
{
|
||||
if (m_spatial && m_imageLayout == ImageLayoutKind::HWC)
|
||||
InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
|
||||
{
|
||||
in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
|
||||
}
|
||||
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) override
|
||||
{
|
||||
srcGrad.BatchNormalizationBackward(in, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
}
|
||||
};
|
||||
|
||||
template class CntkBatchNormEngine<float>;
|
||||
template class CntkBatchNormEngine<double>;
|
||||
|
||||
template <typename T>
|
||||
bool HasFlag(T src, T testFlag)
|
||||
{
|
||||
return ((int)src & (int)testFlag) != 0;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout,
|
||||
BatchNormEngineKind enabledEngines)
|
||||
{
|
||||
// Use CNTK as default batch norm engine.
|
||||
if (HasFlag(enabledEngines, BatchNormEngineKind::Cntk))
|
||||
{
|
||||
fprintf(stderr, "\nUsing CNTK batch normalization engine.\n");
|
||||
return std::make_unique<CntkBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
|
||||
}
|
||||
|
||||
if (HasFlag(enabledEngines, BatchNormEngineKind::CuDnn))
|
||||
{
|
||||
fprintf(stderr, "\nUsing cuDNN batch normalization engine.\n");
|
||||
return CuDnnBatchNormEngineFactory<ElemType>::Create(deviceId, inOutT, spatial, imageLayout);
|
||||
}
|
||||
|
||||
RuntimeError("Could not find appropriate batch normalization engine.");
|
||||
}
|
||||
|
||||
template class BatchNormEngine<float>;
|
||||
template class BatchNormEngine<double>;
|
||||
|
||||
} } }
|
|
@ -0,0 +1,73 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Matrix.h"
|
||||
#include "TensorShape.h" // for ImageLayoutKind
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
//-------------------------------------------------------------
|
||||
// Batch normalization engine interface.
|
||||
//-------------------------------------------------------------
|
||||
enum class BatchNormEngineKind
|
||||
{
|
||||
None = 0,
|
||||
Cntk = 1,
|
||||
CuDnn = 1 << 1,
|
||||
|
||||
All = Cntk | CuDnn
|
||||
};
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4251)
|
||||
|
||||
template <class ElemType>
|
||||
class MATH_API BatchNormEngine
|
||||
{
|
||||
public:
|
||||
using Mat = Matrix<ElemType>;
|
||||
|
||||
public:
|
||||
virtual ~BatchNormEngine() = default;
|
||||
|
||||
void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
|
||||
|
||||
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad);
|
||||
|
||||
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout,
|
||||
BatchNormEngineKind enabledEngines = BatchNormEngineKind::All);
|
||||
|
||||
DISABLE_COPY_AND_MOVE(BatchNormEngine);
|
||||
|
||||
protected:
|
||||
BatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout)
|
||||
: m_deviceId(deviceId), m_inOutT(inOutT), m_spatial(spatial), m_imageLayout(imageLayout)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void EnsureCompatible() = 0;
|
||||
|
||||
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
|
||||
|
||||
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) = 0;
|
||||
|
||||
protected:
|
||||
DEVICEID_TYPE m_deviceId;
|
||||
TensorShape m_inOutT;
|
||||
bool m_spatial;
|
||||
ImageLayoutKind m_imageLayout;
|
||||
};
|
||||
|
||||
#pragma warning(pop)
|
||||
|
||||
} } }
|
|
@ -4085,6 +4085,257 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AddAveragePoolingGradient(const CPUMat
|
|||
}
|
||||
#pragma endregion Other Helper Functions
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::ConvolutionForward(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
|
||||
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
|
||||
{
|
||||
for (size_t row = 0; row < output.GetNumRows(); row++)
|
||||
{
|
||||
int colBase = mpRowCol(row, 0);
|
||||
int ivBase = mpRowIwht(row, 0);
|
||||
assert(0 <= colBase && colBase < GetNumRows());
|
||||
|
||||
ElemType sum = 0;
|
||||
int i0 = mpRowRun(row, 0);
|
||||
int skip = runs(i0++, 0);
|
||||
int size = runs(i0++, 0);
|
||||
int imask = i0 + size;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
if (runs(imask + i, 0) == 0)
|
||||
continue;
|
||||
int dcol = runs(i0 + i, 0);
|
||||
assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
|
||||
sum += kernel.BufferPointer()[ivBase + skip + i] * (*this)(colBase + dcol, sample);
|
||||
}
|
||||
output(row, sample) = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::ConvolutionBackwardData(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
|
||||
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
|
||||
{
|
||||
for (size_t row = 0; row < GetNumRows(); row++)
|
||||
{
|
||||
int colBase = mpRowCol(row, 0);
|
||||
int ivBase = mpRowIwht(row, 0);
|
||||
assert(0 <= colBase && colBase < grad.GetNumRows());
|
||||
|
||||
ElemType curGrad = (*this)(row, sample);
|
||||
|
||||
int i0 = mpRowRun(row, 0);
|
||||
int skip = runs(i0++, 0);
|
||||
int size = runs(i0++, 0);
|
||||
int imask = i0 + size;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
if (runs(imask + i, 0) == 0)
|
||||
continue;
|
||||
int dcol = runs(i0 + i, 0);
|
||||
assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
|
||||
grad(colBase + dcol, sample) += curGrad * kernel.BufferPointer()[ivBase + skip + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::ConvolutionBackwardKernel(const CPUMatrix<ElemType>& in, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
|
||||
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& kernelGrad) const
|
||||
{
|
||||
// Do NOT parallelize these loops!
|
||||
for (size_t sample = 0; sample < GetNumCols(); sample++)
|
||||
{
|
||||
for (size_t row = 0; row < GetNumRows(); row++)
|
||||
{
|
||||
int colBase = mpRowCol(row, 0);
|
||||
int ivBase = mpRowIwht(row, 0);
|
||||
assert(0 <= colBase && colBase < in.GetNumRows());
|
||||
|
||||
ElemType curGrad = (*this)(row, sample);
|
||||
|
||||
int i0 = mpRowRun(row, 0);
|
||||
int skip = runs(i0++, 0);
|
||||
int size = runs(i0++, 0);
|
||||
int imask = i0 + size;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
if (runs(imask + i, 0) == 0)
|
||||
continue;
|
||||
int dcol = runs(i0 + i, 0);
|
||||
assert(0 <= colBase + dcol && colBase + dcol < in.GetNumRows());
|
||||
kernelGrad.BufferPointer()[ivBase + skip + i] += curGrad * in(colBase + dcol, sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
|
||||
{
|
||||
for (size_t row = 0; row < output.GetNumRows(); row++)
|
||||
{
|
||||
int colBase = mpRowCol(row, 0);
|
||||
assert(0 <= colBase && colBase < GetNumRows());
|
||||
|
||||
assert(std::numeric_limits<ElemType>::has_infinity);
|
||||
ElemType res = -std::numeric_limits<ElemType>::infinity();
|
||||
|
||||
int i0 = mpRowIndices(row, 0);
|
||||
int size = indices(i0++, 0);
|
||||
assert(size > 0);
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
int dcol = indices(i0 + i, 0);
|
||||
assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
|
||||
res = std::max(res, (*this)(colBase + dcol, sample));
|
||||
}
|
||||
output(row, sample) = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
|
||||
const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
|
||||
CPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
|
||||
{
|
||||
for (size_t row = 0; row < GetNumRows(); row++)
|
||||
{
|
||||
int colBase = mpRowCol(row, 0);
|
||||
assert(0 <= colBase && colBase < grad.GetNumRows());
|
||||
|
||||
int i0 = mpRowIndices(row, 0);
|
||||
int size = indices(i0++, 0);
|
||||
assert(size > 0);
|
||||
ElemType g = (*this)(row, sample);
|
||||
ElemType m = out(row, sample);
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
int dcol = indices(i0 + i, 0);
|
||||
assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
|
||||
if (in(colBase + dcol, sample) >= m)
|
||||
grad(colBase + dcol, sample) += g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
|
||||
{
|
||||
for (size_t row = 0; row < output.GetNumRows(); row++)
|
||||
{
|
||||
int colBase = mpRowCol(row, 0);
|
||||
assert(0 <= colBase && colBase < GetNumRows());
|
||||
|
||||
ElemType sum = 0;
|
||||
|
||||
int i0 = mpRowIndices(row, 0);
|
||||
int size = indices(i0++, 0);
|
||||
assert(size > 0);
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
int dcol = indices(i0 + i, 0);
|
||||
assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
|
||||
sum += (*this)(colBase + dcol, sample);
|
||||
}
|
||||
// Note that we divide by size which is the number of actual elements (does not include padding).
|
||||
output(row, sample) = sum / size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
|
||||
{
|
||||
for (size_t row = 0; row < GetNumRows(); row++)
|
||||
{
|
||||
int colBase = mpRowCol(row, 0);
|
||||
assert(0 <= colBase && colBase < grad.GetNumRows());
|
||||
|
||||
int i0 = mpRowIndices(row, 0);
|
||||
int size = indices(i0++, 0);
|
||||
assert(size > 0);
|
||||
ElemType g = (*this)(row, sample) / size;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
int dcol = indices(i0 + i, 0);
|
||||
assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
|
||||
grad(colBase + dcol, sample) += g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev, CPUMatrix<ElemType>& out, double epsilon,
|
||||
CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
|
||||
{
|
||||
UNUSED(epsilon); UNUSED(saveMean); UNUSED(saveInvStdDev);
|
||||
|
||||
assert((GetNumRows() % scale.GetNumRows()) == 0);
|
||||
|
||||
if (expAvgFactor != 0 || blendFactor != 1)
|
||||
RuntimeError("Batch normalization training on CPU is not yet implemented.");
|
||||
|
||||
bool spatial = GetNumRows() != scale.GetNumRows();
|
||||
if (spatial)
|
||||
{
|
||||
size_t spatialSize = GetNumRows() / scale.GetNumRows();
|
||||
#pragma omp parallel for
|
||||
for (long icol = 0; icol < out.GetNumCols(); icol++)
|
||||
{
|
||||
for (long irow = 0; irow < out.GetNumRows(); irow++)
|
||||
{
|
||||
size_t imap = irow / spatialSize;
|
||||
out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) * runInvStdDev(imap, 0) + bias(imap, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (long icol = 0; icol < out.GetNumCols(); icol++)
|
||||
{
|
||||
for (long irow = 0; irow < out.GetNumRows(); irow++)
|
||||
{
|
||||
out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) * runInvStdDev(irow, 0) + bias(irow, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
|
||||
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const
|
||||
{
|
||||
UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
|
||||
RuntimeError("Batch normalization training on CPU is not yet implemented.");
|
||||
}
|
||||
|
||||
|
||||
#pragma region Static BLAS Functions
|
||||
|
||||
/// <summary>Matrix-matrix multiply with col-major matrices (a and b may be transposed): c = alpha * op(a) * op(b) + beta*c</summary>
|
||||
|
@ -5943,4 +6194,8 @@ template void CPUMatrix<char>::SetValue(const char);
|
|||
template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char* pArray, size_t matrixFlags);
|
||||
template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
|
||||
template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
|
||||
} } }
|
||||
|
||||
template CPUMatrix<int>::CPUMatrix(const size_t, const size_t, int*, const size_t);
|
||||
template CPUMatrix<int>::~CPUMatrix();
|
||||
|
||||
}}}
|
||||
|
|
|
@ -317,6 +317,27 @@ public:
|
|||
const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
|
||||
|
||||
void ConvolutionForward(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
|
||||
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
|
||||
void ConvolutionBackwardData(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
|
||||
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& grad) const;
|
||||
void ConvolutionBackwardKernel(const CPUMatrix<ElemType>& in, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
|
||||
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& kernelGrad) const;
|
||||
|
||||
void MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
|
||||
void MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
|
||||
const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
|
||||
CPUMatrix<ElemType>& grad) const;
|
||||
|
||||
void AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
|
||||
void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
|
||||
CPUMatrix<ElemType>& grad) const;
|
||||
|
||||
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev,
|
||||
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
|
||||
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
|
||||
|
||||
public:
|
||||
static int SetNumThreads(int numThreads); // note: this does not depend on <ElemType>, i.e. you can call it on any <ElemType>
|
||||
|
||||
|
@ -457,4 +478,5 @@ private:
|
|||
|
||||
typedef CPUMatrix<float> CPUSingleMatrix;
|
||||
typedef CPUMatrix<double> CPUDoubleMatrix;
|
||||
} } }
|
||||
|
||||
}}}
|
||||
|
|
|
@ -1335,4 +1335,7 @@ template CPUSparseMatrix<char> CPUSparseMatrix<char>::ColumnSlice(size_t startCo
|
|||
template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
|
||||
template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
|
||||
|
||||
template CPUSparseMatrix<int>::CPUSparseMatrix(const MatrixFormat, const size_t, const size_t, const size_t);
|
||||
template CPUSparseMatrix<int>::~CPUSparseMatrix();
|
||||
|
||||
}}}
|
||||
|
|
|
@ -0,0 +1,963 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4100)
|
||||
#pragma warning(disable : 4127)
|
||||
#pragma warning(disable : 4201)
|
||||
#pragma warning(disable : 4515)
|
||||
#endif
|
||||
#include <cub/cub.cuh>
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
size_t RoundUpToMultiple(size_t n, size_t blockSize)
|
||||
{
|
||||
return (n + blockSize - 1) / blockSize;
|
||||
}
|
||||
|
||||
cudaError_t GetLastCudaError()
|
||||
{
|
||||
cudaError_t prelaunchErr = cudaGetLastError();
|
||||
assert(cudaSuccess == prelaunchErr);
|
||||
if (prelaunchErr != cudaSuccess)
|
||||
return prelaunchErr;
|
||||
|
||||
#ifndef NO_SYNC
|
||||
cudaError_t executionErr = cudaStreamSynchronize(GetStream());
|
||||
assert(cudaSuccess == executionErr);
|
||||
if (executionErr != cudaSuccess)
|
||||
return executionErr;
|
||||
#endif
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
template <int U, typename T>
|
||||
__device__ __forceinline__ void LoadValues(const T* src, T dst[U])
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 0; i < U; i++)
|
||||
dst[i] = src[i];
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2])
|
||||
{
|
||||
// src must be aligned at 8 bytes boundary.
|
||||
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
|
||||
auto v = *(const float2*)src;
|
||||
dst[0] = v.x;
|
||||
dst[1] = v.y;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4])
|
||||
{
|
||||
// src must be aligned at 16 bytes boundary.
|
||||
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
|
||||
// Can do the following instead (use ld.global.nc.* on CC 3.5+):
|
||||
// asm volatile("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(v.x), "=f"(v.y), "=f"(v.z), "=f"(v.w) : "l"(src));
|
||||
// Similar for shared memory (e.g. ld.shared.*)
|
||||
auto v = *(const float4*)src;
|
||||
dst[0] = v.x;
|
||||
dst[1] = v.y;
|
||||
dst[2] = v.z;
|
||||
dst[3] = v.w;
|
||||
}
|
||||
|
||||
template <int U, typename T>
|
||||
__device__ __forceinline__ void StoreValues(const T src[U], T* dst)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 0; i < U; i++)
|
||||
dst[i] = src[i];
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst)
|
||||
{
|
||||
// dst must be aligned at 8 bytes boundary.
|
||||
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
|
||||
float2 v;
|
||||
v.x = src[0];
|
||||
v.y = src[1];
|
||||
*(reinterpret_cast<float2*>(dst)) = v;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst)
|
||||
{
|
||||
// dst must be aligned at 16 bytes boundary.
|
||||
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
|
||||
float4 v;
|
||||
v.x = src[0];
|
||||
v.y = src[1];
|
||||
v.z = src[2];
|
||||
v.w = src[3];
|
||||
*(reinterpret_cast<float4*>(dst)) = v;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T Shuffle(T input, int srcLane)
|
||||
{
|
||||
// shfl is supported only on Kepler+. We really don't care about Fermi anymore but our build still has sm_20.
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return cub::ShuffleIndex(input, srcLane);
|
||||
#else
|
||||
// REVIEW alexeyk: make static_assert once we remove SM 2.0 support from our build.
|
||||
assert(false);
|
||||
return input;
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace Operations
|
||||
{
|
||||
__device__ float RSqrt(float a)
|
||||
{
|
||||
// REVIEW alexeyk: rsqrtf is just one MUFU.RSQ instruction so it's faster than
|
||||
// __frsqrt_rn intrinsic which performs round-to-nearest-even rounding which adds ~10 other instructions.
|
||||
// __frsqrt_rn is unbiased rounding though, need to verify whether it is a better choice for BN implementation.
|
||||
//return __frsqrt_rn(a);
|
||||
return rsqrtf(a);
|
||||
}
|
||||
|
||||
__device__ double RSqrt(double a)
|
||||
{
|
||||
return rsqrt(a);
|
||||
}
|
||||
}
|
||||
|
||||
// This function is used to select correct unroll factor.
|
||||
// REVIEW alexeyk: ask our C++ gurus (Marko/Amit) if there is better way.
|
||||
template <template <int> class Func, typename T, typename ...Targs>
|
||||
void Call(size_t vectorSize, Targs... args)
|
||||
{
|
||||
if ((vectorSize % 4) == 0)
|
||||
Func<4>::template Call<T>(args...);
|
||||
else if ((vectorSize % 2) == 0)
|
||||
Func<2>::template Call<T>(args...);
|
||||
else
|
||||
Func<1>::template Call<T>(args...);
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Mean and variance computaion
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// The kernel implements online, parallel and numerically stable algorithm
|
||||
// for computing batch mean and variance (here inverse standard deviation) with one pass over the data.
|
||||
// It uses algorithms by Knuth/Welford and Chan et al (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
|
||||
// In short, algorithm has 2 steps:
|
||||
// 1. Each thread strides over the input and computes mean and
|
||||
// m2 value (used to compute variance at the end) - Welford algorithm.
|
||||
// 2. Parallel reduction (Chan algorithm) performed by columns (note that
|
||||
// thread block and grid X dimensions go along the vector and Y dimension - along the batch).
|
||||
// As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
|
||||
//
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
|
||||
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert((vectorSize % U) == 0);
|
||||
assert(blockDim.x == BlockDimX);
|
||||
assert(blockDim.y == BlockDimY);
|
||||
assert(blockDim.z == 1);
|
||||
assert(gridDim.y == 1);
|
||||
assert(gridDim.z == 1);
|
||||
assert(::isfinite(epsilon) && epsilon > 0);
|
||||
assert(::isfinite(expAvgFactor) && expAvgFactor > 0);
|
||||
|
||||
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
if (irowSrcBase >= vectorSize)
|
||||
return;
|
||||
assert(irowSrcBase + U <= vectorSize);
|
||||
|
||||
int n = 0;
|
||||
ElemType mean[U];
|
||||
ElemType m2[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
mean[k] = 0;
|
||||
m2[k] = 0;
|
||||
}
|
||||
|
||||
int icolSrc = threadIdx.y;
|
||||
const ElemType* psrc = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
|
||||
// Stride over all vectors in the batch.
|
||||
for (; icolSrc < batchSize; icolSrc += BlockDimY)
|
||||
{
|
||||
n++;
|
||||
ElemType curVal[U];
|
||||
LoadValues<U>(psrc, curVal);
|
||||
// No need for separate unrolling, SASS looks good.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ElemType d = curVal[k] - mean[k];
|
||||
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
|
||||
// Using precise math slows down the code by about 40%.
|
||||
mean[k] += d / n;
|
||||
m2[k] += d * (curVal[k] - mean[k]);
|
||||
}
|
||||
psrc += vectorSize * BlockDimY;
|
||||
}
|
||||
|
||||
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
|
||||
const int laneId = tid & 0x1f;
|
||||
// First, reduce within warp using shuffle.
|
||||
if (n > 0)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CUB_PTX_WARP_THREADS / BlockDimX; i *= 2)
|
||||
{
|
||||
int srcLane = laneId + BlockDimX * i;
|
||||
int n2 = Shuffle(n, srcLane);
|
||||
int nsum = n + n2;
|
||||
ElemType d[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
d[k] = Shuffle(mean[k], srcLane) - mean[k];
|
||||
ElemType dScaled = d[k] * n2 / nsum;
|
||||
mean[k] += dScaled;
|
||||
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
|
||||
}
|
||||
n = nsum;
|
||||
}
|
||||
}
|
||||
|
||||
// Storage for each warp in a thread block. First warp ("accumulator") holds
|
||||
// final results so it does not need shared memory.
|
||||
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
|
||||
__shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
|
||||
__shared__ ElemType m2Res[BlockDimX * U][cwarp - 1];
|
||||
__shared__ int nRes[cwarp - 1];
|
||||
|
||||
// Each warp (except warp0) will write accumulated results to shared memory.
|
||||
const int iwarp = tid / CUB_PTX_WARP_THREADS;
|
||||
if (iwarp > 0 && laneId < BlockDimX)
|
||||
{
|
||||
if (laneId == 0)
|
||||
nRes[iwarp - 1] = n;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
meanRes[laneId * U + k][iwarp - 1] = mean[k];
|
||||
m2Res[laneId * U + k][iwarp - 1] = m2[k];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Accumulate and write final results.
|
||||
// REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
// Use simple loop as number of warps is small, 8 at max.
|
||||
#pragma unroll
|
||||
for (int i = 0; i < cwarp - 1; i++)
|
||||
{
|
||||
int n2 = nRes[i];
|
||||
int nsum = n + n2;
|
||||
ElemType d[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
d[k] = meanRes[threadIdx.x * U + k][i] - mean[k];
|
||||
ElemType dScaled = d[k] * n2 / nsum;
|
||||
mean[k] += dScaled;
|
||||
m2[k] += m2Res[threadIdx.x * U + k][i] + d[k] * n * dScaled;
|
||||
}
|
||||
n = nsum;
|
||||
}
|
||||
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
// Store mean and running mean.
|
||||
StoreValues<U>(mean, xMean + idxDstBase);
|
||||
if (expAvgFactor == 1)
|
||||
StoreValues<U>(mean, runMean + idxDstBase);
|
||||
else
|
||||
{
|
||||
ElemType run[U];
|
||||
LoadValues<U>(runMean + idxDstBase, run);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
|
||||
StoreValues<U>(run, runMean + idxDstBase);
|
||||
}
|
||||
// Store inv std dev and its running version.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
m2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
|
||||
}
|
||||
StoreValues<U>(m2, xInvStdDev + idxDstBase);
|
||||
if (expAvgFactor == 1)
|
||||
StoreValues<U>(m2, runInvStdDev + idxDstBase);
|
||||
else
|
||||
{
|
||||
ElemType run[U];
|
||||
LoadValues<U>(runInvStdDev + idxDstBase, run);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
run[k] = expAvgFactor * m2[k] + (1.0 - expAvgFactor) * run[k];
|
||||
StoreValues<U>(run, runInvStdDev + idxDstBase);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This kernel is very similar to kComputeBatchMeanAndInvStdDev except it reduces not just over N (minibatch)
|
||||
// but also W and H dimensions.
|
||||
// REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x,
|
||||
double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
|
||||
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert(blockDim.x == BlockDimX);
|
||||
assert(blockDim.y == BlockDimY);
|
||||
assert(blockDim.z == 1);
|
||||
assert(gridDim.y == 1);
|
||||
assert(gridDim.z == 1);
|
||||
assert((spatialSize % U) == 0);
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
assert(::isfinite(expAvgFactor) && expAvgFactor > 0);
|
||||
assert(::isfinite(epsilon) && epsilon > 0);
|
||||
|
||||
int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
|
||||
if (irowSrcBase >= vectorSize)
|
||||
return;
|
||||
assert(irowSrcBase + U <= vectorSize);
|
||||
int irowSrcLim = (blockIdx.x + 1) * spatialSize;
|
||||
|
||||
int n = 0;
|
||||
ElemType mean[U];
|
||||
ElemType m2[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
mean[k] = 0;
|
||||
m2[k] = 0;
|
||||
}
|
||||
|
||||
int icolSrc = threadIdx.y;
|
||||
const ElemType* psrcBase = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
|
||||
// Stride over all vectors in the batch.
|
||||
for (; icolSrc < batchSize; icolSrc += BlockDimY)
|
||||
{
|
||||
const ElemType* psrc = psrcBase;
|
||||
// Stride over all values in feature map (W and H dimensions).
|
||||
for (int irowSrc = irowSrcBase; irowSrc < irowSrcLim; irowSrc += BlockDimX * U, psrc += BlockDimX * U)
|
||||
{
|
||||
n++;
|
||||
ElemType curVal[U];
|
||||
LoadValues<U>(psrc, curVal);
|
||||
// No need for separate unrolling, SASS looks good.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ElemType d = curVal[k] - mean[k];
|
||||
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
|
||||
// Using precise math slows down the code by about 40%.
|
||||
mean[k] += d / n;
|
||||
m2[k] += d * (curVal[k] - mean[k]);
|
||||
}
|
||||
}
|
||||
psrcBase += vectorSize * BlockDimY;
|
||||
}
|
||||
|
||||
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
|
||||
const int laneId = tid & 0x1f;
|
||||
// First, reduce within warp using shuffle.
|
||||
if (n > 0)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CUB_PTX_WARP_THREADS; i *= 2)
|
||||
{
|
||||
int srcLane = laneId + i;
|
||||
int n2 = Shuffle(n, srcLane);
|
||||
int nsum = n + n2;
|
||||
ElemType d[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
d[k] = Shuffle(mean[k], srcLane) - mean[k];
|
||||
ElemType dScaled = d[k] * n2 / nsum;
|
||||
mean[k] += dScaled;
|
||||
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
|
||||
}
|
||||
n = nsum;
|
||||
}
|
||||
}
|
||||
|
||||
// Storage for each warp in a thread block. First warp ("accumulator") holds
|
||||
// final results so it does not need shared memory.
|
||||
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
|
||||
__shared__ ElemType meanRes[U][cwarp - 1];
|
||||
__shared__ ElemType m2Res[U][cwarp - 1];
|
||||
__shared__ int nRes[cwarp - 1];
|
||||
|
||||
// Each warp (except warp0) will write accumulated results to shared memory.
|
||||
const int iwarp = tid / CUB_PTX_WARP_THREADS;
|
||||
if (iwarp > 0 && laneId == 0)
|
||||
{
|
||||
nRes[iwarp - 1] = n;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
meanRes[k][iwarp - 1] = mean[k];
|
||||
m2Res[k][iwarp - 1] = m2[k];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// One thread will accumulate and write final results.
|
||||
if (tid == 0)
|
||||
{
|
||||
// Use simple loop as number of warps is small, 8 at max.
|
||||
#pragma unroll
|
||||
for (int i = 0; i < cwarp - 1; i++)
|
||||
{
|
||||
int n2 = nRes[i];
|
||||
int nsum = n + n2;
|
||||
ElemType d[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
d[k] = meanRes[k][i] - mean[k];
|
||||
ElemType dScaled = d[k] * n2 / nsum;
|
||||
mean[k] += dScaled;
|
||||
m2[k] += m2Res[k][i] + d[k] * n * dScaled;
|
||||
}
|
||||
n = nsum;
|
||||
}
|
||||
// Final step - accumlate results in mean[0] and m2[0].
|
||||
// REVIEW alexeyk: move outside of the loop, before storing values to smem.
|
||||
#pragma unroll
|
||||
for (int k = 1; k < U; k++)
|
||||
{
|
||||
ElemType d = mean[k] - mean[0];
|
||||
ElemType dScaled = d * n / (n + k * n);
|
||||
mean[0] += dScaled;
|
||||
m2[0] += m2[k] + d * k * n * dScaled;
|
||||
}
|
||||
|
||||
xMean[blockIdx.x] = mean[0];
|
||||
runMean[blockIdx.x] = (expAvgFactor == 1) ? mean[0] : (expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x]);
|
||||
m2[0] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
|
||||
xInvStdDev[blockIdx.x] = m2[0];
|
||||
runInvStdDev[blockIdx.x] = (expAvgFactor == 1) ? m2[0] : (expAvgFactor * m2[0] + (1.0 - expAvgFactor) * runInvStdDev[blockIdx.x]);
|
||||
}
|
||||
}
|
||||
|
||||
// The struct is used by Call function to select proper template in runtime based on the size of the vector.
|
||||
// The same pattern is used in other cases of similar structs.
|
||||
template <int U>
|
||||
struct ComputeBatchMeanAndInvStdDev
|
||||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
|
||||
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
|
||||
const int BlockDimX = 32 / U;
|
||||
const int BlockDimY = 4 * U;
|
||||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
|
||||
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
|
||||
kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(batchSize),
|
||||
x, expAvgFactor, runMean, runInvStdDev, epsilon, xMean, xInvStdDev);
|
||||
}
|
||||
};
|
||||
|
||||
template <int U>
|
||||
struct ComputeSpatialBatchMeanAndInvStdDev
|
||||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x,
|
||||
double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
|
||||
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
assert((spatialSize % U) == 0);
|
||||
|
||||
const int BlockDimX = 32 / U;
|
||||
const int BlockDimY = 4 * U;
|
||||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
|
||||
// Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
|
||||
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
|
||||
kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize),
|
||||
x, expAvgFactor, runMean, runInvStdDev,epsilon, xMean, xInvStdDev);
|
||||
}
|
||||
};
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Forward propagation
|
||||
// All functions accept input/outputs tensors in column-major format where each column is a vector of a minibatch.
|
||||
// In convolutional case (i.e. spatial=true), each vector is in CHW format where W dimension has stride = 1.
|
||||
// Tensors for biases and inverse stddevs have dimensions that equal to vector dimension in non-convolutional (i.e. spatial=false)
|
||||
// or Cx1x1 in convolutional case.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
|
||||
__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize, const ElemType* x, ElemType* y,
|
||||
const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert(blockDim.x == BlockDimX);
|
||||
assert(blockDim.y == BlockDimY);
|
||||
assert(blockDim.z == 1);
|
||||
assert(gridDim.y == 1);
|
||||
assert(gridDim.z == 1);
|
||||
assert((vectorSize % U) == 0);
|
||||
assert(!Spatial || (spatialSize % U) == 0);
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
|
||||
int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
if (irowBase >= vectorSize)
|
||||
return;
|
||||
assert(irowBase + U <= vectorSize);
|
||||
|
||||
__shared__ ElemType meanS[BlockDimX * U];
|
||||
__shared__ ElemType invStdDevS[BlockDimX * U];
|
||||
__shared__ ElemType scaleS[BlockDimX * U];
|
||||
__shared__ ElemType biasS[BlockDimX * U];
|
||||
int offs = threadIdx.x * U;
|
||||
// REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
if (Spatial)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
int imap = (irowBase + k) / spatialSize;
|
||||
meanS[offs + k] = batchMean[imap];
|
||||
invStdDevS[offs + k] = batchInvStdDev[imap];
|
||||
scaleS[offs + k] = bnScale[imap];
|
||||
biasS[offs + k] = bnBias[imap];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadValues<U>(batchMean + irowBase, meanS + offs);
|
||||
LoadValues<U>(batchInvStdDev + irowBase, invStdDevS + offs);
|
||||
LoadValues<U>(bnScale + irowBase, scaleS + offs);
|
||||
LoadValues<U>(bnBias + irowBase, biasS + offs);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
ElemType mean[U];
|
||||
ElemType invStdDev[U];
|
||||
ElemType scale[U];
|
||||
ElemType bias[U];
|
||||
LoadValues<U>(meanS + offs, mean);
|
||||
LoadValues<U>(invStdDevS + offs, invStdDev);
|
||||
LoadValues<U>(scaleS + offs, scale);
|
||||
LoadValues<U>(biasS + offs, bias);
|
||||
|
||||
int icol = blockIdx.y * BlockDimY + threadIdx.y;
|
||||
size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
|
||||
const ElemType* psrc = x + startOffs;
|
||||
ElemType* pdst = y + startOffs;
|
||||
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
|
||||
for (; icol < batchSize; icol += gridDim.y * BlockDimY, psrc += stride, pdst += stride)
|
||||
{
|
||||
ElemType val[U];
|
||||
LoadValues<U>(psrc, val);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
val[k] = scale[k] * (val[k] - mean[k]) * invStdDev[k] + bias[k];
|
||||
}
|
||||
StoreValues<U>(val, pdst);
|
||||
}
|
||||
}
|
||||
|
||||
template <int U>
|
||||
struct NormalizeBatchTraining
|
||||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, ElemType* y,
|
||||
const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
|
||||
const int BlockDimX = 32 / U;
|
||||
const int BlockDimY = 4 * U;
|
||||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
|
||||
auto gdim = dim3((unsigned int)RoundUpToMultiple(vectorSize, BlockDimX * U));
|
||||
if (spatial)
|
||||
{
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
|
||||
(int)vectorSize, (int)spatialSize, (int)batchSize, x, y, bnScale, bnBias,
|
||||
batchMean, batchInvStdDev);
|
||||
}
|
||||
else
|
||||
{
|
||||
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
|
||||
(int)vectorSize, (int)spatialSize, (int)batchSize, x, y, bnScale, bnBias,
|
||||
batchMean, batchInvStdDev);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Backpropagation
|
||||
// BatchNormalizationBackward back-propagates derivatives of batch normalization function
|
||||
// with respect to the inputs and scale and bias parameters.
|
||||
// All tensor dimensions and assumptions are the same as in case of forward propagation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
|
||||
const ElemType* saveMean, const ElemType* saveInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
static_assert(((BlockDimY - 1) & BlockDimY) == 0, "BlockDimY must be a power of 2.");
|
||||
assert((vectorSize % U) == 0);
|
||||
assert(blockDim.x == BlockDimX);
|
||||
assert(blockDim.y == BlockDimY);
|
||||
assert(blockDim.z == 1);
|
||||
assert(gridDim.y == 1);
|
||||
assert(gridDim.z == 1);
|
||||
|
||||
// REVIEW alexeyk: first part looks very similar to kComputeBatchMeanAndInvStdDev, any chance to refactor?
|
||||
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
if (irowSrcBase >= vectorSize)
|
||||
return;
|
||||
assert(irowSrcBase + U <= vectorSize);
|
||||
|
||||
ElemType mean[U];
|
||||
ElemType invStdDev[U];
|
||||
__shared__ ElemType meanS[BlockDimX * U];
|
||||
__shared__ ElemType invStdDevS[BlockDimX * U];
|
||||
// Read mean and inv std dev.
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
LoadValues<U>(saveMean + irowSrcBase, mean);
|
||||
LoadValues<U>(saveInvStdDev + irowSrcBase, invStdDev);
|
||||
StoreValues<U>(mean, &meanS[threadIdx.x * U]);
|
||||
StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
|
||||
}
|
||||
__syncthreads();
|
||||
if (threadIdx.y != 0)
|
||||
{
|
||||
LoadValues<U>(&meanS[threadIdx.x * U], mean);
|
||||
LoadValues<U>(&invStdDevS[threadIdx.x * U], invStdDev);
|
||||
}
|
||||
|
||||
ElemType ds[U];
|
||||
ElemType db[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ds[k] = 0;
|
||||
db[k] = 0;
|
||||
}
|
||||
|
||||
int icolSrc = threadIdx.y;
|
||||
size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
|
||||
const ElemType* px = x + startOffs;
|
||||
const ElemType* pdy = dy + startOffs;
|
||||
size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
|
||||
// Stride over all vectors in the batch.
|
||||
for (; icolSrc < batchSize; icolSrc += BlockDimY, px += stride, pdy += stride)
|
||||
{
|
||||
ElemType curX[U];
|
||||
ElemType curdY[U];
|
||||
LoadValues<U>(px, curX);
|
||||
LoadValues<U>(pdy, curdY);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ds[k] += pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
|
||||
db[k] += pdy[k];
|
||||
}
|
||||
}
|
||||
|
||||
// Final reduction.
|
||||
__shared__ ElemType dsS[BlockDimY][BlockDimX * U];
|
||||
__shared__ ElemType dbS[BlockDimY][BlockDimX * U];
|
||||
StoreValues<U>(ds, &dsS[threadIdx.y][threadIdx.x * U]);
|
||||
StoreValues<U>(db, &dbS[threadIdx.y][threadIdx.x * U]);
|
||||
__syncthreads();
|
||||
// Very simple block reduction. As the block y dim is small (e.g. 16) then the loop
|
||||
// is executed very few times (e.g. 4) so the performance is good.
|
||||
// Can be potentially improved by using shuffle instructions (as in kComputeBatchMeanAndInvStdDev).
|
||||
#pragma unroll
|
||||
for (int y = BlockDimY / 2; y > 0; y /= 2)
|
||||
{
|
||||
if (threadIdx.y < y)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
dsS[threadIdx.y][threadIdx.x * U + k] += dsS[threadIdx.y + y][threadIdx.x * U + k];
|
||||
dbS[threadIdx.y][threadIdx.x * U + k] += dbS[threadIdx.y + y][threadIdx.x * U + k];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
// Write results.
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
dScale[irowSrcBase + k] = dsS[0][threadIdx.x * U + k];
|
||||
dBias[irowSrcBase + k] = dbS[0][threadIdx.x * U + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
|
||||
__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert(blockDim.x == BlockDimX);
|
||||
assert(blockDim.y == BlockDimY);
|
||||
assert(blockDim.z == 1);
|
||||
assert(gridDim.y == 1);
|
||||
assert(gridDim.z == 1);
|
||||
assert((spatialSize % U) == 0);
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
|
||||
int irowBase = blockIdx.x * spatialSize + threadIdx.x * U;
|
||||
if (irowBase >= vectorSize)
|
||||
return;
|
||||
assert(irowBase + U <= vectorSize);
|
||||
int irowLim = (blockIdx.x + 1) * spatialSize;
|
||||
|
||||
ElemType mean;
|
||||
ElemType invStdDev;
|
||||
__shared__ ElemType meanS;
|
||||
__shared__ ElemType invStdDevS;
|
||||
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
|
||||
// Read mean and inv std dev.
|
||||
if (tid == 0)
|
||||
{
|
||||
meanS = saveMean[blockIdx.x];
|
||||
invStdDevS = saveInvStdDev[blockIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
if (tid != 0)
|
||||
{
|
||||
mean = meanS;
|
||||
invStdDev = invStdDevS;
|
||||
}
|
||||
|
||||
ElemType ds[U];
|
||||
ElemType db[U];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ds[k] = 0;
|
||||
db[k] = 0;
|
||||
}
|
||||
|
||||
int icolSrc = threadIdx.y;
|
||||
size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowBase;
|
||||
const ElemType* pxBase = x + startOffs;
|
||||
const ElemType* pdyBase = dy + startOffs;
|
||||
size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
|
||||
// Stride over all vectors in the batch.
|
||||
for (; icolSrc < batchSize; icolSrc += BlockDimY, pxBase += stride, pdyBase += stride)
|
||||
{
|
||||
const ElemType* px = pxBase;
|
||||
const ElemType* pdy = pdyBase;
|
||||
// Stride over all values in feature map (W and H dimensions).
|
||||
for (int irow = irowBase; irow < irowLim; irow += BlockDimX * U, px += BlockDimX * U, pdy += BlockDimX * U)
|
||||
{
|
||||
ElemType curX[U];
|
||||
ElemType curdY[U];
|
||||
LoadValues<U>(px, curX);
|
||||
LoadValues<U>(pdy, curdY);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ds[k] += pdy[k] * (curX[k] - mean) * invStdDev;
|
||||
db[k] += pdy[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
using BlockReduce = cub::BlockReduce<ElemType, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
|
||||
// Note: must use separate temp storages for each reduction.
|
||||
__shared__ typename BlockReduce::TempStorage tmp1;
|
||||
ElemType dsRes = BlockReduce(tmp1).Sum(ds);
|
||||
__shared__ typename BlockReduce::TempStorage tmp2;
|
||||
ElemType dbRes = BlockReduce(tmp2).Sum(db);
|
||||
if (tid == 0)
|
||||
{
|
||||
dScale[blockIdx.x] = dsRes;
|
||||
dBias[blockIdx.x] = dbRes;
|
||||
}
|
||||
}
|
||||
|
||||
template <int U>
|
||||
struct ComputeScaleAndBiasGradients
|
||||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
const int BlockDimX = 32 / U;
|
||||
const int BlockDimY = 4 * U;
|
||||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
|
||||
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
|
||||
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
|
||||
}
|
||||
};
|
||||
|
||||
template <int U>
|
||||
struct ComputeSpatialScaleAndBiasGradients
|
||||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
|
||||
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((spatialSize % U) == 0);
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
|
||||
const int BlockDimX = 32 / U;
|
||||
const int BlockDimY = 4 * U;
|
||||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
|
||||
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
|
||||
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
|
||||
}
|
||||
};
|
||||
|
||||
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
|
||||
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
|
||||
const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias,
|
||||
const ElemType* saveMean, const ElemType* saveInvStdDev)
|
||||
{
|
||||
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
|
||||
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
|
||||
assert(blockDim.x == BlockDimX);
|
||||
assert(blockDim.y == BlockDimY);
|
||||
assert(blockDim.z == 1);
|
||||
assert(gridDim.z == 1);
|
||||
assert((vectorSize % U) == 0);
|
||||
assert(Spatial || spatialSize == 1);
|
||||
assert(!Spatial || (spatialSize % U) == 0);
|
||||
assert((vectorSize % spatialSize) == 0);
|
||||
|
||||
int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
|
||||
if (irowBase >= vectorSize)
|
||||
return;
|
||||
assert(irowBase + U <= vectorSize);
|
||||
ElemType scale[U];
|
||||
ElemType ds[U];
|
||||
ElemType db[U];
|
||||
ElemType mean[U];
|
||||
ElemType invStdDev[U];
|
||||
// REVIEW alexeyk: here we're wasting some bandwidth but this might be ok as it's a one-timer.
|
||||
if (Spatial)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
int imap = (irowBase + k) / spatialSize;
|
||||
scale[k] = bnScale[imap];
|
||||
ds[k] = dScale[imap];
|
||||
db[k] = dBias[imap];
|
||||
mean[k] = saveMean[imap];
|
||||
invStdDev[k] = saveInvStdDev[imap];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadValues<U>(bnScale + irowBase, scale);
|
||||
LoadValues<U>(dScale + irowBase, ds);
|
||||
LoadValues<U>(dBias + irowBase, db);
|
||||
LoadValues<U>(saveMean + irowBase, mean);
|
||||
LoadValues<U>(saveInvStdDev + irowBase, invStdDev);
|
||||
}
|
||||
|
||||
int icol = blockIdx.y * BlockDimY + threadIdx.y;
|
||||
size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
|
||||
const ElemType* px = x + startOffs;
|
||||
const ElemType* pdy = dy + startOffs;
|
||||
ElemType* pdx = dx + startOffs;
|
||||
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
|
||||
for (; icol < batchSize; icol += gridDim.y * BlockDimY, px += stride, pdy += stride, pdx += stride)
|
||||
{
|
||||
ElemType xCur[U];
|
||||
ElemType dyCur[U];
|
||||
ElemType dxCur[U];
|
||||
LoadValues<U>(px, xCur);
|
||||
LoadValues<U>(pdy, dyCur);
|
||||
LoadValues<U>(pdx, dxCur);
|
||||
// From the BN paper, dL/dxi is a sum of three terms: dL/dxi = t1 + t2 + t3
|
||||
// After simplifcation, they become the following:
|
||||
// 1. t1 = scale * dL/dyi * invStdDev
|
||||
// 2. t2 = (-scale / m) * invStdDev * xHat * dL/dScale
|
||||
// 3. t3 = (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
|
||||
// Simplifying this a bit more, we get the formula below.
|
||||
ElemType val[U];
|
||||
int m = Spatial ? batchSize * spatialSize : batchSize;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < U; k++)
|
||||
{
|
||||
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k];
|
||||
val[k] = dxCur[k] + (scale[k] * invStdDev[k]) * (dyCur[k] - (xNorm * ds[k] + db[k]) / m);
|
||||
}
|
||||
StoreValues<U>(val, pdx);
|
||||
}
|
||||
}
|
||||
|
||||
template <int U>
|
||||
struct BackpropagateBatchNormGradients
|
||||
{
|
||||
template <typename ElemType>
|
||||
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
|
||||
const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
|
||||
{
|
||||
assert((vectorSize % U) == 0);
|
||||
const int BlockDimX = 32 / U;
|
||||
const int BlockDimY = 4 * U;
|
||||
auto bdim = dim3(BlockDimX, BlockDimY);
|
||||
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)),
|
||||
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
|
||||
if (spatial)
|
||||
{
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
|
||||
}
|
||||
else
|
||||
{
|
||||
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
|
||||
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} } }
|
|
@ -0,0 +1,272 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#include <math_constants.h>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <typename ElemType>
|
||||
__global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__ kernel,
|
||||
const int* mpRowCol, const int* mpRowIwht,
|
||||
const int* mpRowRun, const int* __restrict__ runs,
|
||||
const ElemType* __restrict__ src, int srcVecSize,
|
||||
ElemType* dst, int dstVecSize)
|
||||
{
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= dstVecSize)
|
||||
return;
|
||||
|
||||
src += blockIdx.y * srcVecSize;
|
||||
dst += blockIdx.y * dstVecSize;
|
||||
|
||||
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
|
||||
{
|
||||
int colBase = mpRowCol[row];
|
||||
int ivBase = mpRowIwht[row];
|
||||
assert(0 <= colBase && colBase < srcVecSize);
|
||||
|
||||
ElemType sum = 0;
|
||||
int i0 = mpRowRun[row];
|
||||
int skip = runs[i0++];
|
||||
int size = runs[i0++];
|
||||
int imask = i0 + size;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
if (runs[imask + i] == 0)
|
||||
continue;
|
||||
int dcol = runs[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
|
||||
sum += kernel[ivBase + skip + i] * src[colBase + dcol];
|
||||
}
|
||||
dst[row] = sum;
|
||||
|
||||
src += blockDim.y * srcVecSize;
|
||||
dst += blockDim.y * dstVecSize;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
__global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restrict__ kernel,
|
||||
const int* mpRowCol, const int* mpRowIwht,
|
||||
const int* mpRowRun, const int* __restrict__ runs,
|
||||
const ElemType* __restrict__ srcGrad, int srcVecSize,
|
||||
ElemType* grad, int dstVecSize)
|
||||
{
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= srcVecSize)
|
||||
return;
|
||||
|
||||
srcGrad += blockIdx.y * srcVecSize;
|
||||
grad += blockIdx.y * dstVecSize;
|
||||
|
||||
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
|
||||
{
|
||||
int colBase = mpRowCol[row];
|
||||
int ivBase = mpRowIwht[row];
|
||||
assert(0 <= colBase && colBase < dstVecSize);
|
||||
|
||||
ElemType g = srcGrad[row];
|
||||
int i0 = mpRowRun[row];
|
||||
int skip = runs[i0++];
|
||||
int size = runs[i0++];
|
||||
int imask = i0 + size;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
if (runs[imask + i] == 0)
|
||||
continue;
|
||||
int dcol = runs[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
|
||||
atomicAdd(&grad[colBase + dcol], g * kernel[ivBase + skip + i]);
|
||||
}
|
||||
|
||||
srcGrad += blockDim.y * srcVecSize;
|
||||
grad += blockDim.y * dstVecSize;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
__global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int outVecSize,
|
||||
const ElemType* __restrict__ in,
|
||||
const int* mpRowCol, const int* mpRowIwht,
|
||||
const int* mpRowRun, const int* __restrict__ runs,
|
||||
const ElemType* __restrict__ srcGrad,
|
||||
ElemType* kernelGrad)
|
||||
{
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= outVecSize)
|
||||
return;
|
||||
|
||||
in += blockIdx.y * inVecSize;
|
||||
srcGrad += blockIdx.y * outVecSize;
|
||||
|
||||
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
|
||||
{
|
||||
int colBase = mpRowCol[row];
|
||||
int ivBase = mpRowIwht[row];
|
||||
assert(0 <= colBase && colBase < inVecSize);
|
||||
|
||||
ElemType g = srcGrad[row];
|
||||
int i0 = mpRowRun[row];
|
||||
int skip = runs[i0++];
|
||||
int size = runs[i0++];
|
||||
int imask = i0 + size;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
if (runs[imask + i] == 0)
|
||||
continue;
|
||||
int dcol = runs[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < inVecSize);
|
||||
atomicAdd(&kernelGrad[ivBase + skip + i], g * in[colBase + dcol]);
|
||||
}
|
||||
|
||||
in += blockDim.y * inVecSize;
|
||||
srcGrad += blockDim.y * outVecSize;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
__global__ void kMaxPoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
|
||||
const ElemType* __restrict__ src, int srcVecSize,
|
||||
ElemType* dst, int dstVecSize)
|
||||
{
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= dstVecSize)
|
||||
return;
|
||||
|
||||
src += blockIdx.y * srcVecSize;
|
||||
dst += blockIdx.y * dstVecSize;
|
||||
|
||||
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
|
||||
{
|
||||
int colBase = mpRowCol[row];
|
||||
assert(0 <= colBase && colBase < srcVecSize);
|
||||
|
||||
int i0 = mpRowIndices[row];
|
||||
int size = indices[i0++];
|
||||
ElemType res = src[colBase + indices[i0]];
|
||||
for (int i = 1; i < size; i++)
|
||||
{
|
||||
int dcol = indices[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
|
||||
res = max(res, src[colBase + dcol]);
|
||||
}
|
||||
dst[row] = res;
|
||||
|
||||
src += blockDim.y * srcVecSize;
|
||||
dst += blockDim.y * dstVecSize;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
__global__ void kMaxPoolingBackward(int batchSize, const ElemType* out, const ElemType* in,
|
||||
const int* mpRowCol, const int* mpRowIndices, const int* indices,
|
||||
const ElemType* __restrict__ srcGrad, int srcVecSize,
|
||||
ElemType* grad, int dstVecSize)
|
||||
{
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= srcVecSize)
|
||||
return;
|
||||
|
||||
in += blockIdx.y * dstVecSize;
|
||||
out += blockIdx.y * srcVecSize;
|
||||
srcGrad += blockIdx.y * srcVecSize;
|
||||
grad += blockIdx.y * dstVecSize;
|
||||
|
||||
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
|
||||
{
|
||||
int colBase = mpRowCol[row];
|
||||
assert(0 <= colBase && colBase < dstVecSize);
|
||||
|
||||
int i0 = mpRowIndices[row];
|
||||
int size = indices[i0++];
|
||||
assert(size > 0);
|
||||
ElemType g = srcGrad[row];
|
||||
ElemType m = out[row];
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
int dcol = indices[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
|
||||
if (in[colBase + dcol] >= m)
|
||||
atomicAdd(&grad[colBase + dcol], g);
|
||||
}
|
||||
|
||||
in += blockDim.y * dstVecSize;
|
||||
out += blockDim.y * srcVecSize;
|
||||
srcGrad += blockDim.y * srcVecSize;
|
||||
grad += blockDim.y * dstVecSize;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
__global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
|
||||
const ElemType* __restrict__ src, int srcVecSize,
|
||||
ElemType* dst, int dstVecSize)
|
||||
{
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= dstVecSize)
|
||||
return;
|
||||
|
||||
src += blockIdx.y * srcVecSize;
|
||||
dst += blockIdx.y * dstVecSize;
|
||||
|
||||
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
|
||||
{
|
||||
int colBase = mpRowCol[row];
|
||||
assert(0 <= colBase && colBase < srcVecSize);
|
||||
|
||||
int i0 = mpRowIndices[row];
|
||||
int size = indices[i0++];
|
||||
ElemType sum = 0;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
int dcol = indices[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
|
||||
sum += src[colBase + dcol];
|
||||
}
|
||||
dst[row] = sum / size;
|
||||
|
||||
src += blockDim.y * srcVecSize;
|
||||
dst += blockDim.y * dstVecSize;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
__global__ void kAveragePoolingBackward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
|
||||
const ElemType* __restrict__ srcGrad, int srcVecSize,
|
||||
ElemType* grad, int dstVecSize)
|
||||
{
|
||||
int row = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row >= srcVecSize)
|
||||
return;
|
||||
|
||||
srcGrad += blockIdx.y * srcVecSize;
|
||||
grad += blockIdx.y * dstVecSize;
|
||||
|
||||
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
|
||||
{
|
||||
int colBase = mpRowCol[row];
|
||||
assert(0 <= colBase && colBase < dstVecSize);
|
||||
|
||||
int i0 = mpRowIndices[row];
|
||||
int size = indices[i0++];
|
||||
assert(size > 0);
|
||||
ElemType g = srcGrad[row] / size;
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
int dcol = indices[i0 + i];
|
||||
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
|
||||
atomicAdd(&grad[colBase + dcol], g);
|
||||
}
|
||||
|
||||
srcGrad += blockDim.y * srcVecSize;
|
||||
grad += blockDim.y * dstVecSize;
|
||||
}
|
||||
}
|
||||
|
||||
} } }
|
|
@ -5,242 +5,295 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "ConvolutionEngine.h"
|
||||
#include "CuDnnConvolutionEngine.h"
|
||||
#include "CuDnnFactories.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::Forward(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter,
|
||||
const ConvDesc& convDesc, const Tensor4D& outT, Mat& out, Mat& workspace)
|
||||
void ConvolutionEngine<ElemType>::Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace)
|
||||
{
|
||||
assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
|
||||
assert(inT.n() == in.GetNumCols());
|
||||
assert(filterT.k() == filter.GetNumRows());
|
||||
assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
|
||||
assert(inT.c() == filterT.c());
|
||||
assert(outT.c() == filterT.k());
|
||||
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
|
||||
assert(outT.n() == out.GetNumCols());
|
||||
|
||||
EnsureCompatible();
|
||||
ForwardCore(inT, in, filterT, filter, convDesc, outT, out, workspace);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::BackwardData(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& gradT, Mat& grad, Mat& workspace)
|
||||
{
|
||||
assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
|
||||
assert(srcGradT.n() == srcGrad.GetNumCols());
|
||||
assert(filterT.k() == filter.GetNumRows());
|
||||
assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
|
||||
assert(srcGradT.c() == filterT.k());
|
||||
assert(gradT.c() == filterT.c());
|
||||
assert(gradT.w() * gradT.h() * gradT.c() == grad.GetNumRows());
|
||||
assert(gradT.n() == grad.GetNumCols());
|
||||
|
||||
EnsureCompatible();
|
||||
BackwardDataCore(srcGradT, srcGrad, filterT, filter, convDesc, gradT, grad, workspace);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::BackwardFilter(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
|
||||
const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace)
|
||||
{
|
||||
assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
|
||||
assert(srcGradT.n() == srcGrad.GetNumCols());
|
||||
assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
|
||||
assert(inT.n() == in.GetNumCols());
|
||||
assert(srcGradT.c() == filterT.k());
|
||||
assert(inT.c() == filterT.c());
|
||||
assert(filterT.k() == filter.GetNumRows());
|
||||
assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
|
||||
|
||||
EnsureCompatible();
|
||||
BackwardFilterCore(srcGradT, srcGrad, inT, in, convDesc, filterT, filter, allowReuse, workspace);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::NormalizeBatch(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
|
||||
double epsilon, Mat& saveMean, Mat& saveInvStdDev)
|
||||
{
|
||||
const size_t crowIn = inT.w() * inT.h() * inT.c();
|
||||
if (spatial)
|
||||
{
|
||||
assert(scaleBiasT.c() == inT.c());
|
||||
assert(scaleBiasT.w() == 1);
|
||||
assert(scaleBiasT.h() == 1);
|
||||
assert(runMean.GetNumRows() == inT.c());
|
||||
assert(runInvStdDev.GetNumRows() == inT.c());
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(scaleBiasT.c() == inT.c());
|
||||
assert(scaleBiasT.w() == inT.w());
|
||||
assert(scaleBiasT.h() == inT.h());
|
||||
assert(runMean.GetNumRows() == crowIn);
|
||||
assert(runInvStdDev.GetNumRows() == crowIn);
|
||||
}
|
||||
assert(scaleBiasT.n() == 1);
|
||||
assert(crowIn == in.GetNumRows());
|
||||
assert(crowIn == out.GetNumRows());
|
||||
assert(inT.n() == in.GetNumCols());
|
||||
assert(inT.n() == out.GetNumCols());
|
||||
assert(bias.GetNumCols() == 1);
|
||||
assert(scale.GetNumCols() == 1);
|
||||
assert(runMean.GetNumCols() == 1);
|
||||
assert(runInvStdDev.GetNumCols() == 1);
|
||||
assert(runMean.GetNumCols() == saveMean.GetNumCols());
|
||||
assert(runMean.GetNumRows() == saveMean.GetNumRows());
|
||||
assert(runInvStdDev.GetNumCols() == saveInvStdDev.GetNumCols());
|
||||
assert(runInvStdDev.GetNumRows() == saveInvStdDev.GetNumRows());
|
||||
|
||||
#ifndef _DEBUG
|
||||
UNUSED(crowIn); // crowIn used only in asserts.
|
||||
const auto& g = *m_geometry;
|
||||
assert(g.InputShape().GetNumElements() == in.GetNumRows());
|
||||
assert(g.OutputShape().GetNumElements() == out.GetNumRows());
|
||||
size_t batchSize = in.GetNumCols();
|
||||
assert(batchSize == out.GetNumCols());
|
||||
// REVIEW alexeyk: add shape-aware asserts?
|
||||
assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
|
||||
#ifdef NDEBUG
|
||||
UNUSED(g);
|
||||
UNUSED(batchSize);
|
||||
#endif
|
||||
|
||||
EnsureCompatibleBatchNorm(spatial);
|
||||
NormalizeBatchCore(inT, in, scaleBiasT, scale, bias, spatial, expAvgFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
|
||||
EnsureCompatible();
|
||||
EnsureConvolutionInitialized();
|
||||
ForwardCore(in, kernel, out, workspace);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::NormalizeBatchInference(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out)
|
||||
void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace)
|
||||
{
|
||||
const size_t crowIn = inT.w() * inT.h() * inT.c();
|
||||
|
||||
if (spatial)
|
||||
{
|
||||
assert(scaleBiasT.c() == inT.c());
|
||||
assert(scaleBiasT.w() == 1);
|
||||
assert(scaleBiasT.h() == 1);
|
||||
assert(scaleBiasT.c() == runMean.GetNumRows());
|
||||
assert(scaleBiasT.c() == runInvStdDev.GetNumRows());
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(scaleBiasT.c() == inT.c());
|
||||
assert(scaleBiasT.w() == inT.w());
|
||||
assert(scaleBiasT.h() == inT.h());
|
||||
assert(crowIn == runMean.GetNumRows());
|
||||
assert(crowIn == runInvStdDev.GetNumRows());
|
||||
}
|
||||
assert(scaleBiasT.n() == 1);
|
||||
assert(crowIn == in.GetNumRows());
|
||||
assert(crowIn == out.GetNumRows());
|
||||
assert(inT.n() == in.GetNumCols());
|
||||
assert(inT.n() == out.GetNumCols());
|
||||
assert(bias.GetNumCols() == 1);
|
||||
assert(scale.GetNumCols() == 1);
|
||||
assert(runMean.GetNumCols() == 1);
|
||||
assert(runInvStdDev.GetNumCols() == 1);
|
||||
#ifndef _DEBUG
|
||||
// used only in asserts.
|
||||
UNUSED(crowIn);
|
||||
const auto& g = *m_geometry;
|
||||
assert(g.InputShape().GetNumElements() == grad.GetNumRows());
|
||||
assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
|
||||
size_t batchSize = srcGrad.GetNumCols();
|
||||
assert(batchSize == grad.GetNumCols());
|
||||
assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
|
||||
#ifdef NDEBUG
|
||||
UNUSED(g);
|
||||
UNUSED(batchSize);
|
||||
#endif
|
||||
|
||||
EnsureCompatibleBatchNorm(spatial);
|
||||
NormalizeBatchInferenceCore(inT, in, scaleBiasT, scale, bias, spatial, runMean, runInvStdDev, out);
|
||||
EnsureCompatible();
|
||||
EnsureConvolutionInitialized();
|
||||
BackwardDataCore(srcGrad, kernel, grad, workspace);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::BackwardNormalizeBatch(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
|
||||
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad)
|
||||
void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool allowReuse, Mat& workspace)
|
||||
{
|
||||
const size_t crowIn = inT.w() * inT.h() * inT.c();
|
||||
|
||||
if (spatial)
|
||||
{
|
||||
assert(scaleBiasT.c() == inT.c());
|
||||
assert(scaleBiasT.w() == 1);
|
||||
assert(scaleBiasT.h() == 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(scaleBiasT.c() == inT.c());
|
||||
assert(scaleBiasT.w() == inT.w());
|
||||
assert(scaleBiasT.h() == inT.h());
|
||||
}
|
||||
assert(scaleBiasT.n() == 1);
|
||||
assert(crowIn == in.GetNumRows());
|
||||
assert(crowIn == srcGrad.GetNumRows());
|
||||
assert(crowIn == grad.GetNumRows());
|
||||
assert(inT.n() == in.GetNumCols());
|
||||
assert(inT.n() == srcGrad.GetNumCols());
|
||||
assert(inT.n() == grad.GetNumCols());
|
||||
assert(scaleGrad.GetNumRows() == scale.GetNumRows());
|
||||
assert(scaleGrad.GetNumCols() == scale.GetNumCols());
|
||||
assert(biasGrad.GetNumRows() == scale.GetNumRows());
|
||||
assert(biasGrad.GetNumCols() == scale.GetNumCols());
|
||||
#ifndef _DEBUG
|
||||
UNUSED(crowIn); // crowIn used only in asserts.
|
||||
const auto& g = *m_geometry;
|
||||
assert(g.InputShape().GetNumElements() == in.GetNumRows());
|
||||
assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
|
||||
size_t batchSize = in.GetNumCols();
|
||||
assert(batchSize == srcGrad.GetNumCols());
|
||||
assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
|
||||
#ifdef NDEBUG
|
||||
UNUSED(g);
|
||||
UNUSED(batchSize);
|
||||
#endif
|
||||
|
||||
EnsureCompatibleBatchNorm(spatial);
|
||||
BackwardNormalizeBatchCore(inT, in, srcGrad, grad, scaleBiasT, scale, spatial, saveMean, saveInvStdDev, scaleGrad, biasGrad);
|
||||
EnsureCompatible();
|
||||
EnsureConvolutionInitialized();
|
||||
BackwardKernelCore(srcGrad, in, kernel, allowReuse, workspace);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::ForwardPooling(const Mat& in, Mat& out)
|
||||
{
|
||||
const auto& g = *m_geometry;
|
||||
assert(g.InputShape().GetNumElements() == in.GetNumRows());
|
||||
assert(g.OutputShape().GetNumElements() == out.GetNumRows());
|
||||
size_t batchSize = in.GetNumCols();
|
||||
assert(batchSize == out.GetNumCols());
|
||||
#ifdef NDEBUG
|
||||
UNUSED(g);
|
||||
UNUSED(batchSize);
|
||||
#endif
|
||||
|
||||
EnsureCompatible();
|
||||
EnsurePoolingInitialized();
|
||||
ForwardPoolingCore(in, out);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void ConvolutionEngine<ElemType>::BackwardPooling(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad)
|
||||
{
|
||||
const auto& g = *m_geometry;
|
||||
assert(g.InputShape().GetNumElements() == grad.GetNumRows());
|
||||
assert(g.InputShape().GetNumElements() == in.GetNumRows());
|
||||
assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
|
||||
assert(g.OutputShape().GetNumElements() == out.GetNumRows());
|
||||
size_t batchSize = out.GetNumCols();
|
||||
assert(batchSize == srcGrad.GetNumCols());
|
||||
assert(batchSize == in.GetNumCols());
|
||||
assert(batchSize == grad.GetNumCols());
|
||||
#ifdef NDEBUG
|
||||
UNUSED(g);
|
||||
UNUSED(batchSize);
|
||||
#endif
|
||||
|
||||
EnsureCompatible();
|
||||
EnsurePoolingInitialized();
|
||||
BackwardPoolingCore(out, srcGrad, in, grad);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// Default (legacy) convolution engine implementation.
|
||||
// Reference convolution engine implementation.
|
||||
// This engine supports arbitrary convolution geometry but does not provide efficient implementation.
|
||||
// Its main purpose is to serve as a baseline for optmized engines (e.g. cuDNN) that
|
||||
// usually implement only a subset of a general convolution geometry.
|
||||
//------------------------------------------------------------------
|
||||
template <class ElemType>
|
||||
class DefaultConvolutionEngine : public ConvolutionEngine<ElemType>
|
||||
class ReferenceConvolutionEngine : public ConvolutionEngine<ElemType>
|
||||
{
|
||||
public:
|
||||
using Base = ConvolutionEngine<ElemType>;
|
||||
using typename Base::Mat;
|
||||
using typename Base::Tensor4D;
|
||||
using typename Base::Filter;
|
||||
using typename Base::ConvDesc;
|
||||
|
||||
public:
|
||||
DefaultConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
|
||||
: Base(deviceId, imageLayout), m_ones(deviceId), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_bnImpl(bnImpl)
|
||||
ReferenceConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
|
||||
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
|
||||
m_mpRowCol(geometry->MpRowCol().size(), 1, const_cast<int*>(geometry->MpRowCol().data()), deviceId, IsGpu(deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer)
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
using Base::m_geometry;
|
||||
using Base::m_deviceId;
|
||||
using Base::m_imageLayout;
|
||||
using Base::m_maxTempMemSizeInSamples;
|
||||
using Base::m_poolKind;
|
||||
|
||||
void EnsureCompatible() override
|
||||
{
|
||||
if (m_imageLayout != ImageLayoutKind::CHW)
|
||||
RuntimeError("Reference convolution engine supports only CHW/cudnn layout.");
|
||||
}
|
||||
|
||||
void EnsureConvolutionInitialized() override
|
||||
{
|
||||
if (m_mpRowIwht == nullptr)
|
||||
{
|
||||
auto flags = IsGpu(m_deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer;
|
||||
m_mpRowIwht = std::make_unique<Matrix<int>>(m_geometry->MpRowIwht().size(), 1,
|
||||
const_cast<int*>(m_geometry->MpRowIwht().data()), m_deviceId, flags);
|
||||
m_mpRowRun = std::make_unique<Matrix<int>>(m_geometry->MpRowRun().size(), 1,
|
||||
const_cast<int*>(m_geometry->MpRowRun().data()), m_deviceId, flags);
|
||||
m_runs = std::make_unique<Matrix<int>>(m_geometry->Runs().size(), 1,
|
||||
const_cast<int*>(m_geometry->Runs().data()), m_deviceId, flags);
|
||||
}
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& /*workspace*/) override
|
||||
{
|
||||
in.ConvolutionForward(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, out);
|
||||
}
|
||||
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& /*workspace*/) override
|
||||
{
|
||||
srcGrad.ConvolutionBackwardData(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, grad);
|
||||
}
|
||||
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& /*workspace*/) override
|
||||
{
|
||||
srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
|
||||
}
|
||||
|
||||
void EnsurePoolingInitialized() override
|
||||
{
|
||||
if (m_indices == nullptr)
|
||||
{
|
||||
auto flags = IsGpu(m_deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer;
|
||||
m_mpRowIndices = std::make_unique<Matrix<int>>(m_geometry->MpRowIndices().size(), 1,
|
||||
const_cast<int*>(m_geometry->MpRowIndices().data()), m_deviceId, flags);
|
||||
m_indices = std::make_unique<Matrix<int>>(m_geometry->Indices().size(), 1,
|
||||
const_cast<int*>(m_geometry->Indices().data()), m_deviceId, flags);
|
||||
}
|
||||
}
|
||||
|
||||
void ForwardPoolingCore(const Mat& in, Mat& out) override
|
||||
{
|
||||
if (m_poolKind == PoolKind::Max)
|
||||
{
|
||||
in.MaxPoolingForward(m_mpRowCol, *m_mpRowIndices, *m_indices, out);
|
||||
}
|
||||
else if (m_poolKind == PoolKind::Average)
|
||||
{
|
||||
in.AveragePoolingForward(m_mpRowCol, *m_mpRowIndices, *m_indices, out);
|
||||
}
|
||||
else
|
||||
InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
|
||||
|
||||
}
|
||||
|
||||
void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
|
||||
{
|
||||
if (m_poolKind == PoolKind::Max)
|
||||
{
|
||||
srcGrad.MaxPoolingBackward(out, in, m_mpRowCol, *m_mpRowIndices, *m_indices, grad);
|
||||
}
|
||||
else if (m_poolKind == PoolKind::Average)
|
||||
{
|
||||
srcGrad.AveragePoolingBackward(m_mpRowCol, *m_mpRowIndices, *m_indices, grad);
|
||||
}
|
||||
else
|
||||
InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
|
||||
}
|
||||
|
||||
private:
|
||||
static bool IsGpu(DEVICEID_TYPE deviceId)
|
||||
{
|
||||
return deviceId >= 0;
|
||||
}
|
||||
|
||||
private:
|
||||
using IntMatPtr = std::unique_ptr<Matrix<int>>;
|
||||
|
||||
Matrix<int> m_mpRowCol;
|
||||
// Convolution-specific maps.
|
||||
IntMatPtr m_mpRowIwht;
|
||||
IntMatPtr m_mpRowRun;
|
||||
IntMatPtr m_runs;
|
||||
// Pooling-specific maps.
|
||||
IntMatPtr m_mpRowIndices;
|
||||
IntMatPtr m_indices;
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// Legacy convolution engine implementation.
|
||||
//------------------------------------------------------------------
|
||||
template <class ElemType>
|
||||
class LegacyConvolutionEngine : public ConvolutionEngine<ElemType>
|
||||
{
|
||||
public:
|
||||
using Base = ConvolutionEngine<ElemType>;
|
||||
using typename Base::Mat;
|
||||
|
||||
public:
|
||||
LegacyConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
|
||||
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
|
||||
m_inT(m_geometry->InputShape(), ImageLayoutKind::CHW), m_outT(m_geometry->OutputShape(), ImageLayoutKind::CHW),
|
||||
m_kernelT(m_geometry->KernelShape(), ImageLayoutKind::CHW), m_strideT(m_geometry->Stride(), ImageLayoutKind::CHW)
|
||||
{
|
||||
m_padding = m_geometry->AutoPad()[0];
|
||||
}
|
||||
|
||||
protected:
|
||||
using Base::m_geometry;
|
||||
using Base::m_deviceId;
|
||||
using Base::m_imageLayout;
|
||||
using Base::m_maxTempMemSizeInSamples;
|
||||
using Base::m_poolKind;
|
||||
|
||||
void EnsureCompatible() override
|
||||
{
|
||||
if (m_imageLayout != ImageLayoutKind::HWC)
|
||||
RuntimeError("Default convolution engine currently supports only HWC/legacy layout.");
|
||||
RuntimeError("Legacy convolution engine supports only HWC/legacy layout.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& outT, Mat& out, Mat& workspace) override
|
||||
void EnsureConvolutionInitialized() override
|
||||
{
|
||||
size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
|
||||
size_t packedInputColsPerSample = outT.w() * outT.h();
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = in.GetNumCols();
|
||||
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
||||
size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
|
||||
size_t outputSizePerChannel = packedInputColsPerSample;
|
||||
// size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
|
||||
// size_t inputDim = inT.w() * inT.h() * inT.c(); // size of each input sample
|
||||
|
||||
size_t batchSize = inT.n();
|
||||
size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);
|
||||
|
||||
assert(filter.GetNumCols() == packedInputRows && filter.GetNumRows() == outT.c());
|
||||
assert(kernel.GetNumCols() == packedInputRows && kernel.GetNumRows() == m_outT.c());
|
||||
UNUSED(packedInputRows);
|
||||
|
||||
// GPU and 1-dimensional image
|
||||
m_gpuSparseOpt = (filterT.h() == 1 &&
|
||||
m_gpuSparseOpt = (m_kernelT.h() == 1 &&
|
||||
in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
|
||||
convDesc.wStride() == 1 &&
|
||||
!convDesc.padding() &&
|
||||
m_strideT.w() == 1 &&
|
||||
!m_padding &&
|
||||
in.GetMatrixType() == MatrixType::SPARSE);
|
||||
m_gpuSparse1D = (m_gpuSparseOpt && inT.h() == 1);
|
||||
m_gpuSparse1D = (m_gpuSparseOpt && m_inT.h() == 1);
|
||||
|
||||
out.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
|
||||
|
||||
// Reshaping is only necessary if we are going to use the unpacking trick
|
||||
if (m_gpuSparseOpt)
|
||||
out.Reshape(outT.c() * outT.w(), outT.h() * batchSize);
|
||||
out.Reshape(m_outT.c() * m_outT.w(), m_outT.h() * batchSize);
|
||||
else
|
||||
out.Reshape(outT.c(), outputSizePerChannel * batchSize);
|
||||
out.Reshape(m_outT.c(), outputSizePerChannel * batchSize);
|
||||
|
||||
size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
|
||||
size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
|
||||
|
@ -263,53 +316,51 @@ protected:
|
|||
|
||||
if (m_gpuSparseOpt)
|
||||
{
|
||||
if (filterT.w() * inT.c() != filter.GetNumCols())
|
||||
if (m_kernelT.w() * m_inT.c() != kernel.GetNumCols())
|
||||
LogicError("Kernel width and weight matrix dimensions don't match.");
|
||||
|
||||
inputSubBatch.Reshape(inT.c() * inT.w(), inT.h() * smallBatchSize);
|
||||
Mat outputSubBatch = out.ColumnSlice(startSampleId, outT.h() * smallBatchSize);
|
||||
Mat::ConvolveAndWeightedAdd(1, filter, false, inputSubBatch, false, 0, outputSubBatch,
|
||||
static_cast<int>(inT.c()), convDesc.wStride(), convDesc.padding(), true);
|
||||
inputSubBatch.Reshape(m_inT.c() * m_inT.w(), m_inT.h() * smallBatchSize);
|
||||
Mat outputSubBatch = out.ColumnSlice(startSampleId, m_outT.h() * smallBatchSize);
|
||||
Mat::ConvolveAndWeightedAdd(1, kernel, false, inputSubBatch, false, 0, outputSubBatch,
|
||||
static_cast<int>(m_inT.c()), m_strideT.w(), m_padding, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
|
||||
workspace.AssignPackedConvolutionInput(inputSubBatch,
|
||||
inT.w(), inT.h(), inT.c(),
|
||||
outT.w(), outT.h(), outT.c(),
|
||||
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
|
||||
convDesc.padding());
|
||||
m_inT.w(), m_inT.h(), m_inT.c(),
|
||||
m_outT.w(), m_outT.h(), m_outT.c(),
|
||||
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
|
||||
m_padding);
|
||||
|
||||
Mat outputSubBatch = out.ColumnSlice(outputSizePerChannel * startSampleId, outputSizePerChannel * smallBatchSize);
|
||||
|
||||
// workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
|
||||
// BUGBUG: This ^^ destroys the content of the matrix. Also it seems not to change the size. Does it? Should this be a Reshape()?
|
||||
Mat::Multiply(filter, false, workspace, false, outputSubBatch);
|
||||
Mat::Multiply(kernel, false, workspace, false, outputSubBatch);
|
||||
}
|
||||
}
|
||||
|
||||
out.Reshape(outT.c() * outputSizePerChannel, batchSize); // each sample becomes a column
|
||||
out.Reshape(m_outT.c() * outputSizePerChannel, batchSize); // each sample becomes a column
|
||||
|
||||
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
|
||||
assert(outT.n() == out.GetNumCols());
|
||||
assert(m_outT.w() * m_outT.h() * m_outT.c() == out.GetNumRows());
|
||||
assert(batchSize == out.GetNumCols());
|
||||
}
|
||||
|
||||
void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& gradT, Mat& grad, Mat& workspace) override
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
|
||||
{
|
||||
size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
|
||||
size_t packedInputColsPerSample = srcGradT.w() * srcGradT.h();
|
||||
size_t batchSize = srcGrad.GetNumCols();
|
||||
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
||||
size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
|
||||
size_t outputSizePerChannel = packedInputColsPerSample;
|
||||
// size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
|
||||
// size_t inputDim = gradT.w() * gradT.h() * gradT.c(); // size of each input sample
|
||||
|
||||
size_t batchSize = srcGradT.n();
|
||||
// size_t inputDim = m_inT.w() * m_inT.h() * m_inT.c(); // size of each input sample
|
||||
|
||||
size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);
|
||||
|
||||
// Create slice which is the same as full matrix so we can reshape it.
|
||||
Matrix<ElemType> srcGradTmp = srcGrad.ColumnSlice(0, srcGrad.GetNumCols());
|
||||
srcGradTmp.Reshape(srcGradT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
|
||||
srcGradTmp.Reshape(m_outT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
|
||||
|
||||
size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
|
||||
size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
|
||||
|
@ -322,31 +373,29 @@ protected:
|
|||
|
||||
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
|
||||
Matrix<ElemType> outputGradientSubBatch = srcGradTmp.ColumnSlice(startSampleId * outputSizePerChannel, smallBatchSize * outputSizePerChannel);
|
||||
Matrix<ElemType>::Multiply(filter, true, outputGradientSubBatch, false, workspace);
|
||||
Matrix<ElemType>::Multiply(kernel, true, outputGradientSubBatch, false, workspace);
|
||||
|
||||
Matrix<ElemType> inputGradientSubBatch = grad.ColumnSlice(startSampleId, smallBatchSize);
|
||||
workspace.UnpackConvolutionInput(inputGradientSubBatch,
|
||||
gradT.w(), gradT.h(), gradT.c(),
|
||||
srcGradT.w(), srcGradT.h(), srcGradT.c(),
|
||||
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
|
||||
convDesc.padding());
|
||||
m_inT.w(), m_inT.h(), m_inT.c(),
|
||||
m_outT.w(), m_outT.h(), m_outT.c(),
|
||||
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
|
||||
m_padding);
|
||||
}
|
||||
|
||||
assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
|
||||
assert(srcGradT.n() == srcGrad.GetNumCols());
|
||||
assert(m_outT.w() * m_outT.h() * m_outT.c() == srcGrad.GetNumRows());
|
||||
assert(batchSize == srcGrad.GetNumCols());
|
||||
}
|
||||
|
||||
void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
|
||||
const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace) override
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) override
|
||||
{
|
||||
size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
|
||||
size_t packedInputColsPerSample = srcGradT.w() * srcGradT.h();
|
||||
size_t batchSize = in.GetNumCols();
|
||||
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
|
||||
size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
|
||||
size_t outputSizePerChannel = packedInputColsPerSample;
|
||||
// size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
|
||||
// size_t inputDim = m_inputImageLayout.width * m_inputImageLayout.height * m_inputImageLayout.channels; // size of each input sample
|
||||
|
||||
size_t batchSize = inT.n();
|
||||
|
||||
size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);
|
||||
|
||||
// const Matrix<ElemType> & weightMatrix = input0;
|
||||
|
@ -354,14 +403,14 @@ protected:
|
|||
|
||||
// Create slice which is the same as full matrix so we can reshape it.
|
||||
Matrix<ElemType> srcGradTmp = srcGrad.ColumnSlice(0, srcGrad.GetNumCols());
|
||||
srcGradTmp.Reshape(srcGradT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
|
||||
srcGradTmp.Reshape(m_outT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
|
||||
|
||||
size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
|
||||
size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
|
||||
|
||||
if (numSubBatches == 1 && allowReuse && !m_gpuSparseOpt) // reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
|
||||
// REVIEW alexeyk: the following makes an assumption that data in workspace was filled by Forward call and remained unchanged. Find way to enforce/verify that.
|
||||
Matrix<ElemType>::MultiplyAndAdd(srcGradTmp, false, workspace, true, filter);
|
||||
Matrix<ElemType>::MultiplyAndAdd(srcGradTmp, false, workspace, true, kernelGrad);
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < numSubBatches; i++)
|
||||
|
@ -379,16 +428,16 @@ protected:
|
|||
{
|
||||
Matrix<ElemType> inputSubBatch(in.GetDeviceId());
|
||||
inputSubBatch.SetValue(in.ColumnSlice(startSampleID, smallBatchSize));
|
||||
inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w() * inT.h());
|
||||
inputSubBatch.Reshape(m_inT.c(), smallBatchSize * m_inT.w() * m_inT.h());
|
||||
Matrix<ElemType> inputSubBatchSparseReordered(inputSubBatch.GetNumCols(), inputSubBatch.GetNumRows(), inputSubBatch.GetDeviceId(), MatrixType::SPARSE, MatrixFormat::matrixFormatSparseCSC);
|
||||
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize * inT.h(), inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
|
||||
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, m_inT.w(), 1, smallBatchSize * m_inT.h(), m_inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
|
||||
|
||||
Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.h() * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
|
||||
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize * srcGradT.h(), srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
|
||||
Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * m_outT.h() * m_outT.w(), m_outT.c(), outputGradientSubBatch.GetDeviceId());
|
||||
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, m_outT.w(), 1, smallBatchSize * m_outT.h(), m_outT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
|
||||
|
||||
filter.Reshape(srcGradT.c() * filterT.w(), inT.c());
|
||||
Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize * inT.h(), convDesc.wStride(), convDesc.padding(), false);
|
||||
filter.Reshape(srcGradT.c(), inT.c() * filterT.w());
|
||||
kernelGrad.Reshape(m_outT.c() * m_kernelT.w(), m_inT.c());
|
||||
Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, kernelGrad, smallBatchSize * m_inT.h(), m_strideT.w(), m_padding, false);
|
||||
kernelGrad.Reshape(m_outT.c(), m_inT.c() * m_kernelT.w());
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -396,288 +445,107 @@ protected:
|
|||
Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
|
||||
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
|
||||
workspace.AssignPackedConvolutionInput(inputSubBatch,
|
||||
inT.w(), inT.h(), inT.c(),
|
||||
srcGradT.w(), srcGradT.h(), srcGradT.c(),
|
||||
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
|
||||
convDesc.padding());
|
||||
m_inT.w(), m_inT.h(), m_inT.c(),
|
||||
m_outT.w(), m_outT.h(), m_outT.c(),
|
||||
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
|
||||
m_padding);
|
||||
|
||||
Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
|
||||
Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, kernelGrad);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
|
||||
assert(srcGradT.n() == srcGrad.GetNumCols());
|
||||
assert(m_outT.w() * m_outT.h() * m_outT.c() == srcGrad.GetNumRows());
|
||||
assert(batchSize == srcGrad.GetNumCols());
|
||||
}
|
||||
|
||||
void EnsureCompatibleBatchNorm(bool spatial) override
|
||||
void EnsurePoolingInitialized() override
|
||||
{
|
||||
if (m_deviceId >= 0)
|
||||
InvalidArgument("This engine does not support batch normalization on GPUs.");
|
||||
if (m_bnImpl != BatchNormImpl::Cntk)
|
||||
InvalidArgument("Only CNTK batch normalization implementation is supported by this engine.");
|
||||
if (spatial && m_imageLayout != ImageLayoutKind::CHW)
|
||||
InvalidArgument("This engine batch normalization currently supports only CHW data layout for convolutional nodes.");
|
||||
}
|
||||
|
||||
void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
|
||||
void ForwardPoolingCore(const Mat& in, Mat& out) override
|
||||
{
|
||||
UNUSED(inT);
|
||||
UNUSED(in);
|
||||
UNUSED(scaleBiasT);
|
||||
UNUSED(scale);
|
||||
UNUSED(bias);
|
||||
UNUSED(out);
|
||||
UNUSED(spatial);
|
||||
UNUSED(expAvgFactor);
|
||||
UNUSED(runMean);
|
||||
UNUSED(runInvStdDev);
|
||||
UNUSED(epsilon);
|
||||
UNUSED(saveMean);
|
||||
UNUSED(saveInvStdDev);
|
||||
RuntimeError("Not yet implemented.");
|
||||
}
|
||||
|
||||
void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) override
|
||||
{
|
||||
UNUSED(scaleBiasT);
|
||||
if (spatial)
|
||||
if (m_poolKind == PoolKind::Max)
|
||||
{
|
||||
size_t spatialSize = inT.w() * inT.h();
|
||||
#pragma omp parallel for
|
||||
for (long icol = 0; icol < out.GetNumCols(); icol++)
|
||||
{
|
||||
for (long irow = 0; irow < out.GetNumRows(); irow++)
|
||||
{
|
||||
size_t imap = irow / spatialSize;
|
||||
out(irow, icol) = scale(imap, 0) * (in(irow, icol) - runMean(imap, 0)) * runInvStdDev(imap, 0) + bias(imap, 0);
|
||||
}
|
||||
}
|
||||
out.AssignMaxPoolingResult(in, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
|
||||
m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
|
||||
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
|
||||
}
|
||||
else if (m_poolKind == PoolKind::Average)
|
||||
{
|
||||
out.AssignAveragePoolingResult(in, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
|
||||
m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
|
||||
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (long icol = 0; icol < out.GetNumCols(); icol++)
|
||||
{
|
||||
for (long irow = 0; irow < out.GetNumRows(); irow++)
|
||||
{
|
||||
out(irow, icol) = scale(irow, 0) * (in(irow, icol) - runMean(irow, 0)) * runInvStdDev(irow, 0) + bias(irow, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
|
||||
}
|
||||
|
||||
void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
|
||||
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) override
|
||||
void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
|
||||
{
|
||||
UNUSED(inT);
|
||||
UNUSED(in);
|
||||
UNUSED(srcGrad);
|
||||
UNUSED(grad);
|
||||
UNUSED(scaleBiasT);
|
||||
UNUSED(scale);
|
||||
UNUSED(scaleGrad);
|
||||
UNUSED(biasGrad);
|
||||
UNUSED(spatial);
|
||||
UNUSED(saveMean);
|
||||
UNUSED(saveInvStdDev);
|
||||
RuntimeError("Not yet implemented.");
|
||||
if (m_poolKind == PoolKind::Max)
|
||||
{
|
||||
grad.AddMaxPoolingGradient(srcGrad, in, out,
|
||||
m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
|
||||
m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
|
||||
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
|
||||
}
|
||||
else if (m_poolKind == PoolKind::Average)
|
||||
{
|
||||
grad.AddAveragePoolingGradient(srcGrad, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
|
||||
m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
|
||||
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
|
||||
}
|
||||
else
|
||||
InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t m_maxTempMemSizeInSamples;
|
||||
BatchNormImpl m_bnImpl;
|
||||
Mat m_ones;
|
||||
ImageDimensions m_inT;
|
||||
ImageDimensions m_outT;
|
||||
ImageDimensions m_kernelT;
|
||||
ImageDimensions m_strideT;
|
||||
bool m_padding;
|
||||
|
||||
bool m_gpuSparseOpt;
|
||||
bool m_gpuSparse1D;
|
||||
};
|
||||
|
||||
template <class ElemType>
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
|
||||
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
|
||||
ConvolutionEngineKind enabledEngines)
|
||||
{
|
||||
auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
|
||||
// Note: in some cases do not throw exception even if parameters do not match as Create
|
||||
// can be called from places like MEL with default parameters and never be used.
|
||||
// The check will be done later in engine's EnsureCompatible call if the egnine is actually used.
|
||||
auto engStr = (std::string)(*geometry);
|
||||
// Only legacy engine supports HWC layout.
|
||||
if (imageLayout == ImageLayoutKind::HWC)
|
||||
{
|
||||
if (!isEnabled(ConvolutionEngineKind::Legacy))
|
||||
RuntimeError("Trying to use Legacy convolution engine when it's disabled.");
|
||||
// REVIEW alexeyk: should honor m_traceLevel here.
|
||||
fprintf(stderr, "\nUsing legacy convolution engine for geometry: %s.\n", engStr.c_str());
|
||||
return std::make_unique<LegacyConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
|
||||
}
|
||||
|
||||
// Check if we can use cuDNN engine. Do not need to validate tensors as ConvolveGeometry has already done that.
|
||||
if (isEnabled(ConvolutionEngineKind::CuDnn) &&
|
||||
CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId, geometry, poolKind))
|
||||
{
|
||||
fprintf(stderr, "\nUsing cuDNN convolution engine for geometry: %s.\n", engStr.c_str());
|
||||
return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
|
||||
}
|
||||
|
||||
if (!isEnabled(ConvolutionEngineKind::Reference))
|
||||
RuntimeError("Reference convolution is disabled and no other engine supports such configuratin (or disabled).");
|
||||
fprintf(stderr, "\nUsing reference convolution engine for geometry: %s.\n", engStr.c_str());
|
||||
return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
|
||||
}
|
||||
|
||||
template class ConvolutionEngine<float>;
|
||||
template class ConvolutionEngine<double>;
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// Pooling engine.
|
||||
//------------------------------------------------------------------
|
||||
|
||||
|
||||
template <class ElemType>
|
||||
void PoolingEngine<ElemType>::Forward(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out)
|
||||
{
|
||||
assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
|
||||
assert(inT.n() == in.GetNumCols());
|
||||
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
|
||||
assert(outT.n() == out.GetNumCols());
|
||||
|
||||
EnsureCompatible();
|
||||
ForwardCore(inT, in, poolDesc, outT, out);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void PoolingEngine<ElemType>::Backward(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad)
|
||||
{
|
||||
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
|
||||
assert(outT.n() == out.GetNumCols());
|
||||
assert(out.GetNumRows() == srcGrad.GetNumRows());
|
||||
assert(out.GetNumCols() == srcGrad.GetNumCols());
|
||||
assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
|
||||
assert(inT.n() == in.GetNumCols());
|
||||
assert(in.GetNumRows() == grad.GetNumRows());
|
||||
assert(in.GetNumCols() == grad.GetNumCols());
|
||||
|
||||
EnsureCompatible();
|
||||
BackwardCore(outT, out, srcGrad, poolDesc, inT, in, grad);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// Default (legacy) pooling engine implementation.
|
||||
//------------------------------------------------------------------
|
||||
template <class ElemType>
|
||||
class DefaultPoolingEngine : public PoolingEngine<ElemType>
|
||||
{
|
||||
public:
|
||||
using Base = PoolingEngine<ElemType>;
|
||||
using typename Base::Tensor4D;
|
||||
using typename Base::PoolDesc;
|
||||
using typename Base::Mat;
|
||||
|
||||
public:
|
||||
DefaultPoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
|
||||
: Base(deviceId, imageLayout)
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
using Base::m_deviceId;
|
||||
using Base::m_imageLayout;
|
||||
|
||||
void EnsureCompatible() override
|
||||
{
|
||||
if (m_imageLayout != ImageLayoutKind::HWC)
|
||||
RuntimeError("Default pooling engine currently supports only HWC/legacy layout.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) override
|
||||
{
|
||||
if (poolDesc.kind() == PoolDesc::PoolKind::Max)
|
||||
{
|
||||
out.AssignMaxPoolingResult(in, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
|
||||
outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
|
||||
poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
|
||||
}
|
||||
else if (poolDesc.kind() == PoolDesc::PoolKind::Average)
|
||||
{
|
||||
out.AssignAveragePoolingResult(in, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
|
||||
outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
|
||||
poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
|
||||
}
|
||||
else
|
||||
InvalidArgument("Pooling type %d is not supported.", (int)poolDesc.kind());
|
||||
}
|
||||
|
||||
void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) override
|
||||
{
|
||||
if (poolDesc.kind() == PoolDesc::PoolKind::Max)
|
||||
{
|
||||
grad.AddMaxPoolingGradient(srcGrad, in, out,
|
||||
inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
|
||||
outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
|
||||
poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
|
||||
}
|
||||
else if (poolDesc.kind() == PoolDesc::PoolKind::Average)
|
||||
{
|
||||
grad.AddAveragePoolingGradient(srcGrad, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
|
||||
outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
|
||||
poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
|
||||
}
|
||||
else
|
||||
InvalidArgument("Pooling type %d is not supported.", (int)poolDesc.kind());
|
||||
}
|
||||
};
|
||||
|
||||
template class PoolingEngine<float>;
|
||||
template class PoolingEngine<double>;
|
||||
|
||||
template <class ElemType>
|
||||
class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
|
||||
{
|
||||
public:
|
||||
using Base = ConvolutionEngineFactory<ElemType>;
|
||||
using typename Base::Tensor4D;
|
||||
using typename Base::Tensor4DPtr;
|
||||
using typename Base::Filter;
|
||||
using typename Base::FilterPtr;
|
||||
using typename Base::ConvDesc;
|
||||
using typename Base::ConvDescPtr;
|
||||
using typename Base::PoolDesc;
|
||||
using typename Base::PoolDescPtr;
|
||||
|
||||
using typename Base::ConvEnginePtr;
|
||||
using typename Base::PoolEnginePtr;
|
||||
|
||||
public:
|
||||
Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override
|
||||
{
|
||||
return std::make_unique<ConvolutionTensor4D>(w, h, c, n);
|
||||
}
|
||||
|
||||
FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override
|
||||
{
|
||||
return std::make_unique<Filter>(w, h, c, k);
|
||||
}
|
||||
|
||||
ConvDescPtr CreateConvDescriptor(const Tensor4D& /*inT*/, const Filter& /*filterT*/,
|
||||
size_t wStride, size_t hStride, bool padding) override
|
||||
{
|
||||
return std::make_unique<ConvDesc>(wStride, hStride, padding);
|
||||
}
|
||||
|
||||
PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override
|
||||
{
|
||||
return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
|
||||
}
|
||||
|
||||
ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) override
|
||||
{
|
||||
return std::make_unique<DefaultConvolutionEngine<ElemType>>(deviceId, imageLayout, maxTempMemSizeInSamples, bnImpl);
|
||||
}
|
||||
|
||||
PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) override
|
||||
{
|
||||
return std::make_unique<DefaultPoolingEngine<ElemType>>(deviceId, imageLayout);
|
||||
}
|
||||
};
|
||||
|
||||
template <class ElemType>
|
||||
std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind)
|
||||
{
|
||||
if (engType == EngineType::Auto)
|
||||
{
|
||||
// REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
|
||||
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId) && imageLayoutKind == ImageLayoutKind::CHW)
|
||||
return Create(deviceId, EngineType::CuDnn, imageLayoutKind);
|
||||
else
|
||||
return Create(deviceId, EngineType::Legacy, imageLayoutKind);
|
||||
}
|
||||
else if (engType == EngineType::CuDnn)
|
||||
{
|
||||
if (imageLayoutKind != ImageLayoutKind::CHW)
|
||||
InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the cuDNN engine.", ToString(imageLayoutKind).c_str());
|
||||
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
|
||||
return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
|
||||
RuntimeError("cuDNN convolution engine is not supported, check the device id and whether the code was compiled with cuDNN.");
|
||||
}
|
||||
else if (engType == EngineType::Legacy)
|
||||
{
|
||||
return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
|
||||
}
|
||||
|
||||
RuntimeError("Not supported convolution engine type: %d.", (int)engType);
|
||||
}
|
||||
|
||||
template class ConvolutionEngineFactory<float>;
|
||||
template class ConvolutionEngineFactory<double>;
|
||||
|
||||
}}}
|
||||
|
|
|
@ -5,370 +5,104 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
// REVIEW alexeyk: this seems to be repeated all over the CNTKMathDll.
|
||||
#ifdef _WIN32
|
||||
#ifdef MATH_EXPORTS
|
||||
#define MATH_API __declspec(dllexport)
|
||||
#else
|
||||
#define MATH_API __declspec(dllimport)
|
||||
#endif
|
||||
#else // no DLLs on Linux
|
||||
#define MATH_API
|
||||
#endif
|
||||
|
||||
#include "Matrix.h"
|
||||
#include "TensorShape.h" // for ImageLayoutKind
|
||||
#include "ConvolveGeometry.h"
|
||||
#include "StringUtil.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// REVIEW alexeyk: this is a temp class until we have generic tensor suport in CNTK.
|
||||
class ConvolutionTensor4D
|
||||
//-------------------------------------------------------------
|
||||
// Convolution and pooling engine interface.
|
||||
//-------------------------------------------------------------
|
||||
enum class ConvolutionEngineKind
|
||||
{
|
||||
public:
|
||||
size_t w() const
|
||||
{
|
||||
return m_w;
|
||||
}
|
||||
size_t h() const
|
||||
{
|
||||
return m_h;
|
||||
}
|
||||
size_t c() const
|
||||
{
|
||||
return m_c;
|
||||
}
|
||||
size_t n() const
|
||||
{
|
||||
return m_n;
|
||||
}
|
||||
virtual void setN(size_t n)
|
||||
{
|
||||
m_n = n;
|
||||
}
|
||||
None = 0,
|
||||
Reference = 1,
|
||||
CuDnn = 1 << 1,
|
||||
Legacy = 1 << 2,
|
||||
|
||||
public:
|
||||
ConvolutionTensor4D(size_t w = 1, size_t h = 1, size_t c = 1, size_t n = 1)
|
||||
{
|
||||
m_w = w;
|
||||
m_h = h;
|
||||
m_c = c;
|
||||
m_n = n;
|
||||
}
|
||||
|
||||
public:
|
||||
virtual ~ConvolutionTensor4D() = default;
|
||||
// Deleting copy ctor/assignment as derived objects may contain non-copyable state.
|
||||
ConvolutionTensor4D(const ConvolutionTensor4D&) = delete;
|
||||
ConvolutionTensor4D& operator=(const ConvolutionTensor4D&) = delete;
|
||||
// REVIEW alexeyk: Have to implement move ctor explicitly as VS2013 does not support default move ctors.
|
||||
// ConvolutionTensor4D(ConvolutionTensor4D&&);
|
||||
// ConvolutionTensor4D& operator=(ConvolutionTensor4D&&);
|
||||
|
||||
private:
|
||||
size_t m_w;
|
||||
size_t m_h;
|
||||
size_t m_c;
|
||||
size_t m_n;
|
||||
All = Reference | CuDnn | Legacy
|
||||
};
|
||||
|
||||
class ConvolutionFilter
|
||||
enum class PoolKind
|
||||
{
|
||||
public:
|
||||
size_t w() const
|
||||
{
|
||||
return m_w;
|
||||
}
|
||||
size_t h() const
|
||||
{
|
||||
return m_h;
|
||||
}
|
||||
size_t c() const
|
||||
{
|
||||
return m_c;
|
||||
}
|
||||
size_t k() const
|
||||
{
|
||||
return m_k;
|
||||
}
|
||||
|
||||
public:
|
||||
ConvolutionFilter(size_t w = 1, size_t h = 1, size_t c = 1, size_t k = 1)
|
||||
{
|
||||
m_w = w;
|
||||
m_h = h;
|
||||
m_c = c;
|
||||
m_k = k;
|
||||
}
|
||||
|
||||
public:
|
||||
virtual ~ConvolutionFilter() = default;
|
||||
|
||||
// Deleting copy ctor/assignment as derived objects may contain non-copyable state.
|
||||
ConvolutionFilter(const ConvolutionFilter&) = delete;
|
||||
ConvolutionFilter& operator=(const ConvolutionFilter&) = delete;
|
||||
|
||||
private:
|
||||
size_t m_w;
|
||||
size_t m_h;
|
||||
size_t m_c;
|
||||
size_t m_k;
|
||||
None,
|
||||
Max,
|
||||
Average
|
||||
};
|
||||
|
||||
// ConvolutionDescriptor describes properties specific to convolution application.
|
||||
class ConvolutionDescriptor
|
||||
{
|
||||
public:
|
||||
// Horizontal stride (in w-dimension).
|
||||
size_t wStride() const
|
||||
{
|
||||
return m_wStride;
|
||||
}
|
||||
// Vertical stride (in h-dimension).
|
||||
size_t hStride() const
|
||||
{
|
||||
return m_hStride;
|
||||
}
|
||||
bool padding() const
|
||||
{
|
||||
return m_padding;
|
||||
}
|
||||
|
||||
public:
|
||||
ConvolutionDescriptor(size_t wStride = 1, size_t hStride = 1, bool padding = false)
|
||||
{
|
||||
m_wStride = wStride;
|
||||
m_hStride = hStride;
|
||||
m_padding = padding;
|
||||
}
|
||||
|
||||
public:
|
||||
virtual ~ConvolutionDescriptor() = default;
|
||||
// Deleting copy ctor/assignment as derived objects may contain non-copyable state.
|
||||
ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
|
||||
ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
|
||||
|
||||
private:
|
||||
size_t m_wStride;
|
||||
size_t m_hStride;
|
||||
bool m_padding;
|
||||
};
|
||||
|
||||
// PoolingDescriptor describes properties specific to convolution application.
|
||||
class PoolingDescriptor
|
||||
{
|
||||
public:
|
||||
enum class PoolKind
|
||||
{
|
||||
Max,
|
||||
Average
|
||||
};
|
||||
|
||||
PoolKind kind() const
|
||||
{
|
||||
return m_kind;
|
||||
}
|
||||
// Pooling window size.
|
||||
size_t w() const
|
||||
{
|
||||
return m_w;
|
||||
}
|
||||
size_t h() const
|
||||
{
|
||||
return m_h;
|
||||
}
|
||||
// Horizontal stride (in w-dimension).
|
||||
size_t wStride() const
|
||||
{
|
||||
return m_wStride;
|
||||
}
|
||||
// Vertical stride (in h-dimension).
|
||||
size_t hStride() const
|
||||
{
|
||||
return m_hStride;
|
||||
}
|
||||
// Horizontal pad (in w-dimension).
|
||||
size_t wPad() const
|
||||
{
|
||||
return m_wPad;
|
||||
}
|
||||
// Vertical pad (in h-dimension).
|
||||
size_t hPad() const
|
||||
{
|
||||
return m_hPad;
|
||||
}
|
||||
|
||||
public:
|
||||
PoolingDescriptor(PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
|
||||
{
|
||||
m_kind = kind;
|
||||
m_w = w;
|
||||
m_h = h;
|
||||
m_wStride = wStride;
|
||||
m_hStride = hStride;
|
||||
m_wPad = wPad;
|
||||
m_hPad = hPad;
|
||||
}
|
||||
|
||||
public:
|
||||
virtual ~PoolingDescriptor() = default;
|
||||
// Deleting copy ctor/assignment as derived objects may contain non-copyable state.
|
||||
PoolingDescriptor(const PoolingDescriptor&) = delete;
|
||||
PoolingDescriptor& operator=(const PoolingDescriptor&) = delete;
|
||||
|
||||
private:
|
||||
PoolKind m_kind;
|
||||
size_t m_w;
|
||||
size_t m_h;
|
||||
size_t m_wStride;
|
||||
size_t m_hStride;
|
||||
size_t m_wPad;
|
||||
size_t m_hPad;
|
||||
};
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4251)
|
||||
|
||||
template <class ElemType>
|
||||
class MATH_API ConvolutionEngine
|
||||
{
|
||||
public:
|
||||
using Tensor4D = ConvolutionTensor4D;
|
||||
using Filter = ConvolutionFilter;
|
||||
using ConvDesc = ConvolutionDescriptor;
|
||||
using Mat = Matrix<ElemType>;
|
||||
|
||||
public:
|
||||
ConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
|
||||
: m_deviceId(deviceId), m_imageLayout(imageLayout)
|
||||
{
|
||||
}
|
||||
virtual ~ConvolutionEngine() = default;
|
||||
|
||||
void Forward(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& outT, Mat& out, Mat& workspace);
|
||||
void Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace);
|
||||
|
||||
void BackwardData(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& gradT, Mat& grad, Mat& workspace);
|
||||
void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace);
|
||||
|
||||
void BackwardFilter(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
|
||||
const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace);
|
||||
void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace);
|
||||
|
||||
void NormalizeBatch(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
|
||||
double epsilon, Mat& saveMean, Mat& saveInvStdDev);
|
||||
void ForwardPooling(const Mat& in, Mat& out);
|
||||
|
||||
void NormalizeBatchInference(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out);
|
||||
void BackwardPooling(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad);
|
||||
|
||||
void BackwardNormalizeBatch(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
|
||||
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad);
|
||||
std::shared_ptr<const ConvolveGeometry> Geometry() const { return m_geometry; }
|
||||
|
||||
static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
|
||||
size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None, ConvolutionEngineKind enabledEngines = ConvolutionEngineKind::All);
|
||||
|
||||
DISABLE_COPY_AND_MOVE(ConvolutionEngine);
|
||||
|
||||
protected:
|
||||
virtual void EnsureCompatible() = 0;
|
||||
|
||||
virtual void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& outT, Mat& out, Mat& workspace) = 0;
|
||||
|
||||
virtual void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& gradT, Mat& grad, Mat& workspace) = 0;
|
||||
|
||||
virtual void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
|
||||
const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace) = 0;
|
||||
|
||||
virtual void EnsureCompatibleBatchNorm(bool spatial) = 0;
|
||||
|
||||
virtual void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
|
||||
double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
|
||||
|
||||
// REVIEW alexeyk: roll into NormalizeBatchCore.
|
||||
virtual void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) = 0;
|
||||
|
||||
virtual void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
|
||||
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) = 0;
|
||||
|
||||
protected:
|
||||
DEVICEID_TYPE m_deviceId;
|
||||
ImageLayoutKind m_imageLayout;
|
||||
};
|
||||
|
||||
template <class ElemType>
|
||||
class MATH_API PoolingEngine
|
||||
{
|
||||
public:
|
||||
using Tensor4D = ConvolutionTensor4D;
|
||||
using PoolDesc = PoolingDescriptor;
|
||||
using Mat = Matrix<ElemType>;
|
||||
|
||||
public:
|
||||
PoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
|
||||
: m_deviceId(deviceId), m_imageLayout(imageLayout)
|
||||
ConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
|
||||
: m_geometry(geometry), m_deviceId(deviceId), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_poolKind(poolKind)
|
||||
{
|
||||
assert(m_geometry != nullptr);
|
||||
}
|
||||
virtual ~PoolingEngine() = default;
|
||||
|
||||
void Forward(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out);
|
||||
void Backward(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad);
|
||||
|
||||
DISABLE_COPY_AND_MOVE(PoolingEngine);
|
||||
|
||||
protected:
|
||||
virtual void EnsureCompatible() = 0;
|
||||
virtual void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) = 0;
|
||||
virtual void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) = 0;
|
||||
|
||||
virtual void EnsureConvolutionInitialized() = 0;
|
||||
|
||||
virtual void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) = 0;
|
||||
|
||||
virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) = 0;
|
||||
|
||||
virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) = 0;
|
||||
|
||||
virtual void EnsurePoolingInitialized() = 0;
|
||||
|
||||
virtual void ForwardPoolingCore(const Mat& in, Mat& out) = 0;
|
||||
|
||||
virtual void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) = 0;
|
||||
|
||||
protected:
|
||||
ConvolveGeometryPtr m_geometry;
|
||||
DEVICEID_TYPE m_deviceId;
|
||||
ImageLayoutKind m_imageLayout;
|
||||
size_t m_maxTempMemSizeInSamples;
|
||||
PoolKind m_poolKind;
|
||||
};
|
||||
|
||||
// REVIEW alexeyk: this is a temporary hack until we find a better place for the BatchNorm engine(s).
|
||||
enum class BatchNormImpl
|
||||
#pragma warning(pop)
|
||||
|
||||
static inline PoolKind PoolKindFrom(const wstring& s)
|
||||
{
|
||||
CuDnn,
|
||||
Cntk
|
||||
};
|
||||
if (s.empty() || AreEqualIgnoreCase(s, L"none"))
|
||||
return PoolKind::None;
|
||||
if (AreEqualIgnoreCase(s, L"max"))
|
||||
return PoolKind::Max;
|
||||
if (AreEqualIgnoreCase(s, L"average"))
|
||||
return PoolKind::Average;
|
||||
InvalidArgument("Unknown pooling kind: '%ls'. Supported values: 'none', 'max', 'average'.", s.c_str());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
class MATH_API ConvolutionEngineFactory
|
||||
{
|
||||
public:
|
||||
using Tensor4D = ConvolutionTensor4D;
|
||||
using Tensor4DPtr = std::unique_ptr<Tensor4D>;
|
||||
using Filter = ConvolutionFilter;
|
||||
using FilterPtr = std::unique_ptr<ConvolutionFilter>;
|
||||
using ConvDesc = ConvolutionDescriptor;
|
||||
using ConvDescPtr = std::unique_ptr<ConvolutionDescriptor>;
|
||||
using PoolDesc = PoolingDescriptor;
|
||||
using PoolDescPtr = std::unique_ptr<PoolingDescriptor>;
|
||||
|
||||
using ConvEnginePtr = std::unique_ptr<ConvolutionEngine<ElemType>>;
|
||||
using PoolEnginePtr = std::unique_ptr<PoolingEngine<ElemType>>;
|
||||
|
||||
public:
|
||||
ConvolutionEngineFactory() = default;
|
||||
virtual ~ConvolutionEngineFactory() = default;
|
||||
|
||||
virtual Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) = 0;
|
||||
virtual FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) = 0;
|
||||
virtual ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT,
|
||||
size_t wStride, size_t hStride, bool padding) = 0;
|
||||
virtual PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) = 0;
|
||||
// virtual Tensor4DPtr CreateLrnDescriptor() = 0;
|
||||
|
||||
virtual ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) = 0;
|
||||
virtual PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) = 0;
|
||||
|
||||
enum class EngineType
|
||||
{
|
||||
Auto,
|
||||
CuDnn,
|
||||
Legacy
|
||||
};
|
||||
static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind);
|
||||
|
||||
DISABLE_COPY_AND_MOVE(ConvolutionEngineFactory);
|
||||
};
|
||||
} } }
|
||||
|
|
|
@ -0,0 +1,552 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include "Basics.h"
|
||||
#include "TensorShape.h"
|
||||
#include <iterator>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// Notes:
|
||||
// * ConvolveGeometry represents the application of one or more rectangular "kernels" (all of the same size)
|
||||
// to a rectangular input to produce a rectangular output.
|
||||
// * A "cell" in the rectangular input is identified by a single coordinate called a "col" (for column).
|
||||
// * A "cell" in the rectangular output is identified by a single coordinate called a "row".
|
||||
// * The kernels may involve weights, in which case MpRowIwht indicates the starting index of the weights
|
||||
// used for a given output cell.
|
||||
// The overall idea of ConvolveGeometry is to precompute maps that can be used to apply convolutions of
|
||||
// arbitrary configurations and dimensions. In such case the generic implementation becomes very simple and invariant
|
||||
// wrt convolution configuration and dimensionality. For specific cases like 2D/3D convolutions and full sharing,
|
||||
// highly optimized implementations (e.g. cuDNN) are used.
|
||||
class ConvolveGeometry final
|
||||
{
|
||||
public:
|
||||
using IntVec = std::vector<int>;
|
||||
using BoolVec = std::vector<bool>;
|
||||
|
||||
const TensorShape& InputShape() const { return m_inputShape; }
|
||||
const TensorShape& OutputShape() const { return m_outputShape; }
|
||||
const TensorShape& KernelShape() const { return m_kernelShape; }
|
||||
const TensorShape& MapCount() const { return m_mapCount; }
|
||||
const TensorShape& Stride() const { return m_stride; }
|
||||
const BoolVec& Sharing() const { return m_sharing; }
|
||||
const BoolVec& AutoPad() const { return m_autoPad; }
|
||||
const TensorShape& LowerPad() const { return m_lowerPad; }
|
||||
const TensorShape& UpperPad() const { return m_upperPad; }
|
||||
|
||||
// Maps from a "row" (index of output cell) to its base "col" (index of input cell). For a given row,
|
||||
// the cols that contribute to it are { MpRowCol[row] + Indices[i0 + 1 + i] | 0 <= i < Indices[i0] },
|
||||
// where i0 = MpRowIndices[row].
|
||||
const IntVec& MpRowCol() const { return m_mpRowCol; }
|
||||
|
||||
// Maps from a "row" (index of output cell) to where to start in the weights array. Each run of weights
|
||||
// consists of KernelSize weights.
|
||||
const IntVec& MpRowIwht() const { return m_mpRowIwht; }
|
||||
|
||||
// Maps from a "row" (index of output cell) to its starting index in Runs. A run consists of:
|
||||
// * skip count (to skip that many weights)
|
||||
// * item count
|
||||
// * relative indices into source (item count of these)
|
||||
// * masks (all 1's or all 0's) (item count of these)
|
||||
// For items that are masked out (0 mask), the index stored is the next valid index.
|
||||
// This ensures that accessing the corresponding neuron value doesn't fault and that
|
||||
// backprop operations write the correct value last (any previous writes won't change
|
||||
// the value).
|
||||
// NOTE: The first (zeroth) run is always the "full" kernel run. Also, MpRowRun can be empty,
|
||||
// indicating that all values are zero (all outputs use the "full" kernel run).
|
||||
const IntVec& MpRowRun() const { return m_mpRowRun; }
|
||||
const IntVec& Runs() const { return m_runs; }
|
||||
|
||||
// Maps from a "row" (index of output cell) to its starting index in Indices. Note that "Runs" is intended
|
||||
// for kernels that have weights, while "Indices" is intended for kernels that don't need to access weights.
|
||||
// As a result, the encoding in Indices is simpler and more direct.
|
||||
// A run in Indices consists of:
|
||||
// * item count
|
||||
// * relative indices into source (item count of these)
|
||||
// NOTE: The first run of indices is always the "full" kernel run. Also, MpRowIndices can be empty,
|
||||
// indicating that all values are zero (all outputs use the "full" kernel run).
|
||||
// In addition, all items in Indices are valid source indices so no masking is required in subsequent computation.
|
||||
const IntVec& MpRowIndices() const { return m_mpRowIndices; }
|
||||
const IntVec& Indices() const { return m_indices; }
|
||||
|
||||
// Number of kernels (equal to MapCount if sharing is all true values).
|
||||
size_t KernelCount() const { return m_kernelCount; }
|
||||
|
||||
ConvolveGeometry(const TensorShape& inputShape, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& stride,
|
||||
const BoolVec& sharing, const BoolVec& autoPad, const TensorShape& lowerPad, const TensorShape& upperPad)
|
||||
: m_inputShape(inputShape), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(stride), m_sharing(sharing),
|
||||
m_autoPad(autoPad), m_lowerPad(lowerPad), m_upperPad(upperPad)
|
||||
{
|
||||
// Note: this ctor is a bit long so sit back and relax.
|
||||
|
||||
assert(m_inputShape.GetRank() == m_kernelShape.GetRank());
|
||||
assert(m_mapCount.GetRank() == 1 || m_mapCount.GetRank() == m_inputShape.GetRank());
|
||||
assert(m_stride.GetRank() == 1 || m_stride.GetRank() == m_inputShape.GetRank());
|
||||
assert(m_sharing.size() == 1 || m_sharing.size() == m_inputShape.GetRank());
|
||||
assert(m_autoPad.size() == 1 || m_autoPad.size() == m_inputShape.GetRank());
|
||||
assert(m_lowerPad.GetRank() == 1 || m_lowerPad.GetRank() == m_inputShape.GetRank());
|
||||
assert(m_upperPad.GetRank() == 1 || m_upperPad.GetRank() == m_inputShape.GetRank());
|
||||
|
||||
m_outputShape = ComputeOutputShape(m_inputShape, m_kernelShape, m_mapCount, m_stride,
|
||||
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
|
||||
assert(m_inputShape.GetRank() == m_outputShape.GetRank());
|
||||
|
||||
size_t dimCount = inputShape.GetRank();
|
||||
size_t kernelSize = kernelShape.GetNumElements();
|
||||
|
||||
// Compute the total number of kernels.
|
||||
m_kernelCount = 1;
|
||||
for (size_t i = 0; i < dimCount; i++)
|
||||
m_kernelCount *= !GetSharing(i) ? m_outputShape[i] : GetMapCount(i);
|
||||
|
||||
// Compute the "Start" indices.
|
||||
m_start.resize(dimCount);
|
||||
m_startIndex = 0;
|
||||
m_originIndex = 0;
|
||||
for (int i = (int)dimCount - 1; i >= 0; i--)
|
||||
{
|
||||
assert((m_outputShape[i] % GetMapCount(i)) == 0);
|
||||
int outPerMap = (int)(m_outputShape[i] / GetMapCount(i));
|
||||
// Number of cells between first and last "centers", inclusive.
|
||||
int cells = (int)((outPerMap - 1) * GetStride(i) + 1);
|
||||
assert(m_inputShape[i] >= cells);
|
||||
|
||||
// Extra cells, to the left and right of "cells".
|
||||
int extra = (int)m_inputShape[i] - cells;
|
||||
assert(extra >= 0);
|
||||
|
||||
// When LowerPad and/or UpperPad are specified, the Start[i] value is determined by those values.
|
||||
int lo = GetAutoPad(i) ? 0 : (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : i];
|
||||
int hi = GetAutoPad(i) ? 0 : (int)m_upperPad[m_upperPad.size() == 1 ? 0 : i];
|
||||
if (lo != 0 || hi != 0)
|
||||
{
|
||||
assert(extra + lo + hi + 1 == m_kernelShape[i]);
|
||||
// Compute the number of cells on the left and right parts of the kernel,
|
||||
// not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
|
||||
// placed on the right (the center is shifted to the left).
|
||||
int right = (int)m_kernelShape[i] - 1;
|
||||
int left = right / 2;
|
||||
right -= left;
|
||||
assert(left <= right);
|
||||
assert(right <= left + 1);
|
||||
|
||||
assert(lo <= left);
|
||||
assert(hi <= right);
|
||||
m_start[i] = left - lo;
|
||||
assert(m_start[i] + cells + right == m_inputShape[i] + hi);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_start[i] = extra / 2;
|
||||
#ifdef _DEBUG
|
||||
// If we're padding then extra should be covered.
|
||||
bool padded = GetAutoPad(i);
|
||||
assert(!padded || extra + 1 <= m_kernelShape[i]);
|
||||
// If we're not padding then, we should stay within the input dimension.
|
||||
assert(padded || extra + 1 >= m_kernelShape[i]);
|
||||
|
||||
// Compute the number of cells on the left and right parts of the kernel,
|
||||
// not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
|
||||
// placed on the right (the center is shifted to the left).
|
||||
int right = (int)m_kernelShape[i] - 1;
|
||||
int left = right / 2;
|
||||
right -= left;
|
||||
assert(0 <= left);
|
||||
assert(left <= right);
|
||||
assert(right <= left + 1);
|
||||
|
||||
int min = m_start[i] - left;
|
||||
int max = m_start[i] + (int)cells + right;
|
||||
assert(!padded || min <= 0 && max >= m_inputShape[i]);
|
||||
assert(padded || min >= 0 && max <= m_inputShape[i]);
|
||||
|
||||
int diff = min - ((int)m_inputShape[i] - max);
|
||||
assert(std::abs(diff) <= 1);
|
||||
|
||||
UNUSED(padded);
|
||||
UNUSED(diff);
|
||||
#endif
|
||||
}
|
||||
|
||||
m_startIndex = m_startIndex * (int)m_inputShape[i] + m_start[i];
|
||||
m_originIndex = m_originIndex * (int)m_inputShape[i] + ((int)m_kernelShape[i] - 1) / 2;
|
||||
}
|
||||
|
||||
// Compute support, mapping from the index into the kernel to offset into source.
|
||||
// Support consists of the column deltas of the kernels, as offsets from MpRowCol[row].
|
||||
IntVec support(kernelSize);
|
||||
std::vector<IntVec> kernelCoords(kernelSize);
|
||||
for (int idx = 0; idx < kernelSize; idx++)
|
||||
{
|
||||
kernelCoords[idx].resize(dimCount);
|
||||
int ivSrc = 0;
|
||||
int factor = 1;
|
||||
int cur = idx;
|
||||
for (size_t i = 0; i < dimCount; i++)
|
||||
{
|
||||
assert(cur >= 0);
|
||||
int d = (int)m_kernelShape[i];
|
||||
assert(d > 0);
|
||||
int coord = cur % d;
|
||||
cur /= d;
|
||||
kernelCoords[idx][i] = coord;
|
||||
ivSrc += factor * coord;
|
||||
factor *= (int)m_inputShape[i];
|
||||
}
|
||||
assert(cur == 0);
|
||||
assert(ivSrc < m_inputShape.GetNumElements());
|
||||
support[idx] = ivSrc - m_originIndex;
|
||||
}
|
||||
|
||||
size_t outputSize = m_outputShape.GetNumElements();
|
||||
// Compute the mappings (where row = output node index, col = source node index):
|
||||
// * from row to the index of the first weight to use for that row.
|
||||
// * from row to the first input col. The rest are col + _support[i].
|
||||
m_mpRowIwht.resize(outputSize);
|
||||
m_mpRowCol.resize(outputSize);
|
||||
m_mpRowRun.resize(outputSize);
|
||||
m_mpRowIndices.resize(outputSize);
|
||||
|
||||
// A "key" is an equivalence class of run/masks.
|
||||
// Calculate the key for an interior cell (for using all of support - when all masks are 1's).
|
||||
int keyInterior = 0;
|
||||
for (size_t i = 0; i < dimCount; i++)
|
||||
{
|
||||
int width = (int)m_kernelShape[i];
|
||||
keyInterior = keyInterior * width + (width - 1) / 2;
|
||||
}
|
||||
|
||||
m_runs.resize(2 * kernelSize + 2, -1);
|
||||
m_indices.resize(kernelSize + 1);
|
||||
m_runs[0] = 0; // Skip count
|
||||
m_runs[1] = (int)kernelSize; // Count of entries
|
||||
m_indices[0] = (int)kernelSize;
|
||||
for (size_t i = 0; i < kernelSize; i++)
|
||||
{
|
||||
m_runs[2 + i] = support[i];
|
||||
m_indices[1 + i] = support[i];
|
||||
}
|
||||
|
||||
// Working buffer for masks.
|
||||
IntVec masks(kernelSize);
|
||||
|
||||
// Map from key to pair of starting locations in Runs and Indices.
|
||||
std::map<int, std::pair<int, int>> mpkeystarts;
|
||||
mpkeystarts[keyInterior] = std::make_pair(0, 0);
|
||||
|
||||
IntVec dkey(dimCount);
|
||||
for (size_t row = 0; row < outputSize; row++)
|
||||
{
|
||||
// Compute the kernel number, column, and key.
|
||||
// REVIEW alexeyk: Seems like there should be a simpler and faster way, without starting
|
||||
// from scratch for each output (row)....
|
||||
int kern = 0;
|
||||
int col = 0;
|
||||
int factorKern = 1;
|
||||
int factorCol = 1;
|
||||
int key = 0;
|
||||
int cur = (int)row;
|
||||
for (size_t i = 0; i < dimCount; i++)
|
||||
{
|
||||
int dim = (int)(m_outputShape[i] / GetMapCount(i));
|
||||
int coord = cur % dim;
|
||||
cur /= dim;
|
||||
|
||||
// Kernel
|
||||
if (!GetSharing(i))
|
||||
{
|
||||
kern += factorKern * coord;
|
||||
factorKern *= dim;
|
||||
}
|
||||
|
||||
int maps = (int)GetMapCount(i);
|
||||
if (maps > 1)
|
||||
{
|
||||
kern += factorKern * (cur % maps);
|
||||
cur /= maps;
|
||||
factorKern *= maps;
|
||||
}
|
||||
|
||||
// Transform coord to input index space.
|
||||
coord *= (int)GetStride(i);
|
||||
coord += m_start[i];
|
||||
|
||||
col += factorCol * coord;
|
||||
factorCol *= (int)m_inputShape[i];
|
||||
|
||||
int width = (int)m_kernelShape[i];
|
||||
int half = (width - 1) / 2;
|
||||
int min = coord - half;
|
||||
int lim = min + width;
|
||||
if (min < 0)
|
||||
dkey[i] = min;
|
||||
else if (lim > m_inputShape[i])
|
||||
dkey[i] = lim - (int)m_inputShape[i];
|
||||
else
|
||||
dkey[i] = 0;
|
||||
int dk = dkey[i] + half;
|
||||
assert(0 <= dk);
|
||||
assert(dk < width);
|
||||
key = key * width + dk;
|
||||
}
|
||||
assert(cur == 0);
|
||||
assert(0 <= kern);
|
||||
assert(kern < m_kernelCount);
|
||||
assert(0 <= col);
|
||||
assert(col < m_inputShape.GetNumElements());
|
||||
|
||||
auto startsIter = mpkeystarts.find(key);
|
||||
if (startsIter == mpkeystarts.end())
|
||||
{
|
||||
auto starts = std::make_pair((int)m_runs.size(), (int)m_indices.size());
|
||||
mpkeystarts[key] = starts;
|
||||
|
||||
int indexCount = 0;
|
||||
for (int idx = 0; idx < kernelSize; idx++)
|
||||
{
|
||||
const auto& coords = kernelCoords[idx];
|
||||
int mask = 0;
|
||||
for (int i = (int)dimCount; ; )
|
||||
{
|
||||
if (--i < 0)
|
||||
{
|
||||
// All OK.
|
||||
mask = -1;
|
||||
break;
|
||||
}
|
||||
int k = dkey[i] + coords[i];
|
||||
if (k < 0)
|
||||
break;
|
||||
if (k >= m_kernelShape[i])
|
||||
break;
|
||||
}
|
||||
assert(mask == 0 || mask == -1);
|
||||
indexCount -= mask;
|
||||
masks[idx] = mask;
|
||||
}
|
||||
|
||||
int skip = 0;
|
||||
while (masks[skip] == 0)
|
||||
skip++;
|
||||
int count = (int)kernelSize;
|
||||
while (masks[count - 1] == 0)
|
||||
count--;
|
||||
|
||||
count -= skip;
|
||||
m_runs.push_back(skip); // Skip count
|
||||
m_runs.push_back(count); // Count of entries
|
||||
m_indices.push_back(indexCount);
|
||||
for (int i = 0, iMin = 0; i < count; i++)
|
||||
{
|
||||
int index = support[skip + i];
|
||||
int mask = masks[skip + i];
|
||||
if (mask != 0)
|
||||
{
|
||||
// Add "index" to runs for this slot and any immediately preceeding
|
||||
// slots that have mask == 0.
|
||||
assert(iMin <= i);
|
||||
assert(m_runs.size() == starts.first + 2 + iMin);
|
||||
for (; iMin <= i; iMin++)
|
||||
m_runs.push_back(index);
|
||||
assert(iMin == i + 1);
|
||||
assert(m_runs.size() == starts.first + 2 + iMin);
|
||||
|
||||
m_indices.push_back(index);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < count; i++)
|
||||
m_runs.push_back(masks[skip + i]);
|
||||
assert(m_runs.size() == std::get<0>(starts) + 2 + 2 * count);
|
||||
assert(m_indices.size() == std::get<1>(starts) + 1 + indexCount);
|
||||
|
||||
m_mpRowRun[row] = starts.first;
|
||||
m_mpRowIndices[row] = starts.second;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_mpRowRun[row] = (*startsIter).second.first;
|
||||
m_mpRowIndices[row] = (*startsIter).second.second;
|
||||
}
|
||||
assert(0 <= kern);
|
||||
assert(kern < m_kernelCount);
|
||||
m_mpRowCol[row] = col;
|
||||
m_mpRowIwht[row] = kern * (int)kernelSize;
|
||||
}
|
||||
}
|
||||
|
||||
size_t GetStride(size_t dim) const
|
||||
{
|
||||
assert(m_stride.size() == 1 || dim < m_stride.size());
|
||||
return m_stride[m_stride.size() == 1 ? 0 : dim];
|
||||
}
|
||||
|
||||
size_t GetMapCount(size_t dim) const
|
||||
{
|
||||
assert(m_mapCount.size() == 1 || dim < m_mapCount.size());
|
||||
// If the whole map count tensor was specified explicitly - return requested component.
|
||||
if (m_mapCount.size() > 1)
|
||||
return m_mapCount[dim];
|
||||
// If map count tensor rank == 1 then assume it represents number of feature maps for the rightmost dimension.
|
||||
if (dim == m_inputShape.size() - 1)
|
||||
return m_mapCount[0];
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool GetSharing(size_t dim) const
|
||||
{
|
||||
assert(m_sharing.size() == 1 || dim < m_sharing.size());
|
||||
return m_sharing[m_sharing.size() == 1 ? 0 : dim];
|
||||
}
|
||||
|
||||
bool GetAutoPad(size_t dim) const
|
||||
{
|
||||
assert(m_autoPad.size() == 1 || dim < m_autoPad.size());
|
||||
return m_autoPad[m_autoPad.size() == 1 ? 0 : dim];
|
||||
}
|
||||
|
||||
int GetLowerPad(size_t dim) const
|
||||
{
|
||||
if (!GetAutoPad(dim))
|
||||
return (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : dim];
|
||||
|
||||
int kernSize = (int)m_kernelShape[dim];
|
||||
int inpSize = (int)m_inputShape[dim];
|
||||
int outSize = (int)m_outputShape[dim];
|
||||
int stride = (int)GetStride(dim);
|
||||
|
||||
// Taken from computation in ConvolveGeometry ctor.
|
||||
// Number of cells between first and last "centers", inclusive.
|
||||
int cells = (outSize - 1) * stride + 1;
|
||||
// Extra cells, to the left and right of "cells".
|
||||
int extra = inpSize - cells;
|
||||
int center = extra / 2;
|
||||
return -(center - (kernSize - 1) / 2);
|
||||
}
|
||||
|
||||
static TensorShape ComputeOutputShape(const TensorShape& inputShape, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& stride,
|
||||
const BoolVec& sharing, const BoolVec& autoPad, const TensorShape& lowerPad, const TensorShape& upperPad)
|
||||
{
|
||||
if (inputShape.GetRank() != kernelShape.GetRank())
|
||||
InvalidArgument("Convolution input and kernel tensors must have the same rank.");
|
||||
if (mapCount.GetRank() != 1 && inputShape.GetRank() != mapCount.GetRank())
|
||||
InvalidArgument("Convolution map tensor must have rank 1 or the same as the input tensor.");
|
||||
if (stride.GetRank() != 1 && inputShape.GetRank() != stride.GetRank())
|
||||
InvalidArgument("Convolution stride tensor must have rank 1 or the same as the input tensor.");
|
||||
if (sharing.size() != 1 && inputShape.GetRank() != sharing.size())
|
||||
InvalidArgument("Convolution sharing tensor must have rank 1 or the same as the input tensor.");
|
||||
if (autoPad.size() != 1 && inputShape.GetRank() != autoPad.size())
|
||||
InvalidArgument("Convolution padding tensor must have rank 1 or the same as the input tensor.");
|
||||
if (lowerPad.GetRank() != 1 && inputShape.GetRank() != lowerPad.GetRank())
|
||||
InvalidArgument("Convolution lower pad tensor must have rank 1 or the same as the input tensor.");
|
||||
if (upperPad.GetRank() != 1 && inputShape.GetRank() != upperPad.GetRank())
|
||||
InvalidArgument("Convolution upper pad tensor must have rank 1 or the same as the input tensor.");
|
||||
|
||||
SmallVector<size_t> dimsOutput(inputShape.GetRank());
|
||||
for (size_t i = 0; i < inputShape.GetRank(); i++)
|
||||
{
|
||||
assert(inputShape[i] >= 1);
|
||||
if (kernelShape[i] > inputShape[i])
|
||||
InvalidArgument("Convolution operation requires that kernel dim %d <= input dim %d.", (int)kernelShape[i], (int)inputShape[i]);
|
||||
|
||||
size_t delta = stride[stride.GetRank() == 1 ? 0 : i];
|
||||
size_t dim = inputShape[i];
|
||||
bool autoPadCur = autoPad[autoPad.size() == 1 ? 0 : i];
|
||||
size_t lo = lowerPad[lowerPad.size() == 1 ? 0 : i];
|
||||
size_t hi = upperPad[upperPad.size() == 1 ? 0 : i];
|
||||
if (autoPadCur)
|
||||
{
|
||||
dim += kernelShape[i] - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
dim += lo + hi;
|
||||
}
|
||||
size_t dimOut = (dim - kernelShape[i]) / delta + 1;
|
||||
// When LowerPad and/or UpperPad are specified (i.e. > 0), we insist that the kernel applications
|
||||
// fill the entire space.
|
||||
if (!autoPadCur && (lo > 0 || hi > 0))
|
||||
{
|
||||
size_t size = (dimOut - 1) * delta + kernelShape[i];
|
||||
if (size != dim)
|
||||
InvalidArgument("Convolution requires that kernel fills the entire space if auto-padding is disabled.");
|
||||
}
|
||||
if (mapCount.size() > 1)
|
||||
dimOut *= mapCount[i];
|
||||
else if (i == inputShape.GetRank() - 1)
|
||||
dimOut *= mapCount[0];
|
||||
|
||||
dimsOutput[i] = dimOut;
|
||||
}
|
||||
|
||||
auto dimsOut = TensorShape(dimsOutput);
|
||||
// Check the output dimensions.
|
||||
size_t mapCountTotal = mapCount.GetNumElements();
|
||||
size_t sizeOut = dimsOut.GetNumElements();
|
||||
assert((sizeOut % mapCountTotal) == 0);
|
||||
UNUSED(mapCountTotal);
|
||||
UNUSED(sizeOut);
|
||||
|
||||
return dimsOut;
|
||||
}
|
||||
|
||||
// Used in unit tests and during debugging.
|
||||
operator std::string() const
|
||||
{
|
||||
std::ostringstream res;
|
||||
res << "Input: " << (string)InputShape();
|
||||
res << ", Output: " << (string)OutputShape();
|
||||
res << ", Kernel: " << (string)KernelShape();
|
||||
res << ", Map: " << (string)MapCount();
|
||||
res << ", Stride: " << (string)Stride();
|
||||
res << ", Sharing: (";
|
||||
std::copy(begin(Sharing()), end(Sharing()) - 1, std::ostream_iterator<bool>(res, ", "));
|
||||
res << Sharing().back() << ")";
|
||||
res << ", AutoPad: (";
|
||||
std::copy(begin(AutoPad()), end(AutoPad()) - 1, std::ostream_iterator<bool>(res, ", "));
|
||||
res << AutoPad().back() << ")";
|
||||
res << ", LowerPad: " << (string)LowerPad();
|
||||
res << ", UpperPad: " << (string)UpperPad();
|
||||
return res.str();
|
||||
}
|
||||
|
||||
DISABLE_COPY_AND_MOVE(ConvolveGeometry);
|
||||
|
||||
private:
|
||||
TensorShape m_inputShape;
|
||||
TensorShape m_outputShape;
|
||||
TensorShape m_kernelShape;
|
||||
TensorShape m_mapCount;
|
||||
TensorShape m_stride;
|
||||
BoolVec m_sharing;
|
||||
BoolVec m_autoPad;
|
||||
TensorShape m_lowerPad;
|
||||
TensorShape m_upperPad;
|
||||
|
||||
// There are several reasons why int type is used here rather than size_t:
|
||||
// 1. Many of these vectors contain offsets which can be negative.
|
||||
// 2. Most of these vectors will be copied into device memory (GPU) so the smaller the size - the better.
|
||||
// Also, 64-bit operations are slower on GPU.
|
||||
// 3. If you are still not convinced, we don't expect convolutions to be more than 2B in size anyway.
|
||||
// See description to corresponding getter functions to understand what these are.
|
||||
IntVec m_mpRowCol;
|
||||
IntVec m_mpRowIwht;
|
||||
IntVec m_mpRowRun;
|
||||
IntVec m_runs;
|
||||
IntVec m_mpRowIndices;
|
||||
IntVec m_indices;
|
||||
// The indices of the first ("top-left-most") "kernel-center" cell in the source.
|
||||
IntVec m_start;
|
||||
int m_startIndex;
|
||||
// When the first kernel cell is aligned with the first source cell, this is the index of the input cell that
|
||||
// is aligned with the "kernel-center" cell. Indices in "Runs" and "Indices" are relative to OriginIndex.
|
||||
int m_originIndex;
|
||||
|
||||
size_t m_kernelCount;
|
||||
};
|
||||
|
||||
using ConvolveGeometryPtr = std::shared_ptr<ConvolveGeometry>;
|
||||
|
||||
} } }
|
|
@ -0,0 +1,173 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "CuDnnFactories.h"
|
||||
#include "BatchNormalizationEngine.h"
|
||||
#include "CuDnnCommon.h"
|
||||
#include "GPUMatrix.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
class CuDnnBatchNormEngine : public BatchNormEngine<ElemType>
|
||||
{
|
||||
public:
|
||||
using Base = BatchNormEngine<ElemType>;
|
||||
using typename Base::Mat;
|
||||
|
||||
public:
|
||||
CuDnnBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout)
|
||||
: Base(deviceId, inOutT, spatial, imageLayout),
|
||||
m_cudnn(CuDnn::Instance()),
|
||||
m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<ElemType>()),
|
||||
m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<ElemType>())
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
using Base::m_deviceId;
|
||||
using Base::m_imageLayout;
|
||||
using Base::m_inOutT;
|
||||
using Base::m_spatial;
|
||||
|
||||
void EnsureCompatible() override
|
||||
{
|
||||
if (m_spatial && m_imageLayout == ImageLayoutKind::HWC)
|
||||
InvalidArgument("cuDNN batch normalization supports only cudnn(CHW) layout.");
|
||||
if (m_inOutT.GetRank() > 4)
|
||||
InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
|
||||
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
|
||||
{
|
||||
// REVIEW alexeyk: there might be a way to do this in cuDNN.
|
||||
if (blendFactor != 0 && (blendFactor != 1 || expAvgFactor > 0))
|
||||
InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
|
||||
|
||||
m_inOutCuDnnT.UpdateBatchSize(in.GetNumCols());
|
||||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
|
||||
epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
|
||||
// expAvgFactor == 0 && blendFactor == 1 means we are in eval mode.
|
||||
if (expAvgFactor == 0 && blendFactor == 1)
|
||||
{
|
||||
CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
|
||||
}
|
||||
else
|
||||
{
|
||||
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
|
||||
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
|
||||
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
}
|
||||
}
|
||||
|
||||
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) override
|
||||
{
|
||||
m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
|
||||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
|
||||
#if CUDNN_PATCHLEVEL >= 7
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
#else
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
static ElemType* ptr(Mat& src)
|
||||
{
|
||||
return src.BufferPointer();
|
||||
}
|
||||
static const ElemType* ptr(const Mat& src)
|
||||
{
|
||||
return src.BufferPointer();
|
||||
}
|
||||
|
||||
static TensorShape GetInOutTensor(const TensorShape& inOutT)
|
||||
{
|
||||
// cuDNN supports only 3D and 4D tensors (in cuDNN docs it's 4D and 5D dues to N dimension)
|
||||
// even for non-spatial inputs so expand the tensor if needed.
|
||||
if (inOutT.GetRank() > 2)
|
||||
return inOutT;
|
||||
SmallVector<size_t> v(std::max(inOutT.GetRank(), (size_t)3), 1);
|
||||
for (size_t i = 0; i < inOutT.GetRank(); i++)
|
||||
v[i] = inOutT[i];
|
||||
return TensorShape(v);
|
||||
}
|
||||
|
||||
static TensorShape GetScaleBiasTensor(const TensorShape& inOutT, bool spatial)
|
||||
{
|
||||
if (!spatial)
|
||||
return GetInOutTensor(inOutT);
|
||||
|
||||
const auto& t = GetInOutTensor(inOutT);
|
||||
SmallVector<size_t> v(t.GetRank(), 1);
|
||||
v[v.size() - 1] = t[t.GetRank() - 1];
|
||||
return TensorShape(v);
|
||||
}
|
||||
|
||||
private:
|
||||
using C = Consts<ElemType>;
|
||||
|
||||
CuDnn::ptr_t m_cudnn;
|
||||
CuDnnTensor m_inOutCuDnnT;
|
||||
CuDnnTensor m_scaleBiasCuDnnT;
|
||||
};
|
||||
|
||||
template class CuDnnBatchNormEngine<float>;
|
||||
template class CuDnnBatchNormEngine<double>;
|
||||
|
||||
template <typename ElemType>
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout)
|
||||
{
|
||||
return std::make_unique<CuDnnBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
|
||||
}
|
||||
|
||||
template class CuDnnBatchNormEngineFactory<float>;
|
||||
template class CuDnnBatchNormEngineFactory<double>;
|
||||
|
||||
CudaTimer::~CudaTimer()
|
||||
{
|
||||
// TODO: Should not throw if std::uncaught_exception()
|
||||
if (m_start != nullptr)
|
||||
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
|
||||
if (m_stop != nullptr)
|
||||
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
|
||||
}
|
||||
void CudaTimer::Start()
|
||||
{
|
||||
cudaEvent_t start;
|
||||
cudaEvent_t stop;
|
||||
if (m_start != nullptr)
|
||||
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
|
||||
if (m_stop != nullptr)
|
||||
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
|
||||
CUDA_CALL(cudaEventCreate(&start));
|
||||
CUDA_CALL(cudaEventCreate(&stop));
|
||||
m_start = start;
|
||||
m_stop = stop;
|
||||
CUDA_CALL(cudaEventRecord(start, GetStream()));
|
||||
}
|
||||
void CudaTimer::Stop()
|
||||
{
|
||||
CUDA_CALL(cudaEventRecord(reinterpret_cast<cudaEvent_t>(m_stop), GetStream()));
|
||||
CUDA_CALL(cudaEventSynchronize(reinterpret_cast<cudaEvent_t>(m_stop)));
|
||||
}
|
||||
float CudaTimer::Elapsed()
|
||||
{
|
||||
float ms;
|
||||
CUDA_CALL(cudaEventElapsedTime(&ms, reinterpret_cast<cudaEvent_t>(m_start), reinterpret_cast<cudaEvent_t>(m_stop)));
|
||||
return ms;
|
||||
}
|
||||
|
||||
} } }
|
|
@ -0,0 +1,108 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GPUMatrix.h"
|
||||
#include "CuDnnCommon.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <>
|
||||
const float Consts<float>::One = 1;
|
||||
template <>
|
||||
const double Consts<double>::One = 1;
|
||||
template <>
|
||||
const float Consts<float>::Zero = 0;
|
||||
template <>
|
||||
const double Consts<double>::Zero = 0;
|
||||
|
||||
CuDnnTensor::CuDnnTensor(const TensorShape& src, cudnnDataType_t dataType)
|
||||
: m_tensor(nullptr)
|
||||
{
|
||||
CUDNN_CALL(cudnnCreateTensorDescriptor(&m_tensor));
|
||||
// Set cuDNN tensor dimensions. cuDNN uses row-major format while TensorShape - column-major
|
||||
// so conversion is required. N dimension will be set to 1.
|
||||
const auto& stridesSrc = src.GetStrides();
|
||||
SmallVector<int> dims(src.GetRank() + 1);
|
||||
SmallVector<int> strides(stridesSrc.size() + 1);
|
||||
assert(dims.size() == strides.size());
|
||||
for (int i = 0; i < src.GetRank(); i++)
|
||||
{
|
||||
dims[dims.size() - 1 - i] = (int)src[i];
|
||||
strides[dims.size() - 1 - i] = (int)stridesSrc[i];
|
||||
}
|
||||
// Set "minibatch"(aka N) dimension.
|
||||
dims[0] = 1;
|
||||
strides[0] = strides[1] * dims[1];
|
||||
CUDNN_CALL(cudnnSetTensorNdDescriptor(m_tensor, dataType, (int)dims.size(), dims.data(), strides.data()));
|
||||
}
|
||||
|
||||
CuDnnTensor::~CuDnnTensor()
|
||||
{
|
||||
if (m_tensor != nullptr)
|
||||
{
|
||||
cudnnDestroyTensorDescriptor(m_tensor);
|
||||
m_tensor = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void CuDnnTensor::UpdateBatchSize(size_t batchSize)
|
||||
{
|
||||
// Currently cuDNN supports only 2D and 3D convlutions anyway (so max 5D tensors).
|
||||
const int MaxDims = 5;
|
||||
int dims[MaxDims];
|
||||
int strides[MaxDims];
|
||||
int nbDims = 0;
|
||||
cudnnDataType_t dataType;
|
||||
// According to NVIDIA, Get/Set functions are very fast so it's safe to call them in a loop.
|
||||
CUDNN_CALL(cudnnGetTensorNdDescriptor(m_tensor, MaxDims, &dataType, &nbDims, dims, strides));
|
||||
assert(nbDims <= MaxDims);
|
||||
dims[0] = (int)batchSize;
|
||||
CUDNN_CALL(cudnnSetTensorNdDescriptor(m_tensor, dataType, nbDims, dims, strides));
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
cudnnDataType_t CuDnnTensor::GetDataType()
|
||||
{
|
||||
if (typeid(ElemType) == typeid(float))
|
||||
return CUDNN_DATA_FLOAT;
|
||||
else if (typeid(ElemType) == typeid(double))
|
||||
return CUDNN_DATA_DOUBLE;
|
||||
else
|
||||
InvalidArgument("cuDNN engine currently supports only single and double precision data types.");
|
||||
}
|
||||
|
||||
template cudnnDataType_t CuDnnTensor::GetDataType<float>();
|
||||
template cudnnDataType_t CuDnnTensor::GetDataType<double>();
|
||||
|
||||
CuDnn::ptr_t CuDnn::Instance()
|
||||
{
|
||||
auto createNew = []()
|
||||
{
|
||||
int deviceId;
|
||||
CUDA_CALL(cudaGetDevice(&deviceId));
|
||||
cudaDeviceProp props = {0};
|
||||
if (cudaGetDeviceProperties(&props, deviceId) != cudaSuccess || props.major < 3)
|
||||
RuntimeError("cuDNN requires device with compute capability 3.0 or higher.");
|
||||
cudnnHandle_t* cudnn = new cudnnHandle_t;
|
||||
CUDNN_CALL(cudnnCreate(cudnn));
|
||||
CUDNN_CALL(cudnnSetStream(*cudnn, GetStream()));
|
||||
return cudnn;
|
||||
};
|
||||
|
||||
static std::shared_ptr<cudnnHandle_t> m_instance = std::shared_ptr<cudnnHandle_t>(createNew(), [](cudnnHandle_t* src)
|
||||
{
|
||||
assert(*src != nullptr);
|
||||
auto err = cudnnDestroy(*src);
|
||||
assert(err == CUDNN_STATUS_SUCCESS);
|
||||
#ifdef NDEBUG
|
||||
UNUSED(err);
|
||||
#endif
|
||||
delete src;
|
||||
});
|
||||
return m_instance;
|
||||
}
|
||||
|
||||
} } }
|
|
@ -0,0 +1,49 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Basics.h"
|
||||
#include "TensorShape.h"
|
||||
#include <cudnn.h>
|
||||
#include <memory>
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
class CuDnnTensor final
|
||||
{
|
||||
public:
|
||||
CuDnnTensor(const TensorShape& src, cudnnDataType_t dataType);
|
||||
~CuDnnTensor();
|
||||
|
||||
void UpdateBatchSize(size_t batchSize);
|
||||
|
||||
operator cudnnTensorDescriptor_t() const { return m_tensor; }
|
||||
|
||||
template <typename ElemType>
|
||||
static cudnnDataType_t GetDataType();
|
||||
|
||||
DISABLE_COPY_AND_MOVE(CuDnnTensor);
|
||||
|
||||
private:
|
||||
cudnnTensorDescriptor_t m_tensor;
|
||||
};
|
||||
|
||||
struct CuDnn final
|
||||
{
|
||||
using ptr_t = std::shared_ptr<cudnnHandle_t>;
|
||||
static ptr_t Instance();
|
||||
|
||||
DISABLE_COPY_AND_MOVE(CuDnn);
|
||||
};
|
||||
|
||||
template <typename ElemType>
|
||||
struct Consts
|
||||
{
|
||||
static const ElemType Zero;
|
||||
static const ElemType One;
|
||||
};
|
||||
|
||||
} } }
|
|
@ -4,11 +4,11 @@
|
|||
//
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "CuDnnConvolutionEngine.h"
|
||||
#include "CuDnnFactories.h"
|
||||
#include "GPUMatrix.h"
|
||||
#ifdef USE_CUDNN
|
||||
#include <cudnn.h>
|
||||
#include "CuDnnConvolutionEngine.cuh"
|
||||
#include <typeinfo>
|
||||
#include <typeindex>
|
||||
#include "CuDnnCommon.h"
|
||||
|
||||
template <>
|
||||
const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
|
||||
|
@ -16,287 +16,177 @@ const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
|
|||
return cudnnGetErrorString(x);
|
||||
}
|
||||
|
||||
// A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for filters.
|
||||
// A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for kernels.
|
||||
// Such formats have very limited support in cuDNN and not used in other frameworks.
|
||||
// CNTK with cuDNN by default uses NCHW formats for both inputs/outputs and filters.
|
||||
// CNTK with cuDNN by default uses NCHW formats for both inputs/outputs and kernels.
|
||||
#define TENSOR_FORMAT CUDNN_TENSOR_NCHW
|
||||
#define FILTER_FORMAT CUDNN_TENSOR_NCHW
|
||||
#endif
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId)
|
||||
{
|
||||
// REVIEW alexeyk: compile-time for now, make runtime, config-driven.
|
||||
#ifdef USE_CUDNN
|
||||
cudaDeviceProp props = {0};
|
||||
return cudaGetDeviceProperties(&props, deviceId) == cudaSuccess && props.major >= 3;
|
||||
#else
|
||||
UNUSED(deviceId);
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
CudaTimer::~CudaTimer()
|
||||
{
|
||||
// TODO: Should not throw if std::uncaught_exception()
|
||||
if (m_start != nullptr)
|
||||
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
|
||||
if (m_stop != nullptr)
|
||||
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
|
||||
}
|
||||
void CudaTimer::Start()
|
||||
{
|
||||
cudaEvent_t start;
|
||||
cudaEvent_t stop;
|
||||
if (m_start != nullptr)
|
||||
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
|
||||
if (m_stop != nullptr)
|
||||
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
|
||||
CUDA_CALL(cudaEventCreate(&start));
|
||||
CUDA_CALL(cudaEventCreate(&stop));
|
||||
m_start = start;
|
||||
m_stop = stop;
|
||||
CUDA_CALL(cudaEventRecord(start, GetStream()));
|
||||
}
|
||||
void CudaTimer::Stop()
|
||||
{
|
||||
CUDA_CALL(cudaEventRecord(reinterpret_cast<cudaEvent_t>(m_stop), GetStream()));
|
||||
CUDA_CALL(cudaEventSynchronize(reinterpret_cast<cudaEvent_t>(m_stop)));
|
||||
}
|
||||
float CudaTimer::Elapsed()
|
||||
{
|
||||
float ms;
|
||||
CUDA_CALL(cudaEventElapsedTime(&ms, reinterpret_cast<cudaEvent_t>(m_start), reinterpret_cast<cudaEvent_t>(m_stop)));
|
||||
return ms;
|
||||
}
|
||||
|
||||
#ifdef USE_CUDNN
|
||||
|
||||
static bool IsGpu(DEVICEID_TYPE deviceId)
|
||||
{
|
||||
return deviceId >= 0;
|
||||
}
|
||||
|
||||
class CuDnnTensor4D : public ConvolutionTensor4D
|
||||
class CuDnnKernel
|
||||
{
|
||||
public:
|
||||
CuDnnTensor4D(size_t w, size_t h, size_t c, size_t n, cudnnDataType_t dataType)
|
||||
: ConvolutionTensor4D(w, h, c, n), m_dataType(dataType), m_tensor(nullptr)
|
||||
CuDnnKernel(const ConvolveGeometry& geometry, cudnnDataType_t dataType)
|
||||
: m_kernel(nullptr)
|
||||
{
|
||||
CUDNN_CALL(cudnnCreateTensorDescriptor(&m_tensor));
|
||||
CUDNN_CALL(cudnnSetTensor4dDescriptor(m_tensor, TENSOR_FORMAT, dataType,
|
||||
static_cast<int>(n), static_cast<int>(c), static_cast<int>(h), static_cast<int>(w)));
|
||||
CUDNN_CALL(cudnnCreateFilterDescriptor(&m_kernel));
|
||||
// Set cuDNN kernel dimensions. cuDNN uses row-major format while TensorShape - column-major
|
||||
// so conversion is required.
|
||||
const auto& filt = geometry.KernelShape();
|
||||
size_t mapCount = geometry.GetMapCount(geometry.InputShape().GetRank() - 1);
|
||||
if (mapCount != geometry.MapCount().GetNumElements())
|
||||
InvalidArgument("cuDNN does not support map tensor of this configuration.");
|
||||
SmallVector<int> dims(filt.GetRank() + 1);
|
||||
for (int i = 0; i < filt.GetRank(); i++)
|
||||
dims[dims.size() - 1 - i] = (int)filt[i];
|
||||
// Set map count(aka K) dimension.
|
||||
dims[0] = (int)mapCount;
|
||||
CUDNN_CALL(cudnnSetFilterNdDescriptor_v4(m_kernel, dataType, FILTER_FORMAT, (int)dims.size(), dims.data()));
|
||||
}
|
||||
|
||||
public:
|
||||
operator cudnnTensorDescriptor_t() const
|
||||
~CuDnnKernel()
|
||||
{
|
||||
return m_tensor;
|
||||
}
|
||||
|
||||
~CuDnnTensor4D() noexcept
|
||||
{
|
||||
if (m_tensor != nullptr)
|
||||
if (m_kernel != nullptr)
|
||||
{
|
||||
// TODO: Check for error code and throw if !std::uncaught_exception()
|
||||
cudnnDestroyTensorDescriptor(m_tensor);
|
||||
m_tensor = nullptr;
|
||||
cudnnDestroyFilterDescriptor(m_kernel);
|
||||
m_kernel = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void setN(size_t newN) override
|
||||
{
|
||||
ConvolutionTensor4D::setN(newN);
|
||||
CUDNN_CALL(cudnnSetTensor4dDescriptor(m_tensor, TENSOR_FORMAT, m_dataType,
|
||||
static_cast<int>(n()), static_cast<int>(c()), static_cast<int>(h()), static_cast<int>(w())));
|
||||
}
|
||||
|
||||
private:
|
||||
cudnnDataType_t m_dataType;
|
||||
cudnnTensorDescriptor_t m_tensor;
|
||||
};
|
||||
|
||||
class CuDnnFilter : public ConvolutionFilter
|
||||
{
|
||||
public:
|
||||
CuDnnFilter(size_t w, size_t h, size_t c, size_t k, cudnnDataType_t dataType)
|
||||
: ConvolutionFilter(w, h, c, k), m_filter(nullptr)
|
||||
{
|
||||
CUDNN_CALL(cudnnCreateFilterDescriptor(&m_filter));
|
||||
CUDNN_CALL(cudnnSetFilter4dDescriptor_v4(m_filter, dataType, FILTER_FORMAT,
|
||||
static_cast<int>(k), static_cast<int>(c), static_cast<int>(h), static_cast<int>(w)));
|
||||
}
|
||||
|
||||
public:
|
||||
operator cudnnFilterDescriptor_t() const
|
||||
{
|
||||
return m_filter;
|
||||
return m_kernel;
|
||||
}
|
||||
|
||||
~CuDnnFilter() noexcept
|
||||
{
|
||||
if (m_filter != nullptr)
|
||||
{
|
||||
// TODO: Check for error code and throw if !std::uncaught_exception()
|
||||
cudnnDestroyFilterDescriptor(m_filter);
|
||||
m_filter = nullptr;
|
||||
}
|
||||
}
|
||||
DISABLE_COPY_AND_MOVE(CuDnnKernel);
|
||||
|
||||
private:
|
||||
cudnnFilterDescriptor_t m_filter;
|
||||
cudnnFilterDescriptor_t m_kernel;
|
||||
};
|
||||
|
||||
class CuDnnConvolutionDescriptor : public ConvolutionDescriptor
|
||||
class CuDnnConv
|
||||
{
|
||||
public:
|
||||
CuDnnConvolutionDescriptor(size_t wStride, size_t hStride, size_t wPad, size_t hPad)
|
||||
: ConvolutionDescriptor(wStride, hStride, wPad > 0 || hPad > 0), m_conv(nullptr)
|
||||
CuDnnConv(const ConvolveGeometry& geometry, cudnnDataType_t dataType)
|
||||
: m_conv(nullptr)
|
||||
{
|
||||
CUDNN_CALL(cudnnCreateConvolutionDescriptor(&m_conv));
|
||||
CUDNN_CALL(cudnnSetConvolution2dDescriptor(m_conv,
|
||||
static_cast<int>(hPad), static_cast<int>(wPad),
|
||||
static_cast<int>(hStride), static_cast<int>(wStride),
|
||||
1, 1, CUDNN_CROSS_CORRELATION));
|
||||
// Set cuDNN convolution parameters. cuDNN uses row-major format while TensorShape - column-major
|
||||
// so conversion is required. Also, for 2D convolutions (which have 3D tensor shapes)
|
||||
// cuDNN uses 2D descriptors while for 3D convolutions - 3D so we need to ignore
|
||||
// rightmost dimension in ConvolveGeometry tensors.
|
||||
SmallVector<int> stride(geometry.InputShape().GetRank() - 1);
|
||||
SmallVector<int> pad(stride.size());
|
||||
for (int i = 0; i < stride.size(); i++)
|
||||
{
|
||||
stride[stride.size() - 1 - i] = (int)geometry.GetStride(i);
|
||||
pad[stride.size() - 1 - i] = geometry.GetLowerPad(i);
|
||||
}
|
||||
SmallVector<int> upscale(stride.size(), 1);
|
||||
CUDNN_CALL(cudnnSetConvolutionNdDescriptor(m_conv, (int)stride.size(), pad.data(),
|
||||
stride.data(), upscale.data(),
|
||||
CUDNN_CROSS_CORRELATION, dataType));
|
||||
}
|
||||
|
||||
public:
|
||||
operator cudnnConvolutionDescriptor_t() const
|
||||
{
|
||||
return m_conv;
|
||||
}
|
||||
|
||||
~CuDnnConvolutionDescriptor() noexcept
|
||||
~CuDnnConv()
|
||||
{
|
||||
if (m_conv != nullptr)
|
||||
{
|
||||
// TODO: Check for error code and throw if !std::uncaught_exception()
|
||||
cudnnDestroyConvolutionDescriptor(m_conv);
|
||||
m_conv = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
operator cudnnConvolutionDescriptor_t() const
|
||||
{
|
||||
return m_conv;
|
||||
}
|
||||
|
||||
DISABLE_COPY_AND_MOVE(CuDnnConv);
|
||||
|
||||
private:
|
||||
cudnnConvolutionDescriptor_t m_conv;
|
||||
};
|
||||
|
||||
class CuDnnPoolingDescriptor : public PoolingDescriptor
|
||||
class CuDnnPool
|
||||
{
|
||||
public:
|
||||
CuDnnPoolingDescriptor(PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
|
||||
: PoolingDescriptor(kind, w, h, wStride, hStride, wPad, hPad), m_pool(nullptr)
|
||||
CuDnnPool(const ConvolveGeometry& geometry, PoolKind kind)
|
||||
: m_pool(nullptr)
|
||||
{
|
||||
assert(kind == PoolKind::Max || kind == PoolKind::Average);
|
||||
|
||||
CUDNN_CALL(cudnnCreatePoolingDescriptor(&m_pool));
|
||||
CUDNN_CALL(cudnnSetPooling2dDescriptor(m_pool,
|
||||
// Set cuDNN pooling parameters. cuDNN uses row-major format while TensorShape - column-major
|
||||
// so conversion is required. Same as in convolution descriptor, cuDNN uses 2D descriptors
|
||||
// for 3D inputs.
|
||||
SmallVector<int> dims(geometry.InputShape().GetRank() - 1);
|
||||
SmallVector<int> stride(dims.size());
|
||||
SmallVector<int> pad(stride.size());
|
||||
int j = (int)dims.size() - 1;
|
||||
for (int i = 0; i < stride.size(); i++, j--)
|
||||
{
|
||||
dims[j] = (int)geometry.KernelShape()[i];
|
||||
stride[j] = (int)geometry.GetStride(i);
|
||||
pad[j] = geometry.GetLowerPad(i);
|
||||
}
|
||||
|
||||
// Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine.
|
||||
CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
|
||||
kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
|
||||
static_cast<int>(h), static_cast<int>(w),
|
||||
static_cast<int>(hPad), static_cast<int>(wPad),
|
||||
static_cast<int>(hStride), static_cast<int>(wStride)));
|
||||
(int)dims.size(), dims.data(), pad.data(), stride.data()));
|
||||
}
|
||||
|
||||
public:
|
||||
operator cudnnPoolingDescriptor_t() const
|
||||
{
|
||||
return m_pool;
|
||||
}
|
||||
|
||||
~CuDnnPoolingDescriptor() noexcept
|
||||
~CuDnnPool()
|
||||
{
|
||||
if (m_pool != nullptr)
|
||||
{
|
||||
// TODO: Check for error code and throw if !std::uncaught_exception()
|
||||
cudnnDestroyPoolingDescriptor(m_pool);
|
||||
m_pool = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
operator cudnnPoolingDescriptor_t() const
|
||||
{
|
||||
return m_pool;
|
||||
}
|
||||
|
||||
DISABLE_COPY_AND_MOVE(CuDnnPool);
|
||||
|
||||
private:
|
||||
cudnnPoolingDescriptor_t m_pool;
|
||||
};
|
||||
|
||||
template <typename CuDnnT, typename In>
|
||||
static CuDnnT& As(In& src)
|
||||
{
|
||||
// Do dynamic_cast only in debug builds and static_cast in release builds.
|
||||
assert(dynamic_cast<CuDnnT*>(&src) != nullptr);
|
||||
return static_cast<CuDnnT&>(src);
|
||||
}
|
||||
static const CuDnnTensor4D& t(const ConvolutionTensor4D& src)
|
||||
{
|
||||
return As<const CuDnnTensor4D>(src);
|
||||
}
|
||||
static const CuDnnFilter& f(const ConvolutionFilter& src)
|
||||
{
|
||||
return As<const CuDnnFilter>(src);
|
||||
}
|
||||
static const CuDnnConvolutionDescriptor& cd(const ConvolutionDescriptor& src)
|
||||
{
|
||||
return As<const CuDnnConvolutionDescriptor>(src);
|
||||
}
|
||||
static const CuDnnPoolingDescriptor& p(const PoolingDescriptor& src)
|
||||
{
|
||||
return As<const CuDnnPoolingDescriptor>(src);
|
||||
}
|
||||
template <typename ElemType>
|
||||
static ElemType* ptr(Matrix<ElemType>& src)
|
||||
{
|
||||
return src.BufferPointer();
|
||||
}
|
||||
template <typename ElemType>
|
||||
static const ElemType* ptr(const Matrix<ElemType>& src)
|
||||
{
|
||||
return src.BufferPointer();
|
||||
}
|
||||
|
||||
template <typename ElemType>
|
||||
struct Consts
|
||||
{
|
||||
static const ElemType Zero;
|
||||
static const ElemType One;
|
||||
};
|
||||
template <>
|
||||
const float Consts<float>::One = 1;
|
||||
template <>
|
||||
const double Consts<double>::One = 1;
|
||||
template <>
|
||||
const float Consts<float>::Zero = 0;
|
||||
template <>
|
||||
const double Consts<double>::Zero = 0;
|
||||
|
||||
template <typename ElemType>
|
||||
template <class ElemType>
|
||||
class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
|
||||
{
|
||||
public:
|
||||
using Base = ConvolutionEngine<ElemType>;
|
||||
using typename Base::Mat;
|
||||
using typename Base::Tensor4D;
|
||||
using typename Base::Filter;
|
||||
using typename Base::ConvDesc;
|
||||
|
||||
CuDnnConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
|
||||
: Base(deviceId, imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_bnImpl(bnImpl), m_stream(GetStream()), m_cudnn(nullptr)
|
||||
public:
|
||||
CuDnnConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
|
||||
size_t maxTempMemSizeInSamples, PoolKind poolKind)
|
||||
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
|
||||
m_cudnn(CuDnn::Instance()),
|
||||
m_dataType(CuDnnTensor::GetDataType<ElemType>()),
|
||||
m_inT(geometry->InputShape(), m_dataType),
|
||||
m_outT(geometry->OutputShape(), m_dataType)
|
||||
{
|
||||
CUDNN_CALL(cudnnCreate(&m_cudnn));
|
||||
CUDNN_CALL(cudnnSetStream(m_cudnn, m_stream));
|
||||
}
|
||||
|
||||
~CuDnnConvolutionEngine()
|
||||
{
|
||||
if (m_cudnn != nullptr)
|
||||
{
|
||||
// TODO: Check for error code and throw if !std::uncaught_exception()
|
||||
cudnnDestroy(m_cudnn);
|
||||
m_cudnn = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
using Base::m_geometry;
|
||||
using Base::m_deviceId;
|
||||
using Base::m_imageLayout;
|
||||
using Base::m_maxTempMemSizeInSamples;
|
||||
using Base::m_poolKind;
|
||||
|
||||
void EnsureCompatible() override
|
||||
{
|
||||
|
@ -306,26 +196,39 @@ protected:
|
|||
RuntimeError("cuDNN convolution engine supports GPU devices only.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& outT, Mat& out, Mat& workspace) override
|
||||
void EnsureConvolutionInitialized() override
|
||||
{
|
||||
// Find best algo and allocate temp buffer, if needed.
|
||||
auto finder = [&](int& calgo, cudnnConvolutionFwdAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
|
||||
if (m_kernelT == nullptr)
|
||||
{
|
||||
return cudnnFindConvolutionForwardAlgorithm(m_cudnn, t(inT), f(filterT), cd(convDesc), t(outT), MaxAlgoCount, &calgo, algoPerf);
|
||||
m_kernelT = std::make_unique<CuDnnKernel>(*m_geometry, m_dataType),
|
||||
m_conv = std::make_unique<CuDnnConv>(*m_geometry, m_dataType);
|
||||
}
|
||||
}
|
||||
|
||||
void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = in.GetNumCols();
|
||||
// Find best algo and allocate temp buffer, if needed.
|
||||
auto finder = [this](int& calgo, cudnnConvolutionFwdAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
|
||||
{
|
||||
return cudnnFindConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, MaxAlgoCount, &calgo, algoPerf);
|
||||
};
|
||||
FindBestAlgo(t(inT), m_fwdAlgo, finder);
|
||||
auto staticFinder = [this](cudnnConvolutionFwdAlgo_t& algo) -> cudnnStatus_t
|
||||
{
|
||||
return cudnnGetConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, &algo);
|
||||
};
|
||||
FindBestAlgo(batchSize, m_fwdAlgo, finder, staticFinder);
|
||||
if (m_fwdAlgo.Algo.memory > 0)
|
||||
workspace.Resize((m_fwdAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
||||
// Perform forward convolution operation.
|
||||
auto err = cudnnConvolutionForward(m_cudnn, &C::One, t(inT), ptr(in), f(filterT), ptr(filter), cd(convDesc),
|
||||
m_fwdAlgo.Algo.algo, ptr(workspace), m_fwdAlgo.Algo.memory, &C::Zero, t(outT), ptr(out));
|
||||
auto err = cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv,
|
||||
m_fwdAlgo.Algo.algo, ptr(workspace), m_fwdAlgo.Algo.memory, &C::Zero, m_outT, ptr(out));
|
||||
// There might be a case where cuDNN fails due to workspace being too small, try using no-workspace algo instead.
|
||||
// REVIEW alexeyk: NVIDIA is currently reviewing this issue.
|
||||
if (CUDNN_STATUS_INVALID_VALUE == err && m_fwdAlgo.Algo.memory > 0)
|
||||
{
|
||||
auto err2 = cudnnConvolutionForward(m_cudnn, &C::One, t(inT), ptr(in), f(filterT), ptr(filter), cd(convDesc),
|
||||
m_fwdAlgo.NoWorkspaceAlgo, nullptr, 0, &C::Zero, t(outT), ptr(out));
|
||||
auto err2 = cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv,
|
||||
m_fwdAlgo.NoWorkspaceAlgo, nullptr, 0, &C::Zero, m_outT, ptr(out));
|
||||
// Update original error in case of success.
|
||||
if (CUDNN_STATUS_SUCCESS == err2)
|
||||
err = CUDNN_STATUS_SUCCESS;
|
||||
|
@ -333,128 +236,104 @@ protected:
|
|||
CUDNN_CALL(err);
|
||||
}
|
||||
|
||||
void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
|
||||
const Tensor4D& gradT, Mat& grad, Mat& workspace) override
|
||||
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = srcGrad.GetNumCols();
|
||||
// Find best algo and allocate temp buffer, if needed.
|
||||
auto finder = [&](int& calgo, cudnnConvolutionBwdDataAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
|
||||
auto finder = [this](int& calgo, cudnnConvolutionBwdDataAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
|
||||
{
|
||||
return cudnnFindConvolutionBackwardDataAlgorithm(m_cudnn, f(filterT), t(srcGradT), cd(convDesc), t(gradT), MaxAlgoCount, &calgo, algoPerf);
|
||||
return cudnnFindConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, MaxAlgoCount, &calgo, algoPerf);
|
||||
};
|
||||
FindBestAlgo(t(srcGradT), m_backDataAlgo, finder);
|
||||
auto staticFinder = [this](cudnnConvolutionBwdDataAlgo_t& algo) -> cudnnStatus_t
|
||||
{
|
||||
return cudnnGetConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE, 0, &algo);
|
||||
};
|
||||
FindBestAlgo(batchSize, m_backDataAlgo, finder, staticFinder);
|
||||
if (m_backDataAlgo.Algo.memory > 0)
|
||||
workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
||||
// Compute gradients with respect to the output tensor (data).
|
||||
CUDNN_CALL(cudnnConvolutionBackwardData(m_cudnn, &C::One, f(filterT), ptr(filter), t(srcGradT), ptr(srcGrad), cd(convDesc), m_backDataAlgo.Algo.algo,
|
||||
ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, t(gradT), ptr(grad)));
|
||||
CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
|
||||
ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
|
||||
}
|
||||
|
||||
void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
|
||||
const Filter& filterT, Mat& filter, bool /*allowReuse*/, Mat& workspace) override
|
||||
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
|
||||
{
|
||||
size_t batchSize = in.GetNumCols();
|
||||
// Find best algo and allocate temp buffer, if needed.
|
||||
auto finder = [&](int& calgo, cudnnConvolutionBwdFilterAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
|
||||
auto finder = [this](int& calgo, cudnnConvolutionBwdFilterAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
|
||||
{
|
||||
return cudnnFindConvolutionBackwardFilterAlgorithm(m_cudnn, t(inT), t(srcGradT), cd(convDesc), f(filterT), MaxAlgoCount, &calgo, algoPerf);
|
||||
return cudnnFindConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, MaxAlgoCount, &calgo, algoPerf);
|
||||
};
|
||||
FindBestAlgo(t(inT), m_backFiltAlgo, finder);
|
||||
auto staticFinder = [this](cudnnConvolutionBwdFilterAlgo_t& algo) -> cudnnStatus_t
|
||||
{
|
||||
return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE, 0, &algo);
|
||||
};
|
||||
FindBestAlgo(batchSize, m_backFiltAlgo, finder, staticFinder);
|
||||
if (m_backFiltAlgo.Algo.memory > 0)
|
||||
workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
|
||||
// Compute gradients with respect to the output tensor (data).
|
||||
CUDNN_CALL(cudnnConvolutionBackwardFilter(m_cudnn, &C::One, t(inT), ptr(in), t(srcGradT), ptr(srcGrad), cd(convDesc), m_backFiltAlgo.Algo.algo,
|
||||
ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, f(filterT), ptr(filter)));
|
||||
CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
|
||||
ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
|
||||
}
|
||||
|
||||
void EnsureCompatibleBatchNorm(bool spatial) override
|
||||
void EnsurePoolingInitialized() override
|
||||
{
|
||||
if (!IsGpu(m_deviceId))
|
||||
InvalidArgument("cuDNN engine does not support batch normalization on CPUs.");
|
||||
if (spatial && m_imageLayout != ImageLayoutKind::CHW)
|
||||
InvalidArgument("cuDNN engine batch normalization currently supports only CHW data layout for convolutional nodes.");
|
||||
if (m_pool == nullptr)
|
||||
m_pool = std::make_unique<CuDnnPool>(*m_geometry, m_poolKind);
|
||||
}
|
||||
|
||||
void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
|
||||
double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
|
||||
void ForwardPoolingCore(const Mat& in, Mat& out) override
|
||||
{
|
||||
if (m_bnImpl == BatchNormImpl::CuDnn)
|
||||
{
|
||||
cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
|
||||
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
|
||||
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
|
||||
t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
|
||||
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
}
|
||||
else if (m_bnImpl == BatchNormImpl::Cntk)
|
||||
{
|
||||
epsilon = std::max(epsilon, 1e-9);
|
||||
CUDA_CALL(BatchNormalizationForwardTraining(inT, spatial, ptr(in), ptr(out), ptr(scale), ptr(bias),
|
||||
expAvgFactor, ptr(runMean), ptr(runInvStdDev),
|
||||
epsilon, ptr(saveMean), ptr(saveInvStdDev), m_stream));
|
||||
}
|
||||
else
|
||||
RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
|
||||
size_t batchSize = in.GetNumCols();
|
||||
m_inT.UpdateBatchSize(batchSize);
|
||||
m_outT.UpdateBatchSize(batchSize);
|
||||
CUDNN_CALL(cudnnPoolingForward(*m_cudnn, *(m_pool), &C::One, m_inT, ptr(in), &C::Zero, m_outT, ptr(out)));
|
||||
}
|
||||
|
||||
void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
|
||||
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) override
|
||||
void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
|
||||
{
|
||||
if (m_bnImpl == BatchNormImpl::CuDnn)
|
||||
{
|
||||
cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
CUDNN_CALL(cudnnBatchNormalizationForwardInference(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
|
||||
t(scaleBiasT), ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), CUDNN_BN_MIN_EPSILON));
|
||||
}
|
||||
else if (m_bnImpl == BatchNormImpl::Cntk)
|
||||
{
|
||||
CUDA_CALL(BatchNormalizationForwardInference(inT, spatial, ptr(in), ptr(out), ptr(scale), ptr(bias),
|
||||
ptr(runMean), ptr(runInvStdDev), m_stream));
|
||||
}
|
||||
else
|
||||
RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
|
||||
}
|
||||
|
||||
void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
|
||||
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
|
||||
Mat& scaleGrad, Mat& biasGrad) override
|
||||
{
|
||||
if (m_bnImpl == BatchNormImpl::CuDnn)
|
||||
{
|
||||
cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||
// REVIEW alexeyk: remove once Philly is upgraded to prod version.
|
||||
#if CUDNN_PATCHLEVEL >= 7
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(m_cudnn, mode, &C::One, &C::One, &C::One, &C::One, t(inT), ptr(in), t(inT), ptr(srcGrad), t(inT), ptr(grad),
|
||||
t(scaleBiasT), ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
#else
|
||||
CUDNN_CALL(cudnnBatchNormalizationBackward(m_cudnn, mode, &C::One, &C::One, t(inT), ptr(in), t(inT), ptr(srcGrad), t(inT), ptr(grad),
|
||||
t(scaleBiasT), ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||
#endif
|
||||
|
||||
}
|
||||
else if (m_bnImpl == BatchNormImpl::Cntk)
|
||||
{
|
||||
CUDA_CALL(BatchNormalizationBackward(inT, spatial, ptr(in), ptr(srcGrad), ptr(grad), ptr(scale), ptr(scaleGrad), ptr(biasGrad),
|
||||
ptr(saveMean), ptr(saveInvStdDev), m_stream));
|
||||
}
|
||||
else
|
||||
RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
|
||||
size_t batchSize = in.GetNumCols();
|
||||
m_inT.UpdateBatchSize(batchSize);
|
||||
m_outT.UpdateBatchSize(batchSize);
|
||||
CUDNN_CALL(cudnnPoolingBackward(*m_cudnn, *(m_pool), &C::One, m_outT, ptr(out), m_outT, ptr(srcGrad),
|
||||
m_inT, ptr(in), &C::One, m_inT, ptr(grad)));
|
||||
}
|
||||
|
||||
private:
|
||||
using C = Consts<ElemType>;
|
||||
|
||||
static const int MaxAlgoCount = 10;
|
||||
|
||||
template <typename TAlgo, typename TFinder>
|
||||
void FindBestAlgo(const CuDnnTensor4D& t, TAlgo& algo, TFinder finder)
|
||||
template <typename TAlgo, typename TFinder, typename TStaticFinder>
|
||||
void FindBestAlgo(size_t batchSize, TAlgo& algo, TFinder finder, TStaticFinder staticFinder)
|
||||
{
|
||||
if (!algo.NeedAutotuning(t))
|
||||
if (!algo.NeedAutotuning(batchSize))
|
||||
return;
|
||||
m_inT.UpdateBatchSize(batchSize);
|
||||
m_outT.UpdateBatchSize(batchSize);
|
||||
using CuDnnAlgoT = decltype(TAlgo::Algo);
|
||||
CuDnnAlgoT algoPerf[MaxAlgoCount];
|
||||
int calgo = 0;
|
||||
CUDNN_CALL(finder(calgo, algoPerf));
|
||||
cudnnStatus_t err = finder(calgo, algoPerf);
|
||||
// Alloc failed - usually means cuDNN runtime auto-tuner could not allocate workspace.
|
||||
// In such case, use static auto-tuner with no workspace.
|
||||
if (err == CUDNN_STATUS_ALLOC_FAILED)
|
||||
{
|
||||
decltype(CuDnnAlgoT::algo) noMemAlgo;
|
||||
CUDNN_CALL(staticFinder(noMemAlgo));
|
||||
algo.CurMBSize = batchSize;
|
||||
algo.Algo = algoPerf[0];
|
||||
algo.Algo.algo = noMemAlgo;
|
||||
algo.Algo.memory = 0;
|
||||
algo.Algo.status = CUDNN_STATUS_SUCCESS;
|
||||
algo.NoWorkspaceAlgo = noMemAlgo;
|
||||
return;
|
||||
}
|
||||
CUDNN_CALL(err);
|
||||
assert(calgo > 0);
|
||||
size_t maxMem = m_maxTempMemSizeInSamples == 0 ? (std::numeric_limits<size_t>::max)() : t.w() * t.h() * t.c() * m_maxTempMemSizeInSamples * sizeof(ElemType);
|
||||
size_t inputSampleSize = m_geometry->InputShape().GetNumElements();
|
||||
size_t maxMem = m_maxTempMemSizeInSamples == 0 ? (std::numeric_limits<size_t>::max)() : inputSampleSize * m_maxTempMemSizeInSamples * sizeof(ElemType);
|
||||
// Find best (fastest) algorithm which satisfies workspace requirements.
|
||||
auto res = std::find_if(algoPerf, algoPerf + calgo,
|
||||
[=](const CuDnnAlgoT& cur)
|
||||
{
|
||||
|
@ -462,8 +341,9 @@ private:
|
|||
});
|
||||
if (res == algoPerf + calgo)
|
||||
RuntimeError("cuDNN could not find suitable algorithm for the current convolution configuration.");
|
||||
algo.CurMBSize = t.n();
|
||||
algo.CurMBSize = batchSize;
|
||||
algo.Algo = *res;
|
||||
// Find fastest algorithm that does NOT require workspace. It is used as a fallback algo in Forward function.
|
||||
res = std::find_if(algoPerf, algoPerf + calgo,
|
||||
[](const CuDnnAlgoT& cur)
|
||||
{
|
||||
|
@ -478,6 +358,15 @@ private:
|
|||
algo.NoWorkspaceAlgo = (*res).algo;
|
||||
}
|
||||
|
||||
static ElemType* ptr(Mat& src)
|
||||
{
|
||||
return src.BufferPointer();
|
||||
}
|
||||
static const ElemType* ptr(const Mat& src)
|
||||
{
|
||||
return src.BufferPointer();
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
struct ConvAlgoInfo
|
||||
|
@ -495,7 +384,7 @@ private:
|
|||
T Algo;
|
||||
CuDnnAlgoT NoWorkspaceAlgo;
|
||||
|
||||
bool NeedAutotuning(const CuDnnTensor4D& t)
|
||||
bool NeedAutotuning(size_t batchSize)
|
||||
{
|
||||
// Need to re-run auto-tuner in case minibatch size is increased.
|
||||
// If minibatch size is decreased we assume that previously selected algorithm requires less or the same amount of workspace.
|
||||
|
@ -504,186 +393,57 @@ private:
|
|||
// We also need to reset auto-tuning status at the beginning of each epoch but ComputationNode currently does not provide such notification.
|
||||
// We assume no other dimensions of tensors can change so we don't check it.
|
||||
// REVIEW alexeyk: review once we get response from NVIDIA.
|
||||
return (Algo.status != CUDNN_STATUS_SUCCESS || t.n() > CurMBSize);
|
||||
return (Algo.status != CUDNN_STATUS_SUCCESS || batchSize > CurMBSize);
|
||||
}
|
||||
};
|
||||
|
||||
using C = Consts<ElemType>;
|
||||
CuDnn::ptr_t m_cudnn;
|
||||
cudnnDataType_t m_dataType;
|
||||
CuDnnTensor m_inT;
|
||||
CuDnnTensor m_outT;
|
||||
// Convolution specific.
|
||||
std::unique_ptr<CuDnnKernel> m_kernelT;
|
||||
std::unique_ptr<CuDnnConv> m_conv;
|
||||
// Pooling specific.
|
||||
std::unique_ptr<CuDnnPool> m_pool;
|
||||
|
||||
// REVIEW alexeyk: currently limit is set once in ctor though in CNTK it can be, theoretically, changed in runtime.
|
||||
size_t m_maxTempMemSizeInSamples;
|
||||
BatchNormImpl m_bnImpl;
|
||||
cudnnHandle_t m_cudnn;
|
||||
cudaStream_t m_stream;
|
||||
ConvAlgoInfo<cudnnConvolutionFwdAlgoPerf_t> m_fwdAlgo;
|
||||
ConvAlgoInfo<cudnnConvolutionBwdDataAlgoPerf_t> m_backDataAlgo;
|
||||
ConvAlgoInfo<cudnnConvolutionBwdFilterAlgoPerf_t> m_backFiltAlgo;
|
||||
};
|
||||
|
||||
template <class ElemType>
|
||||
class CuDnnPoolingEngine : public PoolingEngine<ElemType>
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr geometry,
|
||||
DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
|
||||
size_t maxTempMemSizeInSamples, PoolKind poolKind)
|
||||
{
|
||||
public:
|
||||
using Base = PoolingEngine<ElemType>;
|
||||
using typename Base::Tensor4D;
|
||||
using typename Base::PoolDesc;
|
||||
using typename Base::Mat;
|
||||
|
||||
public:
|
||||
CuDnnPoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
|
||||
: Base(deviceId, imageLayout), m_cudnn(nullptr)
|
||||
{
|
||||
CUDNN_CALL(cudnnCreate(&m_cudnn));
|
||||
CUDNN_CALL(cudnnSetStream(m_cudnn, GetStream()));
|
||||
}
|
||||
|
||||
~CuDnnPoolingEngine()
|
||||
{
|
||||
if (m_cudnn != nullptr)
|
||||
{
|
||||
// TODO: Check for error code and throw if !std::uncaught_exception()
|
||||
cudnnDestroy(m_cudnn);
|
||||
m_cudnn = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
using Base::m_deviceId;
|
||||
using Base::m_imageLayout;
|
||||
|
||||
void EnsureCompatible() override
|
||||
{
|
||||
if (m_imageLayout != ImageLayoutKind::CHW)
|
||||
RuntimeError("cuDNN pooling engine supports only CHW/cudnn layout.");
|
||||
if (!IsGpu(m_deviceId))
|
||||
RuntimeError("cuDNN pooling engine supports GPU devices only.");
|
||||
}
|
||||
|
||||
void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) override
|
||||
{
|
||||
CUDNN_CALL(cudnnPoolingForward(m_cudnn, p(poolDesc), &C::One, t(inT), ptr(in), &C::Zero, t(outT), ptr(out)));
|
||||
}
|
||||
|
||||
void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) override
|
||||
{
|
||||
CUDNN_CALL(cudnnPoolingBackward(m_cudnn, p(poolDesc), &C::One, t(outT), ptr(out), t(outT), ptr(srcGrad),
|
||||
t(inT), ptr(in), &C::One, t(inT), ptr(grad)));
|
||||
}
|
||||
|
||||
private:
|
||||
using C = Consts<ElemType>;
|
||||
|
||||
cudnnHandle_t m_cudnn;
|
||||
};
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
|
||||
{
|
||||
// REVIEW alexeyk: assert fires in GCC but not in VC++.
|
||||
// static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
|
||||
RuntimeError("Not implemented.");
|
||||
}
|
||||
template <>
|
||||
typename CuDnnConvolutionEngineFactory<float>::Tensor4DPtr CuDnnConvolutionEngineFactory<float>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
|
||||
{
|
||||
return std::make_unique<CuDnnTensor4D>(w, h, c, n, CUDNN_DATA_FLOAT);
|
||||
}
|
||||
template <>
|
||||
typename CuDnnConvolutionEngineFactory<double>::Tensor4DPtr CuDnnConvolutionEngineFactory<double>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
|
||||
{
|
||||
return std::make_unique<CuDnnTensor4D>(w, h, c, n, CUDNN_DATA_DOUBLE);
|
||||
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
|
||||
bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind)
|
||||
{
|
||||
// REVIEW alexeyk: assert fires in GCC but not in VC++.
|
||||
// static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
|
||||
RuntimeError("Not implemented.");
|
||||
}
|
||||
template <>
|
||||
typename CuDnnConvolutionEngineFactory<float>::FilterPtr CuDnnConvolutionEngineFactory<float>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
|
||||
{
|
||||
return std::make_unique<CuDnnFilter>(w, h, c, k, CUDNN_DATA_FLOAT);
|
||||
}
|
||||
template <>
|
||||
typename CuDnnConvolutionEngineFactory<double>::FilterPtr CuDnnConvolutionEngineFactory<double>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
|
||||
{
|
||||
return std::make_unique<CuDnnFilter>(w, h, c, k, CUDNN_DATA_DOUBLE);
|
||||
}
|
||||
// REVIEW alexeyk: IsSupported check should be performed by cuDNN itself. Is there a good way to do that?
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
|
||||
const Tensor4D& /*inT*/, const Filter& filterT, size_t wStride, size_t hStride, bool padding)
|
||||
{
|
||||
size_t wPad = padding ? filterT.w() / 2 : 0;
|
||||
size_t hPad = padding ? filterT.h() / 2 : 0;
|
||||
return std::make_unique<CuDnnConvolutionDescriptor>(wStride, hStride, wPad, hPad);
|
||||
cudaDeviceProp props = {0};
|
||||
if (cudaGetDeviceProperties(&props, deviceId) != cudaSuccess || props.major < 3)
|
||||
return false;
|
||||
|
||||
const auto& input = geometry->InputShape();
|
||||
const auto& kernel = geometry->KernelShape();
|
||||
const auto& sharing = geometry->Sharing();
|
||||
const auto& mapCount = geometry->MapCount();
|
||||
// cuDNN supports 2D and 3D convolutions at the moment with full sharing.
|
||||
// In case map count size > 1, then it should have all ones except last dimension.
|
||||
// If pooling is requested, then cuDNN supports only 2D/3D inputs and 2D pooling kernels.
|
||||
return (input.GetRank() <= 4 &&
|
||||
std::find(begin(sharing), end(sharing), false) == sharing.end() &&
|
||||
mapCount.GetNumElements() == mapCount[mapCount.GetRank() - 1] &&
|
||||
(poolKind == PoolKind::None ||
|
||||
input.GetRank() <= 3 && (kernel.GetRank() < 3 || kernel[2] == 1)));
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
|
||||
typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
|
||||
{
|
||||
return std::make_unique<CuDnnPoolingDescriptor>(kind, w, h, wStride, hStride, wPad, hPad);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(
|
||||
DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
|
||||
{
|
||||
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(deviceId, imageLayout, maxTempMemSizeInSamples, bnImpl);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(
|
||||
DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
|
||||
{
|
||||
return std::make_unique<CuDnnPoolingEngine<ElemType>>(deviceId, imageLayout);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t, size_t, size_t, size_t)
|
||||
{
|
||||
RuntimeError("The code is compiled without USE_CUDNN macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t, size_t, size_t, size_t)
|
||||
{
|
||||
RuntimeError("The code is compiled without USE_CUDNN macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
|
||||
const Tensor4D&, const Filter&, size_t, size_t, bool)
|
||||
{
|
||||
RuntimeError("The code is compiled without USE_CUDNN macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
|
||||
typename PoolDesc::PoolKind, size_t, size_t, size_t, size_t, size_t, size_t)
|
||||
{
|
||||
RuntimeError("The code is compiled without USE_CUDNN macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, ImageLayoutKind, size_t, BatchNormImpl)
|
||||
{
|
||||
RuntimeError("The code is compiled without USE_CUDNN macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(DEVICEID_TYPE, ImageLayoutKind)
|
||||
{
|
||||
RuntimeError("The code is compiled without USE_CUDNN macro.");
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template class CuDnnConvolutionEngineFactory<float>;
|
||||
template class CuDnnConvolutionEngineFactory<double>;
|
||||
|
||||
} } }
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,61 +0,0 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ConvolutionEngine.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
class CuDnnConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
|
||||
{
|
||||
public:
|
||||
using Base = ConvolutionEngineFactory<ElemType>;
|
||||
using typename Base::Tensor4D;
|
||||
using typename Base::Tensor4DPtr;
|
||||
using typename Base::Filter;
|
||||
using typename Base::FilterPtr;
|
||||
using typename Base::ConvDesc;
|
||||
using typename Base::ConvDescPtr;
|
||||
using typename Base::PoolDesc;
|
||||
using typename Base::PoolDescPtr;
|
||||
|
||||
using typename Base::ConvEnginePtr;
|
||||
using typename Base::PoolEnginePtr;
|
||||
|
||||
public:
|
||||
Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override;
|
||||
FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override;
|
||||
ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT,
|
||||
size_t wStride, size_t hStride, bool padding) override;
|
||||
PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override;
|
||||
|
||||
ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) override;
|
||||
PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) override;
|
||||
|
||||
static bool IsSupported(DEVICEID_TYPE deviceId);
|
||||
};
|
||||
|
||||
// REVIEW alexeyk: wrong place. It is currently used only in unit tests but I can't add it there because of the build issues.
|
||||
// Timer that can be used to measure CUDA calls.
|
||||
// Uses CUDA event and will synchronize(!) the stream when Stop is called.
|
||||
class MATH_API CudaTimer
|
||||
{
|
||||
public:
|
||||
CudaTimer(): m_start(nullptr), m_stop(nullptr)
|
||||
{
|
||||
}
|
||||
~CudaTimer();
|
||||
void Start();
|
||||
void Stop();
|
||||
float Elapsed();
|
||||
|
||||
DISABLE_COPY_AND_MOVE(CudaTimer);
|
||||
private:
|
||||
void* m_start;
|
||||
void* m_stop;
|
||||
};
|
||||
} } }
|
|
@ -0,0 +1,51 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ConvolutionEngine.h"
|
||||
#include "BatchNormalizationEngine.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
template <class ElemType>
|
||||
class CuDnnConvolutionEngineFactory
|
||||
{
|
||||
public:
|
||||
static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
|
||||
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
|
||||
PoolKind poolKind);
|
||||
static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind);
|
||||
};
|
||||
|
||||
template <class ElemType>
|
||||
class CuDnnBatchNormEngineFactory
|
||||
{
|
||||
public:
|
||||
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout);
|
||||
};
|
||||
|
||||
// REVIEW alexeyk: wrong place? It is currently used only in unit tests but I can't add it there because of the build issues.
|
||||
// Timer that can be used to measure CUDA calls.
|
||||
// Uses CUDA event and will synchronize(!) the stream when Stop is called.
|
||||
class MATH_API CudaTimer
|
||||
{
|
||||
public:
|
||||
CudaTimer(): m_start(nullptr), m_stop(nullptr)
|
||||
{
|
||||
}
|
||||
~CudaTimer();
|
||||
void Start();
|
||||
void Stop();
|
||||
float Elapsed();
|
||||
|
||||
DISABLE_COPY_AND_MOVE(CudaTimer);
|
||||
private:
|
||||
void* m_start;
|
||||
void* m_stop;
|
||||
};
|
||||
|
||||
} } }
|
|
@ -24,6 +24,8 @@
|
|||
#include "cublas_v2.h"
|
||||
#include <assert.h>
|
||||
#include <memory>
|
||||
#include "CntkBatchNormalization.cuh"
|
||||
#include "Convolution.cuh"
|
||||
|
||||
#pragma comment(lib, "cudart.lib") // instruct linker to reference these libs
|
||||
#pragma comment(lib, "cublas.lib")
|
||||
|
@ -145,7 +147,7 @@ AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numE
|
|||
}
|
||||
|
||||
AllocatedElemType* deviceBufferPtr = AllocateNoTrace<AllocatedElemType>(deviceId, numElements);
|
||||
|
||||
|
||||
if (IsTraceEnabled())
|
||||
{
|
||||
fprintf(stderr, "Allocated DeviceBufferPointer = %p\n", (void*)deviceBufferPtr);
|
||||
|
@ -3001,6 +3003,178 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMat
|
|||
|
||||
#pragma endregion Other helper functions
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const
|
||||
{
|
||||
const int BlockSize = 128;
|
||||
auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
|
||||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
kConvolutionForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), kernel.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
|
||||
runs.m_pArray, m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
const int BlockSize = 128;
|
||||
auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
|
||||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
kConvolutionBackwardData<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), kernel.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
|
||||
runs.m_pArray, m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const
|
||||
{
|
||||
const int BlockSize = 128;
|
||||
auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
|
||||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
kConvolutionBackwardKernel<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), (int)in.GetNumRows(), (int)GetNumRows(),
|
||||
in.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
|
||||
runs.m_pArray, m_pArray, kernelGrad.m_pArray);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
|
||||
{
|
||||
const int BlockSize = 128;
|
||||
auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
|
||||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
kMaxPoolingForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
|
||||
m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
|
||||
const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
|
||||
GPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
const int BlockSize = 128;
|
||||
auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
|
||||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
kMaxPoolingBackward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), out.m_pArray, in.m_pArray,
|
||||
mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
|
||||
m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
|
||||
{
|
||||
const int BlockSize = 128;
|
||||
auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
|
||||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
kAveragePoolingForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
|
||||
m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
const int BlockSize = 128;
|
||||
auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
|
||||
PrepareDevice();
|
||||
SyncGuard syncGuard;
|
||||
kAveragePoolingBackward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
|
||||
m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
|
||||
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
|
||||
{
|
||||
assert((GetNumRows() % scale.GetNumRows()) == 0);
|
||||
|
||||
bool spatial = GetNumRows() != scale.GetNumRows();
|
||||
size_t vectorSize = GetNumRows();
|
||||
size_t spatialSize = spatial ? (GetNumRows() / scale.GetNumRows()) : 1;
|
||||
size_t batchSize = GetNumCols();
|
||||
|
||||
assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
|
||||
assert(0 < batchSize && batchSize <= std::numeric_limits<int>::max());
|
||||
|
||||
SyncGuard syncGuard;
|
||||
// If expAvgFactor == 0 && blendFactor == 1 then we don't need to compute current minibatch statistics.
|
||||
if (expAvgFactor > 0 || blendFactor < 1)
|
||||
{
|
||||
if (spatial)
|
||||
{
|
||||
Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, m_pArray,
|
||||
expAvgFactor, runMean.m_pArray, runInvStdDev.m_pArray, epsilon,
|
||||
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
|
||||
}
|
||||
else
|
||||
{
|
||||
Call<ComputeBatchMeanAndInvStdDev, ElemType>(vectorSize, vectorSize, batchSize, m_pArray,
|
||||
expAvgFactor, runMean.m_pArray, runInvStdDev.m_pArray, epsilon,
|
||||
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
|
||||
}
|
||||
}
|
||||
// When:
|
||||
// blendFactor == 1 - use running mean/var instead of the current minibatch mean/var.
|
||||
// 0 < blendFactor < 1 - blend running mean/var with mean/var of the current minibatch: saveMean = (1 - blendFactor) * saveMean + blendFactor * runMean
|
||||
// blendFactor == 0 - use mean/var of the current minibatch.
|
||||
if (blendFactor < 1)
|
||||
{
|
||||
if (blendFactor > 0)
|
||||
{
|
||||
// REVIEW alexeyk: can be rolled into NormalizeBatchTraining to save bandwidth.
|
||||
Scale((ElemType)(1 - blendFactor), saveMean);
|
||||
ScaleAndAdd((ElemType)blendFactor, runMean, saveMean);
|
||||
Scale((ElemType)(1 - blendFactor), saveInvStdDev);
|
||||
ScaleAndAdd((ElemType)blendFactor, runInvStdDev, saveInvStdDev);
|
||||
}
|
||||
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
|
||||
spatial, m_pArray, out.m_pArray, scale.m_pArray, bias.m_pArray,
|
||||
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
|
||||
}
|
||||
else
|
||||
{
|
||||
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
|
||||
spatial, m_pArray, out.m_pArray, scale.m_pArray, bias.m_pArray,
|
||||
runMean.m_pArray, runInvStdDev.m_pArray, GetStream());
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale,
|
||||
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
|
||||
{
|
||||
assert((GetNumRows() % scale.GetNumRows()) == 0);
|
||||
|
||||
bool spatial = GetNumRows() != scale.GetNumRows();
|
||||
size_t vectorSize = GetNumRows();
|
||||
size_t spatialSize = spatial ? (GetNumRows() / scale.GetNumRows()) : 1;
|
||||
size_t batchSize = GetNumCols();
|
||||
|
||||
assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
|
||||
assert(0 < batchSize && batchSize <= std::numeric_limits<int>::max());
|
||||
|
||||
SyncGuard syncGuard;
|
||||
if (spatial)
|
||||
{
|
||||
Call<ComputeSpatialScaleAndBiasGradients, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, in.m_pArray, m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray,
|
||||
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
|
||||
}
|
||||
else
|
||||
{
|
||||
Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.m_pArray, m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray,
|
||||
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
|
||||
}
|
||||
Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
|
||||
in.m_pArray, m_pArray, grad.m_pArray, scale.m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray, saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
|
||||
}
|
||||
|
||||
#pragma region Static BLAS Functions
|
||||
// float/double overloads of cublasSgemm()/cublasDgemm()
|
||||
static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
|
||||
|
@ -4216,6 +4390,9 @@ template void GPUMatrix<char>::SetValue(const char);
|
|||
template void GPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, int deviceId, char* pArray, size_t matrixFlags);
|
||||
template void GPUMatrix<char>::SetValue(GPUMatrix<char> const&);
|
||||
|
||||
template GPUMatrix<int>::GPUMatrix(const size_t, const size_t, int, int*, const size_t);
|
||||
template GPUMatrix<int>::~GPUMatrix();
|
||||
|
||||
template int* TracingGPUMemoryAllocator::Allocate<int>(int, size_t);
|
||||
template size_t* TracingGPUMemoryAllocator::Allocate<size_t>(int, size_t);
|
||||
template long* TracingGPUMemoryAllocator::Allocate<long>(int, size_t);
|
||||
|
|
|
@ -45,6 +45,11 @@ typedef struct CUstream_st* cudaStream_t;
|
|||
#define USE_TIME_BASED_SEED ULONG_MAX
|
||||
#endif
|
||||
|
||||
// Max number of GPUs on a _single_ node.
|
||||
#ifndef MAX_GPUS
|
||||
#define MAX_GPUS 16
|
||||
#endif
|
||||
|
||||
// Stream management functions
|
||||
void MATH_API SetStream(cudaStream_t stream);
|
||||
cudaStream_t MATH_API GetStream();
|
||||
|
@ -100,7 +105,7 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
|
|||
friend class GPUMatrix;
|
||||
|
||||
public:
|
||||
static const int MaxGpus = 8; // support up to 8 GPUs
|
||||
static const int MaxGpus = MAX_GPUS;
|
||||
using BaseMatrix<ElemType>::m_computeDevice;
|
||||
using BaseMatrix<ElemType>::m_elemSizeAllocated;
|
||||
using BaseMatrix<ElemType>::m_format;
|
||||
|
@ -402,6 +407,27 @@ public:
|
|||
const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
|
||||
|
||||
void ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const;
|
||||
void ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const;
|
||||
void ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const;
|
||||
|
||||
void MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
|
||||
void MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
|
||||
const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
|
||||
GPUMatrix<ElemType>& grad) const;
|
||||
|
||||
void AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
|
||||
void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;
|
||||
|
||||
void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
|
||||
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const;
|
||||
|
||||
public:
|
||||
// static BLAS functions
|
||||
static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c);
|
||||
|
|
|
@ -2644,6 +2644,10 @@ template GPUSparseMatrix<char> GPUSparseMatrix<char>::ColumnSlice(size_t startCo
|
|||
template GPUMatrix<char> GPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
|
||||
template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix<char>&& deepCopy);
|
||||
|
||||
template GPUSparseMatrix<int>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
|
||||
template GPUSparseMatrix<int>::~GPUSparseMatrix();
|
||||
template void GPUSparseMatrix<int>::Resize(const size_t, const size_t, const size_t, const bool, bool);
|
||||
|
||||
template <class ElemType>
|
||||
MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
|
||||
{
|
||||
|
|
|
@ -156,8 +156,10 @@
|
|||
<ClInclude Include="..\Common\Include\TensorShape.h" />
|
||||
<ClInclude Include="..\Common\Include\File.h" />
|
||||
<ClInclude Include="..\Common\Include\fileutil.h" />
|
||||
<ClInclude Include="BatchNormalizationEngine.h" />
|
||||
<ClInclude Include="CommonMatrix.h" />
|
||||
<ClInclude Include="ConvolutionEngine.h" />
|
||||
<ClInclude Include="ConvolveGeometry.h" />
|
||||
<ClInclude Include="CPUMatrix.h" />
|
||||
<ClInclude Include="MatrixQuantizerImpl.h" />
|
||||
<ClInclude Include="TensorOps.h" />
|
||||
|
@ -188,6 +190,7 @@
|
|||
<ClCompile Include="..\Common\fileutil.cpp">
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="BatchNormalizationEngine.cpp" />
|
||||
<ClCompile Include="ConvolutionEngine.cpp" />
|
||||
<ClCompile Include="CPUSparseMatrix.cpp" />
|
||||
<ClCompile Include="CUDAPageLockedMemAllocator.cpp" />
|
||||
|
@ -212,4 +215,4 @@
|
|||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets" />
|
||||
</Project>
|
||||
</Project>
|
|
@ -44,6 +44,9 @@
|
|||
<ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
|
||||
<Filter>Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="BatchNormalizationEngine.cpp">
|
||||
<Filter>BatchNormalization</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="CommonMatrix.h" />
|
||||
|
@ -97,6 +100,12 @@
|
|||
<ClInclude Include="MatrixQuantizerImpl.h">
|
||||
<Filter>1bitSGD</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="ConvolveGeometry.h">
|
||||
<Filter>Convolution</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="BatchNormalizationEngine.h">
|
||||
<Filter>BatchNormalization</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="GPUMatrix.h">
|
||||
|
@ -143,5 +152,8 @@
|
|||
<Filter Include="1bitSGD">
|
||||
<UniqueIdentifier>{546cacbd-253e-485b-8c8c-8b9ee0e2f631}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="BatchNormalization">
|
||||
<UniqueIdentifier>{8f982dac-298d-4e48-b060-8e6cba5ff554}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -143,6 +143,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
|
|||
<ItemGroup>
|
||||
<ClInclude Include="..\Common\Include\File.h" />
|
||||
<ClInclude Include="..\Common\Include\fileutil.h" />
|
||||
<ClInclude Include="CntkBatchNormalization.cuh" />
|
||||
<ClInclude Include="ColumnQuantizer.h" />
|
||||
<ClInclude Include="CommonMatrix.h" />
|
||||
<ClInclude Include="cudabasetypes.h" />
|
||||
|
@ -151,11 +152,12 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
|
|||
<ClInclude Include="cudalatticeops.cu.h" />
|
||||
<ClInclude Include="cudalatticeops.h" />
|
||||
<ClInclude Include="cudalib.h" />
|
||||
<ClInclude Include="CuDnnConvolutionEngine.cuh" />
|
||||
<ClInclude Include="CuDnnConvolutionEngine.h" />
|
||||
<ClInclude Include="CuDnnCommon.h" />
|
||||
<ClInclude Include="CuDnnFactories.h" />
|
||||
<ClInclude Include="GPUDataTransferer.h" />
|
||||
<ClInclude Include="GPUTensor.h" />
|
||||
<ClInclude Include="latticefunctionskernels.h" />
|
||||
<ClInclude Include="Convolution.cuh" />
|
||||
<ClInclude Include="TensorOps.h" />
|
||||
<ClInclude Include="ValueQuantizer.h" />
|
||||
<None Include="GPUWatcher.h">
|
||||
|
@ -170,6 +172,9 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
|
|||
<ClInclude Include="targetver.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CudaCompile Include="CuDnnBatchNormalization.cu">
|
||||
<FileType>CppCode</FileType>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="GPUTensor.cu">
|
||||
<InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
|
||||
<Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
|
||||
|
@ -190,6 +195,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
|
|||
<CudaCompile Include="CuDnnConvolutionEngine.cu">
|
||||
<FileType>CppCode</FileType>
|
||||
</CudaCompile>
|
||||
<ClCompile Include="CuDnnCommon.cpp" />
|
||||
<ClCompile Include="GPUDataTransferer.cpp" />
|
||||
<ClCompile Include="stdafx.cpp">
|
||||
<PrecompiledHeader>Create</PrecompiledHeader>
|
||||
|
|
|
@ -28,6 +28,9 @@
|
|||
<CudaCompile Include="CuDnnConvolutionEngine.cu">
|
||||
<Filter>GPU\Convolution</Filter>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="CuDnnBatchNormalization.cu">
|
||||
<Filter>GPU\BatchNormalization</Filter>
|
||||
</CudaCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="cudalattice.cpp">
|
||||
|
@ -45,6 +48,9 @@
|
|||
<ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
|
||||
<Filter>Misc</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="CuDnnCommon.cpp">
|
||||
<Filter>GPU\CuDnn</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\Common\Include\File.h">
|
||||
|
@ -98,8 +104,8 @@
|
|||
<ClInclude Include="CommonMatrix.h">
|
||||
<Filter>from Math</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="CuDnnConvolutionEngine.h">
|
||||
<Filter>GPU\Convolution</Filter>
|
||||
<ClInclude Include="CuDnnFactories.h">
|
||||
<Filter>GPU\CuDnn</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="TensorOps.h">
|
||||
<Filter>from Math</Filter>
|
||||
|
@ -107,7 +113,13 @@
|
|||
<ClInclude Include="GPUDataTransferer.h">
|
||||
<Filter>GPU</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="CuDnnConvolutionEngine.cuh">
|
||||
<ClInclude Include="CntkBatchNormalization.cuh">
|
||||
<Filter>GPU\BatchNormalization</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="CuDnnCommon.h">
|
||||
<Filter>GPU\CuDnn</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Convolution.cuh">
|
||||
<Filter>GPU\Convolution</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
|
@ -150,5 +162,11 @@
|
|||
<Filter Include="GPU\Convolution">
|
||||
<UniqueIdentifier>{3155488f-128f-494e-858d-459b4cc9fab7}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="GPU\BatchNormalization">
|
||||
<UniqueIdentifier>{639ff4b6-39b5-4a5b-8856-ee918eeea91e}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="GPU\CuDnn">
|
||||
<UniqueIdentifier>{05351afa-de95-40c8-830a-d70eede55dc0}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -3987,6 +3987,189 @@ Matrix<ElemType>& Matrix<ElemType>::AddAveragePoolingGradient(const Matrix<ElemT
|
|||
|
||||
#pragma endregion Other Helper Functions
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::ConvolutionForward(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
|
||||
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const
|
||||
{
|
||||
assert(mpRowCol.GetNumCols() == 1);
|
||||
assert(mpRowIwht.GetNumCols() == 1);
|
||||
assert(mpRowRun.GetNumCols() == 1);
|
||||
assert(runs.GetNumCols() == 1);
|
||||
|
||||
DecideAndMoveToRightDevice(*this, output);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->ConvolutionForward(*(kernel.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
|
||||
*(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(output.m_CPUMatrix)),
|
||||
m_GPUMatrix->ConvolutionForward(*(kernel.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
|
||||
*(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(output.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::ConvolutionBackwardData(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
|
||||
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& grad) const
|
||||
{
|
||||
assert(mpRowCol.GetNumCols() == 1);
|
||||
assert(mpRowIwht.GetNumCols() == 1);
|
||||
assert(mpRowRun.GetNumCols() == 1);
|
||||
assert(runs.GetNumCols() == 1);
|
||||
|
||||
DecideAndMoveToRightDevice(*this, grad);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->ConvolutionBackwardData(*(kernel.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
|
||||
*(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(grad.m_CPUMatrix)),
|
||||
m_GPUMatrix->ConvolutionBackwardData(*(kernel.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
|
||||
*(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(grad.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::ConvolutionBackwardKernel(const Matrix<ElemType>& in, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
|
||||
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& kernelGrad) const
|
||||
{
|
||||
assert(mpRowCol.GetNumCols() == 1);
|
||||
assert(mpRowIwht.GetNumCols() == 1);
|
||||
assert(mpRowRun.GetNumCols() == 1);
|
||||
assert(runs.GetNumCols() == 1);
|
||||
|
||||
DecideAndMoveToRightDevice(*this, kernelGrad);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->ConvolutionBackwardKernel(*(in.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
|
||||
*(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(kernelGrad.m_CPUMatrix)),
|
||||
m_GPUMatrix->ConvolutionBackwardKernel(*(in.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
|
||||
*(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(kernelGrad.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
|
||||
{
|
||||
assert(mpRowCol.GetNumCols() == 1);
|
||||
assert(mpRowIndices.GetNumCols() == 1);
|
||||
assert(indices.GetNumCols() == 1);
|
||||
|
||||
DecideAndMoveToRightDevice(*this, output);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->MaxPoolingForward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(output.m_CPUMatrix)),
|
||||
m_GPUMatrix->MaxPoolingForward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(output.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
|
||||
const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
|
||||
Matrix<ElemType>& grad) const
|
||||
{
|
||||
assert(mpRowCol.GetNumCols() == 1);
|
||||
assert(mpRowIndices.GetNumCols() == 1);
|
||||
assert(indices.GetNumCols() == 1);
|
||||
|
||||
DecideAndMoveToRightDevice(*this, grad);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->MaxPoolingBackward(*(out.m_CPUMatrix), *(in.m_CPUMatrix),
|
||||
*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix),
|
||||
*(grad.m_CPUMatrix)),
|
||||
m_GPUMatrix->MaxPoolingBackward(*(out.m_GPUMatrix), *(in.m_GPUMatrix),
|
||||
*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix),
|
||||
*(grad.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
|
||||
{
|
||||
assert(mpRowCol.GetNumCols() == 1);
|
||||
assert(mpRowIndices.GetNumCols() == 1);
|
||||
assert(indices.GetNumCols() == 1);
|
||||
|
||||
DecideAndMoveToRightDevice(*this, output);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->AveragePoolingForward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(output.m_CPUMatrix)),
|
||||
m_GPUMatrix->AveragePoolingForward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(output.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const
|
||||
{
|
||||
assert(mpRowCol.GetNumCols() == 1);
|
||||
assert(mpRowIndices.GetNumCols() == 1);
|
||||
assert(indices.GetNumCols() == 1);
|
||||
|
||||
DecideAndMoveToRightDevice(*this, grad);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->AveragePoolingBackward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(grad.m_CPUMatrix)),
|
||||
m_GPUMatrix->AveragePoolingBackward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(grad.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
|
||||
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const
|
||||
{
|
||||
DecideAndMoveToRightDevice(*this, out);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor,
|
||||
*(runMean.m_CPUMatrix), *(runInvStdDev.m_CPUMatrix),
|
||||
*(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)),
|
||||
m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor,
|
||||
*(runMean.m_GPUMatrix), *(runInvStdDev.m_GPUMatrix),
|
||||
*(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
|
||||
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const
|
||||
{
|
||||
DecideAndMoveToRightDevice(*this, grad);
|
||||
|
||||
// REVIEW alexeyk: add sparse version.
|
||||
DISPATCH_MATRIX_ON_FLAG(this,
|
||||
this,
|
||||
m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix),
|
||||
*(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix),
|
||||
*(scaleGrad.m_CPUMatrix), *(biasGrad.m_CPUMatrix)),
|
||||
m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix),
|
||||
*(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix),
|
||||
*(scaleGrad.m_GPUMatrix), *(biasGrad.m_GPUMatrix)),
|
||||
NOT_IMPLEMENTED,
|
||||
NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
#pragma region Static BLAS Functions
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -5108,4 +5291,6 @@ template void Matrix<char>::SetValue(const Matrix<char>&, MatrixFormat);
|
|||
template bool Matrix<char>::IsEmpty() const;
|
||||
template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);
|
||||
|
||||
template Matrix<int>::Matrix(const size_t, const size_t, int*, DEVICEID_TYPE, const size_t, const size_t);
|
||||
|
||||
}}}
|
||||
|
|
|
@ -453,6 +453,27 @@ public:
|
|||
const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
|
||||
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
|
||||
|
||||
void ConvolutionForward(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
|
||||
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
|
||||
void ConvolutionBackwardData(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
|
||||
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& grad) const;
|
||||
void ConvolutionBackwardKernel(const Matrix<ElemType>& in, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
|
||||
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& kernelGrad) const;
|
||||
|
||||
void MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
|
||||
void MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
|
||||
const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
|
||||
Matrix<ElemType>& grad) const;
|
||||
|
||||
void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
|
||||
void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;
|
||||
|
||||
void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
|
||||
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
|
||||
void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
|
||||
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
|
||||
|
||||
public:
|
||||
// TODO: why are these not static? And why are they here?
|
||||
ElemType Exp10(ElemType num);
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
#include "GPUMatrix.h"
|
||||
#include "GPUSparseMatrix.h"
|
||||
#include "MatrixQuantizerGPU.h"
|
||||
#include "CuDnnConvolutionEngine.h"
|
||||
#include "CuDnnFactories.h"
|
||||
#include "TensorShape.h"
|
||||
#include "GPUDataTransferer.h"
|
||||
|
||||
|
@ -676,6 +676,7 @@ void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inB
|
|||
template class MATH_API GPUSparseMatrix<char>;
|
||||
template class MATH_API GPUSparseMatrix<float>;
|
||||
template class MATH_API GPUSparseMatrix<double>;
|
||||
template class MATH_API GPUSparseMatrix<int>;
|
||||
|
||||
template <typename ElemType>
|
||||
MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
|
||||
|
@ -1728,6 +1729,60 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMat
|
|||
return *this;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const
|
||||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
|
||||
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const
|
||||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
|
||||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
|
||||
const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
|
||||
GPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
|
||||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const
|
||||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
|
||||
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
|
||||
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
|
||||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale,
|
||||
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
|
||||
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
|
||||
{
|
||||
}
|
||||
|
||||
#pragma endregion Other helper functions
|
||||
|
||||
#pragma region Static BLAS Functions
|
||||
|
@ -2096,6 +2151,7 @@ void GPUDataTransferer<ElemType>::WaitForCopyCPUToGPUAsync()
|
|||
template class GPUMatrix<char>;
|
||||
template class GPUMatrix<float>;
|
||||
template class GPUMatrix<double>;
|
||||
template class GPUMatrix<int>;
|
||||
template class DeviceBoundNumber<float>;
|
||||
template class DeviceBoundNumber<double>;
|
||||
template MatrixQuantizerGPU<float>::~MatrixQuantizerGPU();
|
||||
|
@ -2113,45 +2169,14 @@ template <class ElemType>
|
|||
void* GPUMatrix<ElemType>::s_curandGenerator = NULL;
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t, size_t, size_t, size_t)
|
||||
std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr, DEVICEID_TYPE,
|
||||
ImageLayoutKind, size_t, PoolKind)
|
||||
{
|
||||
RuntimeError("The code is compiled with CPUONLY macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t, size_t, size_t, size_t)
|
||||
{
|
||||
RuntimeError("The code is compiled with CPUONLY macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
|
||||
const Tensor4D&, const Filter&, size_t, size_t, bool)
|
||||
{
|
||||
RuntimeError("The code is compiled with CPUONLY macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
|
||||
typename PoolDesc::PoolKind, size_t, size_t, size_t, size_t, size_t, size_t)
|
||||
{
|
||||
RuntimeError("The code is compiled with CPUONLY macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, ImageLayoutKind, size_t, BatchNormImpl)
|
||||
{
|
||||
RuntimeError("The code is compiled with CPUONLY macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(DEVICEID_TYPE, ImageLayoutKind)
|
||||
{
|
||||
RuntimeError("The code is compiled with CPUONLY macro.");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE)
|
||||
bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE, ConvolveGeometryPtr, PoolKind)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
@ -2159,6 +2184,16 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE)
|
|||
template class CuDnnConvolutionEngineFactory<float>;
|
||||
template class CuDnnConvolutionEngineFactory<double>;
|
||||
|
||||
template <class ElemType>
|
||||
std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
|
||||
bool spatial, ImageLayoutKind imageLayout)
|
||||
{
|
||||
RuntimeError("The code is compiled with CPUONLY macro.");
|
||||
}
|
||||
|
||||
template class CuDnnBatchNormEngineFactory<float>;
|
||||
template class CuDnnBatchNormEngineFactory<double>;
|
||||
|
||||
CudaTimer::~CudaTimer()
|
||||
{
|
||||
}
|
||||
|
|
|
@ -18,29 +18,39 @@ CNTKTextFormatReader::CNTKTextFormatReader(MemoryProviderPtr provider,
|
|||
m_provider(provider)
|
||||
{
|
||||
TextConfigHelper configHelper(config);
|
||||
|
||||
if (configHelper.GetElementType() == ElementType::tfloat)
|
||||
{
|
||||
m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<float>(configHelper));
|
||||
}
|
||||
else
|
||||
{
|
||||
m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<double>(configHelper));
|
||||
}
|
||||
|
||||
TransformerPtr randomizer;
|
||||
if (configHelper.ShouldRandomize())
|
||||
try
|
||||
{
|
||||
randomizer = make_shared<BlockRandomizer>(0, SIZE_MAX, m_deserializer);
|
||||
if (configHelper.GetElementType() == ElementType::tfloat)
|
||||
{
|
||||
m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<float>(configHelper));
|
||||
}
|
||||
else
|
||||
{
|
||||
m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<double>(configHelper));
|
||||
}
|
||||
|
||||
size_t window = configHelper.GetRandomizationWindow();
|
||||
TransformerPtr randomizer;
|
||||
if (window > 0)
|
||||
{
|
||||
// Verbosity is a general config parameter, not specific to the text format reader.
|
||||
int verbosity = config(L"verbosity", 2);
|
||||
randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer);
|
||||
}
|
||||
else
|
||||
{
|
||||
randomizer = std::make_shared<NoRandomizer>(m_deserializer);
|
||||
}
|
||||
|
||||
randomizer->Initialize(nullptr, config);
|
||||
|
||||
m_transformer = randomizer;
|
||||
}
|
||||
else
|
||||
catch (const std::runtime_error& e)
|
||||
{
|
||||
randomizer = std::make_shared<NoRandomizer>(m_deserializer);
|
||||
RuntimeError("CNTKTextFormatReader: While reading '%ls': %s", configHelper.GetFilePath().c_str(), e.what());
|
||||
}
|
||||
|
||||
randomizer->Initialize(nullptr, config);
|
||||
|
||||
m_transformer = randomizer;
|
||||
}
|
||||
|
||||
std::vector<StreamDescriptionPtr> CNTKTextFormatReader::GetStreamDescriptions()
|
||||
|
|
|
@ -90,7 +90,6 @@
|
|||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\..\Common\Include\basetypes.h" />
|
||||
<ClInclude Include="..\..\Common\Include\DataReader.h" />
|
||||
<ClInclude Include="..\..\Common\Include\File.h" />
|
||||
<ClInclude Include="..\..\Common\Include\fileutil.h" />
|
||||
|
|
|
@ -27,9 +27,6 @@
|
|||
<ItemGroup>
|
||||
<ClInclude Include="stdafx.h" />
|
||||
<ClInclude Include="targetver.h" />
|
||||
<ClInclude Include="..\..\Common\Include\basetypes.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\Common\Include\DataReader.h">
|
||||
<Filter>Common\Include</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "TextConfigHelper.h"
|
||||
#include "DataReader.h"
|
||||
#include "StringUtil.h"
|
||||
|
||||
using std::string;
|
||||
|
@ -105,19 +106,25 @@ TextConfigHelper::TextConfigHelper(const ConfigParameters& config)
|
|||
|
||||
m_filepath = msra::strfun::utf16(config(L"file"));
|
||||
|
||||
string rand = config(L"randomize", "auto");
|
||||
|
||||
if (AreEqualIgnoreCase(rand, "auto"))
|
||||
if (config.Exists(L"randomize"))
|
||||
{
|
||||
m_randomize = true;
|
||||
}
|
||||
else if (AreEqualIgnoreCase(rand, "none"))
|
||||
{
|
||||
m_randomize = false;
|
||||
}
|
||||
wstring randomizeString = config.CanBeString(L"randomize") ? config(L"randomize") : wstring();
|
||||
if (!_wcsicmp(randomizeString.c_str(), L"none"))
|
||||
{
|
||||
m_randomizationWindow = randomizeNone;
|
||||
}
|
||||
else if (!_wcsicmp(randomizeString.c_str(), L"auto"))
|
||||
{
|
||||
m_randomizationWindow = randomizeAuto;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_randomizationWindow = config(L"randomize");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
RuntimeError("'randomize' parameter must be set to 'auto' or 'none'");
|
||||
m_randomizationWindow = randomizeAuto;
|
||||
}
|
||||
|
||||
m_skipSequenceIds = config(L"skipSequenceIds", false);
|
||||
|
|
|
@ -25,7 +25,7 @@ public:
|
|||
// Get full path to the input file.
|
||||
const wstring& GetFilePath() const { return m_filepath; }
|
||||
|
||||
bool ShouldRandomize() const { return m_randomize; }
|
||||
size_t GetRandomizationWindow() const { return m_randomizationWindow; }
|
||||
|
||||
bool ShouldSkipSequenceIds() const { return m_skipSequenceIds; }
|
||||
|
||||
|
@ -44,7 +44,7 @@ public:
|
|||
private:
|
||||
std::wstring m_filepath;
|
||||
std::vector<StreamDescriptor> m_streams;
|
||||
bool m_randomize;
|
||||
size_t m_randomizationWindow;
|
||||
ElementType m_elementType;
|
||||
bool m_skipSequenceIds;
|
||||
unsigned int m_maxErrors;
|
||||
|
|
|
@ -32,13 +32,10 @@ HTKDataDeserializer::HTKDataDeserializer(
|
|||
m_corpus(corpus),
|
||||
m_totalNumberOfFrames(0)
|
||||
{
|
||||
// Currently we only support frame mode.
|
||||
// TODO: Support of full sequences.
|
||||
bool frameMode = feature.Find("frameMode", "true");
|
||||
if (!frameMode)
|
||||
{
|
||||
LogicError("Currently only reader only supports frame mode. Please check your configuration.");
|
||||
}
|
||||
// The frame mode is currently specified once per configuration,
|
||||
// not in the configuration of a particular deserializer, but on a higher level in the configuration.
|
||||
// Because of that we are using find method below.
|
||||
m_frameMode = feature.Find("frameMode", "true");
|
||||
|
||||
ConfigHelper config(feature);
|
||||
config.CheckFeatureType();
|
||||
|
@ -49,11 +46,18 @@ HTKDataDeserializer::HTKDataDeserializer(
|
|||
m_dimension = config.GetFeatureDimension();
|
||||
m_dimension = m_dimension * (1 + context.first + context.second);
|
||||
|
||||
m_augmentationWindow = config.GetContextWindow();
|
||||
|
||||
InitializeChunkDescriptions(config);
|
||||
InitializeStreams(featureName);
|
||||
InitializeFeatureInformation();
|
||||
|
||||
m_augmentationWindow = config.GetContextWindow();
|
||||
|
||||
// If not given explicitly, we need to identify the required augmentation range from the expected dimension
|
||||
// and the number of dimensions in the file.
|
||||
if (m_augmentationWindow.first == 0 && m_augmentationWindow.second == 0)
|
||||
{
|
||||
m_augmentationWindow.first = m_augmentationWindow.second = msra::dbn::augmentationextent(m_ioFeatureDimension, m_dimension);
|
||||
}
|
||||
}
|
||||
|
||||
// Initializes chunks based on the configuration and utterance descriptions.
|
||||
|
@ -170,7 +174,9 @@ ChunkDescriptions HTKDataDeserializer::GetChunkDescriptions()
|
|||
auto cd = make_shared<ChunkDescription>();
|
||||
cd->m_id = i;
|
||||
cd->m_numberOfSamples = m_chunks[i].GetTotalFrames();
|
||||
cd->m_numberOfSequences = m_chunks[i].GetTotalFrames();
|
||||
// In frame mode, each frame is represented as sequence.
|
||||
// The augmentation is still done for frames in the same sequence only, please see GetSequenceById method.
|
||||
cd->m_numberOfSequences = m_frameMode ? m_chunks[i].GetTotalFrames() : m_chunks[i].GetNumberOfUtterances();
|
||||
chunks.push_back(cd);
|
||||
}
|
||||
return chunks;
|
||||
|
@ -187,16 +193,32 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe
|
|||
{
|
||||
auto utterance = chunk.GetUtterance(i);
|
||||
size_t major = utterance->GetId();
|
||||
// Because it is a frame mode, creating sequences for each frame.
|
||||
for (size_t k = 0; k < utterance->GetNumberOfFrames(); ++k)
|
||||
|
||||
if (m_frameMode)
|
||||
{
|
||||
// Because it is a frame mode, creating a sequence for each frame.
|
||||
for (size_t k = 0; k < utterance->GetNumberOfFrames(); ++k)
|
||||
{
|
||||
SequenceDescription f;
|
||||
f.m_chunkId = chunkId;
|
||||
f.m_key.m_major = major;
|
||||
f.m_key.m_minor = k;
|
||||
f.m_id = offsetInChunk++;
|
||||
f.m_isValid = true;
|
||||
f.m_numberOfSamples = 1;
|
||||
result.push_back(f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Creating sequence description per utterance.
|
||||
SequenceDescription f;
|
||||
f.m_chunkId = chunkId;
|
||||
f.m_key.m_major = major;
|
||||
f.m_key.m_minor = k;
|
||||
f.m_key.m_minor = 0;
|
||||
f.m_id = offsetInChunk++;
|
||||
f.m_isValid = true;
|
||||
f.m_numberOfSamples = 1;
|
||||
f.m_numberOfSamples = utterance->GetNumberOfFrames();
|
||||
result.push_back(f);
|
||||
}
|
||||
}
|
||||
|
@ -204,7 +226,7 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe
|
|||
|
||||
// A wrapper around a matrix that views it as a vector of column vectors.
|
||||
// Does not have any memory associated.
|
||||
class MatrixAsVectorOfVectors
|
||||
class MatrixAsVectorOfVectors
|
||||
{
|
||||
public:
|
||||
MatrixAsVectorOfVectors(msra::dbn::matrixbase& m)
|
||||
|
@ -245,7 +267,7 @@ public:
|
|||
});
|
||||
}
|
||||
|
||||
// Gets data for the sequnce.
|
||||
// Gets data for the sequence.
|
||||
virtual void GetSequence(size_t sequenceId, vector<SequenceDataPtr>& result) override
|
||||
{
|
||||
m_parent->GetSequenceById(m_chunkId, sequenceId, result);
|
||||
|
@ -277,73 +299,117 @@ ChunkPtr HTKDataDeserializer::GetChunk(size_t chunkId)
|
|||
return chunk;
|
||||
};
|
||||
|
||||
// This class stores sequence data for HTK,
|
||||
// - for floats: a simple pointer to the chunk data
|
||||
// - for doubles: allocated array of doubles which is freed when the sequence is no longer used.
|
||||
struct HTKSequenceData : DenseSequenceData
|
||||
// A matrix that stores all samples of a sequence without padding (differently from ssematrix).
|
||||
// The number of columns equals the number of samples in the sequence.
|
||||
// The number of rows equals the size of the feature vector of a sample (= dimensions).
|
||||
class FeatureMatrix
|
||||
{
|
||||
msra::dbn::matrix m_buffer;
|
||||
|
||||
~HTKSequenceData()
|
||||
public:
|
||||
FeatureMatrix(size_t numRows, size_t numColumns) : m_numRows(numRows), m_numColumns(numColumns)
|
||||
{
|
||||
msra::dbn::matrixstripe frame(m_buffer, 0, m_buffer.cols());
|
||||
|
||||
// Checking if m_data just a pointer in to the
|
||||
if (m_data != &frame(0, 0))
|
||||
{
|
||||
delete[] reinterpret_cast<double*>(m_data);
|
||||
m_data = nullptr;
|
||||
}
|
||||
m_data.resize(m_numRows * m_numColumns);
|
||||
}
|
||||
|
||||
// Returns a reference to the column.
|
||||
inline array_ref<float> col(size_t column)
|
||||
{
|
||||
return array_ref<float>(m_data.data() + m_numRows * column, m_numRows);
|
||||
}
|
||||
|
||||
// Gets pointer to the data.
|
||||
inline float* GetData()
|
||||
{
|
||||
return m_data.data();
|
||||
}
|
||||
|
||||
// Gets the number of columns. It equals the number of samples in the sequence/utterance.
|
||||
inline size_t GetNumberOfColumns() const
|
||||
{
|
||||
return m_numColumns;
|
||||
}
|
||||
|
||||
// Gets total size in elements of stored features.
|
||||
inline size_t GetTotalSize() const
|
||||
{
|
||||
return m_data.size();
|
||||
}
|
||||
|
||||
private:
|
||||
// Features
|
||||
std::vector<float> m_data;
|
||||
// Number of rows = dimension of the feature
|
||||
size_t m_numRows;
|
||||
// Number of columns = number of samples in utterance.
|
||||
size_t m_numColumns;
|
||||
};
|
||||
|
||||
typedef shared_ptr<HTKSequenceData> HTKSequenceDataPtr;
|
||||
// This class stores sequence data for HTK for floats.
|
||||
struct HTKFloatSequenceData : DenseSequenceData
|
||||
{
|
||||
HTKFloatSequenceData(FeatureMatrix&& data) : m_buffer(data)
|
||||
{
|
||||
m_numberOfSamples = data.GetNumberOfColumns();
|
||||
m_data = m_buffer.GetData();
|
||||
}
|
||||
|
||||
// Get a sequence by its chunk id and id.
|
||||
private:
|
||||
FeatureMatrix m_buffer;
|
||||
};
|
||||
|
||||
// This class stores sequence data for HTK for doubles.
|
||||
struct HTKDoubleSequenceData : DenseSequenceData
|
||||
{
|
||||
HTKDoubleSequenceData(FeatureMatrix& data) : m_buffer(data.GetData(), data.GetData() + data.GetTotalSize())
|
||||
{
|
||||
m_numberOfSamples = data.GetNumberOfColumns();
|
||||
m_data = m_buffer.data();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<double> m_buffer;
|
||||
};
|
||||
|
||||
// Get a sequence by its chunk id and sequence id.
|
||||
// Sequence ids are guaranteed to be unique inside a chunk.
|
||||
void HTKDataDeserializer::GetSequenceById(size_t chunkId, size_t id, vector<SequenceDataPtr>& r)
|
||||
{
|
||||
const auto& chunkDescription = m_chunks[chunkId];
|
||||
size_t utteranceIndex = chunkDescription.GetUtteranceForChunkFrameIndex(id);
|
||||
size_t utteranceIndex = m_frameMode ? chunkDescription.GetUtteranceForChunkFrameIndex(id) : id;
|
||||
const UtteranceDescription* utterance = chunkDescription.GetUtterance(utteranceIndex);
|
||||
auto utteranceFrames = chunkDescription.GetUtteranceFrames(utteranceIndex);
|
||||
size_t frameIndex = id - utterance->GetStartFrameIndexInsideChunk();
|
||||
|
||||
// wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors()
|
||||
MatrixAsVectorOfVectors utteranceFramesWrapper(utteranceFrames);
|
||||
FeatureMatrix features(m_dimension, m_frameMode ? 1 : utterance->GetNumberOfFrames());
|
||||
|
||||
size_t leftExtent = m_augmentationWindow.first;
|
||||
size_t rightExtent = m_augmentationWindow.second;
|
||||
|
||||
// page in the needed range of frames
|
||||
if (leftExtent == 0 && rightExtent == 0)
|
||||
if (m_frameMode)
|
||||
{
|
||||
leftExtent = rightExtent = msra::dbn::augmentationextent(utteranceFramesWrapper[0].size(), m_dimension);
|
||||
}
|
||||
|
||||
HTKSequenceDataPtr result = make_shared<HTKSequenceData>();
|
||||
result->m_buffer.resize(m_dimension, 1);
|
||||
const vector<char> noBoundaryFlags; // TODO: dummy, currently to boundaries supported.
|
||||
msra::dbn::augmentneighbors(utteranceFramesWrapper, noBoundaryFlags, frameIndex, leftExtent, rightExtent, result->m_buffer, 0);
|
||||
|
||||
result->m_numberOfSamples = 1;
|
||||
msra::dbn::matrixstripe stripe(result->m_buffer, 0, result->m_buffer.cols());
|
||||
if (m_elementType == ElementType::tfloat)
|
||||
{
|
||||
result->m_data = &stripe(0, 0);
|
||||
// For frame mode augment a single frame.
|
||||
size_t frameIndex = id - utterance->GetStartFrameIndexInsideChunk();
|
||||
msra::dbn::augmentneighbors(utteranceFramesWrapper, vector<char>(), frameIndex, m_augmentationWindow.first, m_augmentationWindow.second, features, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(m_elementType == ElementType::tdouble);
|
||||
const size_t dimensions = stripe.rows();
|
||||
double *doubleBuffer = new double[dimensions];
|
||||
const float *floatBuffer = &stripe(0, 0);
|
||||
|
||||
for (size_t i = 0; i < dimensions; i++)
|
||||
// Augment complete utterance.
|
||||
for (size_t frameIndex = 0; frameIndex < utterance->GetNumberOfFrames(); ++frameIndex)
|
||||
{
|
||||
doubleBuffer[i] = floatBuffer[i];
|
||||
msra::dbn::augmentneighbors(utteranceFramesWrapper, vector<char>(), frameIndex, m_augmentationWindow.first, m_augmentationWindow.second, features, frameIndex);
|
||||
}
|
||||
}
|
||||
|
||||
result->m_data = doubleBuffer;
|
||||
// Copy features to the sequence depending on the type.
|
||||
DenseSequenceDataPtr result;
|
||||
if (m_elementType == ElementType::tdouble)
|
||||
{
|
||||
result = make_shared<HTKDoubleSequenceData>(features);
|
||||
}
|
||||
else if (m_elementType == ElementType::tfloat)
|
||||
{
|
||||
result = make_shared<HTKFloatSequenceData>(std::move(features));
|
||||
}
|
||||
else
|
||||
{
|
||||
LogicError("Currently, HTK Deserializer supports only double and float types.");
|
||||
}
|
||||
|
||||
r.push_back(result);
|
||||
|
|
|
@ -66,6 +66,9 @@ private:
|
|||
// Total number of frames.
|
||||
size_t m_totalNumberOfFrames;
|
||||
|
||||
// Flag that indicates whether a single speech frames should be exposed as a sequence.
|
||||
bool m_frameMode;
|
||||
|
||||
// Auxiliary data for checking against the data in the feature file.
|
||||
unsigned int m_samplePeriod;
|
||||
size_t m_ioFeatureDimension;
|
||||
|
|
|
@ -11,6 +11,11 @@
|
|||
#include "ConfigHelper.h"
|
||||
#include "Bundler.h"
|
||||
#include "StringUtil.h"
|
||||
#include "SequencePacker.h"
|
||||
#include "SampleModePacker.h"
|
||||
#include "BpttPacker.h"
|
||||
#include "BlockRandomizer.h"
|
||||
#include "NoRandomizer.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -61,23 +66,58 @@ HTKMLFReader::HTKMLFReader(MemoryProviderPtr provider,
|
|||
// TODO: deserializers and transformers will be dynamically loaded
|
||||
// from external libraries based on the configuration/brain script.
|
||||
|
||||
assert(readerConfig(L"frameMode", true));
|
||||
ConfigHelper config(readerConfig);
|
||||
bool frameMode = readerConfig(L"frameMode", true);
|
||||
bool truncated = readerConfig(L"truncated", false);
|
||||
if (frameMode && truncated)
|
||||
{
|
||||
LogicError("frameMode and truncated BPTT are mutually exclusive.");
|
||||
}
|
||||
|
||||
if (frameMode)
|
||||
{
|
||||
m_packingMode = PackingMode::sample;
|
||||
}
|
||||
else if (truncated)
|
||||
{
|
||||
m_packingMode = PackingMode::truncated;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_packingMode = PackingMode::sequence;
|
||||
}
|
||||
|
||||
// nbruttsineachrecurrentiter is old reader configuration, truncationLength is the new one.
|
||||
// If truncation length is specified we estimate
|
||||
// the number of parallel sequences we have to pack as max(1, (mbsize/truncationLength))
|
||||
// If nbruttsineachrecurrentiter is specified we assume that the truncation size is mbSize
|
||||
// and the real minibatch size in mbSize * nbruttsineachrecurrentiter[epochIndex]
|
||||
m_truncationLength = readerConfig(L"truncationLength", 0);
|
||||
m_numParallelSequencesForAllEpochs =
|
||||
readerConfig(L"nbruttsineachrecurrentiter", ConfigParameters::Array(intargvector(vector<int> { 1 })));
|
||||
|
||||
ConfigHelper config(readerConfig);
|
||||
size_t window = config.GetRandomizationWindow();
|
||||
auto deserializers = CreateDeserializers(readerConfig);
|
||||
assert(deserializers.size() == 2);
|
||||
|
||||
auto bundler = std::make_shared<Bundler>(readerConfig, deserializers[0], deserializers, false);
|
||||
|
||||
int verbosity = readerConfig(L"verbosity", 2);
|
||||
std::wstring readMethod = config.GetRandomizer();
|
||||
if (!AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize")))
|
||||
|
||||
// TODO: this should be bool. Change when config per deserializer is allowed.
|
||||
if (AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize")))
|
||||
{
|
||||
RuntimeError("readMethod must be 'blockRandomize'");
|
||||
m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DecimationMode::chunk, true /* useLegacyRandomization */);
|
||||
}
|
||||
else if (AreEqualIgnoreCase(readMethod, std::wstring(L"none")))
|
||||
{
|
||||
m_randomizer = std::make_shared<NoRandomizer>(bundler);
|
||||
}
|
||||
else
|
||||
{
|
||||
RuntimeError("readMethod must be 'blockRandomize' or 'none'.");
|
||||
}
|
||||
|
||||
int verbosity = readerConfig(L"verbosity", 2);
|
||||
m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DecimationMode::chunk, true /* useLegacyRandomization */);
|
||||
m_randomizer->Initialize(nullptr, readerConfig);
|
||||
|
||||
// Create output stream descriptions (all dense)
|
||||
|
@ -107,11 +147,57 @@ void HTKMLFReader::StartEpoch(const EpochConfiguration& config)
|
|||
}
|
||||
|
||||
m_randomizer->StartEpoch(config);
|
||||
m_packer = std::make_shared<SampleModePacker>(
|
||||
m_provider,
|
||||
m_randomizer,
|
||||
config.m_minibatchSizeInSamples,
|
||||
m_streams);
|
||||
|
||||
// TODO: should we unify sample and sequence mode packers into a single one.
|
||||
// TODO: functionally they are the same, the only difference is how we handle
|
||||
// TODO: MBlayout and what is the perf hit for iterating/copying sequences.
|
||||
// TODO: Should do more perf tests before unifying these two.
|
||||
|
||||
// TODO: As the next step the packers will be moved out of the readers into the
|
||||
// TODO: core CNTK. They are format agnostic and can be used with any type of
|
||||
// TODO: deserializers.
|
||||
switch (m_packingMode)
|
||||
{
|
||||
case PackingMode::sample:
|
||||
m_packer = std::make_shared<SampleModePacker>(
|
||||
m_provider,
|
||||
m_randomizer,
|
||||
config.m_minibatchSizeInSamples,
|
||||
m_streams);
|
||||
break;
|
||||
case PackingMode::sequence:
|
||||
m_packer = std::make_shared<SequencePacker>(
|
||||
m_provider,
|
||||
m_randomizer,
|
||||
config.m_minibatchSizeInSamples,
|
||||
m_streams);
|
||||
break;
|
||||
case PackingMode::truncated:
|
||||
{
|
||||
size_t minibatchSize = config.m_minibatchSizeInSamples;
|
||||
size_t truncationLength = m_truncationLength;
|
||||
if (truncationLength == 0)
|
||||
{
|
||||
// Old config, the truncation length is specified as the minibatch size.
|
||||
// In this case the truncation size is mbSize
|
||||
// and the real minibatch size is truncation size * nbruttsineachrecurrentiter
|
||||
fprintf(stderr, "Legacy configuration is used for truncated BPTT mode, please adapt the config to explicitly specify truncationLength.");
|
||||
truncationLength = minibatchSize;
|
||||
size_t numParallelSequences = m_numParallelSequencesForAllEpochs[config.m_epochIndex];
|
||||
minibatchSize = numParallelSequences * truncationLength;
|
||||
}
|
||||
|
||||
m_packer = std::make_shared<BpttPacker>(
|
||||
m_provider,
|
||||
m_randomizer,
|
||||
minibatchSize,
|
||||
truncationLength,
|
||||
m_streams);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogicError("Unsupported type of packer '%d'.", (int)m_packingMode);
|
||||
}
|
||||
}
|
||||
|
||||
Minibatch HTKMLFReader::ReadMinibatch()
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче