This commit is contained in:
Frank Seide 2016-04-03 00:59:10 -07:00
Родитель 531f5538b6 916497bf1e
Коммит 7784350f29
385 изменённых файлов: 102504 добавлений и 4934969 удалений

Просмотреть файл

@ -8333,9 +8333,9 @@ SquareError
\begin_layout Standard
\begin_inset Formula
\begin{eqnarray}
v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & \leftarrow & \frac{1}{2}\mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{\mathbf{Y}}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
v\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
\nabla_{\mathbf{Y}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}-2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
\end{eqnarray}
\end_inset
@ -8367,8 +8367,8 @@ Note that
\color none
\begin_inset Formula
\begin{eqnarray}
\frac{\partial v}{\partial\mathbf{X}} & = & \mathbf{X}-\mathbf{Y}\\
\frac{\partial v}{\partial\mathbf{Y}} & = & \mathbf{-\left(X-\mathbf{Y}\right)}.
\frac{\partial v}{\partial\mathbf{X}} & = & +2\left(\mathbf{X}-\mathbf{Y}\right)\\
\frac{\partial v}{\partial\mathbf{Y}} & = & -2\left(\mathbf{X}-\mathbf{Y}\right).
\end{eqnarray}
\end_inset

Просмотреть файл

@ -1,4 +1,4 @@
import urllib
import urllib.request
import gzip
import os
import struct

Просмотреть файл

@ -13,7 +13,6 @@ deviceId = 0
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
# imageLayout = "legacy"
command = MNISTtrain:MNISTtest

Просмотреть файл

@ -25,6 +25,7 @@ DNN = [
err = ErrorPrediction(labels, ol)
# Special Nodes
errTop5 = ErrorPrediction(labels, ol, Const(1), tag="eval")
FeatureNodes = (features)
LabelNodes = (labels)
CriterionNodes = (ce)

Просмотреть файл

@ -13,7 +13,6 @@ deviceId = 0
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
# imageLayout = "legacy"
command = train:test
@ -42,7 +41,7 @@ train = [
SGD = [
epochSize = 60000
minibatchSize = 32
learningRatesPerMB = 0.5
learningRatesPerMB = 0.1*5:0.3
momentumPerMB = 0*10:0.7
maxEpochs = 15
]

Просмотреть файл

@ -23,16 +23,17 @@ DNN=[
hStride1 = 1
vStride1 = 1
# weight[cMap1, kW1 * kH1 * inputChannels]
# ConvReLULayer is defined in Macros.ndl
conv1_act = ConvReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1)
# Conv2DReLULayer is defined in Macros.ndl
conv1 = Conv2DReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1)
# pool1
pool1W = 2
pool1H = 2
pool1hStride = 2
pool1vStride = 2
pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
# MaxPooling is a standard NDL node.
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
# conv2
kW2 = 5
kH2 = 5
@ -40,19 +41,20 @@ DNN=[
hStride2 = 1
vStride2 = 1
# weight[cMap2, kW2 * kH2 * cMap1]
# ConvReLULayer is defined in Macros.ndl
conv2_act = ConvReLULayer(pool1, cMap2, 400, kW2, kH2, hStride2, vStride2, 10, 1)
# ConvNDReLULayer is defined in Macros.ndl
conv2 = ConvNDReLULayer(pool1, kW2, kH2, cMap1, 400, cMap2, hStride2, vStride2, 10, 1)
# pool2
pool2W = 2
pool2H = 2
pool2hStride = 2
pool2vStride = 2
pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=$imageLayout$)
# MaxNDPooling is defined in Macros.ndl
pool2 = MaxNDPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=$imageLayout$)
h1Dim = 128
# DNNImageSigmoidLayer and DNNLayer are defined in Macros.ndl
h1 = DNNImageSigmoidLayer(4, 4, cMap2, h1Dim, pool2, 1)
h1 = DNNImageSigmoidLayer(7, 7, cMap2, h1Dim, pool2, 1)
ol = DNNLayer(h1Dim, labelDim, h1, 1)
ce = CrossEntropyWithSoftmax(labels, ol)

Просмотреть файл

@ -13,9 +13,8 @@ deviceId = 0
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
# imageLayout = "legacy"
command = train:CreateEvalModel:test
command = train:test
precision = "float"
modelPath = "$ModelDir$/03_ConvBatchNorm"
@ -38,9 +37,11 @@ train = [
SGD = [
epochSize = 60000
minibatchSize = 32
learningRatesPerMB = 0.5
momentumPerMB = 0*10:0.7
learningRatesPerMB = 0.5:0.1
momentumPerMB = 0.9
maxEpochs = 2
#batchNormalizationTimeConstant=0 # Set through NDL
batchNormalizationBlendTimeConstant=0:1#INF
]
reader = [
@ -63,17 +64,6 @@ train = [
]
]
#######################################
# Edit model #
#######################################
CreateEvalModel=[
action=edit
CurModel=$ModelDir$/03_ConvBatchNorm
NewModel=$ModelDir$/03_ConvBatchNorm.Eval
editPath=$ConfigDir$/03_ConvBatchNorm.mel
]
#######################################
# TEST CONFIG #
#######################################
@ -82,7 +72,7 @@ test = [
action = "test"
minibatchSize = 32
modelPath=$ModelDir$/03_ConvBatchNorm.Eval
modelPath=$ModelDir$/03_ConvBatchNorm
NDLNetworkBuilder = [
networkDescription = "$ConfigDir$/03_ConvBatchNorm.ndl"

Просмотреть файл

@ -1,6 +0,0 @@
m=LoadModel($CurModel$, format=cntk)
SetDefaultModel(m)
SetPropertyForSubTree(CE, batchNormEvalMode, true)
SaveModel(m, $NewModel$, format=cntk)

Просмотреть файл

@ -15,7 +15,7 @@ ndlMnistMacros = [
labels = InputValue(labelDim)
scValue = 1
# Batch normalization time constant.
# Batch normalization time constant (normalizationTimeConstant). blendTimeConstant is set through .cntk file.
bnTimeConst = 1024
convWScale = 10

Просмотреть файл

@ -1,28 +1,28 @@
DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale)
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale)
W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
t = Times(W, x)
z = Plus(t, b)
y = Sigmoid(z)
]
DNNImageSigmoidLayer(inW, inH, inC, outDim, x, parmScale) = [
W = ImageParameter(outDim, inW, inH, inC, init="uniform", initValueScale=parmScale, imageLayout=$imageLayout$)
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale)
W = ImageParameter(outDim, inW, inH, inC, init="uniform", initValueScale=parmScale, initOnCPUOnly=true, imageLayout=$imageLayout$)
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
t = Times(W, x)
z = Plus(t, b)
y = Sigmoid(z)
]
DNNLayer(inDim, outDim, x, parmScale) = [
W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale)
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale)
W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
b = LearnableParameter(outDim, 1, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
t = Times(W, x)
z = Plus(t, b)
]
DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale, initOnCPUOnly=true)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
@ -32,12 +32,36 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
y = RectifiedLinear(bn)
]
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
convW = LearnableParameter(outMap, inWCount, init="uniform", initValueScale=wScale)
convB = ImageParameter(1, 1, outMap, init="fixedValue", value=bValue, imageLayout=$imageLayout$)
conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=$imageLayout$)
convPlusB = Plus(conv, convB);
act = RectifiedLinear(convPlusB);
ConvW(outMap, inWCount, wScale) = [
W = LearnableParameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=true)
]
ConvB(outMap, bValue) = [
b = ImageParameter(1, 1, outMap, init="fixedValue", value=bValue, imageLayout=$imageLayout$)
]
Conv2D(w, inp, kW, kH, outMap, hStride, vStride) = [
c = Convolution(w, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
]
ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride) = [
c = Convolution(w, inp, {kW, kH, inMap}, mapCount=outMap, stride={hStride, vStride, inMap}, sharing={true, true, true}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
]
Conv2DReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
w = ConvW(outMap, inWCount, wScale)
b = ConvB(outMap, bValue)
c = Conv2D(w, inp, kW, kH, outMap, hStride, vStride)
cpb = Plus(c, b);
out = RectifiedLinear(cpb);
]
ConvNDReLULayer(inp, kW, kH, inMap, inWCount, outMap, hStride, vStride, wScale, bValue) = [
w = ConvW(outMap, inWCount, wScale)
b = ConvB(outMap, bValue)
c = ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride)
cpb = Plus(c, b);
out = RectifiedLinear(cpb);
]
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst) = [
@ -51,7 +75,7 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
]
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
W = LearnableParameter(outMap, inWCount, init=Gaussian, initValueScale=wScale)
W = LearnableParameter(outMap, inWCount, init=Gaussian, initValueScale=wScale, initOnCPUOnly=true)
c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
]
@ -59,3 +83,7 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
y = RectifiedLinear(c)
]
MaxNDPooling(inp, kW, kH, hStride, vStride) = [
p = Pooling(inp, "max", {kW, kH, 1}, stride={hStride, vStride, 1}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
]

Просмотреть файл

@ -70,7 +70,7 @@ To run the sample, navigate to the Data folder and run the following command:
3. 03_ConvBatchNorm.ndl is almost identical to 02_Convolution.ndl
except that it uses batch normalization for the convolutional and fully connected layers.
As a result, it achieves around 0.92% of error after training for just 2 epochs (and less than 30 seconds).
As a result, it achieves around 0.8% of error after training for just 2 epochs (and less than 30 seconds).
To run the sample, navigate to the Data folder and run the following command:
`cntk configFile=../Config/03_ConvBatchNorm.cntk`

Просмотреть файл

@ -12,7 +12,6 @@ deviceId = 0
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
# imageLayout = "legacy"
prefetch = "true"
@ -45,6 +44,7 @@ Train = [
readerType = "UCIFastReader"
file = "$DataDir$/Train.txt"
randomize = "auto"
minibatchMode="full"
features = [
dim = 3072
start = 1

Просмотреть файл

@ -12,11 +12,10 @@ deviceId = 0
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
# imageLayout = "legacy"
prefetch = "true"
command = Train:AddBNEval:Test
command = Train:Test
stderr = "$OutputDir$/02_BatchNormConv"
traceLevel = 1
@ -44,6 +43,7 @@ Train = [
readerType = "UCIFastReader"
file = "$DataDir$/Train.txt"
randomize = "auto"
minibatchMode="full"
features = [
dim = 3072
start = 1
@ -57,16 +57,9 @@ Train = [
]
]
AddBNEval = [
action = "edit"
CurModel = "$ModelDir$/02_BatchNormConv"
NewModel = "$ModelDir$/02_BatchNormConv.Eval"
editPath = "$ConfigDir$/02_BatchNormConv.mel"
]
Test = [
action = "test"
modelPath = "$ModelDir$/02_BatchNormConv.Eval"
modelPath = "$ModelDir$/02_BatchNormConv"
# Set minibatch size for testing.
minibatchSize = 16

Просмотреть файл

@ -1,6 +0,0 @@
m=LoadModel($CurModel$, format=cntk)
SetDefaultModel(m)
SetPropertyForSubTree(CE, batchNormEvalMode, true)
SaveModel(m, $NewModel$, format=cntk)

Просмотреть файл

@ -12,12 +12,11 @@ deviceId = 0
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
# imageLayout = "legacy"
prefetch = "true"
parallelTrain = "false"
command = Train:AddBNEval:Test
command = Train:Test
stderr = "$OutputDir$/03_ResNet"
traceLevel = 1
@ -75,16 +74,9 @@ Train = [
]
]
AddBNEval = [
action = "edit"
CurModel = "$ModelDir$/03_ResNet"
NewModel = "$ModelDir$/03_ResNet.Eval"
editPath = "$ConfigDir$/03_ResNet.mel"
]
Test = [
action = "test"
modelPath = "$ModelDir$/03_ResNet.Eval"
modelPath = "$ModelDir$/03_ResNet"
# Set minibatch size for testing.
minibatchSize = 512

Просмотреть файл

@ -1,6 +0,0 @@
m=LoadModel($CurModel$, format=cntk)
SetDefaultModel(m)
SetPropertyForSubTree(CE, batchNormEvalMode, true)
SaveModel(m, $NewModel$, format=cntk)

Просмотреть файл

@ -38,14 +38,14 @@ DNN=[
rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap2 = 32
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
#rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap3 = 64
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
#rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

Просмотреть файл

@ -13,12 +13,11 @@ deviceId = 0
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
# imageLayout = "legacy"
prefetch="true"
parallelTrain="false"
command=Train:AddBNEval:Test
command=Train:Test
stderr="$OutputDir$/04_ResNet_56"
traceLevel=1
@ -76,16 +75,9 @@ Train=[
]
]
AddBNEval=[
action="edit"
CurModel="$ModelDir$/04_ResNet_56"
NewModel="$ModelDir$/04_ResNet_56.Eval"
editPath="$ConfigDir$/03_ResNet.mel"
]
Test=[
action="test"
modelPath="$ModelDir$/04_ResNet_56.Eval"
modelPath="$ModelDir$/04_ResNet_56"
# Set minibatch size for testing.
minibatchSize=512

Просмотреть файл

@ -53,7 +53,7 @@ DNN=[
rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap2 = 32
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
#rn2_1 = ResNetNode2Inc2(rn1_18, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
@ -75,7 +75,7 @@ DNN=[
rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap3 = 64
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
#rn3_1 = ResNetNode2Inc2(rn2_18, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

Просмотреть файл

@ -0,0 +1,80 @@
RootDir = "."
ConfigDir = "$RootDir$"
DataDir = "$RootDir$"
OutputDir = "$RootDir$/Output"
ModelDir = "$OutputDir$/Models"
ndlMacros = "$ConfigDir$/Macros.ndl"
precision = "float"
deviceId = 0
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
prefetch = "true"
command = Train:Test
modelPath = "$ModelDir$/05_ConvLocal"
stderr = "$OutputDir$/05_ConvLocal"
traceLevel = 1
numMBsToShowResult = 50
Train = [
action = "train"
NDLNetworkBuilder = [
networkDescription = "$ConfigDir$/05_ConvLocal.ndl"
]
SGD = [
epochSize = 49984
minibatchSize = 64
learningRatesPerMB = 0.01*10:0.003*10:0.001
momentumPerMB = 0.9*20:0.99
maxEpochs = 30
L2RegWeight = 0.03
]
reader = [
readerType = "UCIFastReader"
file = "$DataDir$/Train.txt"
randomize = "auto"
minibatchMode="full"
features = [
dim = 3072
start = 1
]
labels = [
dim = 1
start = 0
labelDim = 10
labelMappingFile = "$DataDir$/labelsmap.txt"
]
]
]
Test = [
action = "test"
# Set minibatch size for testing.
minibatchSize = 16
reader = [
readerType = "UCIFastReader"
file = "$DataDir$/Test.txt"
randomize = "none"
features = [
dim = 3072
start = 1
]
labels = [
dim = 1
start = 0
labelDim = 10
labelMappingFile = "$DataDir$/labelsmap.txt"
]
]
]

Просмотреть файл

@ -0,0 +1,84 @@
load=ndlMnistMacros
run=DNN
ndlMnistMacros = [
ImageW = 32
ImageH = 32
ImageC = 3
LabelDim = 10
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = $imageLayout$)
featOffs = Const(128)
featScaled = Minus(features, featOffs)
labels = Input(LabelDim, tag = label)
conv1WScale = 0.0043
conv1BValue = 0
conv2WScale = 1.414
conv2BValue = 0
conv3WScale = 1.414
conv3BValue = 0
conv4WScale = 1.414
conv4BValue = 0
fc1WScale = 1.5
fc1BValue = 0
]
DNN=[
# conv1
kW1 = 5
kH1 = 5
cMap1 = 64
hStride1 = 1
vStride1 = 1
# weight[cMap1, kW1 * kH1 * ImageC]
conv1 = ConvReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue)
# pool1
pool1W = 3
pool1H = 3
pool1hStride = 2
pool1vStride = 2
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout = $imageLayout$)
# conv2
kW2 = 5
kH2 = 5
cMap2 = 64
hStride2 = 1
vStride2 = 1
# weight[cMap2, kW2 * kH2 * cMap1]
conv2 = ConvReLULayer(pool1, cMap2, 1600, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue)
# pool2
pool2W = 3
pool2H = 3
pool2hStride = 2
pool2vStride = 2
pool2 = MaxPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout = $imageLayout$)
# conv_local3
kW3 = 3
kH3 = 3
cMap3 = 64
hStride3 = 1
vStride3 = 1
# weight[cMap3 * pool2OutW * poolOutH, kW3 * kH3 * cMap2]
conv3 = ConvLocalReLULayer(pool2, cMap3, 3136, cMap2, 576, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue)
# conv_local4
kW4 = 3
kH4 = 3
cMap4 = 32
hStride4 = 1
vStride4 = 1
# weight[cMap4 * conv3OutW * conv3OutH, kW4 * kH4 * cMap3]
conv4 = ConvLocalReLULayer(conv3, cMap4, 1568, cMap3, 576, kW4, kH4, hStride4, vStride4, conv4WScale, conv4BValue)
ol = DnnImageLastLayer(7, 7, cMap4, labelDim, conv4, fc1WScale, fc1BValue)
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
Err = ErrorPrediction(labels, ol, tag = Eval)
OutputNodes = ol
]

Просмотреть файл

@ -7,6 +7,15 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
y = RectifiedLinear(p)
]
ConvLocalReLULayer(inp, outMap, outWCount, inMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
[
W = LearnableParameter(outWCount, inWCount, init = Gaussian, initValueScale = wScale)
b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
c = Convolution(W, inp, {kW, kH, inMap}, mapCount = outMap, stride = {hStride, vStride, inMap}, sharing = {false, false, false}, imageLayout = $imageLayout$)
p = Plus(c, b)
y = RectifiedLinear(p)
]
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
[
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
@ -15,7 +24,7 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
]
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@ -30,6 +39,17 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
y = RectifiedLinear(c)
]
ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
[
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
]
ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
[
# First convolution layer.
@ -48,7 +68,7 @@ ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, b
c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
# Projection convolution layer.
c_proj = ConvBNLayerW(Wproj, inp, outMap, 1, 1, 2, 2, bValue, scValue, bnTimeConst)
c_proj = ProjLayer(Wproj, inp, outMap, 2, 2, bValue, scValue, bnTimeConst)
#c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = $imageLayout$)
p = Plus(c2, c_proj)
@ -95,7 +115,7 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]
@ -107,7 +127,7 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]
@ -118,3 +138,11 @@ DnnLastLayer(hiddenDim, labelDim, x, wScale, bValue)
t = Times(W, x)
z = Plus(t, b)
]
DnnImageLastLayer(inW, inH, inC, labelDim, x, wScale, bValue)
[
W = ImageParameter(labelDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
t = Times(W, x)
z = Plus(t, b)
]

Просмотреть файл

@ -6411,4 +6411,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
Allocating matrices for forward and/or backward propagation.
Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0819 CE: CrossEntropyWithSoftmax/Sample = 0.35141698
Final Results: Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0819 CE: CrossEntropyWithSoftmax/Sample = 0.35141698 Perplexity = 1.4210798
COMPLETED
__COMPLETED__

Просмотреть файл

@ -9899,4 +9899,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
Allocating matrices for forward and/or backward propagation.
Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0644 CE: CrossEntropyWithSoftmax/Sample = 0.3034767
Final Results: Minibatch[1-20]: Samples Seen = 10000 Err: ErrorPrediction/Sample = 0.0644 CE: CrossEntropyWithSoftmax/Sample = 0.3034767 Perplexity = 1.35456
COMPLETED
__COMPLETED__

Просмотреть файл

@ -25,7 +25,7 @@ Then install numpy package by following instruction from: http://www.scipy.org/i
2. Alternatively install Python Anaconda distribution which contains most of the popular Python packages including numpy:
http://continuum.io/downloads
`-f` parameter is optional and specifies output format of the datasets. `cudnn` option (default) saves dataset in a spatial-major format used by cuDNN, while `legacy` - in CNTK legacy format. Use `cudnn` if CNTK is compiled with cuDNN **and** running on GPU and `legacy` otherwise.
`-f` parameter is optional and specifies output format of the datasets. `cudnn` option (default) saves dataset in a spatial-major format used by cuDNN, while `legacy` - in CNTK legacy format. Use `cudnn` if CNTK is compiled with cuDNN and `legacy` otherwise.
ResNet samples require converting CIFAR-10 dataset to actual images. This can be performed by running the following command:
```
@ -54,5 +54,7 @@ cntk configFile=02_BatchNormConv.cntk
3. 03_ResNet.ndl and 04_ResNet_56.ndl are very deep convolutional networks that use ResNet architecture and have 20 and 56 layers respectively (http://arxiv.org/abs/1512.03385).
With 03_ResNet.cntk you should get around 8.2% of error after training for about 50 minutes. 04_ResNet_56.cntk should produce around 6.4% of error after training for about 3 hours (see log files in the Output directory).
4. 05_ConvLocal.cntk uses locally-connected convolution layers (see `conv_local3` and `conv_local4` in `05_ConvLocal.cntk`) and resembles a network described here: https://code.google.com/p/cuda-convnet/source/browse/trunk/example-layers/layers-conv-local-11pct.cfg
For more details, refer to .ndl and corresponding .cntk files.

Просмотреть файл

@ -66,7 +66,7 @@ Train=[
# Possible values: Center, Random. Default: Center
cropType="Random"
# Horizontal random flip, will be enabled by default if cropType=Random
#hflip=0
#hflip="true"
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
cropRatio=0.875
# Crop scale ratio jitter type.

Просмотреть файл

@ -1,9 +1,6 @@
m1=LoadModel($CurModel$, format=cntk)
SetDefaultModel(m1)
# Switch batch normalization to eval mode.
SetPropertyForSubTree(CE, batchNormEvalMode, true)
# Add top-5 error prediction node.
ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = Eval)

Просмотреть файл

@ -1,18 +1,29 @@
Conv(W, inp, outMap, kW, kH, hStride, vStride)
[
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
]
BN(inp, mapCount, bValue, scValue, bnTimeConst)
[
b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
y = BatchNormalization(inp, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
]
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
[
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
c = Conv(W, inp, outMap, kW, kH, hStride, vStride)
y = BN(c, outMap, bValue, scValue, bnTimeConst)
]
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
[
W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
c = Conv(W, inp, outMap, kW, kH, hStride, vStride)
y = BN(c, outMap, bValue, scValue, bnTimeConst)
]
ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@ -21,6 +32,19 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
y = RectifiedLinear(c)
]
Conv1x1(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
[
W = Parameter(outMap, inMap, init = Gaussian, initValueScale = wScale)
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = "cudnn")
y = BN(c, outMap, bValue, scValue, bnTimeConst)
]
Conv1x1ReLU(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
[
c = Conv1x1(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
y = RectifiedLinear(c)
]
# Standard building block for ResNet with identity shortcut (option A).
ResNetNode2A(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
[
@ -48,15 +72,30 @@ ResNetNode2AInc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue,
y2 = RectifiedLinear(p)
]
# Standard building block for ResNet with padding (option B).
ResNetNode2BInc(inp, outMap, inMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
[
# First convolution layer.
c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
# Second convolution layer, no ReLU.
c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
# Projection convolution layer.
c_proj = Conv1x1(inp, outMap, inMap, 2, 2, wScale, bValue, scValue, bnTimeConst)
p = Plus(c2, c_proj)
y2 = RectifiedLinear(p)
]
# Bottleneck building block for ResNet.
ResNetNode3A(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst)
[
# 1x1 reducing convolution.
c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
c1 = Conv1x1ReLU(inp, convMap, inMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
# 3x3 convolution.
c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
# 1x1 expanding convolution, no ReLU.
c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
p = Plus(c3, inp)
y = RectifiedLinear(p)
@ -65,11 +104,11 @@ ResNetNode3A(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, b
ResNetNode3AInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst, wProj, projStride)
[
# 1x1 reducing convolution.
c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
c1 = Conv1x1ReLU(inp, convMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
# 3x3 convolution.
c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
# 1x1 expanding convolution, no ReLU.
c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
# Input-to-output mapping convolution.
c_proj = ConvBNLayerW(wProj, inp, outMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
@ -80,13 +119,13 @@ ResNetNode3AInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue
ResNetNode3BInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst, projStride)
[
# 1x1 reducing convolution.
c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
c1 = Conv1x1ReLU(inp, convMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
# 3x3 convolution.
c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
# 1x1 expanding convolution, no ReLU.
c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
# Input-to-output mapping convolution.
c_proj = ConvBNLayer(inp, outMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
c_proj = Conv1x1(inp, outMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
p = Plus(c3, c_proj)
y = RectifiedLinear(p)
@ -99,3 +138,8 @@ DnnLayer(hiddenDim, labelDim, x, wScale, bValue)
t = Times(W, x)
z = Plus(t, b)
]
MaxNDPooling(inp, kW, kH, hStride, vStride)
[
p = Pooling(inp, "max", {kW, kH, 1}, stride = {hStride, vStride, 1}, autoPadding = {true, true, false}, imageLayout = "cudnn")
]

Просмотреть файл

@ -71,7 +71,7 @@ Train=[
# Possible values: Center, Random. Default: Center
cropType="Random"
# Horizontal random flip, will be enabled by default if cropType=Random
#hflip=0
#hflip="true"
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
cropRatio=0.46666:0.875
# Crop scale ratio jitter type.

Просмотреть файл

@ -41,8 +41,8 @@ DNN=[
conv1WScale = 0.6
conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
# Max pooling
pool1W = 2
pool1H = 2
pool1W = 3
pool1H = 3
pool1hs = 2
pool1vs = 2
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")

Просмотреть файл

@ -0,0 +1,115 @@
RootDir = "."
ConfigDir = "$RootDir$"
DataDir = "$RootDir$"
OutputDir = "$RootDir$/Output"
ModelDir = "$OutputDir$/Models"
ndlMacros="$ConfigDir$/Macros.ndl"
precision="float"
deviceId="Auto"
command=Train:CreateEval:Test
parallelTrain="false"
stderr="$OutputDir$/ResNet_18"
traceLevel=1
numMBsToShowResult=500
Train=[
action="train"
modelPath="$ModelDir$/ResNet_18"
NDLNetworkBuilder=[
networkDescription="$ConfigDir$/ResNet_18.ndl"
]
SGD=[
epochSize=0
minibatchSize=256
# Note that learning rates are 10x more than in the paper due to a different
# momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
learningRatesPerMB=1.0*35:0.1*35:0.01
momentumPerMB=0.9
maxEpochs=125
gradUpdateType="None"
L2RegWeight=0.0001
dropoutRate=0
ParallelTrain=[
parallelizationMethod="DataParallelSGD"
distributedMBReading="true"
parallelizationStartEpoch=1
DataParallelSGD=[
gradientBits=32
]
]
]
reader=[
readerType="ImageReader"
# Map file which maps images to labels using the following format:
# <full path to image><tab><numerical label (0-based class id)>
# Example:
# C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
file="$DataDir$/train_map.txt"
# Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
randomize="Auto"
features=[
# Below are the required parameters.
width=224
height=224
channels=3
# Below are the optional parameters.
# Possible values: Center, Random. Default: Center
cropType="Random"
# Horizontal random flip, will be enabled by default if cropType=Random
#hflip="true"
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
cropRatio=0.46666:0.875
# Crop scale ratio jitter type.
# Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
jitterType="UniRatio"
# Interpolation to use when scaling image to width x height size.
# Possible values: nearest, linear, cubic, lanczos. Default: linear.
interpolations="Linear"
# Stores mean values for each pixel in OpenCV matrix XML format.
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
]
labels=[
labelDim=1000
]
]
]
CreateEval=[
action="edit"
CurModel="$ModelDir$/ResNet_18"
NewModel="$ModelDir$/ResNet_18.Eval"
editPath="$ConfigDir$/CreateEvalModel.mel"
]
Test=[
action="test"
modelPath="$ModelDir$/ResNet_18.Eval"
# Set minibatch size for testing.
minibatchSize=64
reader=[
readerType="ImageReader"
file="$DataDir$/val_map.txt"
randomize="None"
features=[
width=224
height=224
channels=3
cropType="Center"
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
]
labels=[
labelDim=1000
]
]
]

Просмотреть файл

@ -0,0 +1,72 @@
load=ndlMacros
run=DNN
ndlMacros = [
ImageW = 224
ImageH = 224
ImageC = 3
LabelDim = 1000
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
labels = Input(LabelDim, tag = label)
# Kernels width and height.
kW = 3
kH = 3
# Kernel stride.
hs = 1
vs = 1
# Initial parameter values.
convWScale = 7.07
convBValue = 0
fcWScale = 1.13
fcBValue = 0
scValue = 1
# Batch normalization time constant.
bnTimeConst = 32768
]
DNN=[
conv1WScale = 0.6
cMap1 = 64
conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
# Max pooling
pool1W = 3
pool1H = 3
pool1hs = 2
pool1vs = 2
pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
rn1_1 = ResNetNode2A(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn1_2 = ResNetNode2A(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap2 = 128
rn2_1 = ResNetNode2BInc(rn1_2, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap3 = 256
rn3_1 = ResNetNode2BInc(rn2_2, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap4 = 512
rn4_1 = ResNetNode2BInc(rn3_2, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
# Global average pooling
pool2W = 7
pool2H = 7
pool2hs = 1
pool2vs = 1
pool5 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn")
ol = DnnLayer(cMap4, labelDim, pool5, fcWScale, fcBValue)
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
Err = ErrorPrediction(labels, ol, tag = Eval)
OutputNodes = ol
]

Просмотреть файл

@ -70,7 +70,7 @@ Train=[
# Possible values: Center, Random. Default: Center
cropType="Random"
# Horizontal random flip, will be enabled by default if cropType=Random
#hflip=0
#hflip="true"
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
cropRatio=0.46666:0.875
# Crop scale ratio jitter type.

Просмотреть файл

@ -35,26 +35,24 @@ DNN=[
cMap1 = 64
conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
# Max pooling
pool1W = 2
pool1H = 2
pool1W = 3
pool1H = 3
pool1hs = 2
pool1vs = 2
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
rn1_1 = ResNetNode2A(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn1_2 = ResNetNode2A(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn1_3 = ResNetNode2A(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap2 = 128
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", needGradient = false)
rn2_1 = ResNetNode2AInc(rn1_3, cMap2, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
rn2_1 = ResNetNode2BInc(rn1_3, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn2_3 = ResNetNode2A(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn2_4 = ResNetNode2A(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap3 = 256
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", needGradient = false)
rn3_1 = ResNetNode2AInc(rn2_4, cMap3, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
rn3_1 = ResNetNode2BInc(rn2_4, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn3_3 = ResNetNode2A(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn3_4 = ResNetNode2A(rn3_3, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
@ -62,8 +60,7 @@ DNN=[
rn3_6 = ResNetNode2A(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap4 = 512
rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
rn4_1 = ResNetNode2AInc(rn3_6, cMap4, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn4_1_Wproj)
rn4_1 = ResNetNode2BInc(rn3_6, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

Просмотреть файл

@ -71,7 +71,7 @@ Train=[
# Possible values: Center, Random. Default: Center
cropType="Random"
# Horizontal random flip, will be enabled by default if cropType=Random
#hflip=0
#hflip="true"
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
cropRatio=0.46666:0.875
# Crop scale ratio jitter type.

Просмотреть файл

@ -41,11 +41,11 @@ DNN=[
conv1WScale = 0.6
conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
# Max pooling
pool1W = 2
pool1H = 2
pool1W = 3
pool1H = 3
pool1hs = 2
pool1vs = 2
pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst, 1)
rn1_2 = ResNetNode3A(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst)

Просмотреть файл

@ -1,9 +1,6 @@
m1=LoadModel($CurModel$, format=cntk)
SetDefaultModel(m1)
# Switch batch normalization to eval mode.
SetPropertyForSubTree(CE, batchNormEvalMode, true)
# Add top-5 error prediction node.
ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = "eval")

Просмотреть файл

@ -17,7 +17,7 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false)
bn = BatchNormalization(t, sc, b, m, isd, spatial = false)
y = RectifiedLinear(bn)
]
@ -50,6 +50,6 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, imageLayout = "cudnn")
bn = BatchNormalization(c, sc, b, m, isd, spatial = true, imageLayout = "cudnn")
y = RectifiedLinear(bn);
]

Просмотреть файл

@ -56,7 +56,7 @@ Train=[
# Possible values: Center, Random. Default: Center
cropType="Random"
# Horizontal random flip, will be enabled by default if cropType=Random
#hflip=0
#hflip="true"
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
cropRatio=0.875
# Crop scale ratio jitter type.

Просмотреть файл

@ -65,7 +65,7 @@ Train=[
# Possible values: Center, Random. Default: Center
cropType="Random"
# Horizontal random flip, will be enabled by default if cropType=Random
#hflip=0
#hflip="true"
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
cropRatio=0.875
# Crop scale ratio jitter type.

Просмотреть файл

@ -65,7 +65,7 @@ Train=[
# Possible values: Center, Random. Default: Center
cropType="Random"
# Horizontal random flip, will be enabled by default if cropType=Random
#hflip=0
#hflip="true"
# Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
cropRatio=0.875
# Crop scale ratio jitter type.

Просмотреть файл

@ -31,6 +31,8 @@
# defaults to /usr/local/
# These can be overridden on the command line, e.g. make BUILDTYPE=debug
ARCH=$(shell uname)
ifndef BUILD_TOP
BUILD_TOP=.
endif
@ -211,9 +213,11 @@ CNTKMATH:=cntkmath
BUILDINFO:= $(SOURCEDIR)/CNTK/buildinfo.h
GENBUILD:=Tools/generate_build_info
$(BUILDINFO): $(GENBUILD)
@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
@$(GENBUILD) $(BUILD_TOP)/Config.make
BUILDINFO_OUTPUT := $(shell $(GENBUILD) $(BUILD_TOP)/Config.make && echo Success)
ifneq ("$(BUILDINFO_OUTPUT)","Success")
$(error Could not generate $(BUILDINFO))
endif
########################################
@ -228,6 +232,9 @@ READER_SRC =\
$(SOURCEDIR)/Readers/ReaderLib/ReaderShim.cpp \
$(SOURCEDIR)/Readers/ReaderLib/ChunkRandomizer.cpp \
$(SOURCEDIR)/Readers/ReaderLib/SequenceRandomizer.cpp \
$(SOURCEDIR)/Readers/ReaderLib/SequencePacker.cpp \
$(SOURCEDIR)/Readers/ReaderLib/BpttPacker.cpp \
$(SOURCEDIR)/Readers/ReaderLib/PackerBase.cpp \
$(SOURCEDIR)/Readers/ReaderLib/SampleModePacker.cpp \
COMMON_SRC =\
@ -250,6 +257,7 @@ MATH_SRC =\
$(SOURCEDIR)/Math/TensorView.cpp \
$(SOURCEDIR)/Math/CUDAPageLockedMemAllocator.cpp \
$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
$(SOURCEDIR)/Math/BatchNormalizationEngine.cpp \
ifdef CUDA_PATH
MATH_SRC +=\
@ -258,7 +266,9 @@ MATH_SRC +=\
$(SOURCEDIR)/Math/GPUSparseMatrix.cu \
$(SOURCEDIR)/Math/GPUWatcher.cu \
$(SOURCEDIR)/Math/MatrixQuantizerGPU.cu \
$(SOURCEDIR)/Math/CuDnnCommon.cu \
$(SOURCEDIR)/Math/CuDnnConvolutionEngine.cu \
$(SOURCEDIR)/Math/CuDnnBatchNormalization.cu \
$(SOURCEDIR)/Math/GPUDataTransferer.cpp \
else
@ -376,6 +386,7 @@ LUSEQUENCEREADER_SRC =\
$(SOURCEDIR)/Readers/LUSequenceReader/DataWriterLocal.cpp \
$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceParser.cpp \
$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceReader.cpp \
$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceWriter.cpp \
LUSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LUSEQUENCEREADER_SRC))
@ -595,8 +606,9 @@ CNTK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(C
CNTK:=$(BINDIR)/cntk
ALL+=$(CNTK)
SRC+=$(CNTK_SRC)
$(CNTK): $(BUILDINFO) $(CNTK_OBJ) | $(CNTKMATH_LIB)
$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building output for $(ARCH) with build type $(BUILDTYPE)
@ -638,10 +650,7 @@ $(OBJDIR)/%.o : %.cpp Makefile
@mkdir -p $(dir $@)
$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}
.PHONY: force clean buildall all
force: $(BUILDINFO)
.PHONY: clean buildall all
clean:
@echo $(SEPARATOR)

Просмотреть файл

@ -14,6 +14,7 @@
#include "ConvolutionalNodes.h"
#include "NonlinearityNodes.h"
#include "ReshapingNodes.h"
#include "InputAndParamNodes.h"
#include "TensorShape.h"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -288,36 +289,135 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, timeStep, name);
}
}
else if (cnNodeType == OperationNameOf(ConvolutionNode))
else if (cnNodeType == OperationNameOf(ConvolutionNode) || cnNodeType == OperationNameOf(PoolingNode))
{
if (parameter.size() != 7)
RuntimeError("%ls should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"].", cnNodeType.c_str());
if (parameter.size() != 3 && parameter.size() != 7)
{
if (cnNodeType == OperationNameOf(ConvolutionNode))
{
RuntimeError("%ls: unexpected parameter count. %ls supports 2 modes: \n"
"1. 2D convolution which takes 7 fixed parameters [weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] \n"
"and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"]. \n"
"2. ND convolution which takes 3 fixed parameters [weightNodeName, inputValueNodeName, kernelShape] and \n"
"9 optional parameters [mapCount = [1|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
"For ND convolution, parameters kernelShape, mapCount, stride, sharing, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
cnNodeType.c_str(), cnNodeType.c_str());
}
else
{
RuntimeError("%ls: unexpected parameter count. %ls 3 fixed parameters [inputValueNodeName, poolKind, kernelShape] and \n"
"5 optional parameters stride = [1|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
"Parameters kernelShape, stride, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
cnNodeType.c_str(), cnNodeType.c_str());
}
}
// setup the parameter position of children so we can hook them up later
nodeParamCount = 2;
nodeParamStart = 0;
nodeParamCount = cnNodeType == OperationNameOf(ConvolutionNode) ? 2 : 1;
if (pass == ndlPassInitial)
{
int id = 2; // skip weightNode and inputValueNode
if (parameter.size() == 3)
{
auto reqParams = node->GetParameters(false);
auto optParams = node->GetParameters(true);
auto paramGetter = [reqParams, node](size_t index) -> TensorShape
{
assert(index < reqParams.size());
auto parm = reqParams[index];
if (parm->GetType() != ndlTypeArray)
return TensorShape((size_t)parm->GetScalar());
auto parms = node->GetParentScript()->ParseVariable(parm->GetValue(), false)->GetParameters();
vector<size_t> dims(parms.size());
for (size_t i = 0; i < dims.size(); i++)
dims[i] = parms[i]->GetValue();
return TensorShape(dims);
};
auto paramResolver = [optParams, node](const char* name, size_t defaultVal) -> TensorShape
{
auto res = std::find_if(begin(optParams), end(optParams), [name](const NDLNode<ElemType>* n) { return EqualCI(n->GetName(), name); });
if (res == end(optParams))
return TensorShape(defaultVal);
auto parm = node->GetParentScript()->ParseVariable((*res)->GetValue(), false);
if (parm->GetType() == ndlTypeConstant)
return TensorShape((size_t)parm->GetValue());
auto parms = parm->GetParameters();
vector<size_t> dims(parms.size());
for (size_t i = 0; i < dims.size(); i++)
dims[i] = parms[i]->GetValue();
return TensorShape(dims);
};
auto boolParamResolver = [&optParams, node](const char* name, bool defaultVal) -> vector<bool>
{
auto res = std::find_if(begin(optParams), end(optParams), [name](const NDLNode<ElemType>* n) { return EqualCI(n->GetName(), name); });
if (res == end(optParams))
return vector<bool>{defaultVal};
auto parm = node->GetParentScript()->ParseVariable((*res)->GetValue(), false);
if (parm == nullptr)
return vector<bool>{(*res)->GetValue()};
if (parm->GetType() != ndlTypeArray)
return vector<bool>{parm->GetValue()};
auto parms = parm->GetParameters();
vector<bool> dims(parms.size());
for (size_t i = 0; i < dims.size(); i++)
dims[i] = parms[i]->GetValue();
return dims;
};
// evaluate only scalar parameters
vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
id = 0; // reset counter because the params array starts at zero
size_t kernelWidth = ((NDLNode<ElemType>*) params[id++])->GetScalar();
size_t kernelHeight = ((NDLNode<ElemType>*) params[id++])->GetScalar();
size_t outputChannels = ((NDLNode<ElemType>*) params[id++])->GetScalar();
size_t horizontalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
size_t verticalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
assert(id == 5);
auto kernelShape = paramGetter(reqParams.size() - 1);
auto mapCount = paramResolver("mapCount", 1);
auto stride = paramResolver("stride", 1);
auto sharing = boolParamResolver("sharing", true);
auto autoPad = boolParamResolver("autoPadding", true);
auto lowerPad = paramResolver("lowerPad", 0);
auto upperPad = paramResolver("upperPad", 0);
ImageLayoutKind imageLayout = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW"));
size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
// optional
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
auto pool = PoolKind::None;
if (cnNodeType == OperationNameOf(PoolingNode))
{
auto parm = node->GetParentScript()->ParseVariable(reqParams[1]->GetValue(), false);
pool = PoolKindFrom(wstring(parm->GetValue()));
}
nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding, maxTempMemSizeInSamples, name);
if (pool == PoolKind::None)
{
nodePtr = builder.Convolution(NULL, NULL, kernelShape, mapCount, stride, sharing,
autoPad, lowerPad, upperPad, imageLayout, maxTempMemSizeInSamples, name);
}
else
{
nodePtr = builder.Pooling(NULL, pool, kernelShape, stride, autoPad, lowerPad, upperPad, imageLayout, name);
}
}
else if (parameter.size() == 7)
{
int id = 2; // skip weightNode and inputValueNode
// evaluate only scalar parameters
vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
id = 0; // reset counter because the params array starts at zero
size_t kernelWidth = ((NDLNode<ElemType>*) params[id++])->GetScalar();
size_t kernelHeight = ((NDLNode<ElemType>*) params[id++])->GetScalar();
size_t outputChannels = ((NDLNode<ElemType>*) params[id++])->GetScalar();
size_t horizontalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
size_t verticalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
assert(id == 5);
// optional
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding,
maxTempMemSizeInSamples, name);
}
else
assert(false);
}
}
else if (cnNodeType == OperationNameOf(MaxPoolingNode))
@ -392,9 +492,9 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
// Optional parameters
bool eval = node->GetOptionalParameter("eval", "false");
bool spatial = node->GetOptionalParameter("spatial", "false");
double normTimeConst = node->GetOptionalParameter("normalizationTimeConstant", "0");
double blendTimeConst = node->GetOptionalParameter("blendTimeConstant", "0");
double epsilon = node->GetOptionalParameter("epsilon", "0.00001");
std::wstring bnEngineS = node->GetOptionalParameter("engine", "cntk");
bool useCntkEngine;
@ -406,7 +506,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
InvalidArgument("Unsupported batch normalization engine, choose either \"cntk\"(default) or \"cudnn\".");
ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW"));
nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, eval, spatial, normTimeConst, epsilon, useCntkEngine, imageLayoutKind, name);
nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, spatial, normTimeConst, blendTimeConst, epsilon, useCntkEngine, imageLayoutKind, name);
}
}
else

Просмотреть файл

@ -157,6 +157,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
#endif
else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples")) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(CosineNode), L"Cos")) ret = true;

Просмотреть файл

@ -79,14 +79,15 @@ Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logist
WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]
ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
CastAs (type, data) = ReconcileMBLayout (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]
Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]
AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]
ColumnwiseCrossProduct = KhatriRaoProduct // deprecated
ClassificationError = ErrorPrediction
Delay = PastValue
BatchNormalization(input, scale, bias, runMean, runInvStdDev, eval, spatial, normalizationTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
Abs(x, tag='') = new ComputationNode [ operation = 'Abs' ; inputs = x /*plus the function args*/ ]
ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]

Просмотреть файл

@ -70,7 +70,7 @@ void TestCn(const ConfigParameters& config);
void RedirectStdErr(wstring logpath)
{
fprintf(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
LOGPRINTF(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
auto f = make_shared<File>(logpath.c_str(), fileOptionsWrite | fileOptionsText);
if (dup2(fileno(*f), 2) == -1)
{
@ -165,7 +165,7 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
if (numCPUThreads > 0)
{
std::cerr << "Using " << numCPUThreads << " CPU threads." << endl;
LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
}
bool progressTracing = config(L"progressTracing", false);
@ -187,14 +187,14 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
if (action[j] == "train" || action[j] == "trainRNN")
{
wstring modelPath = commandParams("modelPath");
std::wcerr << "CNTKModelPath: " << modelPath << endl;
LOGPRINTF(stderr, "CNTKModelPath: %ls\n", modelPath.c_str());
size_t maxEpochs = GetMaxEpochs(commandParams);
std::cerr << "CNTKCommandTrainInfo: " + command[i] << " : " << maxEpochs << endl;
LOGPRINTF(stderr, "CNTKCommandTrainInfo: %s : %d\n", command[i].c_str(), (int) maxEpochs);
fullTotalMaxEpochs += maxEpochs;
}
}
}
std::cerr << "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : " << fullTotalMaxEpochs << endl;
LOGPRINTF(stderr, "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : %d\n", (int) fullTotalMaxEpochs);
// set up progress tracing for compute cluster management
if (progressTracing && (!mpi || mpi->IsMainNode()))
@ -225,19 +225,20 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
// print a banner to visually separate each action in the log
const char* delim = "##############################################################################";
const char* prefix = "Action ";
fprintf(stderr, "\n%s\n", delim);
fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
fprintf(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
fprintf(stderr, "%s\n\n", delim);
fprintf(stderr, "\n");
LOGPRINTF(stderr, "%s\n", delim);
LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
LOGPRINTF(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
LOGPRINTF(stderr, "%s\n\n", delim);
if ((mpi == nullptr) || (commandstoRunOnAllRanks.find(thisAction) != commandstoRunOnAllRanks.end()) || mpi->IsMainNode())
{
if (thisAction == "train" || thisAction == "trainRNN")
{
std::cerr << "CNTKCommandTrainBegin: " + command[i] << endl;
LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
DoTrain<ConfigParameters, ElemType>(commandParams);
std::cerr << "CNTKCommandTrainEnd: " + command[i] << endl;
LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
fullEpochsOffset += GetMaxEpochs(commandParams);
}
else if (thisAction == "adapt")
@ -298,7 +299,8 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
}
}
fprintf(stderr, "\nAction \"%s\" complete.\n\n", thisAction.c_str());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Action \"%s\" complete.\n\n", thisAction.c_str());
NDLScript<ElemType> ndlScript;
ndlScript.ClearGlobal(); // clear global macros between commands
@ -321,51 +323,51 @@ std::string TimeDateStamp()
void PrintBuiltInfo()
{
fprintf(stderr, "-------------------------------------------------------------------\n");
fprintf(stderr, "Build info: \n\n");
fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "Build info: \n\n");
LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
#ifdef _BUILDTYPE_
fprintf(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
#endif
#ifdef _BUILDTARGET_
fprintf(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
#endif
#ifdef _WITH_1BITSGD_
fprintf(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
#endif
#ifdef _MATHLIB_
fprintf(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
#endif
#ifdef _CUDA_PATH_
fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
#endif
#ifdef _CUB_PATH_
fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
#endif
#ifdef _CUDNN_PATH_
fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
#endif
#ifdef _GIT_EXIST
fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
#endif
#ifdef _BUILDER_
fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
#endif
#ifdef _BUILDPATH_
fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
#endif
fprintf(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
void PrintUsageInfo()
{
fprintf(stderr, "-------------------------------------------------------------------\n");
fprintf(stderr, "Usage: cntk configFile=yourConfigFile\n");
fprintf(stderr, "For detailed information please consult the CNTK book\n");
fprintf(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
fprintf(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
LOGPRINTF(stderr, "Usage: cntk configFile=yourConfigFile\n");
LOGPRINTF(stderr, "For detailed information please consult the CNTK book\n");
LOGPRINTF(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
}
// ---------------------------------------------------------------------------
@ -414,7 +416,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
for (const auto& arg : args)
startupMessage += L" " + arg;
fprintf(stderr, "%ls\n", startupMessage.c_str());
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
// parse command-line options
vector<wstring> sourceFiles;
@ -443,6 +445,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
// compile the BrainScript
wstring bs = L"[\n";
bs += L"include \'cntk.core.bs'"; // start with including the standard macros
// Note: Using lowercase ^^ here to match the Linux name of the CNTK exe.
//bs += standardFunctions + computationNodes + commonMacros + L"\n";
for (const auto& sourceFile : sourceFiles)
@ -451,7 +454,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
for (const auto& over : overrides)
bs += L"with [ " + over + L" ]\n";
fprintf(stderr, "\n\nBrainScript -->\n\n%ls\n\n", bs.c_str());
fprintf(stderr, "\n\n");
LOGPRINTF(stderr, "BrainScript -->\n\n%ls\n\n", bs.c_str());
let expr = BS::ParseConfigExpression(bs, move(includePaths)); // parse
let valp = BS::Evaluate(expr); // evaluate parse into a dictionary
@ -460,8 +464,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
// legacy parameters that have changed spelling
if (config.Find(L"DoneFile")) // variables follow camel case (start with lower-case letters)
InvalidArgument("Legacy spelling of 'DoneFile' no longer allowed. Use 'doneFile'.");
if (config.Find(L"command")) // spelling error, should be plural. Using 'actions' instead to match the data type.
InvalidArgument("Legacy spelling of 'command' no longer allowed. Use 'actions'.");
if (config.Find(L"type"))
InvalidArgument("Legacy name 'type' no longer allowed. Use 'precision'.");
@ -486,7 +492,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
logpath += msra::strfun::wstrprintf(L"rank%d", (int) mpi->CurrentNodeRank());
RedirectStdErr(logpath);
fprintf(stderr, "%ls\n", startupMessage.c_str());
LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
}
// echo config info to log
@ -497,16 +503,18 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
int numCPUThreads = config(L"numCPUThreads", 0);
numCPUThreads = CPUMatrix<float /*any will do*/>::SetNumThreads(numCPUThreads);
if (numCPUThreads > 0)
fprintf(stderr, "Using %d CPU threads.\n", numCPUThreads);
LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
bool progressTracing = config(L"progressTracing", false);
size_t fullTotalMaxEpochs = 1; // BUGBUG: BS does not allow me to read out the max epochs parameters, as that would instantiate and thus execute the objects
// set up progress tracing for compute cluster management
if (progressTracing && ((mpi == nullptr) || mpi->IsMainNode()))
ProgressTracing::TraceTotalNumberOfSteps(fullTotalMaxEpochs); // enable tracing, using this as the total number of epochs
// MAIN LOOP that executes the actions
auto actionsVal = config[L"actions"];
// Note: weird behavior. If 'actions' is a scalar value (rather than an array) then it will have been resolved already after the above call. That means, it has already completed its action!
// Not pretty, but a direct consequence of the lazy evaluation. The only good solution would be to have a syntax for arrays including length 0 and 1.
// Since this in the end behaves indistinguishable from the array loop below, we will keep it for now.
@ -532,7 +540,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
fcloseOrDie(fp);
}
fprintf(stderr, "COMPLETED\n"), fflush(stderr);
// TODO: change this back to COMPLETED, double underscores don't look good in output
LOGPRINTF(stderr, "__COMPLETED__\n");
fflush(stderr);
MPIWrapper::DeleteInstance();
return EXIT_SUCCESS;
@ -541,11 +551,16 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
// ---------------------------------------------------------------------------
// main() for old CNTK config language
// ---------------------------------------------------------------------------
int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & repots Win32 exceptions
// called from wmain which is a wrapper that catches & repots Win32 exceptions
int wmainOldCNTKConfig(int argc, wchar_t* argv[])
{
ConfigParameters config;
std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);
std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config); // get the command param set they want
bool timestamping = config(L"timestamping", false);
if (timestamping)
{
ProgressTracing::SetTimestampingFlag();
}
// get the command param set they want
wstring logpath = config(L"stderr", L"");
@ -586,8 +601,9 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
std::string timestamp = TimeDateStamp();
// dump config info
fprintf(stderr, "\nRunning on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
fprintf(stderr, "Command line: \n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
LOGPRINTF(stderr, "Command line: \n");
for (int i = 0; i < argc; i++)
fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability
fprintf(stderr, "\n\n");
@ -595,24 +611,27 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
#if 1 //def _DEBUG
// This simply merges all the different config parameters specified (eg, via config files or via command line directly),
// and prints it.
fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
fprintf(stderr, "%s\n", rawConfigString.c_str());
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n");
fprintf(stderr, "\n\n");
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
LOGPRINTF(stderr, "%s\n", rawConfigString.c_str());
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<<\n");
// Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overriden at command line),
// Same as above, but all variables are resolved. If a parameter is set multiple times (eg, set in config, overridden at command line),
// All of these assignments will appear, even though only the last assignment matters.
fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
LOGPRINTF(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
// This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
// value it is set to will appear).
fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
config.dumpWithResolvedVariables();
fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
#endif
fprintf(stderr, "Commands:");
LOGPRINTF(stderr, "Commands:");
for (int i = 0; i < command.size(); i++)
fprintf(stderr, " %s", command[i].c_str());
fprintf(stderr, "\n");
@ -623,7 +642,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
if (config.Exists("type"))
InvalidArgument("CNTK: Use of 'type' parameter is deprecated, it is called 'precision' now.");
fprintf(stderr, "Precision = \"%s\"\n", type.c_str());
LOGPRINTF(stderr, "Precision = \"%s\"\n", type.c_str());
if (type == "float")
DoCommands<float>(config, mpi);
else if (type == "double")
@ -638,7 +658,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
fcloseOrDie(fp);
}
fprintf(stderr, "COMPLETED\n"), fflush(stderr);
// TODO: Change back to COMPLETED (no underscores)
LOGPRINTF(stderr, "__COMPLETED__\n"), fflush(stderr);
MPIWrapper::DeleteInstance();
return EXIT_SUCCESS;
@ -659,43 +680,52 @@ void AllocationFailureHandler()
int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions
{
std::set_new_handler(AllocationFailureHandler);
try
{
{
PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
if (argc <= 1)
{
fprintf(stderr, "No command-line argument given.\n");
LOGPRINTF(stderr, "No command-line argument given.\n");
PrintUsageInfo();
return EXIT_FAILURE;
}
// detect legacy CNTK configuration
bool isOldCNTKConfig = false;
for (int i = 0; i < argc && !isOldCNTKConfig; i++)
isOldCNTKConfig |= !_wcsnicmp(L"configFile=", argv[i], 11);
if (isOldCNTKConfig)
return wmainOldCNTKConfig(argc, argv);
// run from BrainScript
return wmainWithBS(argc, argv);
}
catch (const ScriptableObjects::ScriptingException& err)
{
fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
err.PrintError();
return EXIT_FAILURE;
}
catch (const IExceptionWithCallStackBase& err)
{
fprintf(stderr, "\nEXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
return EXIT_FAILURE;
}
catch (const std::exception& err)
{
fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
fprintf(stderr, "\n");
LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
return EXIT_FAILURE;
}
catch (...)
{
fprintf(stderr, "\nUnknown ERROR occurred\n");
fprintf(stderr, "\n");
LOGPRINTF(stderr, "Unknown ERROR occurred\n");
return EXIT_FAILURE;
}
}
@ -703,7 +733,8 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
#ifdef __WINDOWS__
void TerminateThis()
{
fprintf(stderr, "terminate_this: aborting\n"), fflush(stderr);
LOGPRINTF(stderr, "terminate_this: aborting\n");
fflush(stderr);
exit(EXIT_FAILURE);
}
@ -714,7 +745,7 @@ static void LogDelayLoadError(PEXCEPTION_POINTERS pExcPointers)
if (pExcPointers->ExceptionRecord->ExceptionCode == EXCEPTION_DLL_NOT_FOUND)
{
const auto & pDelayLoadInfo = *PDelayLoadInfo(pExcPointers->ExceptionRecord->ExceptionInformation[0]);
fprintf(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
LOGPRINTF(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
}
}
@ -736,7 +767,7 @@ int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 excepti
else if (code == EXCEPTION_INT_DIVIDE_BY_ZERO) msg = ": Integer division by zero";
else if (code == EXCEPTION_STACK_OVERFLOW) msg = ": Stack overflow";
else if (code == EXCEPTION_DLL_NOT_FOUND) msg = ": Module not found";
fprintf(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
LOGPRINTF(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
fflush(stderr);
exit(EXIT_FAILURE);
}

Просмотреть файл

@ -9,6 +9,7 @@
#include "ModelEditLanguage.h"
#include "ConvolutionalNodes.h"
#include "InputAndParamNodes.h"
#include <map>
namespace Microsoft { namespace MSR { namespace CNTK {
@ -58,8 +59,7 @@ enum MELProperty
melPropFinalCriterion,
melPropEvaluation,
melPropOutput,
melPropRecurrent,
melPropBatchNormMode
melPropRecurrent
};
// SetGroupTag - Set the group tag on a node
@ -73,7 +73,7 @@ void MELScript<ElemType>::SetGroupTag(ComputationNodeBasePtr nodeProp, Computati
cn->AddToNodeGroup(groupTag, nodeProp);
else
cn->RemoveFromNodeGroup(groupTag, nodeProp);
}
}
// ProcessNDLScript - Process the NDL script
// netNdl - netNDL structure
@ -384,18 +384,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
inputNodes[i - 1] = nodeFrom[0];
}
#if 1
nodeTo[0]->AttachInputs(inputNodes);
#else // TODO: delete this
if (inputNodes.size() == 1)
nodeTo[0]->AttachInputs(inputNodes[0]);
else if (inputNodes.size() == 2)
nodeTo[0]->AttachInputs(inputNodes[0], inputNodes[1]);
else if (inputNodes.size() == 3)
nodeTo[0]->AttachInputs(inputNodes[0], inputNodes[1], inputNodes[2]);
else
RuntimeError("SetNodeInputs(): You specified more than 3 input nodes.");
#endif
}
else if (EqualInsensitive(name, "SetProperty"))
{
@ -416,8 +405,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
// map property name to property enum
// Please keep this table sorted.
if (EqualInsensitive(propName, "batchNormEvalMode")) prop = melPropBatchNormMode;
else if (EqualInsensitive(propName, "criterion")) prop = melPropFinalCriterion;
if (EqualInsensitive(propName, "criterion")) prop = melPropFinalCriterion;
else if (EqualInsensitive(propName, "evaluation")) prop = melPropEvaluation;
else if (EqualInsensitive(propName, "feature")) prop = melPropFeature;
else if (EqualInsensitive(propName, "label")) prop = melPropLabel;
@ -483,32 +471,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
// what to do here?
break;
}
case melPropBatchNormMode:
{
if (node->OperationName() != OperationNameOf(BatchNormalizationNode))
{
RuntimeError("Invalid node type: node %ls (type:%ls) is not a %ls node; therefore cannot apply batchNormEvalMode on it.",
node->NodeName().c_str(),
node->OperationName().c_str(),
OperationNameOf(BatchNormalizationNode).c_str());
}
bool property = params[2];
auto pnode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
if (pnode)
pnode->SetEvalMode(property);
else
{
auto pnode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
if (pnode2)
pnode2->SetEvalMode(property);
else
{
RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n",
node->NodeName().c_str());
}
}
break;
}
default:
{
RuntimeError("Invalid property, %s, is not supported", propName.c_str());
@ -534,10 +496,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
{
prop = melPropLearningRateMultiplier;
}
else if (EqualInsensitive(propName, "batchNormEvalMode"))
{
prop = melPropBatchNormMode;
}
else
{
RuntimeError("Invalid property, %s, is not supported", propName.c_str());
@ -566,12 +524,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
netNdl->cn->SetLearnableNodesBelowLearningRateMultiplier(learningRateMultiplier, node);
break;
}
case melPropBatchNormMode:
{
bool evalMode = params[2];
netNdl->cn->SetBatchNormalizationNodesBelowEvalMode(evalMode, node);
break;
}
default:
{
RuntimeError("Invalid property, %s, is not supported", propName.c_str());

Просмотреть файл

@ -4,10 +4,33 @@
//
#pragma once
#include <chrono>
#include "TimerUtility.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: make this proper C++ functions with variadic templates and a name that reflects their difference to fprintf(stderr) which already implies printing to log
// If the Tracing flag is set, print out a timestamp with no new line at the end
#define PREPENDTS(stream) \
do \
{ \
if (ProgressTracing::GetTimestampingFlag()) \
{ \
std::time_t tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); \
char mbstr[30]; \
if (std::strftime(mbstr, sizeof(mbstr), "%m/%d/%Y %H:%M:%S", std::localtime(&tt))) \
fprintf(stream, "%s: ", mbstr); \
} \
} while(0)
// Print out a log message. If the Tracing flag is set, prepend with a timestamp
#define LOGPRINTF(stream, ...) \
do \
{ \
PREPENDTS(stream); \
fprintf(stream, __VA_ARGS__); \
} while(0)
// ---------------------------------------------------------------------------
// ProgressTracing -- static helper class for logging a progress indicator
//
@ -29,12 +52,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
bool m_enabled;
bool m_tracingFlag;
bool m_timestampFlag; // TODO: What does this do? TODO: camelCase
size_t m_totalNumberOfSteps; // total number of epochs in entire training run
size_t m_currentStepOffset; // current offset
Timer m_progressTracingTimer;
ProgressTracing()
: m_enabled(false), m_tracingFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
: m_enabled(false), m_tracingFlag(false), m_timestampFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
{
}
@ -50,12 +74,24 @@ public:
return GetStaticInstance().m_tracingFlag;
}
static bool GetTimestampingFlag()
{
return GetStaticInstance().m_timestampFlag;
// TODO: timestampFlag or timestampingFlag? (Or timeStampFlag?)
}
static void SetTracingFlag()
{
auto& us = GetStaticInstance();
us.m_tracingFlag = true;
}
static void SetTimestampingFlag()
{
auto& us = GetStaticInstance();
us.m_timestampFlag = true;
}
// call TraceTotalNumberOfSteps() to set the total number of steps
// Calling this with totalNumberOfSteps>0 will enable progress tracing.
static void TraceTotalNumberOfSteps(size_t totalNumberOfSteps)

Просмотреть файл

@ -780,6 +780,11 @@ static inline ImageLayoutKind ImageLayoutKindFrom(const wstring& s)
struct ImageDimensions
{
size_t m_width, m_height, m_numChannels;
// convenience accessors. TODO: use only one name. Rename the members themselves?
size_t w() const { return m_width; }
size_t h() const { return m_height; }
size_t c() const { return m_numChannels; }
// interpret TensorShape as image
ImageDimensions(const TensorShape& shape, ImageLayoutKind imageLayoutKind)
{
@ -787,14 +792,14 @@ struct ImageDimensions
InvalidArgument("Convolution operation currently only supports 1D or 2D convolution on 3D tensors.");
if (imageLayoutKind == ImageLayoutKind::CHW)
{
m_width = shape[0];
m_height = shape[1];
m_width = shape[0];
m_height = shape[1];
m_numChannels = shape[2];
}
else if (imageLayoutKind == ImageLayoutKind::HWC)
{
m_width = shape[1];
m_height = shape[2];
m_width = shape[1];
m_height = shape[2];
m_numChannels = shape[0];
}
else

Просмотреть файл

@ -609,11 +609,6 @@ void renameOrDie(const std::string& from, const std::string& to)
// WORKAROUND: "rename" should do this but this is a workaround
// to the HDFS FUSE implementation's bug of failing to do so
// workaround for FUSE rename when running on Philly
if (ProgressTracing::GetTracingFlag())
{
fprintf(stderr, "rename %s to %s\n", from.c_str(), to.c_str());
}
unlinkOrDie(to);
if (rename(from.c_str(), to.c_str()) != 0)
{

Просмотреть файл

@ -514,25 +514,32 @@ template <class ElemType>
}
template <class ElemType>
/*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant)
/*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode,
double normalizationTimeConstant, double& prevNormalizationTimeConstant,
double blendTimeConstant, double& prevBlendTimeConstant)
{
if (normalizationTimeConstant != prevNormalizationTimeConstant && normalizationTimeConstant != numeric_limits<double>::infinity())
if (normalizationTimeConstant != prevNormalizationTimeConstant || blendTimeConstant != prevBlendTimeConstant)
{
fprintf(stderr, "Setting batch normalization time constant to %.8g.\n", normalizationTimeConstant);
if (normalizationTimeConstant != prevNormalizationTimeConstant)
fprintf(stderr, "Setting batch normalization time constant to %.8g.\n", normalizationTimeConstant);
if (blendTimeConstant != prevBlendTimeConstant)
fprintf(stderr, "Setting batch normalization blend time constant to %.8g.\n", blendTimeConstant);
// TODO: Change this to use an interface that is independent of <ElemType>.
list<ComputationNodeBasePtr> batchNormalizationNodes = net->GetNodesWithType(OperationNameOf(BatchNormalizationNode), criterionNode);
if (batchNormalizationNodes.size() == 0 && normalizationTimeConstant != numeric_limits<double>::infinity())
auto batchNormalizationNodes = net->GetNodesWithType(OperationNameOf(BatchNormalizationNode), criterionNode);
if (batchNormalizationNodes.size() == 0)
fprintf(stderr, "WARNING: there is no batch normalization node.\n");
else
{
for (auto& nodeIter : batchNormalizationNodes)
{
auto node = dynamic_pointer_cast<BatchNormalizationNode<ElemType>>(nodeIter);
node->SetNormalizationTimeConstant(normalizationTimeConstant);
node->SetNormalizationTimeConstants(normalizationTimeConstant, prevNormalizationTimeConstant,
blendTimeConstant, prevBlendTimeConstant);
}
}
prevNormalizationTimeConstant = normalizationTimeConstant;
prevBlendTimeConstant = blendTimeConstant;
}
}
@ -1434,7 +1441,7 @@ template void ComputationNetwork::Read<float>(const wstring& fileName);
template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
template /*static*/ void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
template void ComputationNetwork::SaveToDbnFile<float>(ComputationNetworkPtr net, const std::wstring& fileName) const;
@ -1444,7 +1451,7 @@ template void ComputationNetwork::Read<double>(const wstring& fileName);
template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
template /*static*/ void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
template void ComputationNetwork::SaveToDbnFile<double>(ComputationNetworkPtr net, const std::wstring& fileName) const;

Просмотреть файл

@ -103,8 +103,6 @@ public:
Read<ElemType>(fileName);
// perform all further post-processing, caching, etc.
CompileNetwork();
// To ensure that all the BN nodes changed to eval mode unless it's in Training mode.
SetBatchNormalizationNodesBelowEvalMode(true);
}
// static helper to instantiate a network from a file
@ -363,7 +361,6 @@ public:
void AddFeatureNode(ComputationNodeBasePtr featureNode);
//ComputationNodeBasePtr RemoveFeatureNode(ComputationNodeBasePtr featureNode);
void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr);
void SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
// -----------------------------------------------------------------------
// node access
@ -429,7 +426,9 @@ public:
static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
template <class ElemType>
static void SetBatchNormalizationTimeConstant(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
static void SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode,
double normalizationTimeConstant, double& prevNormalizationTimeConstant,
double blendTimeConstant, double& prevBlendTimeConstant);
template <class ElemType>
static void SetSeqParam(ComputationNetworkPtr net,

Просмотреть файл

@ -106,13 +106,13 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
assert(node->m_numNonDelayedParentsInLoop == 0); // (in PurgeStateForFormingRecurrentLoops())
}
for (let& node : nestedNodes)
{
for (auto& input : node->GetInputs())
{
for (auto& input : node->GetInputs())
{
if (input->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
input->m_numNonDelayedParentsInLoop++; // cound #parents of 'input' that are not delay nodes
}
}
}
// re-traverse the graph for all nestedNodes, starting with the first
// Then update m_nestedNodes with the re-traversed order.
@ -301,19 +301,19 @@ void ComputationNetwork::DetermineSCCsR(ComputationNodeBasePtr cur,
for (let& iter : m_allSEQNodes)
{
for (let& iter2 : iter->m_nestedNodes)
{
{
if (iter2 == cur)
{
bFound = true;
{
bFound = true;
// validate that the loop is really the same, by a set comparison
unordered_set<ComputationNodeBasePtr> newLoop ( nestedNodes.begin(), nestedNodes.end());
unordered_set<ComputationNodeBasePtr> existingLoop(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end());
if (newLoop != existingLoop)
LogicError("DetermineSCCsR: %ls %ls operation rediscovered in a loop, but that loop is not the same as last time.", cur->NodeName().c_str(), cur->OperationName().c_str());
break;
}
break;
}
}
}
if (bFound)
fprintf(stderr, "\nDetermineSCCsR: %ls %ls operation was discovered multiple times as as loop participant", cur->NodeName().c_str(), cur->OperationName().c_str());
// TODO: Once we forbid FormRecurrentLoops() from non-NULL, can we ever re-hit a loop here? If not, then turn bFound into a LogicError().

Просмотреть файл

@ -128,6 +128,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring& node
if (nodeType == OperationNameOf(AveragePoolingNode)) return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(BatchNormalizationNode)) return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(ConvolutionNode)) return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(PoolingNode)) return New<PoolingNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(SparseInputValue)) return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(InputValue)) return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == OperationNameOf(LearnableParameter)) return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
@ -229,6 +230,27 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
maxTempMemSizeInSamples));
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount,
const TensorShape& strideShape, const std::vector<bool>& sharing,
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
{
return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
kernelShape, mapCount, strideShape,
sharing, autoPadding, lowerPad, upperPad,
imageLayout, maxTempMemSizeInSamples));
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreatePoolingNode(const std::wstring& nodeName, PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout)
{
return net.AddNodeToNetWithElemType(New<PoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
poolKind, kernelShape, strideShape, autoPadding, lowerPad, upperPad, imageLayout));
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateMaxPoolingNode(const std::wstring& nodeName,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
@ -261,7 +283,9 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
const ComputationNodePtr inputValues,
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
const size_t horizontalSubsample, const size_t verticalSubsample,
ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
@ -269,6 +293,34 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convo
maxTempMemSizeInSamples), { weight, inputValues });
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
const ComputationNodePtr inputValues,
const TensorShape& kernelShape, const TensorShape& mapCount,
const TensorShape& strideShape, const std::vector<bool>& sharing,
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
kernelShape, mapCount, strideShape,
sharing, autoPadding, lowerPad, upperPad,
imageLayout, maxTempMemSizeInSamples),
weight, inputValues);
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Pooling(const ComputationNodePtr inputValues,
PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<PoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
poolKind, kernelShape, strideShape, autoPadding, lowerPad, upperPad, imageLayout),
inputValues);
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
@ -636,10 +688,11 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Looku
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::BatchNormalization(const ComputationNodePtr input,
const ComputationNodePtr scale, const ComputationNodePtr bias, const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev,
bool eval, bool spatial, double normalizationTimeConstant, double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind,
bool spatial, double normalizationTimeConstant, double blendTimeConstant, double epsilon, bool useCntkEngine,
ImageLayoutKind imageLayoutKind,
const std::wstring nodeName)
{
return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, eval, spatial, normalizationTimeConstant, epsilon, useCntkEngine, imageLayoutKind), { input, scale, bias, runMean, runInvStdDev });
return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, useCntkEngine, imageLayoutKind), { input, scale, bias, runMean, runInvStdDev });
}
template class ComputationNetworkBuilder<float>;

Просмотреть файл

@ -7,7 +7,8 @@
#include "Basics.h"
#include "ComputationNode.h"
#include "ComputationNetwork.h"
#include "TrainingNodes.h" // for NCEEvalMode
#include "TrainingNodes.h" // for NCEEvalMode
#include "ConvolutionalNodes.h" // for PoolKind
#include "ScriptableObjects.h"
#include "TensorShape.h"
#include <string>
@ -51,7 +52,15 @@ public:
ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows);
ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout);
ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout);
ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
const size_t horizontalSubsample, const size_t verticalSubsample,
ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
ComputationNodePtr CreatePoolingNode(const std::wstring& nodeName, PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout);
ComputationNodePtr CreateMaxPoolingNode(const std::wstring& nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
ComputationNodePtr CreateAveragePoolingNode(const std::wstring& nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
// this is the catch-all for all cases not covered as special cases above
@ -60,7 +69,7 @@ public:
// The following functions create nodes and link them to the network and their inputs.
// TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
ComputationNodePtr BatchNormalization(const ComputationNodePtr input, const ComputationNodePtr scale, const ComputationNodePtr bias,
const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool eval = false, bool spatial = false, double normalizationTimeConstant = 0, double epsilon = 1e-5, bool useCntkEngine = true,
const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool spatial = false, double normalizationTimeConstant = 0, double blendTimeConstant = 0, double epsilon = 1e-5, bool useCntkEngine = true,
ImageLayoutKind imageLayoutKind = ImageLayoutKind::CHW, const std::wstring nodeName = L"");
ComputationNodePtr Convolution(const ComputationNodePtr weight,
const ComputationNodePtr inputValues,
@ -68,6 +77,17 @@ public:
const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0,
const std::wstring nodeName = L"");
ComputationNodePtr Convolution(const ComputationNodePtr weight,
const ComputationNodePtr inputValues,
const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
const std::wstring nodeName = L"");
ComputationNodePtr Pooling(const ComputationNodePtr inputValues,
PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout,
const std::wstring nodeName = L"");
ComputationNodePtr MaxPooling(const ComputationNodePtr inputValues,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const std::wstring nodeName = L"");

Просмотреть файл

@ -332,42 +332,4 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
}
}
void ComputationNetwork::SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
{
vector<ComputationNodeBasePtr> nodes;
if (rootNode == nullptr)
{
for (auto pair : m_nameToNodeMap)
{
nodes.push_back(pair.second);
}
}
else
{
auto allnodes = rootNode->EnumerateNodes();
for (auto node : allnodes)
nodes.push_back(node);
}
for (auto& node : nodes)
{
if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
{
auto pNode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
if (!pNode)
{
auto pNode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
if (!pNode2)
{
RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n", node->NodeName().c_str());
}
}
else
{
pNode->SetEvalMode(evalMode);
}
}
}
}
}}}

Просмотреть файл

@ -114,9 +114,11 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
{
// instead of the node itself, include the sentinel SEQTraversalFlowControlNode in our list
m_nestedNodes.push_back(recInfo);
// and verify that we only encountered the loop once (all nodes should have been consecutive)
if (!loopsSeen.insert(recInfo).second)
LogicError("PARTraversalFlowControlNode: members of loop %ls are not consecutive in node list.", recInfo->NodeName().c_str());
// consume all nodes that are part of the same loop (they are all consecutive)
while (nodeIter != allNodes.end() && (*nodeIter)->IsPartOfLoop() && FindInRecurrentLoops(recurrentInfo, *nodeIter) == recInfo)
nodeIter++;
@ -303,8 +305,10 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
// look in all recurrent loops of the network
// TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
for (auto& iter : recurrentInfo)
{
if (std::find(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end(), node) != iter->m_nestedNodes.end()) // TODO: should this loop need to be a method of SEQTraversalFlowControlNode?
return iter;
}
return nullptr; // not part of a recurrent loop
}
@ -357,8 +361,10 @@ void ComputationNetwork::PrintComputationTree(const ComputationNodeBasePtr& root
if (nodes.size() == 0)
fprintf(stderr, "\n(empty)\n");
else
{
for (const auto& node : nodes)
node->PrintSelf(printMatrices);
}
}
// -----------------------------------------------------------------------
@ -399,7 +405,7 @@ void ComputationNetwork::CompileNetwork()
// all steps below have to be repeated for all root nodes (=nodes without parents and PreComputeNodes)
DetermineSetOfAllRoots();
fprintf(stderr, "\n%d roots:\n", (int) m_allRoots.size());
fprintf(stderr, "\n%d roots:\n", (int)m_allRoots.size());
for (const auto& root : m_allRoots)
fprintf(stderr, "\t%ls = %ls()\n", root->NodeName().c_str(), root->OperationName().c_str());
@ -469,7 +475,7 @@ void ComputationNetwork::DetermineSetOfAllRoots()
auto input = node->Input(i);
if (!input) // this may be the result of an incorrect MEL operation
{
InvalidArgument("DetermineSetOfAllRoots: Input %d of %ls %ls operation if not connected, network is malformed.",
InvalidArgument("DetermineSetOfAllRoots: Input %d of %ls %ls operation is not connected, network is malformed.",
(int) i, node->NodeName().c_str(), node->OperationName().c_str());
}
referencedNodes.insert(input);
@ -592,7 +598,7 @@ void ComputationNetwork::ValidateNetwork()
}
if (!nonDefaultNodes.empty())
{
fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int) nonDefaultNodes.size(), (int) nodes.size());
fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int)nonDefaultNodes.size(), (int)nodes.size());
// for (auto node : nonDefaultNodes)
// fprintf(stderr, " %ls\n", node->NodeName().c_str());
// fprintf(stderr, "\n\n");
@ -652,6 +658,7 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
hasVisitedChild |= child->m_visited; // if not a single visited child then no point in validating
allChildrenVisited &= child->m_visited;
}
// if there is not at least one visited child
bool valid = false;
if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate()
@ -850,7 +857,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
else
{
nodeIter->RequestMatricesBeforeForwardProp(m_matrixPool);
// we only release matrices for the children since the root node's informatioin will be used and should not be shared
// we only release matrices for the children since the root node's information will be used and should not be shared
// with others
ReleaseMatricesAfterEvalForChildren(nodeIter, parentCount);
}

Просмотреть файл

@ -13,7 +13,6 @@
#include "RecurrentNodes.h"
#include "NonlinearityNodes.h"
#include "LinearAlgebraNodes.h"
#include "ConvolutionalNodes.h"
#include "ReshapingNodes.h"
#include "ComputationNetwork.h"

Просмотреть файл

@ -402,6 +402,19 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
}
let& sequences = pMBLayout->GetAllSequences();
let width = pMBLayout->GetNumTimeSteps();
TensorShape tensorShape = GetSampleLayout();
stringstream str;
let dims = tensorShape.GetDims();
for (auto dim : dims)
str << dim << ' ';
let shape = str.str(); // BUGBUG: change to string(tensorShape) to make sure we always use the same format
bool sequencePrologueHasShape = sequencePrologue.find("%x") != sequencePrologue.npos;
bool sampleSeparatorHasShape = sampleSeparator.find("%x") != sampleSeparator.npos;
bool sequencePrologueHasSeqId = sequencePrologue.find("%d") != sequencePrologue.npos;
bool sampleSeparatorHasSeqId = sampleSeparator.find("%d") != sampleSeparator.npos;
for (size_t s = 0; s < sequences.size(); s++)
{
const auto& seqInfo = sequences[s];
@ -429,9 +442,30 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
let seqCols = t1 - t0;
let seqStride = pMBLayout->GetNumParallelSequences() * matStride;
auto seqProl = sequencePrologue;
auto sampleSep = sampleSeparator;
if (sequencePrologueHasShape || sampleSeparatorHasShape)
{
auto sh = msra::strfun::_strprintf<char>("%s%ld", shape.c_str(), (unsigned long long)seqInfo.GetNumTimeSteps());
if (sequencePrologueHasShape)
seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%x", sh);
if (sampleSeparatorHasShape)
sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%x", sh);
}
if (sequencePrologueHasSeqId || sampleSeparatorHasSeqId)
{
auto sh = msra::strfun::_strprintf<char>("%ld", (unsigned long long)seqInfo.seqId);
if (sequencePrologueHasSeqId)
seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%d", sh);
if (sampleSeparatorHasSeqId)
sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%d", sh);
}
if (s > 0)
fprintfOrDie(f, "%s", sequenceSeparator.c_str());
fprintfOrDie(f, "%s", sequencePrologue.c_str());
fprintfOrDie(f, "%s", seqProl.c_str());
// output it according to our format specification
auto formatChar = valueFormatString.back();
@ -530,14 +564,14 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
else
{
for (size_t j = 0; j < jend; j++) // loop over output rows --BUGBUG: row index is 'i'!! Rename these!!
{
if (j > 0)
fprintfOrDie(f, "%s", sampleSeparator.c_str());
{
if (j > 0)
fprintfOrDie(f, "%s", sampleSep.c_str());
if (j == jstop && jstop < jend - 1) // if jstop == jend-1 we may as well just print the value instead of '...'
{
{
fprintfOrDie(f, "...+%d", (int)(jend - jstop)); // 'nuff said
break;
}
break;
}
// inject sample tensor index if we are printing row-wise and it's a tensor
if (!transpose && sampleLayout.size() > 1 && !isCategoryLabel) // each row is a different sample dimension
{
@ -547,15 +581,15 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
}
// print a row of values
for (size_t i = 0; i < iend; i++) // loop over elements
{
if (i > 0)
fprintfOrDie(f, "%s", elementSeparator.c_str());
{
if (i > 0)
fprintfOrDie(f, "%s", elementSeparator.c_str());
if (i == istop && istop < iend - 1)
{
{
fprintfOrDie(f, "...+%d", (int)(iend - istop));
break;
}
double dval = seqData[i * istride + j * jstride];
break;
}
double dval = seqData[i * istride + j * jstride];
print(dval);
}
}
@ -566,7 +600,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
}
/*static*/ string WriteFormattingOptions::Processed(const wstring& nodeName, string fragment, size_t minibatchId)
{
{
fragment = msra::strfun::ReplaceAll<string>(fragment, "\\n", "\n");
fragment = msra::strfun::ReplaceAll<string>(fragment, "\\r", "\r");
fragment = msra::strfun::ReplaceAll<string>(fragment, "\\t", "\t");
@ -577,7 +611,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
fragment = msra::strfun::ReplaceAll<string>(fragment, "%n", msra::strfun::_strprintf<char>("%ld", minibatchId).c_str());
// %d: sequenceId
return fragment;
}
}
template <class ConfigRecordType>
WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
@ -588,14 +622,14 @@ WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
{
const ConfigRecordType& formatConfig(config(L"format", ConfigRecordType::Record()));
if (formatConfig.ExistsCurrent(L"type")) // do not inherit 'type' from outer block
{
{
wstring type = formatConfig(L"type");
if (type == L"real") ; // default
else if (type == L"category") isCategoryLabel = true;
else if (type == L"sparse") isSparse = true;
else InvalidArgument("write: type must be 'real', 'category', or 'sparse'");
labelMappingFile = (wstring)formatConfig(L"labelMappingFile", L"");
}
}
transpose = formatConfig(L"transpose", transpose);
prologue = formatConfig(L"prologue", prologue);
epilogue = formatConfig(L"epilogue", epilogue);
@ -606,8 +640,8 @@ WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
sampleSeparator = msra::strfun::utf8(formatConfig(L"sampleSeparator", (wstring)msra::strfun::utf16(sampleSeparator)));
precisionFormat = msra::strfun::utf8(formatConfig(L"precisionFormat", (wstring)msra::strfun::utf16(precisionFormat)));
// TODO: change those strings into wstrings to avoid this conversion mess
}
}
}
}
void WriteFormattingOptions::Save(File& fstream) const
{
@ -623,7 +657,7 @@ void WriteFormattingOptions::Save(File& fstream) const
fstream << elementSeparator;
fstream << sampleSeparator;
fstream << precisionFormat;
}
}
void WriteFormattingOptions::Load(File& fstream, size_t modelVersion)
{
@ -710,5 +744,6 @@ public:
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedTensorShape> registerTensorShape(L"TensorShape");
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<int>> registerIntVector (L"IntVector");
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<size_t>> registerSizeVector (L"SizeVector");
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<bool>> registerBoolVector (L"BoolVector");
}}}

Просмотреть файл

@ -31,17 +31,15 @@
// version number to control how to read and write
#define CNTK_MODEL_VERSION_1 1
#define CNTK_MODEL_VERSION_2 2
#define CNTK_MODEL_VERSION_3 3 // (Row)Slice: axis; LearnableParameter: tensor shape; Times: outputRank; TransposeDimensions: axes
#define CNTK_MODEL_VERSION_4 4 // PastValue: tensor shape
#define CNTK_MODEL_VERSION_5 5 // ElemType tag in model file
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_5
#define CNTK_MODEL_VERSION_3 3
#define CNTK_MODEL_VERSION_4 4 // PastValue
#define CNTK_MODEL_VERSION_5 5 // ND convolution and pooling
#define CNTK_MODEL_VERSION_6 6 // Batch norm blending
#define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_7
extern bool g_shareNodeValueMatrices;
#ifndef UNREFERENCED_PARAMETER // TODO: unify with UNUSED()
#define UNREFERENCED_PARAMETER(P) (P)
#endif
// helper mode for debugging
// If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
// #define TRACK_GAP_NANS
@ -902,7 +900,7 @@ public:
if (m_value)
{
node->CreateValueMatrixIfNull();
node->m_value->SetValue(*m_value);
node->m_value->SetValue(*m_value);
}
else
node->m_value = nullptr;
@ -1112,6 +1110,9 @@ public:
const Matrix<ElemType>& Gradient() const { return *m_gradient; }
Matrix<ElemType>& Gradient() { return *m_gradient; }
MatrixBasePtr GradientPtr() const { return m_gradient; }
// TODO: This is only used for testing whether a gradient has been allocated. Maybe reduce to bool HasGradient()?
private:
template<class E>
@ -1268,8 +1269,8 @@ protected:
DetermineDataSize(rows, cols);
try
{
m.VerifySize(rows, cols);
}
m.VerifySize(rows, cols);
}
catch (const std::exception& e)
{
Rethrow(e);
@ -1499,8 +1500,8 @@ public:
"%13.10f"/*valueFormatString*/);
if (m_traceNodeValueSparse)
WriteMinibatchWithFormatting(stderr, FrameRange(), SIZE_MAX, SIZE_MAX, false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/true, std::vector<std::string>(),
""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/,
"%13.10f"/*valueFormatString*/);
""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/,
"%13.10f"/*valueFormatString*/);
}
}

Просмотреть файл

@ -7,31 +7,19 @@
#include "Basics.h"
#include "Matrix.h"
#include "ComputationNode.h"
#include "InputAndParamNodes.h"
#include "ConvolutionEngine.h"
#include <unordered_set>
#include <map>
#include <string>
#include <vector>
#include <stdexcept>
#include <list>
#include <memory>
#include <algorithm>
#include <assert.h>
#include <atomic>
#include <sstream>
#include <iostream>
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// ConvolutionNode (convolutionWeights, inputFeature)
// ConvolutionNodeBase
// -----------------------------------------------------------------------
// Convolutions (incl. pooling) support two different storage formats:
// ConvolutionNodeBase is a base class for ND-convolution(ConvolutionNode) and ND-pooling(PoolingNode).
//
// 2D convolutions (incl. pooling) support two different storage formats:
//
// * legacy ("HWC") mode (CPU and GPU without cudnn): Channels are tuples of scalars
// * legacy ("HWC") mode: Channels are tuples of scalars
//
// This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
// Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
@ -40,7 +28,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// - output : [C' x W' x H' x T] or ARRAY[1..T] OF ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..C']
// - filter : [C' x W" x H" x C ] or ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..C']
//
// * cudnn ("CHW") mode (GPU only): Channels are planes
// * cudnn ("CHW") mode (works both GPU and CPU): Channels are planes
//
// - input : [W x H x C x T] or ARRAY[1..T] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W]
// - output : [W' x H' x C' x T] or ARRAY[1..T] OF ARRAY[1..C'] OF ARRAY[1..H'] OF ARRAY[1..W']
@ -54,71 +42,269 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// - 3 for color images, 1 for B&W images
// - for hidden layer: dimension of activation vector for each pixel
// - C' = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
//
// For ND-convolution/pooling only second format ('cudnn') is supported.
//
template <class ElemType>
class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
class ConvolutionNodeBase : public ComputationNode<ElemType>
{
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
static const std::wstring TypeName() { return L"Convolution"; }
typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
public:
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name),
m_kernelWidth(SIZE_MAX),
m_kernelHeight(SIZE_MAX),
// initialize to dummy values so we catch missing initialization
m_horizontalSubsample(SIZE_MAX),
m_verticalSubsample(SIZE_MAX),
m_zeroPadding(false),
m_maxTempMemSizeInSamples(SIZE_MAX),
m_imageLayoutKind(ImageLayoutKind::HWC)
ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name), m_poolKind(PoolKind::None), m_maxTempMemSizeInSamples(0)
{
SetDims(ImageDimensions::AsTensorShape(1, 1, 0, m_imageLayoutKind), 0);
}
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0)
: Base(deviceId, name),
m_outputChannels(outputChannels),
m_kernelWidth(kernelWidth),
m_kernelHeight(kernelHeight),
m_horizontalSubsample(horizontalSubsample),
m_verticalSubsample(verticalSubsample),
m_zeroPadding(zeroPadding),
m_maxTempMemSizeInSamples(maxTempMemSizeInSamples),
m_imageLayoutKind(imageLayoutKind)
ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
PoolKind poolKind, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
: Base(deviceId, name), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(strideShape), m_sharing(sharing),
m_autoPad(autoPadding), m_lowerPad(lowerPad), m_upperPad(upperPad), m_poolKind(poolKind),
m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
{
SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: necessary?
m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
}
ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
: ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelWidth"), configp->Get(L"kernelHeight"), configp->Get(L"outputChannels"),
configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")),
configp->Get(L"zeroPadding"), configp->Get(L"maxTempMemSizeInSamples"))
{
// weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
}
public:
void Save(File& fstream) const override
{
Base::Save(fstream);
fstream << m_kernelWidth << m_kernelHeight << m_horizontalSubsample << m_verticalSubsample;
uint32_t imageLayoutKind = (uint32_t) m_imageLayoutKind;
uint32_t outputChannels = (uint32_t) m_outputChannels;
fstream << outputChannels << imageLayoutKind;
fstream << m_zeroPadding << m_maxTempMemSizeInSamples;
m_kernelShape.Save(fstream);
m_mapCount.Save(fstream);
m_stride.Save(fstream);
fstream << m_sharing;
fstream << m_autoPad;
m_lowerPad.Save(fstream);
m_upperPad.Save(fstream);
fstream << (int32_t)m_poolKind;
fstream << (int32_t)m_imageLayout;
fstream << m_maxTempMemSizeInSamples;
}
void Load(File& fstream, size_t modelVersion) override
{
Base::Load(fstream, modelVersion);
fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample;
uint32_t imageLayoutKind, outputChannels;
fstream >> outputChannels >> imageLayoutKind;
m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
m_outputChannels = outputChannels;
SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), HasMBLayout()); // TODO: needed?
fstream >> m_zeroPadding >> m_maxTempMemSizeInSamples;
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
// Let ConvolutionNode handle older models.
if (modelVersion >= CNTK_MODEL_VERSION_5)
{
m_kernelShape.Load(fstream);
m_mapCount.Load(fstream);
m_stride.Load(fstream);
fstream >> m_sharing;
fstream >> m_autoPad;
m_lowerPad.Load(fstream);
m_upperPad.Load(fstream);
int32_t k;
fstream >> k;
m_poolKind = (PoolKind)k;
int32_t layout;
fstream >> layout;
m_imageLayout = (ImageLayoutKind)layout;
fstream >> m_maxTempMemSizeInSamples;
}
}
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
{
Base::CopyTo(nodeP, newName, flags);
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<ConvolutionNodeBase<ElemType>>(nodeP);
node->m_kernelShape = m_kernelShape;
node->m_mapCount = m_mapCount;
node->m_stride = m_stride;
node->m_sharing = m_sharing;
node->m_autoPad = m_autoPad;
node->m_lowerPad = m_lowerPad;
node->m_upperPad = m_upperPad;
node->m_poolKind = m_poolKind;
node->m_imageLayout = m_imageLayout;
node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
}
}
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
{
auto sliceOutputGrad = GradientFor(fr);
if (m_poolKind == PoolKind::None)
{
if (inputIndex == 0) // derivative with respect to the weight matrix
{
auto& grad = Input(0)->GradientAsMatrix();
auto sliceInput1Value = Input(1)->ValueFor(fr);
m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
}
else if (inputIndex == 1) // derivative with respect to the input feature
{
auto& input0 = Input(0)->ValueAsMatrix();
auto sliceInput1Grad = Input(1)->GradientFor(fr);
m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
}
}
else
{
Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
}
}
bool OutputUsedInComputingInputNodesGradients() const override
{
// The ConvolutionNode requires output values only for max pooling.
return m_poolKind == PoolKind::Max;
}
void ForwardProp(const FrameRange& fr) override
{
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
if (m_poolKind == PoolKind::None)
{
const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
}
else
{
const Matrix<ElemType>& input0 = Input(0)->ValueFor(fr);
m_convEng->ForwardPooling(input0, sliceOutputValue);
}
}
void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override
{
Base::DumpNodeInfo(printValues, printMetadata, fstream);
if (m_convEng != nullptr)
fstream << "Geometry: " << string(*m_convEng->Geometry()) << "\n";
fstream << "PoolKind: " << (int)m_poolKind << "\n";
}
protected:
TensorShape m_kernelShape;
TensorShape m_mapCount;
TensorShape m_stride;
std::vector<bool> m_sharing;
std::vector<bool> m_autoPad;
TensorShape m_lowerPad;
TensorShape m_upperPad;
PoolKind m_poolKind;
ImageLayoutKind m_imageLayout;
size_t m_maxTempMemSizeInSamples;
shared_ptr<Matrix<ElemType>> m_tempMatrix;
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
};
#define UsingConvolutionNodeBaseMembers \
UsingComputationNodeMembersBoilerplate; \
protected: \
using Base::m_kernelShape; \
using Base::m_mapCount; \
using Base::m_stride; \
using Base::m_sharing; \
using Base::m_autoPad; \
using Base::m_lowerPad; \
using Base::m_upperPad; \
using Base::m_poolKind; \
using Base::m_imageLayout; \
using Base::m_maxTempMemSizeInSamples; \
using Base::m_tempMatrix; \
using Base::m_convEng; \
public:
// -----------------------------------------------------------------------
// ConvolutionNode (convolutionWeights, inputFeature)
// -----------------------------------------------------------------------
template <class ElemType>
class ConvolutionNode : public ConvolutionNodeBase<ElemType>, public NumInputs<2>
{
typedef ConvolutionNodeBase<ElemType> Base;
UsingConvolutionNodeBaseMembers;
static const std::wstring TypeName()
{
return L"Convolution";
}
public:
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name)
{
}
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
: Base(deviceId, name, kernelShape, mapCount, strideShape, sharing, autoPadding, lowerPad, upperPad, PoolKind::None, imageLayout, maxTempMemSizeInSamples),
m_convolution2D(false)
{
}
ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayout,
bool zeroPadding, size_t maxTempMemSizeInSamples)
: ConvolutionNode(deviceId, name, TensorShape(kernelWidth, kernelHeight, 1), TensorShape(1, 1, outputChannels),
TensorShape(horizontalSubsample, verticalSubsample, 1), vector<bool>{true},
vector<bool>{zeroPadding}, TensorShape(0), TensorShape(0),
imageLayout, maxTempMemSizeInSamples)
{
m_convolution2D = true;
}
ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
: ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelShape"), configp->Get(L"mapCount"), configp->Get(L"strideShape"),
configp->Get(L"dimSharing"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
ImageLayoutKindFrom(configp->Get(L"imageLayout")), configp->Get(L"maxTempMemSizeInSamples"))
{
AttachInputs(configp, GetExpectedNumInputs());
}
public:
void Save(File& fstream) const override
{
Base::Save(fstream);
fstream << m_convolution2D;
}
void Load(File& fstream, size_t modelVersion) override
{
Base::Load(fstream, modelVersion);
// Back compat: load pre-ND convolution models.
if (modelVersion < CNTK_MODEL_VERSION_5)
{
size_t kW, kH, sW, sH;
fstream >> kW;
fstream >> kH;
fstream >> sW;
fstream >> sH;
uint32_t imageLayout, mapCount;
fstream >> mapCount;
fstream >> imageLayout;
m_imageLayout = (ImageLayoutKind)imageLayout;
bool pad;
fstream >> pad;
fstream >> m_maxTempMemSizeInSamples;
m_poolKind = PoolKind::None;
m_convolution2D = true;
m_kernelShape = TensorShape(kW, kH, 1);
m_mapCount = TensorShape(mapCount);
m_stride = TensorShape(sW, sH, 1);
m_sharing = vector<bool>{true};
m_autoPad = vector<bool>{pad};
m_lowerPad = TensorShape(0);
m_upperPad = TensorShape(0);
}
else
{
fstream >> m_convolution2D;
}
}
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -127,144 +313,92 @@ public:
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<ConvolutionNode<ElemType>>(nodeP);
node->m_kernelWidth = m_kernelWidth;
node->m_kernelHeight = m_kernelHeight;
node->m_horizontalSubsample = m_horizontalSubsample;
node->m_verticalSubsample = m_verticalSubsample;
node->m_zeroPadding = m_zeroPadding;
node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
node->m_imageLayoutKind = m_imageLayoutKind;
node->m_tempMatrix->SetValue(*m_tempMatrix);
node->m_convolution2D = m_convolution2D;
}
}
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
{
auto sliceOutputGrad = GradientFor(fr);
auto sliceInput1Value = Input(1)->ValueFor(fr);
size_t batchSize = sliceInput1Value.GetNumCols();
m_inT->setN(batchSize);
m_outT->setN(batchSize);
assert(m_convEng != nullptr);
if (inputIndex == 0) // derivative with respect to the weight matrix
{
auto& grad = Input(0)->GradientAsMatrix();
m_convEng->BackwardFilter(*m_outT, sliceOutputGrad, *m_inT, sliceInput1Value, *m_convDesc, *m_filterT, grad, fr.IsAllFrames(), *m_tempMatrix);
}
else if (inputIndex == 1) // derivative with respect to the input feature
{
auto& input0 = Input(0)->ValueAsMatrix();
auto sliceInput1Grad = Input(1)->GradientFor(fr);
m_convEng->BackwardData(*m_outT, sliceOutputGrad, *m_filterT, input0, *m_convDesc, *m_inT, sliceInput1Grad, *m_tempMatrix);
}
}
virtual bool OutputUsedInComputingInputNodesGradients() const override
{
// The ConvolutionNode does not require its output value for computing
// the gradients of its input nodes
return false;
}
void ForwardProp(const FrameRange& fr) override
{
const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
// update the tensor dimension w.r.t. number of samples
size_t batchSize = sliceInput1Value.GetNumCols();
m_inT->setN(batchSize);
m_outT->setN(batchSize);
assert(m_convEng != nullptr);
#if NANCHECK
input0.HasNan("Convolution-input0");
sliceInput1Value.HasNan("Convolution-input1");
#endif
m_convEng->Forward(*m_inT, sliceInput1Value, *m_filterT, input0, *m_convDesc, *m_outT, sliceOutputValue, *m_tempMatrix);
#if NANCHECK
sliceOutputValue.HasNan("Convolution");
#endif
}
void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
void Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
// get input and output tensor shape and interpret as image dimensions
auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
size_t inputIdx = GetExpectedNumInputs() - 1;
TensorShape inputShape;
if (m_convolution2D)
{
// Need to update some tensors with correct input dims.
auto inDims = ImageDimensions(GetInputSampleLayout(inputIdx), m_imageLayout);
// inputShape is used in ConvolveGeometry which supports only CHW layout.
inputShape = inDims.AsTensorShape(ImageLayoutKind::CHW);
size_t kW = m_kernelShape[0];
size_t kH = m_kernelShape[1];
size_t sW = m_stride[0];
size_t sH = m_stride[1];
m_kernelShape = TensorShape(kW, kH, inDims.m_numChannels);
m_stride = TensorShape(sW, sH, inDims.m_numChannels);
if (isFinalValidationPass && (inDims.m_width < m_kernelWidth || inDims.m_height < m_kernelHeight))
InvalidArgument("%ls %ls operation requires that input width be >= kernelWidth and input height >= kernelHeight.", NodeName().c_str(), OperationName().c_str());
// determine output tensor shape
const int kernelWidthCenter = m_zeroPadding ? m_kernelWidth % 2 : m_kernelWidth;
const int kernelHeightCenter = m_zeroPadding ? m_kernelHeight % 2 : m_kernelHeight;
auto outDims = ImageDimensions(
(inDims.m_width - kernelWidthCenter) / m_horizontalSubsample + 1,
(inDims.m_height - kernelHeightCenter) / m_verticalSubsample + 1,
m_outputChannels);
size_t weightCols = m_kernelWidth * m_kernelHeight * inDims.m_numChannels;
size_t mapCount = m_mapCount.GetNumElements();
size_t weightCols = kW * kH * inDims.m_numChannels;
// check/infer input [0] (weights)
// BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
Input(0)->ValidateInferInputDimsFrom(TensorShape(m_outputChannels, weightCols));
Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols));
if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != m_outputChannels))
LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int) m_outputChannels, (int) weightCols);
if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != mapCount))
{
LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]",
Input(0)->NodeName().c_str(), (int)mapCount, (int)weightCols);
}
// that's our dimension
SetDims(outDims.AsTensorShape(m_imageLayoutKind), true);
auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
// ConvolveGeometry always uses CHW.
SetDims(ImageDimensions(outDims, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
}
else
{
if (m_imageLayout != ImageLayoutKind::CHW)
{
InvalidArgument(
"%ls %ls supports only cuDNN (CHW) data layout. "
"Please specify imageLayout=\"cudnn\" in %ls node in your script "
"and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
}
inputShape = GetInputSampleLayout(inputIdx);
auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
SetDims(outDims, HasMBLayout());
}
if (isFinalValidationPass)
{
// set up the various engines and descriptor objects
// REVIEW alexeyk: is there a better place to create engines?
assert(m_factory);
// if (m_factory == nullptr)
// m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
// TODO: This seems to expose too much internal knowlegde of the engine to the ConvolutionNode().
// Why not just pass everything to the engine creator, and get one object that holds everything.
if (m_convEng == nullptr)
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_imageLayoutKind, m_maxTempMemSizeInSamples, BatchNormImpl::Cntk);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
if (m_filterT == nullptr)
m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inDims.m_numChannels, m_outputChannels);
if (m_outT == nullptr)
m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
if (m_convDesc == nullptr)
m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
// REVIEW alexeyk: create per-channel bias (shared across all pixels). Consider adding other types of biases.
if (m_biasT == nullptr)
m_biasT = m_factory->CreateTensor(1, 1, outDims.m_numChannels, 1);
{
auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
m_maxTempMemSizeInSamples, m_poolKind);
}
if (Input(0)->GetAsMatrixNumCols() != m_kernelShape.GetNumElements() ||
Input(0)->GetAsMatrixNumRows() != m_convEng->Geometry()->KernelCount())
{
LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
}
}
}
void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
Base::DumpNodeInfo(printValues, printMetadata, fstream);
Base::RequestMatricesBeforeForwardProp(matrixPool);
RequestMatrixFromPool(m_tempMatrix, matrixPool);
}
auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
auto outDims = ImageDimensions(m_sampleLayout, m_imageLayoutKind);
char str[4096];
sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu] \n", inDims.m_width, inDims.m_height, inDims.m_numChannels);
fstream << string(str);
sprintf(str, "Kernel[Width:%lu, Height:%lu] SubSample[Horizontal:%lu, Vertical:%lu]\n", m_kernelWidth, m_kernelHeight, m_horizontalSubsample, m_verticalSubsample);
fstream << string(str);
sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu] \n", outDims.m_width, outDims.m_height, outDims.m_numChannels);
fstream << string(str);
sprintf(str, "zeroPadding=%ls maxTempMemSizeInSamples=%lu\n", m_zeroPadding ? L"true" : L"false", m_maxTempMemSizeInSamples);
fstream << string(str);
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
}
void SetmMaxTempMemSizeInSamples(const size_t maxTempMemSizeInSamples)
@ -272,47 +406,78 @@ public:
m_maxTempMemSizeInSamples = maxTempMemSizeInSamples;
}
// request matrices needed to do node function value evaluation
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeForwardProp(matrixPool);
RequestMatrixFromPool(m_tempMatrix, matrixPool);
}
// release gradient and temp matrices that no longer needed after all the children's gradients are computed.
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
ReleaseMatrixToPool(m_tempMatrix, matrixPool);
}
private:
size_t m_outputChannels;
size_t m_kernelWidth, m_kernelHeight;
size_t m_horizontalSubsample, m_verticalSubsample;
bool m_zeroPadding;
bool m_1DConvolutionOnGPUSparse;
shared_ptr<Matrix<ElemType>> m_tempMatrix;
size_t m_maxTempMemSizeInSamples; // can change during runtime
ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
std::unique_ptr<ConvolutionTensor4D> m_inT;
std::unique_ptr<ConvolutionFilter> m_filterT;
std::unique_ptr<ConvolutionTensor4D> m_outT;
std::unique_ptr<ConvolutionDescriptor> m_convDesc;
std::unique_ptr<ConvolutionTensor4D> m_biasT;
protected:
bool m_convolution2D;
};
template class ConvolutionNode<float>;
template class ConvolutionNode<double>;
// -----------------------------------------------------------------------
// PoolingNode (inputFeature)
// -----------------------------------------------------------------------
template <class ElemType>
class PoolingNode : public ConvolutionNodeBase<ElemType>, public NumInputs<1>
{
typedef ConvolutionNodeBase<ElemType> Base;
UsingConvolutionNodeBaseMembers;
static const std::wstring TypeName()
{
return L"Pooling";
}
public:
PoolingNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name)
{
}
PoolingNode(DEVICEID_TYPE deviceId, const wstring& name, PoolKind pool, const TensorShape& kernelShape, const TensorShape& strideShape,
const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
ImageLayoutKind imageLayout)
: Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, pool, imageLayout, 0)
{
}
PoolingNode(const ScriptableObjects::IConfigRecordPtr configp)
: PoolingNode(configp->Get(L"deviceId"), L"<placeholder>", PoolKindFrom(configp->Get(L"pool")), configp->Get(L"kernelShape"),
configp->Get(L"strideShape"),
configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
{
AttachInputs(configp, GetExpectedNumInputs());
}
public:
void Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
InferMBLayoutFromInputsForStandardCase();
if (m_imageLayout != ImageLayoutKind::CHW)
{
InvalidArgument(
"%ls %ls supports only cuDNN (CHW) data layout. "
"Please specify imageLayout=\"cudnn\" in %ls node in your script "
"and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
}
auto inputShape = GetInputSampleLayout(0);
auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
SetDims(outDims, HasMBLayout());
if (isFinalValidationPass)
{
if (m_convEng == nullptr)
{
auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
m_maxTempMemSizeInSamples, m_poolKind);
}
}
}
};
// -----------------------------------------------------------------------
// PoolingNodeBase (input)
// Legacy PoolingNodeBase (input)
// -----------------------------------------------------------------------
template <class ElemType>
@ -339,7 +504,6 @@ public:
m_verticalSubsample(verticalSubsample),
m_imageLayoutKind(imageLayoutKind)
{
m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
}
PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp)
: PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")))
@ -362,8 +526,7 @@ public:
uint32_t imageLayoutKind, windowWidth;
fstream >> windowWidth >> imageLayoutKind >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
m_windowWidth = windowWidth;
m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
m_imageLayoutKind = (ImageLayoutKind)imageLayoutKind;
}
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -394,12 +557,7 @@ public:
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
size_t batchSize = sliceInput0Value.GetNumCols();
m_inT->setN(batchSize);
m_outT->setN(batchSize);
assert(m_poolEng != nullptr);
assert(m_poolDesc != nullptr);
m_poolEng->Backward(*m_outT, sliceOutputValue, sliceOutputGrad, *m_poolDesc, *m_inT, sliceInput0Value, sliceInput0Grad);
m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
}
void ForwardProp(const FrameRange& fr) override
@ -407,12 +565,7 @@ public:
Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
size_t batchSize = sliceInput0Value.GetNumCols();
m_inT->setN(batchSize);
m_outT->setN(batchSize);
assert(m_poolEng != nullptr);
assert(m_poolDesc != nullptr);
m_poolEng->Forward(*m_inT, sliceInput0Value, *m_poolDesc, *m_outT, sliceOutputValue);
m_convEng->ForwardPooling(sliceInput0Value, sliceOutputValue);
}
void Validate(bool isFinalValidationPass) override
@ -439,16 +592,14 @@ public:
if (isFinalValidationPass)
{
// set up various engines and descriptor objects
// REVIEW alexeyk: is there a better place to create engines?
assert(m_factory);
// if (m_factory == nullptr)
// m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
if (m_poolEng == nullptr)
m_poolEng = m_factory->CreatePoolEngine(m_deviceId, m_imageLayoutKind);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
if (m_outT == nullptr)
m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
m_geometry = std::make_shared<ConvolveGeometry>(inDims.AsTensorShape(m_imageLayoutKind),
ImageDimensions(m_windowWidth, m_windowHeight, 1).AsTensorShape(m_imageLayoutKind),
TensorShape(1),
ImageDimensions(m_horizontalSubsample, m_verticalSubsample, 1).AsTensorShape(m_imageLayoutKind),
ConvolveGeometry::BoolVec{true},
ConvolveGeometry::BoolVec{false},
TensorShape(0),
TensorShape(0));
}
}
@ -479,12 +630,8 @@ protected:
ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
std::unique_ptr<PoolingEngine<ElemType>> m_poolEng;
std::unique_ptr<ConvolutionTensor4D> m_inT;
std::unique_ptr<ConvolutionTensor4D> m_outT;
std::unique_ptr<PoolingDescriptor> m_poolDesc;
ConvolveGeometryPtr m_geometry;
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
};
// add this at the start of each derived class, to get access to the members of ComputationNode
@ -493,19 +640,20 @@ protected:
UsingComputationNodeMembersBoilerplate; \
\
protected: \
using Base::m_factory; \
using Base::m_poolDesc; \
using Base::m_geometry; \
using Base::m_convEng; \
using Base::m_windowWidth; \
using Base::m_windowHeight; \
using Base::m_horizontalSubsample; \
using Base::m_verticalSubsample; \
using Base::m_inputSizePerSample; \
using Base::m_outputSizePerSample; \
using Base::m_imageLayoutKind; \
\
public:
// -----------------------------------------------------------------------
// MaxPoolingNode
// Legacy MaxPoolingNode
// -----------------------------------------------------------------------
template <class ElemType>
@ -535,16 +683,13 @@ public:
void Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
if (isFinalValidationPass && m_poolDesc == nullptr)
m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Max, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
if (isFinalValidationPass && m_convEng == nullptr)
m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Max);
}
};
template class MaxPoolingNode<float>;
template class MaxPoolingNode<double>;
// -----------------------------------------------------------------------
// AveragePoolingNode
// Legacy AveragePoolingNode
// -----------------------------------------------------------------------
template <class ElemType>
@ -574,12 +719,9 @@ public:
void Validate(bool isFinalValidationPass) override
{
Base::Validate(isFinalValidationPass);
if (isFinalValidationPass && m_poolDesc == nullptr)
m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Average, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
if (isFinalValidationPass && m_convEng == nullptr)
m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Average);
}
};
template class AveragePoolingNode<float>;
template class AveragePoolingNode<double>;
} } }

Просмотреть файл

@ -6,7 +6,6 @@
#include "Basics.h"
#include "ComputationNode.h"
#include "ConvolutionalNodes.h"
#include "Matrix.h"
#include "TensorView.h"

Просмотреть файл

@ -6,7 +6,7 @@
#include "Basics.h"
#include "ComputationNode.h"
#include "ConvolutionEngine.h"
#include "BatchNormalizationEngine.h"
#include <map>
#include <string>
@ -20,8 +20,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// SquareErrorNode (left, right)
// = SumElements ((left - right) .* (left - right))
// Note: to save computation the gradient may be scaled by an constant.
// TODO: ^^ Dig out what that constant is and document it here. "may be scaled"??
// -----------------------------------------------------------------------
template <class ElemType>
@ -47,9 +45,9 @@ public:
FrameRange fr(Input(0)->GetMBLayout());
m_leftMinusRight->AssignDifferenceOf(Input(0)->ValueFor(fr), Input(1)->ValueFor(fr));
MaskMissingColumnsToZero(*m_leftMinusRight, Input(0)->GetMBLayout(), fr); // we are fine since it will only be called with full minibatch.
ElemType v = m_leftMinusRight->FrobeniusNorm();
ElemType v = m_leftMinusRight->FrobeniusNorm(); // v = sqrt( sum{ (I0[i] - I1[i])^2 } )
Value().VerifySize(1, 1);
Value().SetValue(v * v / 2);
Value().SetValue(v * v); // Value = sum{ (I0[i] - I1[i])^2 }
#if NANCHECK
Value().HasNan("SquareError");
#endif
@ -59,7 +57,7 @@ public:
{
FrameRange fr(Input(0)->GetMBLayout());
auto gradient = Input(inputIndex)->GradientFor(fr);
Matrix<ElemType>::Multiply1x1AndWeightedAdd(inputIndex == 0 ? 1.0f : -1.0f, Gradient() /*1x1*/, *m_leftMinusRight, 1.0f, gradient);
Matrix<ElemType>::Multiply1x1AndWeightedAdd(inputIndex == 0 ? 2.0f : -2.0f, Gradient() /*1x1*/, *m_leftMinusRight, 1.0f, gradient); // O = (I0-I1)^2; dO/dI0 = 2*(I0-I1); dO/dI1 = -2*(I0-I1)
}
virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
@ -1522,12 +1520,43 @@ template class DropoutNode<float>;
template class DropoutNode<double>;
// -----------------------------------------------------------------------
// BatchNormalizationNode (...) --TODO: document inputs
// -----------------------------------------------------------------------
// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev, spatial,
// normalizationTimeConstant = 0, blendTimeConstant = 0,
// epsilon = 0.00001,
// useCntkEngine = true, imageLayout = 'cudnn')
//
// Implements batch normalization technique as described in:
// Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift [S. Ioffe, C. Szegedy]
// http://arxiv.org/abs/1502.03167
// In short, it normalizes layer outputs for every minibatch for each output(feature) independently and applies affine transformation to preserve representation of the layer.
// That is, for layer input:
//
// m = mean(input)
// var = variance(input)
// input_norm = (input - mean) / sqrt(var)
// output = gamma * input_norm + beta
//
// where gamma and beta are trainable parameters(represented as LearnableParameter).
//
// * input is the input of the batch normalization node
// * scale is a LearnableParameter that stores scale vector(gamma term in the equation above).
// * bias is a LearnableParameter that stores bias vector(beta term). scale and bias must have the same dimensions which must be equal
// to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
// * runMean is the running mean which is used during evaluation phase and might be used during training as well.
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
// * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
// It is represented as a LearnableParameter with the same dimensions as scale and bias.
// * spatial is a flag that specifies whether to compute mean / var for each feature in a mininbatch independently or, in case of convolutional layers, per feature map.
// * normalizationTimeConstant is the time constant which is used to compute running average of mean and variance.
// Value 0 (default) means there will be no exponential smoothing and running mean / variance will always have values computed for the last seen mininbatch.
// Value 1#INF (infinity)means running values are "frozen" (i.e.will not be updated).
// * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
// Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
// Value 1#INF (infinity)means only running mean / var will be used(this is used, for example, in evaluation phase).
// * epsilon is a conditioner constant used in computing InvStdDev
// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN - based.
// * imageLayout is the image layout.Only cudnn is supported.
// -----------------------------------------------------------------------
template <class ElemType>
class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInputs<5>
{
@ -1540,19 +1569,20 @@ class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInput
public:
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name), m_eval(false), m_spatial(false), m_normTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
: Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
{
}
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool eval, bool spatial, double normalizationTimeConstant, double epsilon,
bool useCntkEngine, ImageLayoutKind imageLayoutKind)
: Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_epsilon(epsilon),
m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind)
: Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
{
}
BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp)
: BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"eval"), configp->Get(L"spatial"),
configp->Get(L"normalizationTimeConstant"), configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
: BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"),
configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
ImageLayoutKindFrom(configp->Get(L"imageLayout")))
{
AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
@ -1561,11 +1591,10 @@ public:
void Save(File& fstream) const override
{
Base::Save(fstream);
fstream << m_version.VerWrittenCur() << m_version.VerReadableCur();
fstream << m_eval;
fstream << m_spatial;
fstream << m_normTimeConst;
fstream << m_blendTimeConst;
fstream << (int32_t)m_imageLayoutKind;
fstream << m_mbCount;
fstream << m_epsilon;
@ -1576,40 +1605,56 @@ public:
{
Base::Load(fstream, modelVersion);
// Read and check version.
// REVIEW alexeyk: extract version checking so it can be re-used in other places.
// BUGBUG: We must serialize m_inputLayout.
int32_t verWritten;
int32_t verReadable;
fstream >> verWritten >> verReadable;
if (verReadable > verWritten)
RuntimeError("Corrupt model file.");
if (verWritten < m_version.VerWeCanReadBack())
RuntimeError("Model is too old.");
if (verReadable > m_version.VerWrittenCur())
RuntimeError("Model is too new.");
fstream >> m_eval;
fstream >> m_spatial;
if (verWritten >= 0x00010004)
if (modelVersion >= CNTK_MODEL_VERSION_6)
{
fstream >> m_spatial;
fstream >> m_normTimeConst;
else
{
double expAvgFactor;
fstream >> expAvgFactor;
UNUSED(expAvgFactor); // Used in previous versions, replaced by m_normTimeConst.
}
if (verWritten >= 0x00010002)
{
fstream >> m_blendTimeConst;
fstream >> m_imageLayoutKind;
fstream >> m_mbCount;
}
if (verWritten >= 0x00010003)
{
fstream >> m_epsilon;
fstream >> m_useCntkEngine;
}
else
{
// Use old versioning scheme for older models.
// Read and check version.
// REVIEW alexeyk: extract version checking so it can be re-used in other places.
int32_t verWritten;
int32_t verReadable;
fstream >> verWritten >> verReadable;
if (verReadable > verWritten)
RuntimeError("Corrupt model file.");
if (verWritten < m_version.VerWeCanReadBack())
RuntimeError("Model is too old.");
if (verReadable > m_version.VerWrittenCur())
RuntimeError("Model is too new.");
bool eval;
fstream >> eval;
UNUSED(eval);
fstream >> m_spatial;
if (verWritten >= 0x00010004)
fstream >> m_normTimeConst;
else
{
double expAvgFactor;
fstream >> expAvgFactor;
UNUSED(expAvgFactor); // Used in previous versions, replaced by m_normTimeConst.
}
if (verWritten >= 0x00010002)
{
fstream >> m_imageLayoutKind;
fstream >> m_mbCount;
}
if (verWritten >= 0x00010003)
{
fstream >> m_epsilon;
fstream >> m_useCntkEngine;
}
}
}
void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -1620,9 +1665,9 @@ public:
auto node = dynamic_pointer_cast<BatchNormalizationNode<ElemType>>(nodeP);
assert(node != nullptr);
node->m_eval = m_eval;
node->m_spatial = m_spatial;
node->m_normTimeConst = m_normTimeConst;
node->m_blendTimeConst = m_blendTimeConst;
node->m_imageLayoutKind = m_imageLayoutKind;
node->m_mbCount = m_mbCount;
node->m_epsilon = m_epsilon;
@ -1630,20 +1675,8 @@ public:
}
}
void SetNormalizationTimeConstant(const double normalizationTimeConstant)
{
m_normTimeConst = normalizationTimeConstant;
}
void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
{
static bool m_evalWarningIssued = false; //make sure we only print warning once
if (m_eval && !m_evalWarningIssued)
{
fprintf(stderr, "WARNING: You turned BatchNormalization to evaluation mode during training. Please make sure this is intended.\n");
m_evalWarningIssued = true;
}
if (inputIndex == 0) // derivative with respect to the input.
{
auto sliceOutputGrad = GradientFor(fr);
@ -1651,15 +1684,11 @@ public:
const Matrix<ElemType>& scale = Input(1)->Value();
const Matrix<ElemType>& bias = Input(2)->Value();
size_t batchSize = sliceInputValue.GetNumCols();
m_inT->setN(batchSize);
assert(m_convEng != nullptr);
auto sliceInputGrad = Input(0)->GradientFor(fr);
m_dScale->Resize(scale);
m_dBias->Resize(bias);
// Compute all derivatives in one step. Save derivatives with respect to scale and bias in temp matrices.
m_convEng->BackwardNormalizeBatch(*m_inT, sliceInputValue, sliceOutputGrad, sliceInputGrad, *m_scaleBiasT, scale, m_spatial,
m_bnEng->Backward(sliceInputValue, sliceOutputGrad, sliceInputGrad, scale,
*m_saveMean, *m_saveInvStdDev, *m_dScale, *m_dBias);
}
else if (inputIndex == 1) // derivative with respect to the scale
@ -1701,48 +1730,45 @@ public:
Matrix<ElemType> sliceOutputValue = ValueFor(fr);
size_t batchSize = sliceInputValue.GetNumCols();
m_inT->setN(batchSize);
assert(m_convEng != nullptr);
#if NANCHECK
sliceInputValue.HasNan("BatchNormalization-input");
#endif
if (m_eval)
m_convEng->NormalizeBatchInference(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, runMean, runInvStdDev, sliceOutputValue);
double expAvgFactor;
double blendFactor;
if (!Environment().IsTraining())
{
expAvgFactor = 0;
blendFactor = 1.0;
m_saveMean->Resize(0, 0);
m_saveInvStdDev->Resize(0, 0);
}
else
{
double expAvgFactor;
double numSamples = (double)GetMBLayout()->GetActualNumSamples();
if (m_normTimeConst > 0)
{
// Convert to per-minibatch factor.
expAvgFactor = 1.0 - exp(-(double)GetMBLayout()->GetActualNumSamples() / m_normTimeConst);
// Convert to per-minibatch factor. Treat positivie infinity as if running mean/var parameters are "frozen"
// that is, do not require updates.
expAvgFactor = !isfinite(m_normTimeConst) ? 0 : (1.0 - exp(-numSamples / m_normTimeConst));
}
else
{
// REVIEW alexeyk: hack, m_normTimeConst < 0 is used to compute CMA.
expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1;
expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1.0;
}
if (m_saveMean == nullptr)
fprintf(stderr, "WARNING: m_saveMean is null\n");
if (m_saveInvStdDev == nullptr)
fprintf(stderr, "WARNING: m_saveInvStdDev is null\n");
if (!isfinite(m_blendTimeConst))
blendFactor = 1.0;
else
blendFactor = m_blendTimeConst > 0 ? (m_blendTimeConst / (m_blendTimeConst + numSamples)) : 0;
m_saveMean->Resize(runMean);
m_saveInvStdDev->Resize(runMean);
}
m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, expAvgFactor, runMean, runInvStdDev,
m_bnEng->Forward(sliceInputValue, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev,
sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);
m_mbCount++;
}
#if NANCHECK
sliceOutputValue.HasNan("BatchNormalization-output");
runMean.HasNan("BatchNormalization-runMean");
runInvStdDev.HasNan("BatchNormalization-runInvStdDev");
m_saveMean->HasNan("BatchNormalization-saveMean");
m_saveInvStdDev->HasNan("BatchNormalization-saveInvStdDev");
#endif
}
void Validate(bool isFinalValidationPass) override
{
@ -1756,34 +1782,23 @@ public:
if (m_spatial && m_imageLayoutKind != CHW)
{
InvalidArgument(
"Batch normalization currently supports only cuDNN (CHW) data layout. "
"%ls %ls currently supports only cuDNN (CHW) data layout. "
"Please specify imageLayout=\"cudnn\" in BatchNormalization node in your NDL/BrainScript "
"and make sure your input data layout is CHW");
"and make sure your input data layout is CHW", NodeName().c_str(), OperationName().c_str());
}
double cudnnMinEps = 1e-5; // CUDNN_BN_MIN_EPSILON
if (!m_useCntkEngine && m_epsilon < cudnnMinEps)
fprintf(stderr, "\nWARNING: cuDNN batch normalization requires epsilon >= %e. Epsilon will be reset to that value.\n", cudnnMinEps);
if (m_blendTimeConst < 0)
InvalidArgument("%ls %ls requires blend time constant to be >= 0.", NodeName().c_str(), OperationName().c_str());
auto shape = GetSampleLayout();
if (m_factory == nullptr)
m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
if (m_convEng == nullptr)
m_convEng = m_factory->CreateConvEngine(m_deviceId, m_imageLayoutKind, 0, m_useCntkEngine ? BatchNormImpl::Cntk : BatchNormImpl::CuDnn);
if (m_spatial)
if (m_bnEng == nullptr)
{
auto dims = ImageDimensions(shape, m_imageLayoutKind);
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
if (m_scaleBiasT == nullptr)
m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
}
else
{
if (m_inT == nullptr)
m_inT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
if (m_scaleBiasT == nullptr)
m_scaleBiasT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
m_bnEng = BatchNormEngine<ElemType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
m_useCntkEngine ? BatchNormEngineKind::Cntk : BatchNormEngineKind::CuDnn);
}
}
}
@ -1791,41 +1806,39 @@ public:
void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeForwardProp(matrixPool);
//if (!m_eval)
{
RequestMatrixFromPool(m_saveMean, matrixPool);
RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
}
}
void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
{
Base::RequestMatricesBeforeBackprop(matrixPool);
//if (!m_eval)
{
RequestMatrixFromPool(m_dScale, matrixPool);
RequestMatrixFromPool(m_dBias, matrixPool);
}
}
void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
{
Base::ReleaseMatricesAfterBackprop(matrixPool);
//if (!m_eval)
{
ReleaseMatrixToPool(m_saveMean, matrixPool);
ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
ReleaseMatrixToPool(m_dScale, matrixPool);
ReleaseMatrixToPool(m_dBias, matrixPool);
}
}
void SetEvalMode(bool bnEvalMode)
void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
double blendTimeConstant, double prevBlendTimeConstant)
{
m_eval = bnEvalMode;
// As this function is called from SGD solver (global), make sure we don't
// override settings set in NDL when it's not necessary.
if (normalizationTimeConstant != prevNormalizationTimeConstant)
m_normTimeConst = normalizationTimeConstant;
if (blendTimeConstant != prevBlendTimeConstant)
m_blendTimeConst = blendTimeConstant;
}
private:
// Old versioning - do not use. Do not remove until we're sure there are no old models around.
struct VersionInfo
{
//int32_t VerWrittenCur() const { return 0x00010001; } // Initial
@ -1838,13 +1851,20 @@ private:
VersionInfo m_version;
private:
// Determines whether to use training or inference(evaluation) mode.
bool m_eval;
// Determines whether to use per-activation (used after non-convolutional layers like fully connected)
// or spatial (used after convolutional layers).
bool m_spatial;
// Time constant for running mean and variance.
double m_normTimeConst;
// Time constant for blending running mean/var and current minibatch mean/var.
// The main idea is to represent current minibatch statistics as MAP estimate, linear interpolation
// of smoothed and minibatch statistics.
// The idea is due to Frank Seide et al.
// It should also work well in data parallelism scenario
// as opposed to plain vanilla BN implementation which would require aggregation of statistics
// from all nodes.
// REVIEW alexeyk: if this works, document it properly in Wiki.
double m_blendTimeConst;
// Epsilon used to compute inverse std deviation.
double m_epsilon;
// Whether to use CNTK or cuDNN BN implementation.
@ -1863,10 +1883,7 @@ private:
// Stores bias derivatives.
shared_ptr<Matrix<ElemType>> m_dBias;
std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
std::unique_ptr<ConvolutionTensor4D> m_inT;
std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
};
template class BatchNormalizationNode<float>;

Просмотреть файл

@ -88,7 +88,7 @@
<ClCompile Include="EvalWrapper.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\..\Common\Include\Eval.h" />
<ClInclude Include="..\..\Common\Include\Eval.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">

Просмотреть файл

@ -13,7 +13,7 @@
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\..\Common\Include\Eval.h">
<ClInclude Include="..\..\Common\Include\Eval.h">
<Filter>Common\Include</Filter>
</ClInclude>
</ItemGroup>

Просмотреть файл

@ -0,0 +1,131 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "BatchNormalizationEngine.h"
#include "CuDnnFactories.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev)
{
assert(in.GetNumRows() == m_inOutT.GetNumElements());
assert(out.GetNumRows() == m_inOutT.GetNumElements());
assert(in.GetNumCols() == out.GetNumCols());
assert(std::isfinite(expAvgFactor) && (0 <= expAvgFactor && expAvgFactor <= 1));
assert(std::isfinite(blendFactor) && (0 <= blendFactor && blendFactor <= 1));
assert(std::isfinite(epsilon) && epsilon > 0);
if (!m_spatial)
{
assert(m_inOutT.GetNumElements() == scale.GetNumRows());
assert(m_inOutT.GetNumElements() == bias.GetNumRows());
assert(m_inOutT.GetNumElements() == runMean.GetNumRows());
assert(m_inOutT.GetNumElements() == runInvStdDev.GetNumRows());
assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
}
else
{
assert((m_inOutT.GetNumElements() % scale.GetNumRows()) == 0);
assert((m_inOutT.GetNumElements() % bias.GetNumRows()) == 0);
assert((m_inOutT.GetNumElements() % runMean.GetNumRows()) == 0);
assert((m_inOutT.GetNumElements() % runInvStdDev.GetNumRows()) == 0);
assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
}
assert(scale.GetNumCols() == 1);
assert(bias.GetNumCols() == 1);
assert(runMean.GetNumCols() == 1);
assert(runInvStdDev.GetNumCols() == 1);
assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
EnsureCompatible();
ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
}
template <class ElemType>
void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale,
const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
{
EnsureCompatible();
BackwardCore(in, srcGrad, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
}
template <class ElemType>
class CntkBatchNormEngine : public BatchNormEngine<ElemType>
{
public:
using Base = BatchNormEngine<ElemType>;
using typename Base::Mat;
public:
CntkBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout)
: Base(deviceId, inOutT, spatial, imageLayout)
{
}
protected:
using Base::m_deviceId;
using Base::m_imageLayout;
using Base::m_inOutT;
using Base::m_spatial;
void EnsureCompatible() override
{
if (m_spatial && m_imageLayout == ImageLayoutKind::HWC)
InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
}
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
{
in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
}
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) override
{
srcGrad.BatchNormalizationBackward(in, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
}
};
template class CntkBatchNormEngine<float>;
template class CntkBatchNormEngine<double>;
template <typename T>
bool HasFlag(T src, T testFlag)
{
return ((int)src & (int)testFlag) != 0;
}
template <class ElemType>
std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout,
BatchNormEngineKind enabledEngines)
{
// Use CNTK as default batch norm engine.
if (HasFlag(enabledEngines, BatchNormEngineKind::Cntk))
{
fprintf(stderr, "\nUsing CNTK batch normalization engine.\n");
return std::make_unique<CntkBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
}
if (HasFlag(enabledEngines, BatchNormEngineKind::CuDnn))
{
fprintf(stderr, "\nUsing cuDNN batch normalization engine.\n");
return CuDnnBatchNormEngineFactory<ElemType>::Create(deviceId, inOutT, spatial, imageLayout);
}
RuntimeError("Could not find appropriate batch normalization engine.");
}
template class BatchNormEngine<float>;
template class BatchNormEngine<double>;
} } }

Просмотреть файл

@ -0,0 +1,73 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Matrix.h"
#include "TensorShape.h" // for ImageLayoutKind
namespace Microsoft { namespace MSR { namespace CNTK {
//-------------------------------------------------------------
// Batch normalization engine interface.
//-------------------------------------------------------------
enum class BatchNormEngineKind
{
None = 0,
Cntk = 1,
CuDnn = 1 << 1,
All = Cntk | CuDnn
};
#pragma warning(push)
#pragma warning(disable : 4251)
template <class ElemType>
class MATH_API BatchNormEngine
{
public:
using Mat = Matrix<ElemType>;
public:
virtual ~BatchNormEngine() = default;
void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad);
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout,
BatchNormEngineKind enabledEngines = BatchNormEngineKind::All);
DISABLE_COPY_AND_MOVE(BatchNormEngine);
protected:
BatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout)
: m_deviceId(deviceId), m_inOutT(inOutT), m_spatial(spatial), m_imageLayout(imageLayout)
{
}
virtual void EnsureCompatible() = 0;
virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) = 0;
protected:
DEVICEID_TYPE m_deviceId;
TensorShape m_inOutT;
bool m_spatial;
ImageLayoutKind m_imageLayout;
};
#pragma warning(pop)
} } }

Просмотреть файл

@ -4085,6 +4085,257 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AddAveragePoolingGradient(const CPUMat
}
#pragma endregion Other Helper Functions
template <class ElemType>
void CPUMatrix<ElemType>::ConvolutionForward(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
{
#pragma omp parallel for
for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
{
for (size_t row = 0; row < output.GetNumRows(); row++)
{
int colBase = mpRowCol(row, 0);
int ivBase = mpRowIwht(row, 0);
assert(0 <= colBase && colBase < GetNumRows());
ElemType sum = 0;
int i0 = mpRowRun(row, 0);
int skip = runs(i0++, 0);
int size = runs(i0++, 0);
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs(imask + i, 0) == 0)
continue;
int dcol = runs(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
sum += kernel.BufferPointer()[ivBase + skip + i] * (*this)(colBase + dcol, sample);
}
output(row, sample) = sum;
}
}
}
template <class ElemType>
void CPUMatrix<ElemType>::ConvolutionBackwardData(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& grad) const
{
#pragma omp parallel for
for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
{
for (size_t row = 0; row < GetNumRows(); row++)
{
int colBase = mpRowCol(row, 0);
int ivBase = mpRowIwht(row, 0);
assert(0 <= colBase && colBase < grad.GetNumRows());
ElemType curGrad = (*this)(row, sample);
int i0 = mpRowRun(row, 0);
int skip = runs(i0++, 0);
int size = runs(i0++, 0);
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs(imask + i, 0) == 0)
continue;
int dcol = runs(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
grad(colBase + dcol, sample) += curGrad * kernel.BufferPointer()[ivBase + skip + i];
}
}
}
}
template <class ElemType>
void CPUMatrix<ElemType>::ConvolutionBackwardKernel(const CPUMatrix<ElemType>& in, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& kernelGrad) const
{
// Do NOT parallelize these loops!
for (size_t sample = 0; sample < GetNumCols(); sample++)
{
for (size_t row = 0; row < GetNumRows(); row++)
{
int colBase = mpRowCol(row, 0);
int ivBase = mpRowIwht(row, 0);
assert(0 <= colBase && colBase < in.GetNumRows());
ElemType curGrad = (*this)(row, sample);
int i0 = mpRowRun(row, 0);
int skip = runs(i0++, 0);
int size = runs(i0++, 0);
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs(imask + i, 0) == 0)
continue;
int dcol = runs(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < in.GetNumRows());
kernelGrad.BufferPointer()[ivBase + skip + i] += curGrad * in(colBase + dcol, sample);
}
}
}
}
template <class ElemType>
void CPUMatrix<ElemType>::MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
{
#pragma omp parallel for
for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
{
for (size_t row = 0; row < output.GetNumRows(); row++)
{
int colBase = mpRowCol(row, 0);
assert(0 <= colBase && colBase < GetNumRows());
assert(std::numeric_limits<ElemType>::has_infinity);
ElemType res = -std::numeric_limits<ElemType>::infinity();
int i0 = mpRowIndices(row, 0);
int size = indices(i0++, 0);
assert(size > 0);
for (int i = 0; i < size; i++)
{
int dcol = indices(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
res = std::max(res, (*this)(colBase + dcol, sample));
}
output(row, sample) = res;
}
}
}
template <class ElemType>
void CPUMatrix<ElemType>::MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
CPUMatrix<ElemType>& grad) const
{
#pragma omp parallel for
for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
{
for (size_t row = 0; row < GetNumRows(); row++)
{
int colBase = mpRowCol(row, 0);
assert(0 <= colBase && colBase < grad.GetNumRows());
int i0 = mpRowIndices(row, 0);
int size = indices(i0++, 0);
assert(size > 0);
ElemType g = (*this)(row, sample);
ElemType m = out(row, sample);
for (int i = 0; i < size; i++)
{
int dcol = indices(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
if (in(colBase + dcol, sample) >= m)
grad(colBase + dcol, sample) += g;
}
}
}
}
template <class ElemType>
void CPUMatrix<ElemType>::AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
{
#pragma omp parallel for
for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
{
for (size_t row = 0; row < output.GetNumRows(); row++)
{
int colBase = mpRowCol(row, 0);
assert(0 <= colBase && colBase < GetNumRows());
ElemType sum = 0;
int i0 = mpRowIndices(row, 0);
int size = indices(i0++, 0);
assert(size > 0);
for (int i = 0; i < size; i++)
{
int dcol = indices(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
sum += (*this)(colBase + dcol, sample);
}
// Note that we divide by size which is the number of actual elements (does not include padding).
output(row, sample) = sum / size;
}
}
}
template <class ElemType>
void CPUMatrix<ElemType>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& grad) const
{
#pragma omp parallel for
for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
{
for (size_t row = 0; row < GetNumRows(); row++)
{
int colBase = mpRowCol(row, 0);
assert(0 <= colBase && colBase < grad.GetNumRows());
int i0 = mpRowIndices(row, 0);
int size = indices(i0++, 0);
assert(size > 0);
ElemType g = (*this)(row, sample) / size;
for (int i = 0; i < size; i++)
{
int dcol = indices(i0 + i, 0);
assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
grad(colBase + dcol, sample) += g;
}
}
}
}
template <class ElemType>
void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev, CPUMatrix<ElemType>& out, double epsilon,
CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
{
UNUSED(epsilon); UNUSED(saveMean); UNUSED(saveInvStdDev);
assert((GetNumRows() % scale.GetNumRows()) == 0);
if (expAvgFactor != 0 || blendFactor != 1)
RuntimeError("Batch normalization training on CPU is not yet implemented.");
bool spatial = GetNumRows() != scale.GetNumRows();
if (spatial)
{
size_t spatialSize = GetNumRows() / scale.GetNumRows();
#pragma omp parallel for
for (long icol = 0; icol < out.GetNumCols(); icol++)
{
for (long irow = 0; irow < out.GetNumRows(); irow++)
{
size_t imap = irow / spatialSize;
out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) * runInvStdDev(imap, 0) + bias(imap, 0);
}
}
}
else
{
#pragma omp parallel for
for (long icol = 0; icol < out.GetNumCols(); icol++)
{
for (long irow = 0; irow < out.GetNumRows(); irow++)
{
out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) * runInvStdDev(irow, 0) + bias(irow, 0);
}
}
}
}
template <class ElemType>
void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const
{
UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
RuntimeError("Batch normalization training on CPU is not yet implemented.");
}
#pragma region Static BLAS Functions
/// <summary>Matrix-matrix multiply with col-major matrices (a and b may be transposed): c = alpha * op(a) * op(b) + beta*c</summary>
@ -5943,4 +6194,8 @@ template void CPUMatrix<char>::SetValue(const char);
template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char* pArray, size_t matrixFlags);
template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
} } }
template CPUMatrix<int>::CPUMatrix(const size_t, const size_t, int*, const size_t);
template CPUMatrix<int>::~CPUMatrix();
}}}

Просмотреть файл

@ -317,6 +317,27 @@ public:
const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
void ConvolutionForward(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
void ConvolutionBackwardData(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& grad) const;
void ConvolutionBackwardKernel(const CPUMatrix<ElemType>& in, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& kernelGrad) const;
void MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
void MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
CPUMatrix<ElemType>& grad) const;
void AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
CPUMatrix<ElemType>& grad) const;
void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev,
CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
public:
static int SetNumThreads(int numThreads); // note: this does not depend on <ElemType>, i.e. you can call it on any <ElemType>
@ -457,4 +478,5 @@ private:
typedef CPUMatrix<float> CPUSingleMatrix;
typedef CPUMatrix<double> CPUDoubleMatrix;
} } }
}}}

Просмотреть файл

@ -1335,4 +1335,7 @@ template CPUSparseMatrix<char> CPUSparseMatrix<char>::ColumnSlice(size_t startCo
template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
template CPUSparseMatrix<int>::CPUSparseMatrix(const MatrixFormat, const size_t, const size_t, const size_t);
template CPUSparseMatrix<int>::~CPUSparseMatrix();
}}}

Просмотреть файл

@ -0,0 +1,963 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4100)
#pragma warning(disable : 4127)
#pragma warning(disable : 4201)
#pragma warning(disable : 4515)
#endif
#include <cub/cub.cuh>
#ifdef _MSC_VER
#pragma warning(pop)
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
size_t RoundUpToMultiple(size_t n, size_t blockSize)
{
return (n + blockSize - 1) / blockSize;
}
cudaError_t GetLastCudaError()
{
cudaError_t prelaunchErr = cudaGetLastError();
assert(cudaSuccess == prelaunchErr);
if (prelaunchErr != cudaSuccess)
return prelaunchErr;
#ifndef NO_SYNC
cudaError_t executionErr = cudaStreamSynchronize(GetStream());
assert(cudaSuccess == executionErr);
if (executionErr != cudaSuccess)
return executionErr;
#endif
return cudaSuccess;
}
template <int U, typename T>
__device__ __forceinline__ void LoadValues(const T* src, T dst[U])
{
#pragma unroll
for (int i = 0; i < U; i++)
dst[i] = src[i];
}
template <>
__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2])
{
// src must be aligned at 8 bytes boundary.
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
auto v = *(const float2*)src;
dst[0] = v.x;
dst[1] = v.y;
}
template <>
__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4])
{
// src must be aligned at 16 bytes boundary.
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
// Can do the following instead (use ld.global.nc.* on CC 3.5+):
// asm volatile("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(v.x), "=f"(v.y), "=f"(v.z), "=f"(v.w) : "l"(src));
// Similar for shared memory (e.g. ld.shared.*)
auto v = *(const float4*)src;
dst[0] = v.x;
dst[1] = v.y;
dst[2] = v.z;
dst[3] = v.w;
}
template <int U, typename T>
__device__ __forceinline__ void StoreValues(const T src[U], T* dst)
{
#pragma unroll
for (int i = 0; i < U; i++)
dst[i] = src[i];
}
template <>
__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst)
{
// dst must be aligned at 8 bytes boundary.
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
float2 v;
v.x = src[0];
v.y = src[1];
*(reinterpret_cast<float2*>(dst)) = v;
}
template <>
__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst)
{
// dst must be aligned at 16 bytes boundary.
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
float4 v;
v.x = src[0];
v.y = src[1];
v.z = src[2];
v.w = src[3];
*(reinterpret_cast<float4*>(dst)) = v;
}
template <typename T>
__device__ __forceinline__ T Shuffle(T input, int srcLane)
{
// shfl is supported only on Kepler+. We really don't care about Fermi anymore but our build still has sm_20.
#if __CUDA_ARCH__ >= 300
return cub::ShuffleIndex(input, srcLane);
#else
// REVIEW alexeyk: make static_assert once we remove SM 2.0 support from our build.
assert(false);
return input;
#endif
}
namespace Operations
{
__device__ float RSqrt(float a)
{
// REVIEW alexeyk: rsqrtf is just one MUFU.RSQ instruction so it's faster than
// __frsqrt_rn intrinsic which performs round-to-nearest-even rounding which adds ~10 other instructions.
// __frsqrt_rn is unbiased rounding though, need to verify whether it is a better choice for BN implementation.
//return __frsqrt_rn(a);
return rsqrtf(a);
}
__device__ double RSqrt(double a)
{
return rsqrt(a);
}
}
// This function is used to select correct unroll factor.
// REVIEW alexeyk: ask our C++ gurus (Marko/Amit) if there is better way.
template <template <int> class Func, typename T, typename ...Targs>
void Call(size_t vectorSize, Targs... args)
{
if ((vectorSize % 4) == 0)
Func<4>::template Call<T>(args...);
else if ((vectorSize % 2) == 0)
Func<2>::template Call<T>(args...);
else
Func<1>::template Call<T>(args...);
}
//--------------------------------------------------------------------
// Mean and variance computaion
//--------------------------------------------------------------------
// The kernel implements online, parallel and numerically stable algorithm
// for computing batch mean and variance (here inverse standard deviation) with one pass over the data.
// It uses algorithms by Knuth/Welford and Chan et al (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
// In short, algorithm has 2 steps:
// 1. Each thread strides over the input and computes mean and
// m2 value (used to compute variance at the end) - Welford algorithm.
// 2. Parallel reduction (Chan algorithm) performed by columns (note that
// thread block and grid X dimensions go along the vector and Y dimension - along the batch).
// As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
//
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert((vectorSize % U) == 0);
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert(::isfinite(epsilon) && epsilon > 0);
assert(::isfinite(expAvgFactor) && expAvgFactor > 0);
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
int n = 0;
ElemType mean[U];
ElemType m2[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
mean[k] = 0;
m2[k] = 0;
}
int icolSrc = threadIdx.y;
const ElemType* psrc = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY)
{
n++;
ElemType curVal[U];
LoadValues<U>(psrc, curVal);
// No need for separate unrolling, SASS looks good.
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n;
m2[k] += d * (curVal[k] - mean[k]);
}
psrc += vectorSize * BlockDimY;
}
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
// First, reduce within warp using shuffle.
if (n > 0)
{
#pragma unroll
for (int i = 1; i < CUB_PTX_WARP_THREADS / BlockDimX; i *= 2)
{
int srcLane = laneId + BlockDimX * i;
int n2 = Shuffle(n, srcLane);
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = Shuffle(mean[k], srcLane) - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
}
n = nsum;
}
}
// Storage for each warp in a thread block. First warp ("accumulator") holds
// final results so it does not need shared memory.
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
__shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
__shared__ ElemType m2Res[BlockDimX * U][cwarp - 1];
__shared__ int nRes[cwarp - 1];
// Each warp (except warp0) will write accumulated results to shared memory.
const int iwarp = tid / CUB_PTX_WARP_THREADS;
if (iwarp > 0 && laneId < BlockDimX)
{
if (laneId == 0)
nRes[iwarp - 1] = n;
#pragma unroll
for (int k = 0; k < U; k++)
{
meanRes[laneId * U + k][iwarp - 1] = mean[k];
m2Res[laneId * U + k][iwarp - 1] = m2[k];
}
}
__syncthreads();
// Accumulate and write final results.
// REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
if (threadIdx.y == 0)
{
// Use simple loop as number of warps is small, 8 at max.
#pragma unroll
for (int i = 0; i < cwarp - 1; i++)
{
int n2 = nRes[i];
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = meanRes[threadIdx.x * U + k][i] - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += m2Res[threadIdx.x * U + k][i] + d[k] * n * dScaled;
}
n = nsum;
}
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
// Store mean and running mean.
StoreValues<U>(mean, xMean + idxDstBase);
if (expAvgFactor == 1)
StoreValues<U>(mean, runMean + idxDstBase);
else
{
ElemType run[U];
LoadValues<U>(runMean + idxDstBase, run);
#pragma unroll
for (int k = 0; k < U; k++)
run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
StoreValues<U>(run, runMean + idxDstBase);
}
// Store inv std dev and its running version.
#pragma unroll
for (int k = 0; k < U; k++)
{
m2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
}
StoreValues<U>(m2, xInvStdDev + idxDstBase);
if (expAvgFactor == 1)
StoreValues<U>(m2, runInvStdDev + idxDstBase);
else
{
ElemType run[U];
LoadValues<U>(runInvStdDev + idxDstBase, run);
#pragma unroll
for (int k = 0; k < U; k++)
run[k] = expAvgFactor * m2[k] + (1.0 - expAvgFactor) * run[k];
StoreValues<U>(run, runInvStdDev + idxDstBase);
}
}
}
// This kernel is very similar to kComputeBatchMeanAndInvStdDev except it reduces not just over N (minibatch)
// but also W and H dimensions.
// REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x,
double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
assert(::isfinite(expAvgFactor) && expAvgFactor > 0);
assert(::isfinite(epsilon) && epsilon > 0);
int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
int irowSrcLim = (blockIdx.x + 1) * spatialSize;
int n = 0;
ElemType mean[U];
ElemType m2[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
mean[k] = 0;
m2[k] = 0;
}
int icolSrc = threadIdx.y;
const ElemType* psrcBase = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY)
{
const ElemType* psrc = psrcBase;
// Stride over all values in feature map (W and H dimensions).
for (int irowSrc = irowSrcBase; irowSrc < irowSrcLim; irowSrc += BlockDimX * U, psrc += BlockDimX * U)
{
n++;
ElemType curVal[U];
LoadValues<U>(psrc, curVal);
// No need for separate unrolling, SASS looks good.
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n;
m2[k] += d * (curVal[k] - mean[k]);
}
}
psrcBase += vectorSize * BlockDimY;
}
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
// First, reduce within warp using shuffle.
if (n > 0)
{
#pragma unroll
for (int i = 1; i < CUB_PTX_WARP_THREADS; i *= 2)
{
int srcLane = laneId + i;
int n2 = Shuffle(n, srcLane);
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = Shuffle(mean[k], srcLane) - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
}
n = nsum;
}
}
// Storage for each warp in a thread block. First warp ("accumulator") holds
// final results so it does not need shared memory.
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
__shared__ ElemType meanRes[U][cwarp - 1];
__shared__ ElemType m2Res[U][cwarp - 1];
__shared__ int nRes[cwarp - 1];
// Each warp (except warp0) will write accumulated results to shared memory.
const int iwarp = tid / CUB_PTX_WARP_THREADS;
if (iwarp > 0 && laneId == 0)
{
nRes[iwarp - 1] = n;
#pragma unroll
for (int k = 0; k < U; k++)
{
meanRes[k][iwarp - 1] = mean[k];
m2Res[k][iwarp - 1] = m2[k];
}
}
__syncthreads();
// One thread will accumulate and write final results.
if (tid == 0)
{
// Use simple loop as number of warps is small, 8 at max.
#pragma unroll
for (int i = 0; i < cwarp - 1; i++)
{
int n2 = nRes[i];
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = meanRes[k][i] - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += m2Res[k][i] + d[k] * n * dScaled;
}
n = nsum;
}
// Final step - accumlate results in mean[0] and m2[0].
// REVIEW alexeyk: move outside of the loop, before storing values to smem.
#pragma unroll
for (int k = 1; k < U; k++)
{
ElemType d = mean[k] - mean[0];
ElemType dScaled = d * n / (n + k * n);
mean[0] += dScaled;
m2[0] += m2[k] + d * k * n * dScaled;
}
xMean[blockIdx.x] = mean[0];
runMean[blockIdx.x] = (expAvgFactor == 1) ? mean[0] : (expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x]);
m2[0] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
xInvStdDev[blockIdx.x] = m2[0];
runInvStdDev[blockIdx.x] = (expAvgFactor == 1) ? m2[0] : (expAvgFactor * m2[0] + (1.0 - expAvgFactor) * runInvStdDev[blockIdx.x]);
}
}
// The struct is used by Call function to select proper template in runtime based on the size of the vector.
// The same pattern is used in other cases of similar structs.
template <int U>
struct ComputeBatchMeanAndInvStdDev
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize),
x, expAvgFactor, runMean, runInvStdDev, epsilon, xMean, xInvStdDev);
}
};
template <int U>
struct ComputeSpatialBatchMeanAndInvStdDev
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x,
double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
{
assert((vectorSize % spatialSize) == 0);
assert((spatialSize % U) == 0);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
// Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize),
x, expAvgFactor, runMean, runInvStdDev,epsilon, xMean, xInvStdDev);
}
};
//--------------------------------------------------------------------
// Forward propagation
// All functions accept input/outputs tensors in column-major format where each column is a vector of a minibatch.
// In convolutional case (i.e. spatial=true), each vector is in CHW format where W dimension has stride = 1.
// Tensors for biases and inverse stddevs have dimensions that equal to vector dimension in non-convolutional (i.e. spatial=false)
// or Cx1x1 in convolutional case.
//--------------------------------------------------------------------
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize, const ElemType* x, ElemType* y,
const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((vectorSize % U) == 0);
assert(!Spatial || (spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
__shared__ ElemType meanS[BlockDimX * U];
__shared__ ElemType invStdDevS[BlockDimX * U];
__shared__ ElemType scaleS[BlockDimX * U];
__shared__ ElemType biasS[BlockDimX * U];
int offs = threadIdx.x * U;
// REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
if (threadIdx.y == 0)
{
if (Spatial)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
int imap = (irowBase + k) / spatialSize;
meanS[offs + k] = batchMean[imap];
invStdDevS[offs + k] = batchInvStdDev[imap];
scaleS[offs + k] = bnScale[imap];
biasS[offs + k] = bnBias[imap];
}
}
else
{
LoadValues<U>(batchMean + irowBase, meanS + offs);
LoadValues<U>(batchInvStdDev + irowBase, invStdDevS + offs);
LoadValues<U>(bnScale + irowBase, scaleS + offs);
LoadValues<U>(bnBias + irowBase, biasS + offs);
}
}
__syncthreads();
ElemType mean[U];
ElemType invStdDev[U];
ElemType scale[U];
ElemType bias[U];
LoadValues<U>(meanS + offs, mean);
LoadValues<U>(invStdDevS + offs, invStdDev);
LoadValues<U>(scaleS + offs, scale);
LoadValues<U>(biasS + offs, bias);
int icol = blockIdx.y * BlockDimY + threadIdx.y;
size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
const ElemType* psrc = x + startOffs;
ElemType* pdst = y + startOffs;
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
for (; icol < batchSize; icol += gridDim.y * BlockDimY, psrc += stride, pdst += stride)
{
ElemType val[U];
LoadValues<U>(psrc, val);
#pragma unroll
for (int k = 0; k < U; k++)
{
val[k] = scale[k] * (val[k] - mean[k]) * invStdDev[k] + bias[k];
}
StoreValues<U>(val, pdst);
}
}
template <int U>
struct NormalizeBatchTraining
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, ElemType* y,
const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3((unsigned int)RoundUpToMultiple(vectorSize, BlockDimX * U));
if (spatial)
{
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize, x, y, bnScale, bnBias,
batchMean, batchInvStdDev);
}
else
{
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize, x, y, bnScale, bnBias,
batchMean, batchInvStdDev);
}
}
};
//--------------------------------------------------------------------
// Backpropagation
// BatchNormalizationBackward back-propagates derivatives of batch normalization function
// with respect to the inputs and scale and bias parameters.
// All tensor dimensions and assumptions are the same as in case of forward propagation.
//--------------------------------------------------------------------
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
const ElemType* saveMean, const ElemType* saveInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
static_assert(((BlockDimY - 1) & BlockDimY) == 0, "BlockDimY must be a power of 2.");
assert((vectorSize % U) == 0);
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
// REVIEW alexeyk: first part looks very similar to kComputeBatchMeanAndInvStdDev, any chance to refactor?
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
ElemType mean[U];
ElemType invStdDev[U];
__shared__ ElemType meanS[BlockDimX * U];
__shared__ ElemType invStdDevS[BlockDimX * U];
// Read mean and inv std dev.
if (threadIdx.y == 0)
{
LoadValues<U>(saveMean + irowSrcBase, mean);
LoadValues<U>(saveInvStdDev + irowSrcBase, invStdDev);
StoreValues<U>(mean, &meanS[threadIdx.x * U]);
StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
}
__syncthreads();
if (threadIdx.y != 0)
{
LoadValues<U>(&meanS[threadIdx.x * U], mean);
LoadValues<U>(&invStdDevS[threadIdx.x * U], invStdDev);
}
ElemType ds[U];
ElemType db[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] = 0;
db[k] = 0;
}
int icolSrc = threadIdx.y;
size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
const ElemType* px = x + startOffs;
const ElemType* pdy = dy + startOffs;
size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY, px += stride, pdy += stride)
{
ElemType curX[U];
ElemType curdY[U];
LoadValues<U>(px, curX);
LoadValues<U>(pdy, curdY);
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] += pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
db[k] += pdy[k];
}
}
// Final reduction.
__shared__ ElemType dsS[BlockDimY][BlockDimX * U];
__shared__ ElemType dbS[BlockDimY][BlockDimX * U];
StoreValues<U>(ds, &dsS[threadIdx.y][threadIdx.x * U]);
StoreValues<U>(db, &dbS[threadIdx.y][threadIdx.x * U]);
__syncthreads();
// Very simple block reduction. As the block y dim is small (e.g. 16) then the loop
// is executed very few times (e.g. 4) so the performance is good.
// Can be potentially improved by using shuffle instructions (as in kComputeBatchMeanAndInvStdDev).
#pragma unroll
for (int y = BlockDimY / 2; y > 0; y /= 2)
{
if (threadIdx.y < y)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
dsS[threadIdx.y][threadIdx.x * U + k] += dsS[threadIdx.y + y][threadIdx.x * U + k];
dbS[threadIdx.y][threadIdx.x * U + k] += dbS[threadIdx.y + y][threadIdx.x * U + k];
}
__syncthreads();
}
}
// Write results.
if (threadIdx.y == 0)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
dScale[irowSrcBase + k] = dsS[0][threadIdx.x * U + k];
dBias[irowSrcBase + k] = dbS[0][threadIdx.x * U + k];
}
}
}
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = blockIdx.x * spatialSize + threadIdx.x * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
int irowLim = (blockIdx.x + 1) * spatialSize;
ElemType mean;
ElemType invStdDev;
__shared__ ElemType meanS;
__shared__ ElemType invStdDevS;
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
// Read mean and inv std dev.
if (tid == 0)
{
meanS = saveMean[blockIdx.x];
invStdDevS = saveInvStdDev[blockIdx.x];
}
__syncthreads();
if (tid != 0)
{
mean = meanS;
invStdDev = invStdDevS;
}
ElemType ds[U];
ElemType db[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] = 0;
db[k] = 0;
}
int icolSrc = threadIdx.y;
size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowBase;
const ElemType* pxBase = x + startOffs;
const ElemType* pdyBase = dy + startOffs;
size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY, pxBase += stride, pdyBase += stride)
{
const ElemType* px = pxBase;
const ElemType* pdy = pdyBase;
// Stride over all values in feature map (W and H dimensions).
for (int irow = irowBase; irow < irowLim; irow += BlockDimX * U, px += BlockDimX * U, pdy += BlockDimX * U)
{
ElemType curX[U];
ElemType curdY[U];
LoadValues<U>(px, curX);
LoadValues<U>(pdy, curdY);
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] += pdy[k] * (curX[k] - mean) * invStdDev;
db[k] += pdy[k];
}
}
}
__syncthreads();
using BlockReduce = cub::BlockReduce<ElemType, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
// Note: must use separate temp storages for each reduction.
__shared__ typename BlockReduce::TempStorage tmp1;
ElemType dsRes = BlockReduce(tmp1).Sum(ds);
__shared__ typename BlockReduce::TempStorage tmp2;
ElemType dbRes = BlockReduce(tmp2).Sum(db);
if (tid == 0)
{
dScale[blockIdx.x] = dsRes;
dBias[blockIdx.x] = dbRes;
}
}
template <int U>
struct ComputeScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
}
};
template <int U>
struct ComputeSpatialScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
{
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
}
};
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias,
const ElemType* saveMean, const ElemType* saveInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.z == 1);
assert((vectorSize % U) == 0);
assert(Spatial || spatialSize == 1);
assert(!Spatial || (spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
ElemType scale[U];
ElemType ds[U];
ElemType db[U];
ElemType mean[U];
ElemType invStdDev[U];
// REVIEW alexeyk: here we're wasting some bandwidth but this might be ok as it's a one-timer.
if (Spatial)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
int imap = (irowBase + k) / spatialSize;
scale[k] = bnScale[imap];
ds[k] = dScale[imap];
db[k] = dBias[imap];
mean[k] = saveMean[imap];
invStdDev[k] = saveInvStdDev[imap];
}
}
else
{
LoadValues<U>(bnScale + irowBase, scale);
LoadValues<U>(dScale + irowBase, ds);
LoadValues<U>(dBias + irowBase, db);
LoadValues<U>(saveMean + irowBase, mean);
LoadValues<U>(saveInvStdDev + irowBase, invStdDev);
}
int icol = blockIdx.y * BlockDimY + threadIdx.y;
size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
const ElemType* px = x + startOffs;
const ElemType* pdy = dy + startOffs;
ElemType* pdx = dx + startOffs;
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
for (; icol < batchSize; icol += gridDim.y * BlockDimY, px += stride, pdy += stride, pdx += stride)
{
ElemType xCur[U];
ElemType dyCur[U];
ElemType dxCur[U];
LoadValues<U>(px, xCur);
LoadValues<U>(pdy, dyCur);
LoadValues<U>(pdx, dxCur);
// From the BN paper, dL/dxi is a sum of three terms: dL/dxi = t1 + t2 + t3
// After simplifcation, they become the following:
// 1. t1 = scale * dL/dyi * invStdDev
// 2. t2 = (-scale / m) * invStdDev * xHat * dL/dScale
// 3. t3 = (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
// Simplifying this a bit more, we get the formula below.
ElemType val[U];
int m = Spatial ? batchSize * spatialSize : batchSize;
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k];
val[k] = dxCur[k] + (scale[k] * invStdDev[k]) * (dyCur[k] - (xNorm * ds[k] + db[k]) / m);
}
StoreValues<U>(val, pdx);
}
}
template <int U>
struct BackpropagateBatchNormGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)),
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
if (spatial)
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
}
else
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
}
}
};
} } }

272
Source/Math/Convolution.cuh Normal file
Просмотреть файл

@ -0,0 +1,272 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <math_constants.h>
namespace Microsoft { namespace MSR { namespace CNTK {
template <typename ElemType>
__global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__ kernel,
const int* mpRowCol, const int* mpRowIwht,
const int* mpRowRun, const int* __restrict__ runs,
const ElemType* __restrict__ src, int srcVecSize,
ElemType* dst, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= dstVecSize)
return;
src += blockIdx.y * srcVecSize;
dst += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < srcVecSize);
ElemType sum = 0;
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs[imask + i] == 0)
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
sum += kernel[ivBase + skip + i] * src[colBase + dcol];
}
dst[row] = sum;
src += blockDim.y * srcVecSize;
dst += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restrict__ kernel,
const int* mpRowCol, const int* mpRowIwht,
const int* mpRowRun, const int* __restrict__ runs,
const ElemType* __restrict__ srcGrad, int srcVecSize,
ElemType* grad, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= srcVecSize)
return;
srcGrad += blockIdx.y * srcVecSize;
grad += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < dstVecSize);
ElemType g = srcGrad[row];
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs[imask + i] == 0)
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
atomicAdd(&grad[colBase + dcol], g * kernel[ivBase + skip + i]);
}
srcGrad += blockDim.y * srcVecSize;
grad += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int outVecSize,
const ElemType* __restrict__ in,
const int* mpRowCol, const int* mpRowIwht,
const int* mpRowRun, const int* __restrict__ runs,
const ElemType* __restrict__ srcGrad,
ElemType* kernelGrad)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= outVecSize)
return;
in += blockIdx.y * inVecSize;
srcGrad += blockIdx.y * outVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < inVecSize);
ElemType g = srcGrad[row];
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs[imask + i] == 0)
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < inVecSize);
atomicAdd(&kernelGrad[ivBase + skip + i], g * in[colBase + dcol]);
}
in += blockDim.y * inVecSize;
srcGrad += blockDim.y * outVecSize;
}
}
template <typename ElemType>
__global__ void kMaxPoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ src, int srcVecSize,
ElemType* dst, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= dstVecSize)
return;
src += blockIdx.y * srcVecSize;
dst += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < srcVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
ElemType res = src[colBase + indices[i0]];
for (int i = 1; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
res = max(res, src[colBase + dcol]);
}
dst[row] = res;
src += blockDim.y * srcVecSize;
dst += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kMaxPoolingBackward(int batchSize, const ElemType* out, const ElemType* in,
const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ srcGrad, int srcVecSize,
ElemType* grad, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= srcVecSize)
return;
in += blockIdx.y * dstVecSize;
out += blockIdx.y * srcVecSize;
srcGrad += blockIdx.y * srcVecSize;
grad += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < dstVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
assert(size > 0);
ElemType g = srcGrad[row];
ElemType m = out[row];
for (int i = 0; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
if (in[colBase + dcol] >= m)
atomicAdd(&grad[colBase + dcol], g);
}
in += blockDim.y * dstVecSize;
out += blockDim.y * srcVecSize;
srcGrad += blockDim.y * srcVecSize;
grad += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ src, int srcVecSize,
ElemType* dst, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= dstVecSize)
return;
src += blockIdx.y * srcVecSize;
dst += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < srcVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
ElemType sum = 0;
for (int i = 0; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
sum += src[colBase + dcol];
}
dst[row] = sum / size;
src += blockDim.y * srcVecSize;
dst += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kAveragePoolingBackward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ srcGrad, int srcVecSize,
ElemType* grad, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= srcVecSize)
return;
srcGrad += blockIdx.y * srcVecSize;
grad += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < dstVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
assert(size > 0);
ElemType g = srcGrad[row] / size;
for (int i = 0; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
atomicAdd(&grad[colBase + dcol], g);
}
srcGrad += blockDim.y * srcVecSize;
grad += blockDim.y * dstVecSize;
}
}
} } }

Просмотреть файл

@ -5,242 +5,295 @@
#include "stdafx.h"
#include "ConvolutionEngine.h"
#include "CuDnnConvolutionEngine.h"
#include "CuDnnFactories.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
void ConvolutionEngine<ElemType>::Forward(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter,
const ConvDesc& convDesc, const Tensor4D& outT, Mat& out, Mat& workspace)
void ConvolutionEngine<ElemType>::Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace)
{
assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
assert(inT.n() == in.GetNumCols());
assert(filterT.k() == filter.GetNumRows());
assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
assert(inT.c() == filterT.c());
assert(outT.c() == filterT.k());
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
assert(outT.n() == out.GetNumCols());
EnsureCompatible();
ForwardCore(inT, in, filterT, filter, convDesc, outT, out, workspace);
}
template <class ElemType>
void ConvolutionEngine<ElemType>::BackwardData(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& gradT, Mat& grad, Mat& workspace)
{
assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
assert(srcGradT.n() == srcGrad.GetNumCols());
assert(filterT.k() == filter.GetNumRows());
assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
assert(srcGradT.c() == filterT.k());
assert(gradT.c() == filterT.c());
assert(gradT.w() * gradT.h() * gradT.c() == grad.GetNumRows());
assert(gradT.n() == grad.GetNumCols());
EnsureCompatible();
BackwardDataCore(srcGradT, srcGrad, filterT, filter, convDesc, gradT, grad, workspace);
}
template <class ElemType>
void ConvolutionEngine<ElemType>::BackwardFilter(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace)
{
assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
assert(srcGradT.n() == srcGrad.GetNumCols());
assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
assert(inT.n() == in.GetNumCols());
assert(srcGradT.c() == filterT.k());
assert(inT.c() == filterT.c());
assert(filterT.k() == filter.GetNumRows());
assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
EnsureCompatible();
BackwardFilterCore(srcGradT, srcGrad, inT, in, convDesc, filterT, filter, allowReuse, workspace);
}
template <class ElemType>
void ConvolutionEngine<ElemType>::NormalizeBatch(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
double epsilon, Mat& saveMean, Mat& saveInvStdDev)
{
const size_t crowIn = inT.w() * inT.h() * inT.c();
if (spatial)
{
assert(scaleBiasT.c() == inT.c());
assert(scaleBiasT.w() == 1);
assert(scaleBiasT.h() == 1);
assert(runMean.GetNumRows() == inT.c());
assert(runInvStdDev.GetNumRows() == inT.c());
}
else
{
assert(scaleBiasT.c() == inT.c());
assert(scaleBiasT.w() == inT.w());
assert(scaleBiasT.h() == inT.h());
assert(runMean.GetNumRows() == crowIn);
assert(runInvStdDev.GetNumRows() == crowIn);
}
assert(scaleBiasT.n() == 1);
assert(crowIn == in.GetNumRows());
assert(crowIn == out.GetNumRows());
assert(inT.n() == in.GetNumCols());
assert(inT.n() == out.GetNumCols());
assert(bias.GetNumCols() == 1);
assert(scale.GetNumCols() == 1);
assert(runMean.GetNumCols() == 1);
assert(runInvStdDev.GetNumCols() == 1);
assert(runMean.GetNumCols() == saveMean.GetNumCols());
assert(runMean.GetNumRows() == saveMean.GetNumRows());
assert(runInvStdDev.GetNumCols() == saveInvStdDev.GetNumCols());
assert(runInvStdDev.GetNumRows() == saveInvStdDev.GetNumRows());
#ifndef _DEBUG
UNUSED(crowIn); // crowIn used only in asserts.
const auto& g = *m_geometry;
assert(g.InputShape().GetNumElements() == in.GetNumRows());
assert(g.OutputShape().GetNumElements() == out.GetNumRows());
size_t batchSize = in.GetNumCols();
assert(batchSize == out.GetNumCols());
// REVIEW alexeyk: add shape-aware asserts?
assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
#ifdef NDEBUG
UNUSED(g);
UNUSED(batchSize);
#endif
EnsureCompatibleBatchNorm(spatial);
NormalizeBatchCore(inT, in, scaleBiasT, scale, bias, spatial, expAvgFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
EnsureCompatible();
EnsureConvolutionInitialized();
ForwardCore(in, kernel, out, workspace);
}
template <class ElemType>
void ConvolutionEngine<ElemType>::NormalizeBatchInference(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out)
void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace)
{
const size_t crowIn = inT.w() * inT.h() * inT.c();
if (spatial)
{
assert(scaleBiasT.c() == inT.c());
assert(scaleBiasT.w() == 1);
assert(scaleBiasT.h() == 1);
assert(scaleBiasT.c() == runMean.GetNumRows());
assert(scaleBiasT.c() == runInvStdDev.GetNumRows());
}
else
{
assert(scaleBiasT.c() == inT.c());
assert(scaleBiasT.w() == inT.w());
assert(scaleBiasT.h() == inT.h());
assert(crowIn == runMean.GetNumRows());
assert(crowIn == runInvStdDev.GetNumRows());
}
assert(scaleBiasT.n() == 1);
assert(crowIn == in.GetNumRows());
assert(crowIn == out.GetNumRows());
assert(inT.n() == in.GetNumCols());
assert(inT.n() == out.GetNumCols());
assert(bias.GetNumCols() == 1);
assert(scale.GetNumCols() == 1);
assert(runMean.GetNumCols() == 1);
assert(runInvStdDev.GetNumCols() == 1);
#ifndef _DEBUG
// used only in asserts.
UNUSED(crowIn);
const auto& g = *m_geometry;
assert(g.InputShape().GetNumElements() == grad.GetNumRows());
assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
size_t batchSize = srcGrad.GetNumCols();
assert(batchSize == grad.GetNumCols());
assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
#ifdef NDEBUG
UNUSED(g);
UNUSED(batchSize);
#endif
EnsureCompatibleBatchNorm(spatial);
NormalizeBatchInferenceCore(inT, in, scaleBiasT, scale, bias, spatial, runMean, runInvStdDev, out);
EnsureCompatible();
EnsureConvolutionInitialized();
BackwardDataCore(srcGrad, kernel, grad, workspace);
}
template <class ElemType>
void ConvolutionEngine<ElemType>::BackwardNormalizeBatch(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad)
void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool allowReuse, Mat& workspace)
{
const size_t crowIn = inT.w() * inT.h() * inT.c();
if (spatial)
{
assert(scaleBiasT.c() == inT.c());
assert(scaleBiasT.w() == 1);
assert(scaleBiasT.h() == 1);
}
else
{
assert(scaleBiasT.c() == inT.c());
assert(scaleBiasT.w() == inT.w());
assert(scaleBiasT.h() == inT.h());
}
assert(scaleBiasT.n() == 1);
assert(crowIn == in.GetNumRows());
assert(crowIn == srcGrad.GetNumRows());
assert(crowIn == grad.GetNumRows());
assert(inT.n() == in.GetNumCols());
assert(inT.n() == srcGrad.GetNumCols());
assert(inT.n() == grad.GetNumCols());
assert(scaleGrad.GetNumRows() == scale.GetNumRows());
assert(scaleGrad.GetNumCols() == scale.GetNumCols());
assert(biasGrad.GetNumRows() == scale.GetNumRows());
assert(biasGrad.GetNumCols() == scale.GetNumCols());
#ifndef _DEBUG
UNUSED(crowIn); // crowIn used only in asserts.
const auto& g = *m_geometry;
assert(g.InputShape().GetNumElements() == in.GetNumRows());
assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
size_t batchSize = in.GetNumCols();
assert(batchSize == srcGrad.GetNumCols());
assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
#ifdef NDEBUG
UNUSED(g);
UNUSED(batchSize);
#endif
EnsureCompatibleBatchNorm(spatial);
BackwardNormalizeBatchCore(inT, in, srcGrad, grad, scaleBiasT, scale, spatial, saveMean, saveInvStdDev, scaleGrad, biasGrad);
EnsureCompatible();
EnsureConvolutionInitialized();
BackwardKernelCore(srcGrad, in, kernel, allowReuse, workspace);
}
template <class ElemType>
void ConvolutionEngine<ElemType>::ForwardPooling(const Mat& in, Mat& out)
{
const auto& g = *m_geometry;
assert(g.InputShape().GetNumElements() == in.GetNumRows());
assert(g.OutputShape().GetNumElements() == out.GetNumRows());
size_t batchSize = in.GetNumCols();
assert(batchSize == out.GetNumCols());
#ifdef NDEBUG
UNUSED(g);
UNUSED(batchSize);
#endif
EnsureCompatible();
EnsurePoolingInitialized();
ForwardPoolingCore(in, out);
}
template <class ElemType>
void ConvolutionEngine<ElemType>::BackwardPooling(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad)
{
const auto& g = *m_geometry;
assert(g.InputShape().GetNumElements() == grad.GetNumRows());
assert(g.InputShape().GetNumElements() == in.GetNumRows());
assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
assert(g.OutputShape().GetNumElements() == out.GetNumRows());
size_t batchSize = out.GetNumCols();
assert(batchSize == srcGrad.GetNumCols());
assert(batchSize == in.GetNumCols());
assert(batchSize == grad.GetNumCols());
#ifdef NDEBUG
UNUSED(g);
UNUSED(batchSize);
#endif
EnsureCompatible();
EnsurePoolingInitialized();
BackwardPoolingCore(out, srcGrad, in, grad);
}
//------------------------------------------------------------------
// Default (legacy) convolution engine implementation.
// Reference convolution engine implementation.
// This engine supports arbitrary convolution geometry but does not provide efficient implementation.
// Its main purpose is to serve as a baseline for optmized engines (e.g. cuDNN) that
// usually implement only a subset of a general convolution geometry.
//------------------------------------------------------------------
template <class ElemType>
class DefaultConvolutionEngine : public ConvolutionEngine<ElemType>
class ReferenceConvolutionEngine : public ConvolutionEngine<ElemType>
{
public:
using Base = ConvolutionEngine<ElemType>;
using typename Base::Mat;
using typename Base::Tensor4D;
using typename Base::Filter;
using typename Base::ConvDesc;
public:
DefaultConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
: Base(deviceId, imageLayout), m_ones(deviceId), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_bnImpl(bnImpl)
ReferenceConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
m_mpRowCol(geometry->MpRowCol().size(), 1, const_cast<int*>(geometry->MpRowCol().data()), deviceId, IsGpu(deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer)
{
}
protected:
using Base::m_geometry;
using Base::m_deviceId;
using Base::m_imageLayout;
using Base::m_maxTempMemSizeInSamples;
using Base::m_poolKind;
void EnsureCompatible() override
{
if (m_imageLayout != ImageLayoutKind::CHW)
RuntimeError("Reference convolution engine supports only CHW/cudnn layout.");
}
void EnsureConvolutionInitialized() override
{
if (m_mpRowIwht == nullptr)
{
auto flags = IsGpu(m_deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer;
m_mpRowIwht = std::make_unique<Matrix<int>>(m_geometry->MpRowIwht().size(), 1,
const_cast<int*>(m_geometry->MpRowIwht().data()), m_deviceId, flags);
m_mpRowRun = std::make_unique<Matrix<int>>(m_geometry->MpRowRun().size(), 1,
const_cast<int*>(m_geometry->MpRowRun().data()), m_deviceId, flags);
m_runs = std::make_unique<Matrix<int>>(m_geometry->Runs().size(), 1,
const_cast<int*>(m_geometry->Runs().data()), m_deviceId, flags);
}
}
void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& /*workspace*/) override
{
in.ConvolutionForward(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, out);
}
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& /*workspace*/) override
{
srcGrad.ConvolutionBackwardData(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, grad);
}
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& /*workspace*/) override
{
srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
}
void EnsurePoolingInitialized() override
{
if (m_indices == nullptr)
{
auto flags = IsGpu(m_deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer;
m_mpRowIndices = std::make_unique<Matrix<int>>(m_geometry->MpRowIndices().size(), 1,
const_cast<int*>(m_geometry->MpRowIndices().data()), m_deviceId, flags);
m_indices = std::make_unique<Matrix<int>>(m_geometry->Indices().size(), 1,
const_cast<int*>(m_geometry->Indices().data()), m_deviceId, flags);
}
}
void ForwardPoolingCore(const Mat& in, Mat& out) override
{
if (m_poolKind == PoolKind::Max)
{
in.MaxPoolingForward(m_mpRowCol, *m_mpRowIndices, *m_indices, out);
}
else if (m_poolKind == PoolKind::Average)
{
in.AveragePoolingForward(m_mpRowCol, *m_mpRowIndices, *m_indices, out);
}
else
InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
}
void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
{
if (m_poolKind == PoolKind::Max)
{
srcGrad.MaxPoolingBackward(out, in, m_mpRowCol, *m_mpRowIndices, *m_indices, grad);
}
else if (m_poolKind == PoolKind::Average)
{
srcGrad.AveragePoolingBackward(m_mpRowCol, *m_mpRowIndices, *m_indices, grad);
}
else
InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
}
private:
static bool IsGpu(DEVICEID_TYPE deviceId)
{
return deviceId >= 0;
}
private:
using IntMatPtr = std::unique_ptr<Matrix<int>>;
Matrix<int> m_mpRowCol;
// Convolution-specific maps.
IntMatPtr m_mpRowIwht;
IntMatPtr m_mpRowRun;
IntMatPtr m_runs;
// Pooling-specific maps.
IntMatPtr m_mpRowIndices;
IntMatPtr m_indices;
};
//------------------------------------------------------------------
// Legacy convolution engine implementation.
//------------------------------------------------------------------
template <class ElemType>
class LegacyConvolutionEngine : public ConvolutionEngine<ElemType>
{
public:
using Base = ConvolutionEngine<ElemType>;
using typename Base::Mat;
public:
LegacyConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
m_inT(m_geometry->InputShape(), ImageLayoutKind::CHW), m_outT(m_geometry->OutputShape(), ImageLayoutKind::CHW),
m_kernelT(m_geometry->KernelShape(), ImageLayoutKind::CHW), m_strideT(m_geometry->Stride(), ImageLayoutKind::CHW)
{
m_padding = m_geometry->AutoPad()[0];
}
protected:
using Base::m_geometry;
using Base::m_deviceId;
using Base::m_imageLayout;
using Base::m_maxTempMemSizeInSamples;
using Base::m_poolKind;
void EnsureCompatible() override
{
if (m_imageLayout != ImageLayoutKind::HWC)
RuntimeError("Default convolution engine currently supports only HWC/legacy layout.");
RuntimeError("Legacy convolution engine supports only HWC/legacy layout.");
}
void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& outT, Mat& out, Mat& workspace) override
void EnsureConvolutionInitialized() override
{
size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
size_t packedInputColsPerSample = outT.w() * outT.h();
}
void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) override
{
size_t batchSize = in.GetNumCols();
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
size_t outputSizePerChannel = packedInputColsPerSample;
// size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
// size_t inputDim = inT.w() * inT.h() * inT.c(); // size of each input sample
size_t batchSize = inT.n();
size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);
assert(filter.GetNumCols() == packedInputRows && filter.GetNumRows() == outT.c());
assert(kernel.GetNumCols() == packedInputRows && kernel.GetNumRows() == m_outT.c());
UNUSED(packedInputRows);
// GPU and 1-dimensional image
m_gpuSparseOpt = (filterT.h() == 1 &&
m_gpuSparseOpt = (m_kernelT.h() == 1 &&
in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
convDesc.wStride() == 1 &&
!convDesc.padding() &&
m_strideT.w() == 1 &&
!m_padding &&
in.GetMatrixType() == MatrixType::SPARSE);
m_gpuSparse1D = (m_gpuSparseOpt && inT.h() == 1);
m_gpuSparse1D = (m_gpuSparseOpt && m_inT.h() == 1);
out.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
// Reshaping is only necessary if we are going to use the unpacking trick
if (m_gpuSparseOpt)
out.Reshape(outT.c() * outT.w(), outT.h() * batchSize);
out.Reshape(m_outT.c() * m_outT.w(), m_outT.h() * batchSize);
else
out.Reshape(outT.c(), outputSizePerChannel * batchSize);
out.Reshape(m_outT.c(), outputSizePerChannel * batchSize);
size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
@ -263,53 +316,51 @@ protected:
if (m_gpuSparseOpt)
{
if (filterT.w() * inT.c() != filter.GetNumCols())
if (m_kernelT.w() * m_inT.c() != kernel.GetNumCols())
LogicError("Kernel width and weight matrix dimensions don't match.");
inputSubBatch.Reshape(inT.c() * inT.w(), inT.h() * smallBatchSize);
Mat outputSubBatch = out.ColumnSlice(startSampleId, outT.h() * smallBatchSize);
Mat::ConvolveAndWeightedAdd(1, filter, false, inputSubBatch, false, 0, outputSubBatch,
static_cast<int>(inT.c()), convDesc.wStride(), convDesc.padding(), true);
inputSubBatch.Reshape(m_inT.c() * m_inT.w(), m_inT.h() * smallBatchSize);
Mat outputSubBatch = out.ColumnSlice(startSampleId, m_outT.h() * smallBatchSize);
Mat::ConvolveAndWeightedAdd(1, kernel, false, inputSubBatch, false, 0, outputSubBatch,
static_cast<int>(m_inT.c()), m_strideT.w(), m_padding, true);
}
else
{
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
workspace.AssignPackedConvolutionInput(inputSubBatch,
inT.w(), inT.h(), inT.c(),
outT.w(), outT.h(), outT.c(),
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
convDesc.padding());
m_inT.w(), m_inT.h(), m_inT.c(),
m_outT.w(), m_outT.h(), m_outT.c(),
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
m_padding);
Mat outputSubBatch = out.ColumnSlice(outputSizePerChannel * startSampleId, outputSizePerChannel * smallBatchSize);
// workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
// BUGBUG: This ^^ destroys the content of the matrix. Also it seems not to change the size. Does it? Should this be a Reshape()?
Mat::Multiply(filter, false, workspace, false, outputSubBatch);
Mat::Multiply(kernel, false, workspace, false, outputSubBatch);
}
}
out.Reshape(outT.c() * outputSizePerChannel, batchSize); // each sample becomes a column
out.Reshape(m_outT.c() * outputSizePerChannel, batchSize); // each sample becomes a column
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
assert(outT.n() == out.GetNumCols());
assert(m_outT.w() * m_outT.h() * m_outT.c() == out.GetNumRows());
assert(batchSize == out.GetNumCols());
}
void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& gradT, Mat& grad, Mat& workspace) override
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
{
size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
size_t packedInputColsPerSample = srcGradT.w() * srcGradT.h();
size_t batchSize = srcGrad.GetNumCols();
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
size_t outputSizePerChannel = packedInputColsPerSample;
// size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
// size_t inputDim = gradT.w() * gradT.h() * gradT.c(); // size of each input sample
size_t batchSize = srcGradT.n();
// size_t inputDim = m_inT.w() * m_inT.h() * m_inT.c(); // size of each input sample
size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);
// Create slice which is the same as full matrix so we can reshape it.
Matrix<ElemType> srcGradTmp = srcGrad.ColumnSlice(0, srcGrad.GetNumCols());
srcGradTmp.Reshape(srcGradT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
srcGradTmp.Reshape(m_outT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
@ -322,31 +373,29 @@ protected:
workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
Matrix<ElemType> outputGradientSubBatch = srcGradTmp.ColumnSlice(startSampleId * outputSizePerChannel, smallBatchSize * outputSizePerChannel);
Matrix<ElemType>::Multiply(filter, true, outputGradientSubBatch, false, workspace);
Matrix<ElemType>::Multiply(kernel, true, outputGradientSubBatch, false, workspace);
Matrix<ElemType> inputGradientSubBatch = grad.ColumnSlice(startSampleId, smallBatchSize);
workspace.UnpackConvolutionInput(inputGradientSubBatch,
gradT.w(), gradT.h(), gradT.c(),
srcGradT.w(), srcGradT.h(), srcGradT.c(),
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
convDesc.padding());
m_inT.w(), m_inT.h(), m_inT.c(),
m_outT.w(), m_outT.h(), m_outT.c(),
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
m_padding);
}
assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
assert(srcGradT.n() == srcGrad.GetNumCols());
assert(m_outT.w() * m_outT.h() * m_outT.c() == srcGrad.GetNumRows());
assert(batchSize == srcGrad.GetNumCols());
}
void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace) override
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) override
{
size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
size_t packedInputColsPerSample = srcGradT.w() * srcGradT.h();
size_t batchSize = in.GetNumCols();
size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
size_t outputSizePerChannel = packedInputColsPerSample;
// size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
// size_t inputDim = m_inputImageLayout.width * m_inputImageLayout.height * m_inputImageLayout.channels; // size of each input sample
size_t batchSize = inT.n();
size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);
// const Matrix<ElemType> & weightMatrix = input0;
@ -354,14 +403,14 @@ protected:
// Create slice which is the same as full matrix so we can reshape it.
Matrix<ElemType> srcGradTmp = srcGrad.ColumnSlice(0, srcGrad.GetNumCols());
srcGradTmp.Reshape(srcGradT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
srcGradTmp.Reshape(m_outT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
if (numSubBatches == 1 && allowReuse && !m_gpuSparseOpt) // reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
// REVIEW alexeyk: the following makes an assumption that data in workspace was filled by Forward call and remained unchanged. Find way to enforce/verify that.
Matrix<ElemType>::MultiplyAndAdd(srcGradTmp, false, workspace, true, filter);
Matrix<ElemType>::MultiplyAndAdd(srcGradTmp, false, workspace, true, kernelGrad);
else
{
for (size_t i = 0; i < numSubBatches; i++)
@ -379,16 +428,16 @@ protected:
{
Matrix<ElemType> inputSubBatch(in.GetDeviceId());
inputSubBatch.SetValue(in.ColumnSlice(startSampleID, smallBatchSize));
inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w() * inT.h());
inputSubBatch.Reshape(m_inT.c(), smallBatchSize * m_inT.w() * m_inT.h());
Matrix<ElemType> inputSubBatchSparseReordered(inputSubBatch.GetNumCols(), inputSubBatch.GetNumRows(), inputSubBatch.GetDeviceId(), MatrixType::SPARSE, MatrixFormat::matrixFormatSparseCSC);
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize * inT.h(), inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, m_inT.w(), 1, smallBatchSize * m_inT.h(), m_inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.h() * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize * srcGradT.h(), srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * m_outT.h() * m_outT.w(), m_outT.c(), outputGradientSubBatch.GetDeviceId());
Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, m_outT.w(), 1, smallBatchSize * m_outT.h(), m_outT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
filter.Reshape(srcGradT.c() * filterT.w(), inT.c());
Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize * inT.h(), convDesc.wStride(), convDesc.padding(), false);
filter.Reshape(srcGradT.c(), inT.c() * filterT.w());
kernelGrad.Reshape(m_outT.c() * m_kernelT.w(), m_inT.c());
Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, kernelGrad, smallBatchSize * m_inT.h(), m_strideT.w(), m_padding, false);
kernelGrad.Reshape(m_outT.c(), m_inT.c() * m_kernelT.w());
}
else
{
@ -396,288 +445,107 @@ protected:
Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
workspace.AssignPackedConvolutionInput(inputSubBatch,
inT.w(), inT.h(), inT.c(),
srcGradT.w(), srcGradT.h(), srcGradT.c(),
filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
convDesc.padding());
m_inT.w(), m_inT.h(), m_inT.c(),
m_outT.w(), m_outT.h(), m_outT.c(),
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
m_padding);
Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, kernelGrad);
}
}
}
assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
assert(srcGradT.n() == srcGrad.GetNumCols());
assert(m_outT.w() * m_outT.h() * m_outT.c() == srcGrad.GetNumRows());
assert(batchSize == srcGrad.GetNumCols());
}
void EnsureCompatibleBatchNorm(bool spatial) override
void EnsurePoolingInitialized() override
{
if (m_deviceId >= 0)
InvalidArgument("This engine does not support batch normalization on GPUs.");
if (m_bnImpl != BatchNormImpl::Cntk)
InvalidArgument("Only CNTK batch normalization implementation is supported by this engine.");
if (spatial && m_imageLayout != ImageLayoutKind::CHW)
InvalidArgument("This engine batch normalization currently supports only CHW data layout for convolutional nodes.");
}
void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
void ForwardPoolingCore(const Mat& in, Mat& out) override
{
UNUSED(inT);
UNUSED(in);
UNUSED(scaleBiasT);
UNUSED(scale);
UNUSED(bias);
UNUSED(out);
UNUSED(spatial);
UNUSED(expAvgFactor);
UNUSED(runMean);
UNUSED(runInvStdDev);
UNUSED(epsilon);
UNUSED(saveMean);
UNUSED(saveInvStdDev);
RuntimeError("Not yet implemented.");
}
void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) override
{
UNUSED(scaleBiasT);
if (spatial)
if (m_poolKind == PoolKind::Max)
{
size_t spatialSize = inT.w() * inT.h();
#pragma omp parallel for
for (long icol = 0; icol < out.GetNumCols(); icol++)
{
for (long irow = 0; irow < out.GetNumRows(); irow++)
{
size_t imap = irow / spatialSize;
out(irow, icol) = scale(imap, 0) * (in(irow, icol) - runMean(imap, 0)) * runInvStdDev(imap, 0) + bias(imap, 0);
}
}
out.AssignMaxPoolingResult(in, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
}
else if (m_poolKind == PoolKind::Average)
{
out.AssignAveragePoolingResult(in, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
}
else
{
#pragma omp parallel for
for (long icol = 0; icol < out.GetNumCols(); icol++)
{
for (long irow = 0; irow < out.GetNumRows(); irow++)
{
out(irow, icol) = scale(irow, 0) * (in(irow, icol) - runMean(irow, 0)) * runInvStdDev(irow, 0) + bias(irow, 0);
}
}
}
InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
}
void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) override
void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
{
UNUSED(inT);
UNUSED(in);
UNUSED(srcGrad);
UNUSED(grad);
UNUSED(scaleBiasT);
UNUSED(scale);
UNUSED(scaleGrad);
UNUSED(biasGrad);
UNUSED(spatial);
UNUSED(saveMean);
UNUSED(saveInvStdDev);
RuntimeError("Not yet implemented.");
if (m_poolKind == PoolKind::Max)
{
grad.AddMaxPoolingGradient(srcGrad, in, out,
m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
}
else if (m_poolKind == PoolKind::Average)
{
grad.AddAveragePoolingGradient(srcGrad, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
}
else
InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
}
private:
size_t m_maxTempMemSizeInSamples;
BatchNormImpl m_bnImpl;
Mat m_ones;
ImageDimensions m_inT;
ImageDimensions m_outT;
ImageDimensions m_kernelT;
ImageDimensions m_strideT;
bool m_padding;
bool m_gpuSparseOpt;
bool m_gpuSparse1D;
};
template <class ElemType>
std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
ConvolutionEngineKind enabledEngines)
{
auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
// Note: in some cases do not throw exception even if parameters do not match as Create
// can be called from places like MEL with default parameters and never be used.
// The check will be done later in engine's EnsureCompatible call if the egnine is actually used.
auto engStr = (std::string)(*geometry);
// Only legacy engine supports HWC layout.
if (imageLayout == ImageLayoutKind::HWC)
{
if (!isEnabled(ConvolutionEngineKind::Legacy))
RuntimeError("Trying to use Legacy convolution engine when it's disabled.");
// REVIEW alexeyk: should honor m_traceLevel here.
fprintf(stderr, "\nUsing legacy convolution engine for geometry: %s.\n", engStr.c_str());
return std::make_unique<LegacyConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
}
// Check if we can use cuDNN engine. Do not need to validate tensors as ConvolveGeometry has already done that.
if (isEnabled(ConvolutionEngineKind::CuDnn) &&
CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId, geometry, poolKind))
{
fprintf(stderr, "\nUsing cuDNN convolution engine for geometry: %s.\n", engStr.c_str());
return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
}
if (!isEnabled(ConvolutionEngineKind::Reference))
RuntimeError("Reference convolution is disabled and no other engine supports such configuratin (or disabled).");
fprintf(stderr, "\nUsing reference convolution engine for geometry: %s.\n", engStr.c_str());
return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
}
template class ConvolutionEngine<float>;
template class ConvolutionEngine<double>;
//------------------------------------------------------------------
// Pooling engine.
//------------------------------------------------------------------
template <class ElemType>
void PoolingEngine<ElemType>::Forward(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out)
{
assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
assert(inT.n() == in.GetNumCols());
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
assert(outT.n() == out.GetNumCols());
EnsureCompatible();
ForwardCore(inT, in, poolDesc, outT, out);
}
template <class ElemType>
void PoolingEngine<ElemType>::Backward(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad)
{
assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
assert(outT.n() == out.GetNumCols());
assert(out.GetNumRows() == srcGrad.GetNumRows());
assert(out.GetNumCols() == srcGrad.GetNumCols());
assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
assert(inT.n() == in.GetNumCols());
assert(in.GetNumRows() == grad.GetNumRows());
assert(in.GetNumCols() == grad.GetNumCols());
EnsureCompatible();
BackwardCore(outT, out, srcGrad, poolDesc, inT, in, grad);
}
//------------------------------------------------------------------
// Default (legacy) pooling engine implementation.
//------------------------------------------------------------------
template <class ElemType>
class DefaultPoolingEngine : public PoolingEngine<ElemType>
{
public:
using Base = PoolingEngine<ElemType>;
using typename Base::Tensor4D;
using typename Base::PoolDesc;
using typename Base::Mat;
public:
DefaultPoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
: Base(deviceId, imageLayout)
{
}
protected:
using Base::m_deviceId;
using Base::m_imageLayout;
void EnsureCompatible() override
{
if (m_imageLayout != ImageLayoutKind::HWC)
RuntimeError("Default pooling engine currently supports only HWC/legacy layout.");
}
void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) override
{
if (poolDesc.kind() == PoolDesc::PoolKind::Max)
{
out.AssignMaxPoolingResult(in, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
}
else if (poolDesc.kind() == PoolDesc::PoolKind::Average)
{
out.AssignAveragePoolingResult(in, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
}
else
InvalidArgument("Pooling type %d is not supported.", (int)poolDesc.kind());
}
void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) override
{
if (poolDesc.kind() == PoolDesc::PoolKind::Max)
{
grad.AddMaxPoolingGradient(srcGrad, in, out,
inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
}
else if (poolDesc.kind() == PoolDesc::PoolKind::Average)
{
grad.AddAveragePoolingGradient(srcGrad, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
}
else
InvalidArgument("Pooling type %d is not supported.", (int)poolDesc.kind());
}
};
template class PoolingEngine<float>;
template class PoolingEngine<double>;
template <class ElemType>
class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
{
public:
using Base = ConvolutionEngineFactory<ElemType>;
using typename Base::Tensor4D;
using typename Base::Tensor4DPtr;
using typename Base::Filter;
using typename Base::FilterPtr;
using typename Base::ConvDesc;
using typename Base::ConvDescPtr;
using typename Base::PoolDesc;
using typename Base::PoolDescPtr;
using typename Base::ConvEnginePtr;
using typename Base::PoolEnginePtr;
public:
Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override
{
return std::make_unique<ConvolutionTensor4D>(w, h, c, n);
}
FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override
{
return std::make_unique<Filter>(w, h, c, k);
}
ConvDescPtr CreateConvDescriptor(const Tensor4D& /*inT*/, const Filter& /*filterT*/,
size_t wStride, size_t hStride, bool padding) override
{
return std::make_unique<ConvDesc>(wStride, hStride, padding);
}
PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override
{
return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
}
ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) override
{
return std::make_unique<DefaultConvolutionEngine<ElemType>>(deviceId, imageLayout, maxTempMemSizeInSamples, bnImpl);
}
PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) override
{
return std::make_unique<DefaultPoolingEngine<ElemType>>(deviceId, imageLayout);
}
};
template <class ElemType>
std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind)
{
if (engType == EngineType::Auto)
{
// REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId) && imageLayoutKind == ImageLayoutKind::CHW)
return Create(deviceId, EngineType::CuDnn, imageLayoutKind);
else
return Create(deviceId, EngineType::Legacy, imageLayoutKind);
}
else if (engType == EngineType::CuDnn)
{
if (imageLayoutKind != ImageLayoutKind::CHW)
InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the cuDNN engine.", ToString(imageLayoutKind).c_str());
if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
RuntimeError("cuDNN convolution engine is not supported, check the device id and whether the code was compiled with cuDNN.");
}
else if (engType == EngineType::Legacy)
{
return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
}
RuntimeError("Not supported convolution engine type: %d.", (int)engType);
}
template class ConvolutionEngineFactory<float>;
template class ConvolutionEngineFactory<double>;
}}}

Просмотреть файл

@ -5,370 +5,104 @@
#pragma once
// REVIEW alexeyk: this seems to be repeated all over the CNTKMathDll.
#ifdef _WIN32
#ifdef MATH_EXPORTS
#define MATH_API __declspec(dllexport)
#else
#define MATH_API __declspec(dllimport)
#endif
#else // no DLLs on Linux
#define MATH_API
#endif
#include "Matrix.h"
#include "TensorShape.h" // for ImageLayoutKind
#include "ConvolveGeometry.h"
#include "StringUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// REVIEW alexeyk: this is a temp class until we have generic tensor suport in CNTK.
class ConvolutionTensor4D
//-------------------------------------------------------------
// Convolution and pooling engine interface.
//-------------------------------------------------------------
enum class ConvolutionEngineKind
{
public:
size_t w() const
{
return m_w;
}
size_t h() const
{
return m_h;
}
size_t c() const
{
return m_c;
}
size_t n() const
{
return m_n;
}
virtual void setN(size_t n)
{
m_n = n;
}
None = 0,
Reference = 1,
CuDnn = 1 << 1,
Legacy = 1 << 2,
public:
ConvolutionTensor4D(size_t w = 1, size_t h = 1, size_t c = 1, size_t n = 1)
{
m_w = w;
m_h = h;
m_c = c;
m_n = n;
}
public:
virtual ~ConvolutionTensor4D() = default;
// Deleting copy ctor/assignment as derived objects may contain non-copyable state.
ConvolutionTensor4D(const ConvolutionTensor4D&) = delete;
ConvolutionTensor4D& operator=(const ConvolutionTensor4D&) = delete;
// REVIEW alexeyk: Have to implement move ctor explicitly as VS2013 does not support default move ctors.
// ConvolutionTensor4D(ConvolutionTensor4D&&);
// ConvolutionTensor4D& operator=(ConvolutionTensor4D&&);
private:
size_t m_w;
size_t m_h;
size_t m_c;
size_t m_n;
All = Reference | CuDnn | Legacy
};
class ConvolutionFilter
enum class PoolKind
{
public:
size_t w() const
{
return m_w;
}
size_t h() const
{
return m_h;
}
size_t c() const
{
return m_c;
}
size_t k() const
{
return m_k;
}
public:
ConvolutionFilter(size_t w = 1, size_t h = 1, size_t c = 1, size_t k = 1)
{
m_w = w;
m_h = h;
m_c = c;
m_k = k;
}
public:
virtual ~ConvolutionFilter() = default;
// Deleting copy ctor/assignment as derived objects may contain non-copyable state.
ConvolutionFilter(const ConvolutionFilter&) = delete;
ConvolutionFilter& operator=(const ConvolutionFilter&) = delete;
private:
size_t m_w;
size_t m_h;
size_t m_c;
size_t m_k;
None,
Max,
Average
};
// ConvolutionDescriptor describes properties specific to convolution application.
class ConvolutionDescriptor
{
public:
// Horizontal stride (in w-dimension).
size_t wStride() const
{
return m_wStride;
}
// Vertical stride (in h-dimension).
size_t hStride() const
{
return m_hStride;
}
bool padding() const
{
return m_padding;
}
public:
ConvolutionDescriptor(size_t wStride = 1, size_t hStride = 1, bool padding = false)
{
m_wStride = wStride;
m_hStride = hStride;
m_padding = padding;
}
public:
virtual ~ConvolutionDescriptor() = default;
// Deleting copy ctor/assignment as derived objects may contain non-copyable state.
ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
private:
size_t m_wStride;
size_t m_hStride;
bool m_padding;
};
// PoolingDescriptor describes properties specific to convolution application.
class PoolingDescriptor
{
public:
enum class PoolKind
{
Max,
Average
};
PoolKind kind() const
{
return m_kind;
}
// Pooling window size.
size_t w() const
{
return m_w;
}
size_t h() const
{
return m_h;
}
// Horizontal stride (in w-dimension).
size_t wStride() const
{
return m_wStride;
}
// Vertical stride (in h-dimension).
size_t hStride() const
{
return m_hStride;
}
// Horizontal pad (in w-dimension).
size_t wPad() const
{
return m_wPad;
}
// Vertical pad (in h-dimension).
size_t hPad() const
{
return m_hPad;
}
public:
PoolingDescriptor(PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
{
m_kind = kind;
m_w = w;
m_h = h;
m_wStride = wStride;
m_hStride = hStride;
m_wPad = wPad;
m_hPad = hPad;
}
public:
virtual ~PoolingDescriptor() = default;
// Deleting copy ctor/assignment as derived objects may contain non-copyable state.
PoolingDescriptor(const PoolingDescriptor&) = delete;
PoolingDescriptor& operator=(const PoolingDescriptor&) = delete;
private:
PoolKind m_kind;
size_t m_w;
size_t m_h;
size_t m_wStride;
size_t m_hStride;
size_t m_wPad;
size_t m_hPad;
};
#pragma warning(push)
#pragma warning(disable : 4251)
template <class ElemType>
class MATH_API ConvolutionEngine
{
public:
using Tensor4D = ConvolutionTensor4D;
using Filter = ConvolutionFilter;
using ConvDesc = ConvolutionDescriptor;
using Mat = Matrix<ElemType>;
public:
ConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
: m_deviceId(deviceId), m_imageLayout(imageLayout)
{
}
virtual ~ConvolutionEngine() = default;
void Forward(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& outT, Mat& out, Mat& workspace);
void Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace);
void BackwardData(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& gradT, Mat& grad, Mat& workspace);
void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace);
void BackwardFilter(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace);
void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace);
void NormalizeBatch(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
double epsilon, Mat& saveMean, Mat& saveInvStdDev);
void ForwardPooling(const Mat& in, Mat& out);
void NormalizeBatchInference(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out);
void BackwardPooling(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad);
void BackwardNormalizeBatch(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad);
std::shared_ptr<const ConvolveGeometry> Geometry() const { return m_geometry; }
static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None, ConvolutionEngineKind enabledEngines = ConvolutionEngineKind::All);
DISABLE_COPY_AND_MOVE(ConvolutionEngine);
protected:
virtual void EnsureCompatible() = 0;
virtual void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& outT, Mat& out, Mat& workspace) = 0;
virtual void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& gradT, Mat& grad, Mat& workspace) = 0;
virtual void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace) = 0;
virtual void EnsureCompatibleBatchNorm(bool spatial) = 0;
virtual void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
// REVIEW alexeyk: roll into NormalizeBatchCore.
virtual void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) = 0;
virtual void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) = 0;
protected:
DEVICEID_TYPE m_deviceId;
ImageLayoutKind m_imageLayout;
};
template <class ElemType>
class MATH_API PoolingEngine
{
public:
using Tensor4D = ConvolutionTensor4D;
using PoolDesc = PoolingDescriptor;
using Mat = Matrix<ElemType>;
public:
PoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
: m_deviceId(deviceId), m_imageLayout(imageLayout)
ConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
: m_geometry(geometry), m_deviceId(deviceId), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_poolKind(poolKind)
{
assert(m_geometry != nullptr);
}
virtual ~PoolingEngine() = default;
void Forward(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out);
void Backward(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad);
DISABLE_COPY_AND_MOVE(PoolingEngine);
protected:
virtual void EnsureCompatible() = 0;
virtual void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) = 0;
virtual void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) = 0;
virtual void EnsureConvolutionInitialized() = 0;
virtual void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) = 0;
virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) = 0;
virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) = 0;
virtual void EnsurePoolingInitialized() = 0;
virtual void ForwardPoolingCore(const Mat& in, Mat& out) = 0;
virtual void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) = 0;
protected:
ConvolveGeometryPtr m_geometry;
DEVICEID_TYPE m_deviceId;
ImageLayoutKind m_imageLayout;
size_t m_maxTempMemSizeInSamples;
PoolKind m_poolKind;
};
// REVIEW alexeyk: this is a temporary hack until we find a better place for the BatchNorm engine(s).
enum class BatchNormImpl
#pragma warning(pop)
static inline PoolKind PoolKindFrom(const wstring& s)
{
CuDnn,
Cntk
};
if (s.empty() || AreEqualIgnoreCase(s, L"none"))
return PoolKind::None;
if (AreEqualIgnoreCase(s, L"max"))
return PoolKind::Max;
if (AreEqualIgnoreCase(s, L"average"))
return PoolKind::Average;
InvalidArgument("Unknown pooling kind: '%ls'. Supported values: 'none', 'max', 'average'.", s.c_str());
}
template <class ElemType>
class MATH_API ConvolutionEngineFactory
{
public:
using Tensor4D = ConvolutionTensor4D;
using Tensor4DPtr = std::unique_ptr<Tensor4D>;
using Filter = ConvolutionFilter;
using FilterPtr = std::unique_ptr<ConvolutionFilter>;
using ConvDesc = ConvolutionDescriptor;
using ConvDescPtr = std::unique_ptr<ConvolutionDescriptor>;
using PoolDesc = PoolingDescriptor;
using PoolDescPtr = std::unique_ptr<PoolingDescriptor>;
using ConvEnginePtr = std::unique_ptr<ConvolutionEngine<ElemType>>;
using PoolEnginePtr = std::unique_ptr<PoolingEngine<ElemType>>;
public:
ConvolutionEngineFactory() = default;
virtual ~ConvolutionEngineFactory() = default;
virtual Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) = 0;
virtual FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) = 0;
virtual ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT,
size_t wStride, size_t hStride, bool padding) = 0;
virtual PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) = 0;
// virtual Tensor4DPtr CreateLrnDescriptor() = 0;
virtual ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) = 0;
virtual PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) = 0;
enum class EngineType
{
Auto,
CuDnn,
Legacy
};
static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind);
DISABLE_COPY_AND_MOVE(ConvolutionEngineFactory);
};
} } }

Просмотреть файл

@ -0,0 +1,552 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Basics.h"
#include "TensorShape.h"
#include <iterator>
namespace Microsoft { namespace MSR { namespace CNTK {
// Notes:
// * ConvolveGeometry represents the application of one or more rectangular "kernels" (all of the same size)
// to a rectangular input to produce a rectangular output.
// * A "cell" in the rectangular input is identified by a single coordinate called a "col" (for column).
// * A "cell" in the rectangular output is identified by a single coordinate called a "row".
// * The kernels may involve weights, in which case MpRowIwht indicates the starting index of the weights
// used for a given output cell.
// The overall idea of ConvolveGeometry is to precompute maps that can be used to apply convolutions of
// arbitrary configurations and dimensions. In such case the generic implementation becomes very simple and invariant
// wrt convolution configuration and dimensionality. For specific cases like 2D/3D convolutions and full sharing,
// highly optimized implementations (e.g. cuDNN) are used.
class ConvolveGeometry final
{
public:
using IntVec = std::vector<int>;
using BoolVec = std::vector<bool>;
const TensorShape& InputShape() const { return m_inputShape; }
const TensorShape& OutputShape() const { return m_outputShape; }
const TensorShape& KernelShape() const { return m_kernelShape; }
const TensorShape& MapCount() const { return m_mapCount; }
const TensorShape& Stride() const { return m_stride; }
const BoolVec& Sharing() const { return m_sharing; }
const BoolVec& AutoPad() const { return m_autoPad; }
const TensorShape& LowerPad() const { return m_lowerPad; }
const TensorShape& UpperPad() const { return m_upperPad; }
// Maps from a "row" (index of output cell) to its base "col" (index of input cell). For a given row,
// the cols that contribute to it are { MpRowCol[row] + Indices[i0 + 1 + i] | 0 <= i < Indices[i0] },
// where i0 = MpRowIndices[row].
const IntVec& MpRowCol() const { return m_mpRowCol; }
// Maps from a "row" (index of output cell) to where to start in the weights array. Each run of weights
// consists of KernelSize weights.
const IntVec& MpRowIwht() const { return m_mpRowIwht; }
// Maps from a "row" (index of output cell) to its starting index in Runs. A run consists of:
// * skip count (to skip that many weights)
// * item count
// * relative indices into source (item count of these)
// * masks (all 1's or all 0's) (item count of these)
// For items that are masked out (0 mask), the index stored is the next valid index.
// This ensures that accessing the corresponding neuron value doesn't fault and that
// backprop operations write the correct value last (any previous writes won't change
// the value).
// NOTE: The first (zeroth) run is always the "full" kernel run. Also, MpRowRun can be empty,
// indicating that all values are zero (all outputs use the "full" kernel run).
const IntVec& MpRowRun() const { return m_mpRowRun; }
const IntVec& Runs() const { return m_runs; }
// Maps from a "row" (index of output cell) to its starting index in Indices. Note that "Runs" is intended
// for kernels that have weights, while "Indices" is intended for kernels that don't need to access weights.
// As a result, the encoding in Indices is simpler and more direct.
// A run in Indices consists of:
// * item count
// * relative indices into source (item count of these)
// NOTE: The first run of indices is always the "full" kernel run. Also, MpRowIndices can be empty,
// indicating that all values are zero (all outputs use the "full" kernel run).
// In addition, all items in Indices are valid source indices so no masking is required in subsequent computation.
const IntVec& MpRowIndices() const { return m_mpRowIndices; }
const IntVec& Indices() const { return m_indices; }
// Number of kernels (equal to MapCount if sharing is all true values).
size_t KernelCount() const { return m_kernelCount; }
ConvolveGeometry(const TensorShape& inputShape, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& stride,
const BoolVec& sharing, const BoolVec& autoPad, const TensorShape& lowerPad, const TensorShape& upperPad)
: m_inputShape(inputShape), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(stride), m_sharing(sharing),
m_autoPad(autoPad), m_lowerPad(lowerPad), m_upperPad(upperPad)
{
// Note: this ctor is a bit long so sit back and relax.
assert(m_inputShape.GetRank() == m_kernelShape.GetRank());
assert(m_mapCount.GetRank() == 1 || m_mapCount.GetRank() == m_inputShape.GetRank());
assert(m_stride.GetRank() == 1 || m_stride.GetRank() == m_inputShape.GetRank());
assert(m_sharing.size() == 1 || m_sharing.size() == m_inputShape.GetRank());
assert(m_autoPad.size() == 1 || m_autoPad.size() == m_inputShape.GetRank());
assert(m_lowerPad.GetRank() == 1 || m_lowerPad.GetRank() == m_inputShape.GetRank());
assert(m_upperPad.GetRank() == 1 || m_upperPad.GetRank() == m_inputShape.GetRank());
m_outputShape = ComputeOutputShape(m_inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
assert(m_inputShape.GetRank() == m_outputShape.GetRank());
size_t dimCount = inputShape.GetRank();
size_t kernelSize = kernelShape.GetNumElements();
// Compute the total number of kernels.
m_kernelCount = 1;
for (size_t i = 0; i < dimCount; i++)
m_kernelCount *= !GetSharing(i) ? m_outputShape[i] : GetMapCount(i);
// Compute the "Start" indices.
m_start.resize(dimCount);
m_startIndex = 0;
m_originIndex = 0;
for (int i = (int)dimCount - 1; i >= 0; i--)
{
assert((m_outputShape[i] % GetMapCount(i)) == 0);
int outPerMap = (int)(m_outputShape[i] / GetMapCount(i));
// Number of cells between first and last "centers", inclusive.
int cells = (int)((outPerMap - 1) * GetStride(i) + 1);
assert(m_inputShape[i] >= cells);
// Extra cells, to the left and right of "cells".
int extra = (int)m_inputShape[i] - cells;
assert(extra >= 0);
// When LowerPad and/or UpperPad are specified, the Start[i] value is determined by those values.
int lo = GetAutoPad(i) ? 0 : (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : i];
int hi = GetAutoPad(i) ? 0 : (int)m_upperPad[m_upperPad.size() == 1 ? 0 : i];
if (lo != 0 || hi != 0)
{
assert(extra + lo + hi + 1 == m_kernelShape[i]);
// Compute the number of cells on the left and right parts of the kernel,
// not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
// placed on the right (the center is shifted to the left).
int right = (int)m_kernelShape[i] - 1;
int left = right / 2;
right -= left;
assert(left <= right);
assert(right <= left + 1);
assert(lo <= left);
assert(hi <= right);
m_start[i] = left - lo;
assert(m_start[i] + cells + right == m_inputShape[i] + hi);
}
else
{
m_start[i] = extra / 2;
#ifdef _DEBUG
// If we're padding then extra should be covered.
bool padded = GetAutoPad(i);
assert(!padded || extra + 1 <= m_kernelShape[i]);
// If we're not padding then, we should stay within the input dimension.
assert(padded || extra + 1 >= m_kernelShape[i]);
// Compute the number of cells on the left and right parts of the kernel,
// not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
// placed on the right (the center is shifted to the left).
int right = (int)m_kernelShape[i] - 1;
int left = right / 2;
right -= left;
assert(0 <= left);
assert(left <= right);
assert(right <= left + 1);
int min = m_start[i] - left;
int max = m_start[i] + (int)cells + right;
assert(!padded || min <= 0 && max >= m_inputShape[i]);
assert(padded || min >= 0 && max <= m_inputShape[i]);
int diff = min - ((int)m_inputShape[i] - max);
assert(std::abs(diff) <= 1);
UNUSED(padded);
UNUSED(diff);
#endif
}
m_startIndex = m_startIndex * (int)m_inputShape[i] + m_start[i];
m_originIndex = m_originIndex * (int)m_inputShape[i] + ((int)m_kernelShape[i] - 1) / 2;
}
// Compute support, mapping from the index into the kernel to offset into source.
// Support consists of the column deltas of the kernels, as offsets from MpRowCol[row].
IntVec support(kernelSize);
std::vector<IntVec> kernelCoords(kernelSize);
for (int idx = 0; idx < kernelSize; idx++)
{
kernelCoords[idx].resize(dimCount);
int ivSrc = 0;
int factor = 1;
int cur = idx;
for (size_t i = 0; i < dimCount; i++)
{
assert(cur >= 0);
int d = (int)m_kernelShape[i];
assert(d > 0);
int coord = cur % d;
cur /= d;
kernelCoords[idx][i] = coord;
ivSrc += factor * coord;
factor *= (int)m_inputShape[i];
}
assert(cur == 0);
assert(ivSrc < m_inputShape.GetNumElements());
support[idx] = ivSrc - m_originIndex;
}
size_t outputSize = m_outputShape.GetNumElements();
// Compute the mappings (where row = output node index, col = source node index):
// * from row to the index of the first weight to use for that row.
// * from row to the first input col. The rest are col + _support[i].
m_mpRowIwht.resize(outputSize);
m_mpRowCol.resize(outputSize);
m_mpRowRun.resize(outputSize);
m_mpRowIndices.resize(outputSize);
// A "key" is an equivalence class of run/masks.
// Calculate the key for an interior cell (for using all of support - when all masks are 1's).
int keyInterior = 0;
for (size_t i = 0; i < dimCount; i++)
{
int width = (int)m_kernelShape[i];
keyInterior = keyInterior * width + (width - 1) / 2;
}
m_runs.resize(2 * kernelSize + 2, -1);
m_indices.resize(kernelSize + 1);
m_runs[0] = 0; // Skip count
m_runs[1] = (int)kernelSize; // Count of entries
m_indices[0] = (int)kernelSize;
for (size_t i = 0; i < kernelSize; i++)
{
m_runs[2 + i] = support[i];
m_indices[1 + i] = support[i];
}
// Working buffer for masks.
IntVec masks(kernelSize);
// Map from key to pair of starting locations in Runs and Indices.
std::map<int, std::pair<int, int>> mpkeystarts;
mpkeystarts[keyInterior] = std::make_pair(0, 0);
IntVec dkey(dimCount);
for (size_t row = 0; row < outputSize; row++)
{
// Compute the kernel number, column, and key.
// REVIEW alexeyk: Seems like there should be a simpler and faster way, without starting
// from scratch for each output (row)....
int kern = 0;
int col = 0;
int factorKern = 1;
int factorCol = 1;
int key = 0;
int cur = (int)row;
for (size_t i = 0; i < dimCount; i++)
{
int dim = (int)(m_outputShape[i] / GetMapCount(i));
int coord = cur % dim;
cur /= dim;
// Kernel
if (!GetSharing(i))
{
kern += factorKern * coord;
factorKern *= dim;
}
int maps = (int)GetMapCount(i);
if (maps > 1)
{
kern += factorKern * (cur % maps);
cur /= maps;
factorKern *= maps;
}
// Transform coord to input index space.
coord *= (int)GetStride(i);
coord += m_start[i];
col += factorCol * coord;
factorCol *= (int)m_inputShape[i];
int width = (int)m_kernelShape[i];
int half = (width - 1) / 2;
int min = coord - half;
int lim = min + width;
if (min < 0)
dkey[i] = min;
else if (lim > m_inputShape[i])
dkey[i] = lim - (int)m_inputShape[i];
else
dkey[i] = 0;
int dk = dkey[i] + half;
assert(0 <= dk);
assert(dk < width);
key = key * width + dk;
}
assert(cur == 0);
assert(0 <= kern);
assert(kern < m_kernelCount);
assert(0 <= col);
assert(col < m_inputShape.GetNumElements());
auto startsIter = mpkeystarts.find(key);
if (startsIter == mpkeystarts.end())
{
auto starts = std::make_pair((int)m_runs.size(), (int)m_indices.size());
mpkeystarts[key] = starts;
int indexCount = 0;
for (int idx = 0; idx < kernelSize; idx++)
{
const auto& coords = kernelCoords[idx];
int mask = 0;
for (int i = (int)dimCount; ; )
{
if (--i < 0)
{
// All OK.
mask = -1;
break;
}
int k = dkey[i] + coords[i];
if (k < 0)
break;
if (k >= m_kernelShape[i])
break;
}
assert(mask == 0 || mask == -1);
indexCount -= mask;
masks[idx] = mask;
}
int skip = 0;
while (masks[skip] == 0)
skip++;
int count = (int)kernelSize;
while (masks[count - 1] == 0)
count--;
count -= skip;
m_runs.push_back(skip); // Skip count
m_runs.push_back(count); // Count of entries
m_indices.push_back(indexCount);
for (int i = 0, iMin = 0; i < count; i++)
{
int index = support[skip + i];
int mask = masks[skip + i];
if (mask != 0)
{
// Add "index" to runs for this slot and any immediately preceeding
// slots that have mask == 0.
assert(iMin <= i);
assert(m_runs.size() == starts.first + 2 + iMin);
for (; iMin <= i; iMin++)
m_runs.push_back(index);
assert(iMin == i + 1);
assert(m_runs.size() == starts.first + 2 + iMin);
m_indices.push_back(index);
}
}
for (int i = 0; i < count; i++)
m_runs.push_back(masks[skip + i]);
assert(m_runs.size() == std::get<0>(starts) + 2 + 2 * count);
assert(m_indices.size() == std::get<1>(starts) + 1 + indexCount);
m_mpRowRun[row] = starts.first;
m_mpRowIndices[row] = starts.second;
}
else
{
m_mpRowRun[row] = (*startsIter).second.first;
m_mpRowIndices[row] = (*startsIter).second.second;
}
assert(0 <= kern);
assert(kern < m_kernelCount);
m_mpRowCol[row] = col;
m_mpRowIwht[row] = kern * (int)kernelSize;
}
}
size_t GetStride(size_t dim) const
{
assert(m_stride.size() == 1 || dim < m_stride.size());
return m_stride[m_stride.size() == 1 ? 0 : dim];
}
size_t GetMapCount(size_t dim) const
{
assert(m_mapCount.size() == 1 || dim < m_mapCount.size());
// If the whole map count tensor was specified explicitly - return requested component.
if (m_mapCount.size() > 1)
return m_mapCount[dim];
// If map count tensor rank == 1 then assume it represents number of feature maps for the rightmost dimension.
if (dim == m_inputShape.size() - 1)
return m_mapCount[0];
return 1;
}
bool GetSharing(size_t dim) const
{
assert(m_sharing.size() == 1 || dim < m_sharing.size());
return m_sharing[m_sharing.size() == 1 ? 0 : dim];
}
bool GetAutoPad(size_t dim) const
{
assert(m_autoPad.size() == 1 || dim < m_autoPad.size());
return m_autoPad[m_autoPad.size() == 1 ? 0 : dim];
}
int GetLowerPad(size_t dim) const
{
if (!GetAutoPad(dim))
return (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : dim];
int kernSize = (int)m_kernelShape[dim];
int inpSize = (int)m_inputShape[dim];
int outSize = (int)m_outputShape[dim];
int stride = (int)GetStride(dim);
// Taken from computation in ConvolveGeometry ctor.
// Number of cells between first and last "centers", inclusive.
int cells = (outSize - 1) * stride + 1;
// Extra cells, to the left and right of "cells".
int extra = inpSize - cells;
int center = extra / 2;
return -(center - (kernSize - 1) / 2);
}
static TensorShape ComputeOutputShape(const TensorShape& inputShape, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& stride,
const BoolVec& sharing, const BoolVec& autoPad, const TensorShape& lowerPad, const TensorShape& upperPad)
{
if (inputShape.GetRank() != kernelShape.GetRank())
InvalidArgument("Convolution input and kernel tensors must have the same rank.");
if (mapCount.GetRank() != 1 && inputShape.GetRank() != mapCount.GetRank())
InvalidArgument("Convolution map tensor must have rank 1 or the same as the input tensor.");
if (stride.GetRank() != 1 && inputShape.GetRank() != stride.GetRank())
InvalidArgument("Convolution stride tensor must have rank 1 or the same as the input tensor.");
if (sharing.size() != 1 && inputShape.GetRank() != sharing.size())
InvalidArgument("Convolution sharing tensor must have rank 1 or the same as the input tensor.");
if (autoPad.size() != 1 && inputShape.GetRank() != autoPad.size())
InvalidArgument("Convolution padding tensor must have rank 1 or the same as the input tensor.");
if (lowerPad.GetRank() != 1 && inputShape.GetRank() != lowerPad.GetRank())
InvalidArgument("Convolution lower pad tensor must have rank 1 or the same as the input tensor.");
if (upperPad.GetRank() != 1 && inputShape.GetRank() != upperPad.GetRank())
InvalidArgument("Convolution upper pad tensor must have rank 1 or the same as the input tensor.");
SmallVector<size_t> dimsOutput(inputShape.GetRank());
for (size_t i = 0; i < inputShape.GetRank(); i++)
{
assert(inputShape[i] >= 1);
if (kernelShape[i] > inputShape[i])
InvalidArgument("Convolution operation requires that kernel dim %d <= input dim %d.", (int)kernelShape[i], (int)inputShape[i]);
size_t delta = stride[stride.GetRank() == 1 ? 0 : i];
size_t dim = inputShape[i];
bool autoPadCur = autoPad[autoPad.size() == 1 ? 0 : i];
size_t lo = lowerPad[lowerPad.size() == 1 ? 0 : i];
size_t hi = upperPad[upperPad.size() == 1 ? 0 : i];
if (autoPadCur)
{
dim += kernelShape[i] - 1;
}
else
{
dim += lo + hi;
}
size_t dimOut = (dim - kernelShape[i]) / delta + 1;
// When LowerPad and/or UpperPad are specified (i.e. > 0), we insist that the kernel applications
// fill the entire space.
if (!autoPadCur && (lo > 0 || hi > 0))
{
size_t size = (dimOut - 1) * delta + kernelShape[i];
if (size != dim)
InvalidArgument("Convolution requires that kernel fills the entire space if auto-padding is disabled.");
}
if (mapCount.size() > 1)
dimOut *= mapCount[i];
else if (i == inputShape.GetRank() - 1)
dimOut *= mapCount[0];
dimsOutput[i] = dimOut;
}
auto dimsOut = TensorShape(dimsOutput);
// Check the output dimensions.
size_t mapCountTotal = mapCount.GetNumElements();
size_t sizeOut = dimsOut.GetNumElements();
assert((sizeOut % mapCountTotal) == 0);
UNUSED(mapCountTotal);
UNUSED(sizeOut);
return dimsOut;
}
// Used in unit tests and during debugging.
operator std::string() const
{
std::ostringstream res;
res << "Input: " << (string)InputShape();
res << ", Output: " << (string)OutputShape();
res << ", Kernel: " << (string)KernelShape();
res << ", Map: " << (string)MapCount();
res << ", Stride: " << (string)Stride();
res << ", Sharing: (";
std::copy(begin(Sharing()), end(Sharing()) - 1, std::ostream_iterator<bool>(res, ", "));
res << Sharing().back() << ")";
res << ", AutoPad: (";
std::copy(begin(AutoPad()), end(AutoPad()) - 1, std::ostream_iterator<bool>(res, ", "));
res << AutoPad().back() << ")";
res << ", LowerPad: " << (string)LowerPad();
res << ", UpperPad: " << (string)UpperPad();
return res.str();
}
DISABLE_COPY_AND_MOVE(ConvolveGeometry);
private:
TensorShape m_inputShape;
TensorShape m_outputShape;
TensorShape m_kernelShape;
TensorShape m_mapCount;
TensorShape m_stride;
BoolVec m_sharing;
BoolVec m_autoPad;
TensorShape m_lowerPad;
TensorShape m_upperPad;
// There are several reasons why int type is used here rather than size_t:
// 1. Many of these vectors contain offsets which can be negative.
// 2. Most of these vectors will be copied into device memory (GPU) so the smaller the size - the better.
// Also, 64-bit operations are slower on GPU.
// 3. If you are still not convinced, we don't expect convolutions to be more than 2B in size anyway.
// See description to corresponding getter functions to understand what these are.
IntVec m_mpRowCol;
IntVec m_mpRowIwht;
IntVec m_mpRowRun;
IntVec m_runs;
IntVec m_mpRowIndices;
IntVec m_indices;
// The indices of the first ("top-left-most") "kernel-center" cell in the source.
IntVec m_start;
int m_startIndex;
// When the first kernel cell is aligned with the first source cell, this is the index of the input cell that
// is aligned with the "kernel-center" cell. Indices in "Runs" and "Indices" are relative to OriginIndex.
int m_originIndex;
size_t m_kernelCount;
};
using ConvolveGeometryPtr = std::shared_ptr<ConvolveGeometry>;
} } }

Просмотреть файл

@ -0,0 +1,173 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "CuDnnFactories.h"
#include "BatchNormalizationEngine.h"
#include "CuDnnCommon.h"
#include "GPUMatrix.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
class CuDnnBatchNormEngine : public BatchNormEngine<ElemType>
{
public:
using Base = BatchNormEngine<ElemType>;
using typename Base::Mat;
public:
CuDnnBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout)
: Base(deviceId, inOutT, spatial, imageLayout),
m_cudnn(CuDnn::Instance()),
m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<ElemType>()),
m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<ElemType>())
{
}
protected:
using Base::m_deviceId;
using Base::m_imageLayout;
using Base::m_inOutT;
using Base::m_spatial;
void EnsureCompatible() override
{
if (m_spatial && m_imageLayout == ImageLayoutKind::HWC)
InvalidArgument("cuDNN batch normalization supports only cudnn(CHW) layout.");
if (m_inOutT.GetRank() > 4)
InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
}
void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
{
// REVIEW alexeyk: there might be a way to do this in cuDNN.
if (blendFactor != 0 && (blendFactor != 1 || expAvgFactor > 0))
InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
m_inOutCuDnnT.UpdateBatchSize(in.GetNumCols());
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
// expAvgFactor == 0 && blendFactor == 1 means we are in eval mode.
if (expAvgFactor == 0 && blendFactor == 1)
{
CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
}
else
{
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
}
}
void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) override
{
m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
#if CUDNN_PATCHLEVEL >= 7
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
#else
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
#endif
}
private:
static ElemType* ptr(Mat& src)
{
return src.BufferPointer();
}
static const ElemType* ptr(const Mat& src)
{
return src.BufferPointer();
}
static TensorShape GetInOutTensor(const TensorShape& inOutT)
{
// cuDNN supports only 3D and 4D tensors (in cuDNN docs it's 4D and 5D dues to N dimension)
// even for non-spatial inputs so expand the tensor if needed.
if (inOutT.GetRank() > 2)
return inOutT;
SmallVector<size_t> v(std::max(inOutT.GetRank(), (size_t)3), 1);
for (size_t i = 0; i < inOutT.GetRank(); i++)
v[i] = inOutT[i];
return TensorShape(v);
}
static TensorShape GetScaleBiasTensor(const TensorShape& inOutT, bool spatial)
{
if (!spatial)
return GetInOutTensor(inOutT);
const auto& t = GetInOutTensor(inOutT);
SmallVector<size_t> v(t.GetRank(), 1);
v[v.size() - 1] = t[t.GetRank() - 1];
return TensorShape(v);
}
private:
using C = Consts<ElemType>;
CuDnn::ptr_t m_cudnn;
CuDnnTensor m_inOutCuDnnT;
CuDnnTensor m_scaleBiasCuDnnT;
};
template class CuDnnBatchNormEngine<float>;
template class CuDnnBatchNormEngine<double>;
template <typename ElemType>
std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout)
{
return std::make_unique<CuDnnBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
}
template class CuDnnBatchNormEngineFactory<float>;
template class CuDnnBatchNormEngineFactory<double>;
CudaTimer::~CudaTimer()
{
// TODO: Should not throw if std::uncaught_exception()
if (m_start != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
if (m_stop != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
}
void CudaTimer::Start()
{
cudaEvent_t start;
cudaEvent_t stop;
if (m_start != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
if (m_stop != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
CUDA_CALL(cudaEventCreate(&start));
CUDA_CALL(cudaEventCreate(&stop));
m_start = start;
m_stop = stop;
CUDA_CALL(cudaEventRecord(start, GetStream()));
}
void CudaTimer::Stop()
{
CUDA_CALL(cudaEventRecord(reinterpret_cast<cudaEvent_t>(m_stop), GetStream()));
CUDA_CALL(cudaEventSynchronize(reinterpret_cast<cudaEvent_t>(m_stop)));
}
float CudaTimer::Elapsed()
{
float ms;
CUDA_CALL(cudaEventElapsedTime(&ms, reinterpret_cast<cudaEvent_t>(m_start), reinterpret_cast<cudaEvent_t>(m_stop)));
return ms;
}
} } }

108
Source/Math/CuDnnCommon.cpp Normal file
Просмотреть файл

@ -0,0 +1,108 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "GPUMatrix.h"
#include "CuDnnCommon.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <>
const float Consts<float>::One = 1;
template <>
const double Consts<double>::One = 1;
template <>
const float Consts<float>::Zero = 0;
template <>
const double Consts<double>::Zero = 0;
CuDnnTensor::CuDnnTensor(const TensorShape& src, cudnnDataType_t dataType)
: m_tensor(nullptr)
{
CUDNN_CALL(cudnnCreateTensorDescriptor(&m_tensor));
// Set cuDNN tensor dimensions. cuDNN uses row-major format while TensorShape - column-major
// so conversion is required. N dimension will be set to 1.
const auto& stridesSrc = src.GetStrides();
SmallVector<int> dims(src.GetRank() + 1);
SmallVector<int> strides(stridesSrc.size() + 1);
assert(dims.size() == strides.size());
for (int i = 0; i < src.GetRank(); i++)
{
dims[dims.size() - 1 - i] = (int)src[i];
strides[dims.size() - 1 - i] = (int)stridesSrc[i];
}
// Set "minibatch"(aka N) dimension.
dims[0] = 1;
strides[0] = strides[1] * dims[1];
CUDNN_CALL(cudnnSetTensorNdDescriptor(m_tensor, dataType, (int)dims.size(), dims.data(), strides.data()));
}
CuDnnTensor::~CuDnnTensor()
{
if (m_tensor != nullptr)
{
cudnnDestroyTensorDescriptor(m_tensor);
m_tensor = nullptr;
}
}
void CuDnnTensor::UpdateBatchSize(size_t batchSize)
{
// Currently cuDNN supports only 2D and 3D convlutions anyway (so max 5D tensors).
const int MaxDims = 5;
int dims[MaxDims];
int strides[MaxDims];
int nbDims = 0;
cudnnDataType_t dataType;
// According to NVIDIA, Get/Set functions are very fast so it's safe to call them in a loop.
CUDNN_CALL(cudnnGetTensorNdDescriptor(m_tensor, MaxDims, &dataType, &nbDims, dims, strides));
assert(nbDims <= MaxDims);
dims[0] = (int)batchSize;
CUDNN_CALL(cudnnSetTensorNdDescriptor(m_tensor, dataType, nbDims, dims, strides));
}
template <typename ElemType>
cudnnDataType_t CuDnnTensor::GetDataType()
{
if (typeid(ElemType) == typeid(float))
return CUDNN_DATA_FLOAT;
else if (typeid(ElemType) == typeid(double))
return CUDNN_DATA_DOUBLE;
else
InvalidArgument("cuDNN engine currently supports only single and double precision data types.");
}
template cudnnDataType_t CuDnnTensor::GetDataType<float>();
template cudnnDataType_t CuDnnTensor::GetDataType<double>();
CuDnn::ptr_t CuDnn::Instance()
{
auto createNew = []()
{
int deviceId;
CUDA_CALL(cudaGetDevice(&deviceId));
cudaDeviceProp props = {0};
if (cudaGetDeviceProperties(&props, deviceId) != cudaSuccess || props.major < 3)
RuntimeError("cuDNN requires device with compute capability 3.0 or higher.");
cudnnHandle_t* cudnn = new cudnnHandle_t;
CUDNN_CALL(cudnnCreate(cudnn));
CUDNN_CALL(cudnnSetStream(*cudnn, GetStream()));
return cudnn;
};
static std::shared_ptr<cudnnHandle_t> m_instance = std::shared_ptr<cudnnHandle_t>(createNew(), [](cudnnHandle_t* src)
{
assert(*src != nullptr);
auto err = cudnnDestroy(*src);
assert(err == CUDNN_STATUS_SUCCESS);
#ifdef NDEBUG
UNUSED(err);
#endif
delete src;
});
return m_instance;
}
} } }

49
Source/Math/CuDnnCommon.h Normal file
Просмотреть файл

@ -0,0 +1,49 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Basics.h"
#include "TensorShape.h"
#include <cudnn.h>
#include <memory>
namespace Microsoft { namespace MSR { namespace CNTK {
class CuDnnTensor final
{
public:
CuDnnTensor(const TensorShape& src, cudnnDataType_t dataType);
~CuDnnTensor();
void UpdateBatchSize(size_t batchSize);
operator cudnnTensorDescriptor_t() const { return m_tensor; }
template <typename ElemType>
static cudnnDataType_t GetDataType();
DISABLE_COPY_AND_MOVE(CuDnnTensor);
private:
cudnnTensorDescriptor_t m_tensor;
};
struct CuDnn final
{
using ptr_t = std::shared_ptr<cudnnHandle_t>;
static ptr_t Instance();
DISABLE_COPY_AND_MOVE(CuDnn);
};
template <typename ElemType>
struct Consts
{
static const ElemType Zero;
static const ElemType One;
};
} } }

Просмотреть файл

@ -4,11 +4,11 @@
//
#include "stdafx.h"
#include "CuDnnConvolutionEngine.h"
#include "CuDnnFactories.h"
#include "GPUMatrix.h"
#ifdef USE_CUDNN
#include <cudnn.h>
#include "CuDnnConvolutionEngine.cuh"
#include <typeinfo>
#include <typeindex>
#include "CuDnnCommon.h"
template <>
const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
@ -16,287 +16,177 @@ const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
return cudnnGetErrorString(x);
}
// A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for filters.
// A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for kernels.
// Such formats have very limited support in cuDNN and not used in other frameworks.
// CNTK with cuDNN by default uses NCHW formats for both inputs/outputs and filters.
// CNTK with cuDNN by default uses NCHW formats for both inputs/outputs and kernels.
#define TENSOR_FORMAT CUDNN_TENSOR_NCHW
#define FILTER_FORMAT CUDNN_TENSOR_NCHW
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId)
{
// REVIEW alexeyk: compile-time for now, make runtime, config-driven.
#ifdef USE_CUDNN
cudaDeviceProp props = {0};
return cudaGetDeviceProperties(&props, deviceId) == cudaSuccess && props.major >= 3;
#else
UNUSED(deviceId);
return false;
#endif
}
CudaTimer::~CudaTimer()
{
// TODO: Should not throw if std::uncaught_exception()
if (m_start != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
if (m_stop != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
}
void CudaTimer::Start()
{
cudaEvent_t start;
cudaEvent_t stop;
if (m_start != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
if (m_stop != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
CUDA_CALL(cudaEventCreate(&start));
CUDA_CALL(cudaEventCreate(&stop));
m_start = start;
m_stop = stop;
CUDA_CALL(cudaEventRecord(start, GetStream()));
}
void CudaTimer::Stop()
{
CUDA_CALL(cudaEventRecord(reinterpret_cast<cudaEvent_t>(m_stop), GetStream()));
CUDA_CALL(cudaEventSynchronize(reinterpret_cast<cudaEvent_t>(m_stop)));
}
float CudaTimer::Elapsed()
{
float ms;
CUDA_CALL(cudaEventElapsedTime(&ms, reinterpret_cast<cudaEvent_t>(m_start), reinterpret_cast<cudaEvent_t>(m_stop)));
return ms;
}
#ifdef USE_CUDNN
static bool IsGpu(DEVICEID_TYPE deviceId)
{
return deviceId >= 0;
}
class CuDnnTensor4D : public ConvolutionTensor4D
class CuDnnKernel
{
public:
CuDnnTensor4D(size_t w, size_t h, size_t c, size_t n, cudnnDataType_t dataType)
: ConvolutionTensor4D(w, h, c, n), m_dataType(dataType), m_tensor(nullptr)
CuDnnKernel(const ConvolveGeometry& geometry, cudnnDataType_t dataType)
: m_kernel(nullptr)
{
CUDNN_CALL(cudnnCreateTensorDescriptor(&m_tensor));
CUDNN_CALL(cudnnSetTensor4dDescriptor(m_tensor, TENSOR_FORMAT, dataType,
static_cast<int>(n), static_cast<int>(c), static_cast<int>(h), static_cast<int>(w)));
CUDNN_CALL(cudnnCreateFilterDescriptor(&m_kernel));
// Set cuDNN kernel dimensions. cuDNN uses row-major format while TensorShape - column-major
// so conversion is required.
const auto& filt = geometry.KernelShape();
size_t mapCount = geometry.GetMapCount(geometry.InputShape().GetRank() - 1);
if (mapCount != geometry.MapCount().GetNumElements())
InvalidArgument("cuDNN does not support map tensor of this configuration.");
SmallVector<int> dims(filt.GetRank() + 1);
for (int i = 0; i < filt.GetRank(); i++)
dims[dims.size() - 1 - i] = (int)filt[i];
// Set map count(aka K) dimension.
dims[0] = (int)mapCount;
CUDNN_CALL(cudnnSetFilterNdDescriptor_v4(m_kernel, dataType, FILTER_FORMAT, (int)dims.size(), dims.data()));
}
public:
operator cudnnTensorDescriptor_t() const
~CuDnnKernel()
{
return m_tensor;
}
~CuDnnTensor4D() noexcept
{
if (m_tensor != nullptr)
if (m_kernel != nullptr)
{
// TODO: Check for error code and throw if !std::uncaught_exception()
cudnnDestroyTensorDescriptor(m_tensor);
m_tensor = nullptr;
cudnnDestroyFilterDescriptor(m_kernel);
m_kernel = nullptr;
}
}
void setN(size_t newN) override
{
ConvolutionTensor4D::setN(newN);
CUDNN_CALL(cudnnSetTensor4dDescriptor(m_tensor, TENSOR_FORMAT, m_dataType,
static_cast<int>(n()), static_cast<int>(c()), static_cast<int>(h()), static_cast<int>(w())));
}
private:
cudnnDataType_t m_dataType;
cudnnTensorDescriptor_t m_tensor;
};
class CuDnnFilter : public ConvolutionFilter
{
public:
CuDnnFilter(size_t w, size_t h, size_t c, size_t k, cudnnDataType_t dataType)
: ConvolutionFilter(w, h, c, k), m_filter(nullptr)
{
CUDNN_CALL(cudnnCreateFilterDescriptor(&m_filter));
CUDNN_CALL(cudnnSetFilter4dDescriptor_v4(m_filter, dataType, FILTER_FORMAT,
static_cast<int>(k), static_cast<int>(c), static_cast<int>(h), static_cast<int>(w)));
}
public:
operator cudnnFilterDescriptor_t() const
{
return m_filter;
return m_kernel;
}
~CuDnnFilter() noexcept
{
if (m_filter != nullptr)
{
// TODO: Check for error code and throw if !std::uncaught_exception()
cudnnDestroyFilterDescriptor(m_filter);
m_filter = nullptr;
}
}
DISABLE_COPY_AND_MOVE(CuDnnKernel);
private:
cudnnFilterDescriptor_t m_filter;
cudnnFilterDescriptor_t m_kernel;
};
class CuDnnConvolutionDescriptor : public ConvolutionDescriptor
class CuDnnConv
{
public:
CuDnnConvolutionDescriptor(size_t wStride, size_t hStride, size_t wPad, size_t hPad)
: ConvolutionDescriptor(wStride, hStride, wPad > 0 || hPad > 0), m_conv(nullptr)
CuDnnConv(const ConvolveGeometry& geometry, cudnnDataType_t dataType)
: m_conv(nullptr)
{
CUDNN_CALL(cudnnCreateConvolutionDescriptor(&m_conv));
CUDNN_CALL(cudnnSetConvolution2dDescriptor(m_conv,
static_cast<int>(hPad), static_cast<int>(wPad),
static_cast<int>(hStride), static_cast<int>(wStride),
1, 1, CUDNN_CROSS_CORRELATION));
// Set cuDNN convolution parameters. cuDNN uses row-major format while TensorShape - column-major
// so conversion is required. Also, for 2D convolutions (which have 3D tensor shapes)
// cuDNN uses 2D descriptors while for 3D convolutions - 3D so we need to ignore
// rightmost dimension in ConvolveGeometry tensors.
SmallVector<int> stride(geometry.InputShape().GetRank() - 1);
SmallVector<int> pad(stride.size());
for (int i = 0; i < stride.size(); i++)
{
stride[stride.size() - 1 - i] = (int)geometry.GetStride(i);
pad[stride.size() - 1 - i] = geometry.GetLowerPad(i);
}
SmallVector<int> upscale(stride.size(), 1);
CUDNN_CALL(cudnnSetConvolutionNdDescriptor(m_conv, (int)stride.size(), pad.data(),
stride.data(), upscale.data(),
CUDNN_CROSS_CORRELATION, dataType));
}
public:
operator cudnnConvolutionDescriptor_t() const
{
return m_conv;
}
~CuDnnConvolutionDescriptor() noexcept
~CuDnnConv()
{
if (m_conv != nullptr)
{
// TODO: Check for error code and throw if !std::uncaught_exception()
cudnnDestroyConvolutionDescriptor(m_conv);
m_conv = nullptr;
}
}
operator cudnnConvolutionDescriptor_t() const
{
return m_conv;
}
DISABLE_COPY_AND_MOVE(CuDnnConv);
private:
cudnnConvolutionDescriptor_t m_conv;
};
class CuDnnPoolingDescriptor : public PoolingDescriptor
class CuDnnPool
{
public:
CuDnnPoolingDescriptor(PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
: PoolingDescriptor(kind, w, h, wStride, hStride, wPad, hPad), m_pool(nullptr)
CuDnnPool(const ConvolveGeometry& geometry, PoolKind kind)
: m_pool(nullptr)
{
assert(kind == PoolKind::Max || kind == PoolKind::Average);
CUDNN_CALL(cudnnCreatePoolingDescriptor(&m_pool));
CUDNN_CALL(cudnnSetPooling2dDescriptor(m_pool,
// Set cuDNN pooling parameters. cuDNN uses row-major format while TensorShape - column-major
// so conversion is required. Same as in convolution descriptor, cuDNN uses 2D descriptors
// for 3D inputs.
SmallVector<int> dims(geometry.InputShape().GetRank() - 1);
SmallVector<int> stride(dims.size());
SmallVector<int> pad(stride.size());
int j = (int)dims.size() - 1;
for (int i = 0; i < stride.size(); i++, j--)
{
dims[j] = (int)geometry.KernelShape()[i];
stride[j] = (int)geometry.GetStride(i);
pad[j] = geometry.GetLowerPad(i);
}
// Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine.
CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
static_cast<int>(h), static_cast<int>(w),
static_cast<int>(hPad), static_cast<int>(wPad),
static_cast<int>(hStride), static_cast<int>(wStride)));
(int)dims.size(), dims.data(), pad.data(), stride.data()));
}
public:
operator cudnnPoolingDescriptor_t() const
{
return m_pool;
}
~CuDnnPoolingDescriptor() noexcept
~CuDnnPool()
{
if (m_pool != nullptr)
{
// TODO: Check for error code and throw if !std::uncaught_exception()
cudnnDestroyPoolingDescriptor(m_pool);
m_pool = nullptr;
}
}
operator cudnnPoolingDescriptor_t() const
{
return m_pool;
}
DISABLE_COPY_AND_MOVE(CuDnnPool);
private:
cudnnPoolingDescriptor_t m_pool;
};
template <typename CuDnnT, typename In>
static CuDnnT& As(In& src)
{
// Do dynamic_cast only in debug builds and static_cast in release builds.
assert(dynamic_cast<CuDnnT*>(&src) != nullptr);
return static_cast<CuDnnT&>(src);
}
static const CuDnnTensor4D& t(const ConvolutionTensor4D& src)
{
return As<const CuDnnTensor4D>(src);
}
static const CuDnnFilter& f(const ConvolutionFilter& src)
{
return As<const CuDnnFilter>(src);
}
static const CuDnnConvolutionDescriptor& cd(const ConvolutionDescriptor& src)
{
return As<const CuDnnConvolutionDescriptor>(src);
}
static const CuDnnPoolingDescriptor& p(const PoolingDescriptor& src)
{
return As<const CuDnnPoolingDescriptor>(src);
}
template <typename ElemType>
static ElemType* ptr(Matrix<ElemType>& src)
{
return src.BufferPointer();
}
template <typename ElemType>
static const ElemType* ptr(const Matrix<ElemType>& src)
{
return src.BufferPointer();
}
template <typename ElemType>
struct Consts
{
static const ElemType Zero;
static const ElemType One;
};
template <>
const float Consts<float>::One = 1;
template <>
const double Consts<double>::One = 1;
template <>
const float Consts<float>::Zero = 0;
template <>
const double Consts<double>::Zero = 0;
template <typename ElemType>
template <class ElemType>
class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
{
public:
using Base = ConvolutionEngine<ElemType>;
using typename Base::Mat;
using typename Base::Tensor4D;
using typename Base::Filter;
using typename Base::ConvDesc;
CuDnnConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
: Base(deviceId, imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_bnImpl(bnImpl), m_stream(GetStream()), m_cudnn(nullptr)
public:
CuDnnConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
size_t maxTempMemSizeInSamples, PoolKind poolKind)
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
m_cudnn(CuDnn::Instance()),
m_dataType(CuDnnTensor::GetDataType<ElemType>()),
m_inT(geometry->InputShape(), m_dataType),
m_outT(geometry->OutputShape(), m_dataType)
{
CUDNN_CALL(cudnnCreate(&m_cudnn));
CUDNN_CALL(cudnnSetStream(m_cudnn, m_stream));
}
~CuDnnConvolutionEngine()
{
if (m_cudnn != nullptr)
{
// TODO: Check for error code and throw if !std::uncaught_exception()
cudnnDestroy(m_cudnn);
m_cudnn = nullptr;
}
}
protected:
using Base::m_geometry;
using Base::m_deviceId;
using Base::m_imageLayout;
using Base::m_maxTempMemSizeInSamples;
using Base::m_poolKind;
void EnsureCompatible() override
{
@ -306,26 +196,39 @@ protected:
RuntimeError("cuDNN convolution engine supports GPU devices only.");
}
void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& outT, Mat& out, Mat& workspace) override
void EnsureConvolutionInitialized() override
{
// Find best algo and allocate temp buffer, if needed.
auto finder = [&](int& calgo, cudnnConvolutionFwdAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
if (m_kernelT == nullptr)
{
return cudnnFindConvolutionForwardAlgorithm(m_cudnn, t(inT), f(filterT), cd(convDesc), t(outT), MaxAlgoCount, &calgo, algoPerf);
m_kernelT = std::make_unique<CuDnnKernel>(*m_geometry, m_dataType),
m_conv = std::make_unique<CuDnnConv>(*m_geometry, m_dataType);
}
}
void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) override
{
size_t batchSize = in.GetNumCols();
// Find best algo and allocate temp buffer, if needed.
auto finder = [this](int& calgo, cudnnConvolutionFwdAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
{
return cudnnFindConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, MaxAlgoCount, &calgo, algoPerf);
};
FindBestAlgo(t(inT), m_fwdAlgo, finder);
auto staticFinder = [this](cudnnConvolutionFwdAlgo_t& algo) -> cudnnStatus_t
{
return cudnnGetConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, &algo);
};
FindBestAlgo(batchSize, m_fwdAlgo, finder, staticFinder);
if (m_fwdAlgo.Algo.memory > 0)
workspace.Resize((m_fwdAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
// Perform forward convolution operation.
auto err = cudnnConvolutionForward(m_cudnn, &C::One, t(inT), ptr(in), f(filterT), ptr(filter), cd(convDesc),
m_fwdAlgo.Algo.algo, ptr(workspace), m_fwdAlgo.Algo.memory, &C::Zero, t(outT), ptr(out));
auto err = cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv,
m_fwdAlgo.Algo.algo, ptr(workspace), m_fwdAlgo.Algo.memory, &C::Zero, m_outT, ptr(out));
// There might be a case where cuDNN fails due to workspace being too small, try using no-workspace algo instead.
// REVIEW alexeyk: NVIDIA is currently reviewing this issue.
if (CUDNN_STATUS_INVALID_VALUE == err && m_fwdAlgo.Algo.memory > 0)
{
auto err2 = cudnnConvolutionForward(m_cudnn, &C::One, t(inT), ptr(in), f(filterT), ptr(filter), cd(convDesc),
m_fwdAlgo.NoWorkspaceAlgo, nullptr, 0, &C::Zero, t(outT), ptr(out));
auto err2 = cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv,
m_fwdAlgo.NoWorkspaceAlgo, nullptr, 0, &C::Zero, m_outT, ptr(out));
// Update original error in case of success.
if (CUDNN_STATUS_SUCCESS == err2)
err = CUDNN_STATUS_SUCCESS;
@ -333,128 +236,104 @@ protected:
CUDNN_CALL(err);
}
void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
const Tensor4D& gradT, Mat& grad, Mat& workspace) override
void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
{
size_t batchSize = srcGrad.GetNumCols();
// Find best algo and allocate temp buffer, if needed.
auto finder = [&](int& calgo, cudnnConvolutionBwdDataAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
auto finder = [this](int& calgo, cudnnConvolutionBwdDataAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
{
return cudnnFindConvolutionBackwardDataAlgorithm(m_cudnn, f(filterT), t(srcGradT), cd(convDesc), t(gradT), MaxAlgoCount, &calgo, algoPerf);
return cudnnFindConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, MaxAlgoCount, &calgo, algoPerf);
};
FindBestAlgo(t(srcGradT), m_backDataAlgo, finder);
auto staticFinder = [this](cudnnConvolutionBwdDataAlgo_t& algo) -> cudnnStatus_t
{
return cudnnGetConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE, 0, &algo);
};
FindBestAlgo(batchSize, m_backDataAlgo, finder, staticFinder);
if (m_backDataAlgo.Algo.memory > 0)
workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
// Compute gradients with respect to the output tensor (data).
CUDNN_CALL(cudnnConvolutionBackwardData(m_cudnn, &C::One, f(filterT), ptr(filter), t(srcGradT), ptr(srcGrad), cd(convDesc), m_backDataAlgo.Algo.algo,
ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, t(gradT), ptr(grad)));
CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
}
void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
const Filter& filterT, Mat& filter, bool /*allowReuse*/, Mat& workspace) override
void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
{
size_t batchSize = in.GetNumCols();
// Find best algo and allocate temp buffer, if needed.
auto finder = [&](int& calgo, cudnnConvolutionBwdFilterAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
auto finder = [this](int& calgo, cudnnConvolutionBwdFilterAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
{
return cudnnFindConvolutionBackwardFilterAlgorithm(m_cudnn, t(inT), t(srcGradT), cd(convDesc), f(filterT), MaxAlgoCount, &calgo, algoPerf);
return cudnnFindConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, MaxAlgoCount, &calgo, algoPerf);
};
FindBestAlgo(t(inT), m_backFiltAlgo, finder);
auto staticFinder = [this](cudnnConvolutionBwdFilterAlgo_t& algo) -> cudnnStatus_t
{
return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE, 0, &algo);
};
FindBestAlgo(batchSize, m_backFiltAlgo, finder, staticFinder);
if (m_backFiltAlgo.Algo.memory > 0)
workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
// Compute gradients with respect to the output tensor (data).
CUDNN_CALL(cudnnConvolutionBackwardFilter(m_cudnn, &C::One, t(inT), ptr(in), t(srcGradT), ptr(srcGrad), cd(convDesc), m_backFiltAlgo.Algo.algo,
ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, f(filterT), ptr(filter)));
CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
}
void EnsureCompatibleBatchNorm(bool spatial) override
void EnsurePoolingInitialized() override
{
if (!IsGpu(m_deviceId))
InvalidArgument("cuDNN engine does not support batch normalization on CPUs.");
if (spatial && m_imageLayout != ImageLayoutKind::CHW)
InvalidArgument("cuDNN engine batch normalization currently supports only CHW data layout for convolutional nodes.");
if (m_pool == nullptr)
m_pool = std::make_unique<CuDnnPool>(*m_geometry, m_poolKind);
}
void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
void ForwardPoolingCore(const Mat& in, Mat& out) override
{
if (m_bnImpl == BatchNormImpl::CuDnn)
{
cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
epsilon, ptr(saveMean), ptr(saveInvStdDev)));
}
else if (m_bnImpl == BatchNormImpl::Cntk)
{
epsilon = std::max(epsilon, 1e-9);
CUDA_CALL(BatchNormalizationForwardTraining(inT, spatial, ptr(in), ptr(out), ptr(scale), ptr(bias),
expAvgFactor, ptr(runMean), ptr(runInvStdDev),
epsilon, ptr(saveMean), ptr(saveInvStdDev), m_stream));
}
else
RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
size_t batchSize = in.GetNumCols();
m_inT.UpdateBatchSize(batchSize);
m_outT.UpdateBatchSize(batchSize);
CUDNN_CALL(cudnnPoolingForward(*m_cudnn, *(m_pool), &C::One, m_inT, ptr(in), &C::Zero, m_outT, ptr(out)));
}
void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) override
void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
{
if (m_bnImpl == BatchNormImpl::CuDnn)
{
cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
CUDNN_CALL(cudnnBatchNormalizationForwardInference(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
t(scaleBiasT), ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), CUDNN_BN_MIN_EPSILON));
}
else if (m_bnImpl == BatchNormImpl::Cntk)
{
CUDA_CALL(BatchNormalizationForwardInference(inT, spatial, ptr(in), ptr(out), ptr(scale), ptr(bias),
ptr(runMean), ptr(runInvStdDev), m_stream));
}
else
RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
}
void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
Mat& scaleGrad, Mat& biasGrad) override
{
if (m_bnImpl == BatchNormImpl::CuDnn)
{
cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
// REVIEW alexeyk: remove once Philly is upgraded to prod version.
#if CUDNN_PATCHLEVEL >= 7
CUDNN_CALL(cudnnBatchNormalizationBackward(m_cudnn, mode, &C::One, &C::One, &C::One, &C::One, t(inT), ptr(in), t(inT), ptr(srcGrad), t(inT), ptr(grad),
t(scaleBiasT), ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
#else
CUDNN_CALL(cudnnBatchNormalizationBackward(m_cudnn, mode, &C::One, &C::One, t(inT), ptr(in), t(inT), ptr(srcGrad), t(inT), ptr(grad),
t(scaleBiasT), ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
#endif
}
else if (m_bnImpl == BatchNormImpl::Cntk)
{
CUDA_CALL(BatchNormalizationBackward(inT, spatial, ptr(in), ptr(srcGrad), ptr(grad), ptr(scale), ptr(scaleGrad), ptr(biasGrad),
ptr(saveMean), ptr(saveInvStdDev), m_stream));
}
else
RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
size_t batchSize = in.GetNumCols();
m_inT.UpdateBatchSize(batchSize);
m_outT.UpdateBatchSize(batchSize);
CUDNN_CALL(cudnnPoolingBackward(*m_cudnn, *(m_pool), &C::One, m_outT, ptr(out), m_outT, ptr(srcGrad),
m_inT, ptr(in), &C::One, m_inT, ptr(grad)));
}
private:
using C = Consts<ElemType>;
static const int MaxAlgoCount = 10;
template <typename TAlgo, typename TFinder>
void FindBestAlgo(const CuDnnTensor4D& t, TAlgo& algo, TFinder finder)
template <typename TAlgo, typename TFinder, typename TStaticFinder>
void FindBestAlgo(size_t batchSize, TAlgo& algo, TFinder finder, TStaticFinder staticFinder)
{
if (!algo.NeedAutotuning(t))
if (!algo.NeedAutotuning(batchSize))
return;
m_inT.UpdateBatchSize(batchSize);
m_outT.UpdateBatchSize(batchSize);
using CuDnnAlgoT = decltype(TAlgo::Algo);
CuDnnAlgoT algoPerf[MaxAlgoCount];
int calgo = 0;
CUDNN_CALL(finder(calgo, algoPerf));
cudnnStatus_t err = finder(calgo, algoPerf);
// Alloc failed - usually means cuDNN runtime auto-tuner could not allocate workspace.
// In such case, use static auto-tuner with no workspace.
if (err == CUDNN_STATUS_ALLOC_FAILED)
{
decltype(CuDnnAlgoT::algo) noMemAlgo;
CUDNN_CALL(staticFinder(noMemAlgo));
algo.CurMBSize = batchSize;
algo.Algo = algoPerf[0];
algo.Algo.algo = noMemAlgo;
algo.Algo.memory = 0;
algo.Algo.status = CUDNN_STATUS_SUCCESS;
algo.NoWorkspaceAlgo = noMemAlgo;
return;
}
CUDNN_CALL(err);
assert(calgo > 0);
size_t maxMem = m_maxTempMemSizeInSamples == 0 ? (std::numeric_limits<size_t>::max)() : t.w() * t.h() * t.c() * m_maxTempMemSizeInSamples * sizeof(ElemType);
size_t inputSampleSize = m_geometry->InputShape().GetNumElements();
size_t maxMem = m_maxTempMemSizeInSamples == 0 ? (std::numeric_limits<size_t>::max)() : inputSampleSize * m_maxTempMemSizeInSamples * sizeof(ElemType);
// Find best (fastest) algorithm which satisfies workspace requirements.
auto res = std::find_if(algoPerf, algoPerf + calgo,
[=](const CuDnnAlgoT& cur)
{
@ -462,8 +341,9 @@ private:
});
if (res == algoPerf + calgo)
RuntimeError("cuDNN could not find suitable algorithm for the current convolution configuration.");
algo.CurMBSize = t.n();
algo.CurMBSize = batchSize;
algo.Algo = *res;
// Find fastest algorithm that does NOT require workspace. It is used as a fallback algo in Forward function.
res = std::find_if(algoPerf, algoPerf + calgo,
[](const CuDnnAlgoT& cur)
{
@ -478,6 +358,15 @@ private:
algo.NoWorkspaceAlgo = (*res).algo;
}
static ElemType* ptr(Mat& src)
{
return src.BufferPointer();
}
static const ElemType* ptr(const Mat& src)
{
return src.BufferPointer();
}
private:
template <typename T>
struct ConvAlgoInfo
@ -495,7 +384,7 @@ private:
T Algo;
CuDnnAlgoT NoWorkspaceAlgo;
bool NeedAutotuning(const CuDnnTensor4D& t)
bool NeedAutotuning(size_t batchSize)
{
// Need to re-run auto-tuner in case minibatch size is increased.
// If minibatch size is decreased we assume that previously selected algorithm requires less or the same amount of workspace.
@ -504,186 +393,57 @@ private:
// We also need to reset auto-tuning status at the beginning of each epoch but ComputationNode currently does not provide such notification.
// We assume no other dimensions of tensors can change so we don't check it.
// REVIEW alexeyk: review once we get response from NVIDIA.
return (Algo.status != CUDNN_STATUS_SUCCESS || t.n() > CurMBSize);
return (Algo.status != CUDNN_STATUS_SUCCESS || batchSize > CurMBSize);
}
};
using C = Consts<ElemType>;
CuDnn::ptr_t m_cudnn;
cudnnDataType_t m_dataType;
CuDnnTensor m_inT;
CuDnnTensor m_outT;
// Convolution specific.
std::unique_ptr<CuDnnKernel> m_kernelT;
std::unique_ptr<CuDnnConv> m_conv;
// Pooling specific.
std::unique_ptr<CuDnnPool> m_pool;
// REVIEW alexeyk: currently limit is set once in ctor though in CNTK it can be, theoretically, changed in runtime.
size_t m_maxTempMemSizeInSamples;
BatchNormImpl m_bnImpl;
cudnnHandle_t m_cudnn;
cudaStream_t m_stream;
ConvAlgoInfo<cudnnConvolutionFwdAlgoPerf_t> m_fwdAlgo;
ConvAlgoInfo<cudnnConvolutionBwdDataAlgoPerf_t> m_backDataAlgo;
ConvAlgoInfo<cudnnConvolutionBwdFilterAlgoPerf_t> m_backFiltAlgo;
};
template <class ElemType>
class CuDnnPoolingEngine : public PoolingEngine<ElemType>
std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr geometry,
DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
size_t maxTempMemSizeInSamples, PoolKind poolKind)
{
public:
using Base = PoolingEngine<ElemType>;
using typename Base::Tensor4D;
using typename Base::PoolDesc;
using typename Base::Mat;
public:
CuDnnPoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
: Base(deviceId, imageLayout), m_cudnn(nullptr)
{
CUDNN_CALL(cudnnCreate(&m_cudnn));
CUDNN_CALL(cudnnSetStream(m_cudnn, GetStream()));
}
~CuDnnPoolingEngine()
{
if (m_cudnn != nullptr)
{
// TODO: Check for error code and throw if !std::uncaught_exception()
cudnnDestroy(m_cudnn);
m_cudnn = nullptr;
}
}
protected:
using Base::m_deviceId;
using Base::m_imageLayout;
void EnsureCompatible() override
{
if (m_imageLayout != ImageLayoutKind::CHW)
RuntimeError("cuDNN pooling engine supports only CHW/cudnn layout.");
if (!IsGpu(m_deviceId))
RuntimeError("cuDNN pooling engine supports GPU devices only.");
}
void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) override
{
CUDNN_CALL(cudnnPoolingForward(m_cudnn, p(poolDesc), &C::One, t(inT), ptr(in), &C::Zero, t(outT), ptr(out)));
}
void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) override
{
CUDNN_CALL(cudnnPoolingBackward(m_cudnn, p(poolDesc), &C::One, t(outT), ptr(out), t(outT), ptr(srcGrad),
t(inT), ptr(in), &C::One, t(inT), ptr(grad)));
}
private:
using C = Consts<ElemType>;
cudnnHandle_t m_cudnn;
};
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
{
// REVIEW alexeyk: assert fires in GCC but not in VC++.
// static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
RuntimeError("Not implemented.");
}
template <>
typename CuDnnConvolutionEngineFactory<float>::Tensor4DPtr CuDnnConvolutionEngineFactory<float>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
{
return std::make_unique<CuDnnTensor4D>(w, h, c, n, CUDNN_DATA_FLOAT);
}
template <>
typename CuDnnConvolutionEngineFactory<double>::Tensor4DPtr CuDnnConvolutionEngineFactory<double>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
{
return std::make_unique<CuDnnTensor4D>(w, h, c, n, CUDNN_DATA_DOUBLE);
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind)
{
// REVIEW alexeyk: assert fires in GCC but not in VC++.
// static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
RuntimeError("Not implemented.");
}
template <>
typename CuDnnConvolutionEngineFactory<float>::FilterPtr CuDnnConvolutionEngineFactory<float>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
{
return std::make_unique<CuDnnFilter>(w, h, c, k, CUDNN_DATA_FLOAT);
}
template <>
typename CuDnnConvolutionEngineFactory<double>::FilterPtr CuDnnConvolutionEngineFactory<double>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
{
return std::make_unique<CuDnnFilter>(w, h, c, k, CUDNN_DATA_DOUBLE);
}
// REVIEW alexeyk: IsSupported check should be performed by cuDNN itself. Is there a good way to do that?
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
const Tensor4D& /*inT*/, const Filter& filterT, size_t wStride, size_t hStride, bool padding)
{
size_t wPad = padding ? filterT.w() / 2 : 0;
size_t hPad = padding ? filterT.h() / 2 : 0;
return std::make_unique<CuDnnConvolutionDescriptor>(wStride, hStride, wPad, hPad);
cudaDeviceProp props = {0};
if (cudaGetDeviceProperties(&props, deviceId) != cudaSuccess || props.major < 3)
return false;
const auto& input = geometry->InputShape();
const auto& kernel = geometry->KernelShape();
const auto& sharing = geometry->Sharing();
const auto& mapCount = geometry->MapCount();
// cuDNN supports 2D and 3D convolutions at the moment with full sharing.
// In case map count size > 1, then it should have all ones except last dimension.
// If pooling is requested, then cuDNN supports only 2D/3D inputs and 2D pooling kernels.
return (input.GetRank() <= 4 &&
std::find(begin(sharing), end(sharing), false) == sharing.end() &&
mapCount.GetNumElements() == mapCount[mapCount.GetRank() - 1] &&
(poolKind == PoolKind::None ||
input.GetRank() <= 3 && (kernel.GetRank() < 3 || kernel[2] == 1)));
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
{
return std::make_unique<CuDnnPoolingDescriptor>(kind, w, h, wStride, hStride, wPad, hPad);
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(
DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
{
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(deviceId, imageLayout, maxTempMemSizeInSamples, bnImpl);
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(
DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
{
return std::make_unique<CuDnnPoolingEngine<ElemType>>(deviceId, imageLayout);
}
#else
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t, size_t, size_t, size_t)
{
RuntimeError("The code is compiled without USE_CUDNN macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t, size_t, size_t, size_t)
{
RuntimeError("The code is compiled without USE_CUDNN macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
const Tensor4D&, const Filter&, size_t, size_t, bool)
{
RuntimeError("The code is compiled without USE_CUDNN macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
typename PoolDesc::PoolKind, size_t, size_t, size_t, size_t, size_t, size_t)
{
RuntimeError("The code is compiled without USE_CUDNN macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, ImageLayoutKind, size_t, BatchNormImpl)
{
RuntimeError("The code is compiled without USE_CUDNN macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(DEVICEID_TYPE, ImageLayoutKind)
{
RuntimeError("The code is compiled without USE_CUDNN macro.");
}
#endif
template class CuDnnConvolutionEngineFactory<float>;
template class CuDnnConvolutionEngineFactory<double>;
} } }

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,61 +0,0 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "ConvolutionEngine.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
class CuDnnConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
{
public:
using Base = ConvolutionEngineFactory<ElemType>;
using typename Base::Tensor4D;
using typename Base::Tensor4DPtr;
using typename Base::Filter;
using typename Base::FilterPtr;
using typename Base::ConvDesc;
using typename Base::ConvDescPtr;
using typename Base::PoolDesc;
using typename Base::PoolDescPtr;
using typename Base::ConvEnginePtr;
using typename Base::PoolEnginePtr;
public:
Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override;
FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override;
ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT,
size_t wStride, size_t hStride, bool padding) override;
PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override;
ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) override;
PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) override;
static bool IsSupported(DEVICEID_TYPE deviceId);
};
// REVIEW alexeyk: wrong place. It is currently used only in unit tests but I can't add it there because of the build issues.
// Timer that can be used to measure CUDA calls.
// Uses CUDA event and will synchronize(!) the stream when Stop is called.
class MATH_API CudaTimer
{
public:
CudaTimer(): m_start(nullptr), m_stop(nullptr)
{
}
~CudaTimer();
void Start();
void Stop();
float Elapsed();
DISABLE_COPY_AND_MOVE(CudaTimer);
private:
void* m_start;
void* m_stop;
};
} } }

Просмотреть файл

@ -0,0 +1,51 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "ConvolutionEngine.h"
#include "BatchNormalizationEngine.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
class CuDnnConvolutionEngineFactory
{
public:
static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
PoolKind poolKind);
static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind);
};
template <class ElemType>
class CuDnnBatchNormEngineFactory
{
public:
static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout);
};
// REVIEW alexeyk: wrong place? It is currently used only in unit tests but I can't add it there because of the build issues.
// Timer that can be used to measure CUDA calls.
// Uses CUDA event and will synchronize(!) the stream when Stop is called.
class MATH_API CudaTimer
{
public:
CudaTimer(): m_start(nullptr), m_stop(nullptr)
{
}
~CudaTimer();
void Start();
void Stop();
float Elapsed();
DISABLE_COPY_AND_MOVE(CudaTimer);
private:
void* m_start;
void* m_stop;
};
} } }

Просмотреть файл

@ -24,6 +24,8 @@
#include "cublas_v2.h"
#include <assert.h>
#include <memory>
#include "CntkBatchNormalization.cuh"
#include "Convolution.cuh"
#pragma comment(lib, "cudart.lib") // instruct linker to reference these libs
#pragma comment(lib, "cublas.lib")
@ -145,7 +147,7 @@ AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numE
}
AllocatedElemType* deviceBufferPtr = AllocateNoTrace<AllocatedElemType>(deviceId, numElements);
if (IsTraceEnabled())
{
fprintf(stderr, "Allocated DeviceBufferPointer = %p\n", (void*)deviceBufferPtr);
@ -3001,6 +3003,178 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMat
#pragma endregion Other helper functions
template <class ElemType>
void GPUMatrix<ElemType>::ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const
{
const int BlockSize = 128;
auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
PrepareDevice();
SyncGuard syncGuard;
kConvolutionForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), kernel.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
runs.m_pArray, m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
}
template <class ElemType>
void GPUMatrix<ElemType>::ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const
{
const int BlockSize = 128;
auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
PrepareDevice();
SyncGuard syncGuard;
kConvolutionBackwardData<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), kernel.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
runs.m_pArray, m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
}
template <class ElemType>
void GPUMatrix<ElemType>::ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const
{
const int BlockSize = 128;
auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
PrepareDevice();
SyncGuard syncGuard;
kConvolutionBackwardKernel<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), (int)in.GetNumRows(), (int)GetNumRows(),
in.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
runs.m_pArray, m_pArray, kernelGrad.m_pArray);
}
template <class ElemType>
void GPUMatrix<ElemType>::MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
{
const int BlockSize = 128;
auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
PrepareDevice();
SyncGuard syncGuard;
kMaxPoolingForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
}
template <class ElemType>
void GPUMatrix<ElemType>::MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
GPUMatrix<ElemType>& grad) const
{
const int BlockSize = 128;
auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
PrepareDevice();
SyncGuard syncGuard;
kMaxPoolingBackward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), out.m_pArray, in.m_pArray,
mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
}
template <class ElemType>
void GPUMatrix<ElemType>::AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
{
const int BlockSize = 128;
auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
PrepareDevice();
SyncGuard syncGuard;
kAveragePoolingForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
}
template <class ElemType>
void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const
{
const int BlockSize = 128;
auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
PrepareDevice();
SyncGuard syncGuard;
kAveragePoolingBackward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
}
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
{
assert((GetNumRows() % scale.GetNumRows()) == 0);
bool spatial = GetNumRows() != scale.GetNumRows();
size_t vectorSize = GetNumRows();
size_t spatialSize = spatial ? (GetNumRows() / scale.GetNumRows()) : 1;
size_t batchSize = GetNumCols();
assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
assert(0 < batchSize && batchSize <= std::numeric_limits<int>::max());
SyncGuard syncGuard;
// If expAvgFactor == 0 && blendFactor == 1 then we don't need to compute current minibatch statistics.
if (expAvgFactor > 0 || blendFactor < 1)
{
if (spatial)
{
Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, m_pArray,
expAvgFactor, runMean.m_pArray, runInvStdDev.m_pArray, epsilon,
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
}
else
{
Call<ComputeBatchMeanAndInvStdDev, ElemType>(vectorSize, vectorSize, batchSize, m_pArray,
expAvgFactor, runMean.m_pArray, runInvStdDev.m_pArray, epsilon,
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
}
}
// When:
// blendFactor == 1 - use running mean/var instead of the current minibatch mean/var.
// 0 < blendFactor < 1 - blend running mean/var with mean/var of the current minibatch: saveMean = (1 - blendFactor) * saveMean + blendFactor * runMean
// blendFactor == 0 - use mean/var of the current minibatch.
if (blendFactor < 1)
{
if (blendFactor > 0)
{
// REVIEW alexeyk: can be rolled into NormalizeBatchTraining to save bandwidth.
Scale((ElemType)(1 - blendFactor), saveMean);
ScaleAndAdd((ElemType)blendFactor, runMean, saveMean);
Scale((ElemType)(1 - blendFactor), saveInvStdDev);
ScaleAndAdd((ElemType)blendFactor, runInvStdDev, saveInvStdDev);
}
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
spatial, m_pArray, out.m_pArray, scale.m_pArray, bias.m_pArray,
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
}
else
{
Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
spatial, m_pArray, out.m_pArray, scale.m_pArray, bias.m_pArray,
runMean.m_pArray, runInvStdDev.m_pArray, GetStream());
}
}
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale,
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
{
assert((GetNumRows() % scale.GetNumRows()) == 0);
bool spatial = GetNumRows() != scale.GetNumRows();
size_t vectorSize = GetNumRows();
size_t spatialSize = spatial ? (GetNumRows() / scale.GetNumRows()) : 1;
size_t batchSize = GetNumCols();
assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
assert(0 < batchSize && batchSize <= std::numeric_limits<int>::max());
SyncGuard syncGuard;
if (spatial)
{
Call<ComputeSpatialScaleAndBiasGradients, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, in.m_pArray, m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray,
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
}
else
{
Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.m_pArray, m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray,
saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
}
Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
in.m_pArray, m_pArray, grad.m_pArray, scale.m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray, saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
}
#pragma region Static BLAS Functions
// float/double overloads of cublasSgemm()/cublasDgemm()
static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
@ -4216,6 +4390,9 @@ template void GPUMatrix<char>::SetValue(const char);
template void GPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, int deviceId, char* pArray, size_t matrixFlags);
template void GPUMatrix<char>::SetValue(GPUMatrix<char> const&);
template GPUMatrix<int>::GPUMatrix(const size_t, const size_t, int, int*, const size_t);
template GPUMatrix<int>::~GPUMatrix();
template int* TracingGPUMemoryAllocator::Allocate<int>(int, size_t);
template size_t* TracingGPUMemoryAllocator::Allocate<size_t>(int, size_t);
template long* TracingGPUMemoryAllocator::Allocate<long>(int, size_t);

Просмотреть файл

@ -45,6 +45,11 @@ typedef struct CUstream_st* cudaStream_t;
#define USE_TIME_BASED_SEED ULONG_MAX
#endif
// Max number of GPUs on a _single_ node.
#ifndef MAX_GPUS
#define MAX_GPUS 16
#endif
// Stream management functions
void MATH_API SetStream(cudaStream_t stream);
cudaStream_t MATH_API GetStream();
@ -100,7 +105,7 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
friend class GPUMatrix;
public:
static const int MaxGpus = 8; // support up to 8 GPUs
static const int MaxGpus = MAX_GPUS;
using BaseMatrix<ElemType>::m_computeDevice;
using BaseMatrix<ElemType>::m_elemSizeAllocated;
using BaseMatrix<ElemType>::m_format;
@ -402,6 +407,27 @@ public:
const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
void ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const;
void ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const;
void ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const;
void MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
void MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
GPUMatrix<ElemType>& grad) const;
void AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;
void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const;
public:
// static BLAS functions
static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c);

Просмотреть файл

@ -2644,6 +2644,10 @@ template GPUSparseMatrix<char> GPUSparseMatrix<char>::ColumnSlice(size_t startCo
template GPUMatrix<char> GPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix<char>&& deepCopy);
template GPUSparseMatrix<int>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
template GPUSparseMatrix<int>::~GPUSparseMatrix();
template void GPUSparseMatrix<int>::Resize(const size_t, const size_t, const size_t, const bool, bool);
template <class ElemType>
MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
{

Просмотреть файл

@ -156,8 +156,10 @@
<ClInclude Include="..\Common\Include\TensorShape.h" />
<ClInclude Include="..\Common\Include\File.h" />
<ClInclude Include="..\Common\Include\fileutil.h" />
<ClInclude Include="BatchNormalizationEngine.h" />
<ClInclude Include="CommonMatrix.h" />
<ClInclude Include="ConvolutionEngine.h" />
<ClInclude Include="ConvolveGeometry.h" />
<ClInclude Include="CPUMatrix.h" />
<ClInclude Include="MatrixQuantizerImpl.h" />
<ClInclude Include="TensorOps.h" />
@ -188,6 +190,7 @@
<ClCompile Include="..\Common\fileutil.cpp">
<PrecompiledHeader>NotUsing</PrecompiledHeader>
</ClCompile>
<ClCompile Include="BatchNormalizationEngine.cpp" />
<ClCompile Include="ConvolutionEngine.cpp" />
<ClCompile Include="CPUSparseMatrix.cpp" />
<ClCompile Include="CUDAPageLockedMemAllocator.cpp" />
@ -212,4 +215,4 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
</Project>

Просмотреть файл

@ -44,6 +44,9 @@
<ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
<Filter>Common</Filter>
</ClCompile>
<ClCompile Include="BatchNormalizationEngine.cpp">
<Filter>BatchNormalization</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="CommonMatrix.h" />
@ -97,6 +100,12 @@
<ClInclude Include="MatrixQuantizerImpl.h">
<Filter>1bitSGD</Filter>
</ClInclude>
<ClInclude Include="ConvolveGeometry.h">
<Filter>Convolution</Filter>
</ClInclude>
<ClInclude Include="BatchNormalizationEngine.h">
<Filter>BatchNormalization</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="GPUMatrix.h">
@ -143,5 +152,8 @@
<Filter Include="1bitSGD">
<UniqueIdentifier>{546cacbd-253e-485b-8c8c-8b9ee0e2f631}</UniqueIdentifier>
</Filter>
<Filter Include="BatchNormalization">
<UniqueIdentifier>{8f982dac-298d-4e48-b060-8e6cba5ff554}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>

Просмотреть файл

@ -143,6 +143,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
<ItemGroup>
<ClInclude Include="..\Common\Include\File.h" />
<ClInclude Include="..\Common\Include\fileutil.h" />
<ClInclude Include="CntkBatchNormalization.cuh" />
<ClInclude Include="ColumnQuantizer.h" />
<ClInclude Include="CommonMatrix.h" />
<ClInclude Include="cudabasetypes.h" />
@ -151,11 +152,12 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
<ClInclude Include="cudalatticeops.cu.h" />
<ClInclude Include="cudalatticeops.h" />
<ClInclude Include="cudalib.h" />
<ClInclude Include="CuDnnConvolutionEngine.cuh" />
<ClInclude Include="CuDnnConvolutionEngine.h" />
<ClInclude Include="CuDnnCommon.h" />
<ClInclude Include="CuDnnFactories.h" />
<ClInclude Include="GPUDataTransferer.h" />
<ClInclude Include="GPUTensor.h" />
<ClInclude Include="latticefunctionskernels.h" />
<ClInclude Include="Convolution.cuh" />
<ClInclude Include="TensorOps.h" />
<ClInclude Include="ValueQuantizer.h" />
<None Include="GPUWatcher.h">
@ -170,6 +172,9 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
<ClInclude Include="targetver.h" />
</ItemGroup>
<ItemGroup>
<CudaCompile Include="CuDnnBatchNormalization.cu">
<FileType>CppCode</FileType>
</CudaCompile>
<CudaCompile Include="GPUTensor.cu">
<InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
<Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
@ -190,6 +195,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
<CudaCompile Include="CuDnnConvolutionEngine.cu">
<FileType>CppCode</FileType>
</CudaCompile>
<ClCompile Include="CuDnnCommon.cpp" />
<ClCompile Include="GPUDataTransferer.cpp" />
<ClCompile Include="stdafx.cpp">
<PrecompiledHeader>Create</PrecompiledHeader>

Просмотреть файл

@ -28,6 +28,9 @@
<CudaCompile Include="CuDnnConvolutionEngine.cu">
<Filter>GPU\Convolution</Filter>
</CudaCompile>
<CudaCompile Include="CuDnnBatchNormalization.cu">
<Filter>GPU\BatchNormalization</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<ClCompile Include="cudalattice.cpp">
@ -45,6 +48,9 @@
<ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
<Filter>Misc</Filter>
</ClCompile>
<ClCompile Include="CuDnnCommon.cpp">
<Filter>GPU\CuDnn</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\Common\Include\File.h">
@ -98,8 +104,8 @@
<ClInclude Include="CommonMatrix.h">
<Filter>from Math</Filter>
</ClInclude>
<ClInclude Include="CuDnnConvolutionEngine.h">
<Filter>GPU\Convolution</Filter>
<ClInclude Include="CuDnnFactories.h">
<Filter>GPU\CuDnn</Filter>
</ClInclude>
<ClInclude Include="TensorOps.h">
<Filter>from Math</Filter>
@ -107,7 +113,13 @@
<ClInclude Include="GPUDataTransferer.h">
<Filter>GPU</Filter>
</ClInclude>
<ClInclude Include="CuDnnConvolutionEngine.cuh">
<ClInclude Include="CntkBatchNormalization.cuh">
<Filter>GPU\BatchNormalization</Filter>
</ClInclude>
<ClInclude Include="CuDnnCommon.h">
<Filter>GPU\CuDnn</Filter>
</ClInclude>
<ClInclude Include="Convolution.cuh">
<Filter>GPU\Convolution</Filter>
</ClInclude>
</ItemGroup>
@ -150,5 +162,11 @@
<Filter Include="GPU\Convolution">
<UniqueIdentifier>{3155488f-128f-494e-858d-459b4cc9fab7}</UniqueIdentifier>
</Filter>
<Filter Include="GPU\BatchNormalization">
<UniqueIdentifier>{639ff4b6-39b5-4a5b-8856-ee918eeea91e}</UniqueIdentifier>
</Filter>
<Filter Include="GPU\CuDnn">
<UniqueIdentifier>{05351afa-de95-40c8-830a-d70eede55dc0}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>

Просмотреть файл

@ -3987,6 +3987,189 @@ Matrix<ElemType>& Matrix<ElemType>::AddAveragePoolingGradient(const Matrix<ElemT
#pragma endregion Other Helper Functions
template <class ElemType>
void Matrix<ElemType>::ConvolutionForward(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const
{
assert(mpRowCol.GetNumCols() == 1);
assert(mpRowIwht.GetNumCols() == 1);
assert(mpRowRun.GetNumCols() == 1);
assert(runs.GetNumCols() == 1);
DecideAndMoveToRightDevice(*this, output);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->ConvolutionForward(*(kernel.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
*(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(output.m_CPUMatrix)),
m_GPUMatrix->ConvolutionForward(*(kernel.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
*(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(output.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::ConvolutionBackwardData(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& grad) const
{
assert(mpRowCol.GetNumCols() == 1);
assert(mpRowIwht.GetNumCols() == 1);
assert(mpRowRun.GetNumCols() == 1);
assert(runs.GetNumCols() == 1);
DecideAndMoveToRightDevice(*this, grad);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->ConvolutionBackwardData(*(kernel.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
*(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(grad.m_CPUMatrix)),
m_GPUMatrix->ConvolutionBackwardData(*(kernel.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
*(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(grad.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::ConvolutionBackwardKernel(const Matrix<ElemType>& in, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& kernelGrad) const
{
assert(mpRowCol.GetNumCols() == 1);
assert(mpRowIwht.GetNumCols() == 1);
assert(mpRowRun.GetNumCols() == 1);
assert(runs.GetNumCols() == 1);
DecideAndMoveToRightDevice(*this, kernelGrad);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->ConvolutionBackwardKernel(*(in.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
*(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(kernelGrad.m_CPUMatrix)),
m_GPUMatrix->ConvolutionBackwardKernel(*(in.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
*(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(kernelGrad.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
{
assert(mpRowCol.GetNumCols() == 1);
assert(mpRowIndices.GetNumCols() == 1);
assert(indices.GetNumCols() == 1);
DecideAndMoveToRightDevice(*this, output);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->MaxPoolingForward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(output.m_CPUMatrix)),
m_GPUMatrix->MaxPoolingForward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(output.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
Matrix<ElemType>& grad) const
{
assert(mpRowCol.GetNumCols() == 1);
assert(mpRowIndices.GetNumCols() == 1);
assert(indices.GetNumCols() == 1);
DecideAndMoveToRightDevice(*this, grad);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->MaxPoolingBackward(*(out.m_CPUMatrix), *(in.m_CPUMatrix),
*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix),
*(grad.m_CPUMatrix)),
m_GPUMatrix->MaxPoolingBackward(*(out.m_GPUMatrix), *(in.m_GPUMatrix),
*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix),
*(grad.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
{
assert(mpRowCol.GetNumCols() == 1);
assert(mpRowIndices.GetNumCols() == 1);
assert(indices.GetNumCols() == 1);
DecideAndMoveToRightDevice(*this, output);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->AveragePoolingForward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(output.m_CPUMatrix)),
m_GPUMatrix->AveragePoolingForward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(output.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const
{
assert(mpRowCol.GetNumCols() == 1);
assert(mpRowIndices.GetNumCols() == 1);
assert(indices.GetNumCols() == 1);
DecideAndMoveToRightDevice(*this, grad);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->AveragePoolingBackward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(grad.m_CPUMatrix)),
m_GPUMatrix->AveragePoolingBackward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(grad.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const
{
DecideAndMoveToRightDevice(*this, out);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor,
*(runMean.m_CPUMatrix), *(runInvStdDev.m_CPUMatrix),
*(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)),
m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor,
*(runMean.m_GPUMatrix), *(runInvStdDev.m_GPUMatrix),
*(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const
{
DecideAndMoveToRightDevice(*this, grad);
// REVIEW alexeyk: add sparse version.
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix),
*(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix),
*(scaleGrad.m_CPUMatrix), *(biasGrad.m_CPUMatrix)),
m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix),
*(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix),
*(scaleGrad.m_GPUMatrix), *(biasGrad.m_GPUMatrix)),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
#pragma region Static BLAS Functions
template <class ElemType>
@ -5108,4 +5291,6 @@ template void Matrix<char>::SetValue(const Matrix<char>&, MatrixFormat);
template bool Matrix<char>::IsEmpty() const;
template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);
template Matrix<int>::Matrix(const size_t, const size_t, int*, DEVICEID_TYPE, const size_t, const size_t);
}}}

Просмотреть файл

@ -453,6 +453,27 @@ public:
const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
void ConvolutionForward(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
void ConvolutionBackwardData(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& grad) const;
void ConvolutionBackwardKernel(const Matrix<ElemType>& in, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& kernelGrad) const;
void MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
void MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
Matrix<ElemType>& grad) const;
void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;
void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
public:
// TODO: why are these not static? And why are they here?
ElemType Exp10(ElemType num);

Просмотреть файл

@ -12,7 +12,7 @@
#include "GPUMatrix.h"
#include "GPUSparseMatrix.h"
#include "MatrixQuantizerGPU.h"
#include "CuDnnConvolutionEngine.h"
#include "CuDnnFactories.h"
#include "TensorShape.h"
#include "GPUDataTransferer.h"
@ -676,6 +676,7 @@ void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inB
template class MATH_API GPUSparseMatrix<char>;
template class MATH_API GPUSparseMatrix<float>;
template class MATH_API GPUSparseMatrix<double>;
template class MATH_API GPUSparseMatrix<int>;
template <typename ElemType>
MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
@ -1728,6 +1729,60 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMat
return *this;
}
template <class ElemType>
void GPUMatrix<ElemType>::ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const
{
}
template <class ElemType>
void GPUMatrix<ElemType>::ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const
{
}
template <class ElemType>
void GPUMatrix<ElemType>::ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const
{
}
template <class ElemType>
void GPUMatrix<ElemType>::MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
{
}
template <class ElemType>
void GPUMatrix<ElemType>::MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
GPUMatrix<ElemType>& grad) const
{
}
template <class ElemType>
void GPUMatrix<ElemType>::AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
{
}
template <class ElemType>
void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const
{
}
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
{
}
template <class ElemType>
void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale,
const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
{
}
#pragma endregion Other helper functions
#pragma region Static BLAS Functions
@ -2096,6 +2151,7 @@ void GPUDataTransferer<ElemType>::WaitForCopyCPUToGPUAsync()
template class GPUMatrix<char>;
template class GPUMatrix<float>;
template class GPUMatrix<double>;
template class GPUMatrix<int>;
template class DeviceBoundNumber<float>;
template class DeviceBoundNumber<double>;
template MatrixQuantizerGPU<float>::~MatrixQuantizerGPU();
@ -2113,45 +2169,14 @@ template <class ElemType>
void* GPUMatrix<ElemType>::s_curandGenerator = NULL;
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t, size_t, size_t, size_t)
std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr, DEVICEID_TYPE,
ImageLayoutKind, size_t, PoolKind)
{
RuntimeError("The code is compiled with CPUONLY macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t, size_t, size_t, size_t)
{
RuntimeError("The code is compiled with CPUONLY macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
const Tensor4D&, const Filter&, size_t, size_t, bool)
{
RuntimeError("The code is compiled with CPUONLY macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
typename PoolDesc::PoolKind, size_t, size_t, size_t, size_t, size_t, size_t)
{
RuntimeError("The code is compiled with CPUONLY macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, ImageLayoutKind, size_t, BatchNormImpl)
{
RuntimeError("The code is compiled with CPUONLY macro.");
}
template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(DEVICEID_TYPE, ImageLayoutKind)
{
RuntimeError("The code is compiled with CPUONLY macro.");
}
template <class ElemType>
bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE)
bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE, ConvolveGeometryPtr, PoolKind)
{
return false;
}
@ -2159,6 +2184,16 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE)
template class CuDnnConvolutionEngineFactory<float>;
template class CuDnnConvolutionEngineFactory<double>;
template <class ElemType>
std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
bool spatial, ImageLayoutKind imageLayout)
{
RuntimeError("The code is compiled with CPUONLY macro.");
}
template class CuDnnBatchNormEngineFactory<float>;
template class CuDnnBatchNormEngineFactory<double>;
CudaTimer::~CudaTimer()
{
}

Просмотреть файл

@ -18,29 +18,39 @@ CNTKTextFormatReader::CNTKTextFormatReader(MemoryProviderPtr provider,
m_provider(provider)
{
TextConfigHelper configHelper(config);
if (configHelper.GetElementType() == ElementType::tfloat)
{
m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<float>(configHelper));
}
else
{
m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<double>(configHelper));
}
TransformerPtr randomizer;
if (configHelper.ShouldRandomize())
try
{
randomizer = make_shared<BlockRandomizer>(0, SIZE_MAX, m_deserializer);
if (configHelper.GetElementType() == ElementType::tfloat)
{
m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<float>(configHelper));
}
else
{
m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<double>(configHelper));
}
size_t window = configHelper.GetRandomizationWindow();
TransformerPtr randomizer;
if (window > 0)
{
// Verbosity is a general config parameter, not specific to the text format reader.
int verbosity = config(L"verbosity", 2);
randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer);
}
else
{
randomizer = std::make_shared<NoRandomizer>(m_deserializer);
}
randomizer->Initialize(nullptr, config);
m_transformer = randomizer;
}
else
catch (const std::runtime_error& e)
{
randomizer = std::make_shared<NoRandomizer>(m_deserializer);
RuntimeError("CNTKTextFormatReader: While reading '%ls': %s", configHelper.GetFilePath().c_str(), e.what());
}
randomizer->Initialize(nullptr, config);
m_transformer = randomizer;
}
std::vector<StreamDescriptionPtr> CNTKTextFormatReader::GetStreamDescriptions()

Просмотреть файл

@ -90,7 +90,6 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\Include\basetypes.h" />
<ClInclude Include="..\..\Common\Include\DataReader.h" />
<ClInclude Include="..\..\Common\Include\File.h" />
<ClInclude Include="..\..\Common\Include\fileutil.h" />

Просмотреть файл

@ -27,9 +27,6 @@
<ItemGroup>
<ClInclude Include="stdafx.h" />
<ClInclude Include="targetver.h" />
<ClInclude Include="..\..\Common\Include\basetypes.h">
<Filter>Common\Include</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\Include\DataReader.h">
<Filter>Common\Include</Filter>
</ClInclude>

Просмотреть файл

@ -5,6 +5,7 @@
#include "stdafx.h"
#include "TextConfigHelper.h"
#include "DataReader.h"
#include "StringUtil.h"
using std::string;
@ -105,19 +106,25 @@ TextConfigHelper::TextConfigHelper(const ConfigParameters& config)
m_filepath = msra::strfun::utf16(config(L"file"));
string rand = config(L"randomize", "auto");
if (AreEqualIgnoreCase(rand, "auto"))
if (config.Exists(L"randomize"))
{
m_randomize = true;
}
else if (AreEqualIgnoreCase(rand, "none"))
{
m_randomize = false;
}
wstring randomizeString = config.CanBeString(L"randomize") ? config(L"randomize") : wstring();
if (!_wcsicmp(randomizeString.c_str(), L"none"))
{
m_randomizationWindow = randomizeNone;
}
else if (!_wcsicmp(randomizeString.c_str(), L"auto"))
{
m_randomizationWindow = randomizeAuto;
}
else
{
m_randomizationWindow = config(L"randomize");
}
}
else
{
RuntimeError("'randomize' parameter must be set to 'auto' or 'none'");
m_randomizationWindow = randomizeAuto;
}
m_skipSequenceIds = config(L"skipSequenceIds", false);

Просмотреть файл

@ -25,7 +25,7 @@ public:
// Get full path to the input file.
const wstring& GetFilePath() const { return m_filepath; }
bool ShouldRandomize() const { return m_randomize; }
size_t GetRandomizationWindow() const { return m_randomizationWindow; }
bool ShouldSkipSequenceIds() const { return m_skipSequenceIds; }
@ -44,7 +44,7 @@ public:
private:
std::wstring m_filepath;
std::vector<StreamDescriptor> m_streams;
bool m_randomize;
size_t m_randomizationWindow;
ElementType m_elementType;
bool m_skipSequenceIds;
unsigned int m_maxErrors;

Просмотреть файл

@ -32,13 +32,10 @@ HTKDataDeserializer::HTKDataDeserializer(
m_corpus(corpus),
m_totalNumberOfFrames(0)
{
// Currently we only support frame mode.
// TODO: Support of full sequences.
bool frameMode = feature.Find("frameMode", "true");
if (!frameMode)
{
LogicError("Currently only reader only supports frame mode. Please check your configuration.");
}
// The frame mode is currently specified once per configuration,
// not in the configuration of a particular deserializer, but on a higher level in the configuration.
// Because of that we are using find method below.
m_frameMode = feature.Find("frameMode", "true");
ConfigHelper config(feature);
config.CheckFeatureType();
@ -49,11 +46,18 @@ HTKDataDeserializer::HTKDataDeserializer(
m_dimension = config.GetFeatureDimension();
m_dimension = m_dimension * (1 + context.first + context.second);
m_augmentationWindow = config.GetContextWindow();
InitializeChunkDescriptions(config);
InitializeStreams(featureName);
InitializeFeatureInformation();
m_augmentationWindow = config.GetContextWindow();
// If not given explicitly, we need to identify the required augmentation range from the expected dimension
// and the number of dimensions in the file.
if (m_augmentationWindow.first == 0 && m_augmentationWindow.second == 0)
{
m_augmentationWindow.first = m_augmentationWindow.second = msra::dbn::augmentationextent(m_ioFeatureDimension, m_dimension);
}
}
// Initializes chunks based on the configuration and utterance descriptions.
@ -170,7 +174,9 @@ ChunkDescriptions HTKDataDeserializer::GetChunkDescriptions()
auto cd = make_shared<ChunkDescription>();
cd->m_id = i;
cd->m_numberOfSamples = m_chunks[i].GetTotalFrames();
cd->m_numberOfSequences = m_chunks[i].GetTotalFrames();
// In frame mode, each frame is represented as sequence.
// The augmentation is still done for frames in the same sequence only, please see GetSequenceById method.
cd->m_numberOfSequences = m_frameMode ? m_chunks[i].GetTotalFrames() : m_chunks[i].GetNumberOfUtterances();
chunks.push_back(cd);
}
return chunks;
@ -187,16 +193,32 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe
{
auto utterance = chunk.GetUtterance(i);
size_t major = utterance->GetId();
// Because it is a frame mode, creating sequences for each frame.
for (size_t k = 0; k < utterance->GetNumberOfFrames(); ++k)
if (m_frameMode)
{
// Because it is a frame mode, creating a sequence for each frame.
for (size_t k = 0; k < utterance->GetNumberOfFrames(); ++k)
{
SequenceDescription f;
f.m_chunkId = chunkId;
f.m_key.m_major = major;
f.m_key.m_minor = k;
f.m_id = offsetInChunk++;
f.m_isValid = true;
f.m_numberOfSamples = 1;
result.push_back(f);
}
}
else
{
// Creating sequence description per utterance.
SequenceDescription f;
f.m_chunkId = chunkId;
f.m_key.m_major = major;
f.m_key.m_minor = k;
f.m_key.m_minor = 0;
f.m_id = offsetInChunk++;
f.m_isValid = true;
f.m_numberOfSamples = 1;
f.m_numberOfSamples = utterance->GetNumberOfFrames();
result.push_back(f);
}
}
@ -204,7 +226,7 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe
// A wrapper around a matrix that views it as a vector of column vectors.
// Does not have any memory associated.
class MatrixAsVectorOfVectors
class MatrixAsVectorOfVectors
{
public:
MatrixAsVectorOfVectors(msra::dbn::matrixbase& m)
@ -245,7 +267,7 @@ public:
});
}
// Gets data for the sequnce.
// Gets data for the sequence.
virtual void GetSequence(size_t sequenceId, vector<SequenceDataPtr>& result) override
{
m_parent->GetSequenceById(m_chunkId, sequenceId, result);
@ -277,73 +299,117 @@ ChunkPtr HTKDataDeserializer::GetChunk(size_t chunkId)
return chunk;
};
// This class stores sequence data for HTK,
// - for floats: a simple pointer to the chunk data
// - for doubles: allocated array of doubles which is freed when the sequence is no longer used.
struct HTKSequenceData : DenseSequenceData
// A matrix that stores all samples of a sequence without padding (differently from ssematrix).
// The number of columns equals the number of samples in the sequence.
// The number of rows equals the size of the feature vector of a sample (= dimensions).
class FeatureMatrix
{
msra::dbn::matrix m_buffer;
~HTKSequenceData()
public:
FeatureMatrix(size_t numRows, size_t numColumns) : m_numRows(numRows), m_numColumns(numColumns)
{
msra::dbn::matrixstripe frame(m_buffer, 0, m_buffer.cols());
// Checking if m_data just a pointer in to the
if (m_data != &frame(0, 0))
{
delete[] reinterpret_cast<double*>(m_data);
m_data = nullptr;
}
m_data.resize(m_numRows * m_numColumns);
}
// Returns a reference to the column.
inline array_ref<float> col(size_t column)
{
return array_ref<float>(m_data.data() + m_numRows * column, m_numRows);
}
// Gets pointer to the data.
inline float* GetData()
{
return m_data.data();
}
// Gets the number of columns. It equals the number of samples in the sequence/utterance.
inline size_t GetNumberOfColumns() const
{
return m_numColumns;
}
// Gets total size in elements of stored features.
inline size_t GetTotalSize() const
{
return m_data.size();
}
private:
// Features
std::vector<float> m_data;
// Number of rows = dimension of the feature
size_t m_numRows;
// Number of columns = number of samples in utterance.
size_t m_numColumns;
};
typedef shared_ptr<HTKSequenceData> HTKSequenceDataPtr;
// This class stores sequence data for HTK for floats.
struct HTKFloatSequenceData : DenseSequenceData
{
HTKFloatSequenceData(FeatureMatrix&& data) : m_buffer(data)
{
m_numberOfSamples = data.GetNumberOfColumns();
m_data = m_buffer.GetData();
}
// Get a sequence by its chunk id and id.
private:
FeatureMatrix m_buffer;
};
// This class stores sequence data for HTK for doubles.
struct HTKDoubleSequenceData : DenseSequenceData
{
HTKDoubleSequenceData(FeatureMatrix& data) : m_buffer(data.GetData(), data.GetData() + data.GetTotalSize())
{
m_numberOfSamples = data.GetNumberOfColumns();
m_data = m_buffer.data();
}
private:
std::vector<double> m_buffer;
};
// Get a sequence by its chunk id and sequence id.
// Sequence ids are guaranteed to be unique inside a chunk.
void HTKDataDeserializer::GetSequenceById(size_t chunkId, size_t id, vector<SequenceDataPtr>& r)
{
const auto& chunkDescription = m_chunks[chunkId];
size_t utteranceIndex = chunkDescription.GetUtteranceForChunkFrameIndex(id);
size_t utteranceIndex = m_frameMode ? chunkDescription.GetUtteranceForChunkFrameIndex(id) : id;
const UtteranceDescription* utterance = chunkDescription.GetUtterance(utteranceIndex);
auto utteranceFrames = chunkDescription.GetUtteranceFrames(utteranceIndex);
size_t frameIndex = id - utterance->GetStartFrameIndexInsideChunk();
// wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors()
MatrixAsVectorOfVectors utteranceFramesWrapper(utteranceFrames);
FeatureMatrix features(m_dimension, m_frameMode ? 1 : utterance->GetNumberOfFrames());
size_t leftExtent = m_augmentationWindow.first;
size_t rightExtent = m_augmentationWindow.second;
// page in the needed range of frames
if (leftExtent == 0 && rightExtent == 0)
if (m_frameMode)
{
leftExtent = rightExtent = msra::dbn::augmentationextent(utteranceFramesWrapper[0].size(), m_dimension);
}
HTKSequenceDataPtr result = make_shared<HTKSequenceData>();
result->m_buffer.resize(m_dimension, 1);
const vector<char> noBoundaryFlags; // TODO: dummy, currently to boundaries supported.
msra::dbn::augmentneighbors(utteranceFramesWrapper, noBoundaryFlags, frameIndex, leftExtent, rightExtent, result->m_buffer, 0);
result->m_numberOfSamples = 1;
msra::dbn::matrixstripe stripe(result->m_buffer, 0, result->m_buffer.cols());
if (m_elementType == ElementType::tfloat)
{
result->m_data = &stripe(0, 0);
// For frame mode augment a single frame.
size_t frameIndex = id - utterance->GetStartFrameIndexInsideChunk();
msra::dbn::augmentneighbors(utteranceFramesWrapper, vector<char>(), frameIndex, m_augmentationWindow.first, m_augmentationWindow.second, features, 0);
}
else
{
assert(m_elementType == ElementType::tdouble);
const size_t dimensions = stripe.rows();
double *doubleBuffer = new double[dimensions];
const float *floatBuffer = &stripe(0, 0);
for (size_t i = 0; i < dimensions; i++)
// Augment complete utterance.
for (size_t frameIndex = 0; frameIndex < utterance->GetNumberOfFrames(); ++frameIndex)
{
doubleBuffer[i] = floatBuffer[i];
msra::dbn::augmentneighbors(utteranceFramesWrapper, vector<char>(), frameIndex, m_augmentationWindow.first, m_augmentationWindow.second, features, frameIndex);
}
}
result->m_data = doubleBuffer;
// Copy features to the sequence depending on the type.
DenseSequenceDataPtr result;
if (m_elementType == ElementType::tdouble)
{
result = make_shared<HTKDoubleSequenceData>(features);
}
else if (m_elementType == ElementType::tfloat)
{
result = make_shared<HTKFloatSequenceData>(std::move(features));
}
else
{
LogicError("Currently, HTK Deserializer supports only double and float types.");
}
r.push_back(result);

Просмотреть файл

@ -66,6 +66,9 @@ private:
// Total number of frames.
size_t m_totalNumberOfFrames;
// Flag that indicates whether a single speech frames should be exposed as a sequence.
bool m_frameMode;
// Auxiliary data for checking against the data in the feature file.
unsigned int m_samplePeriod;
size_t m_ioFeatureDimension;

Просмотреть файл

@ -11,6 +11,11 @@
#include "ConfigHelper.h"
#include "Bundler.h"
#include "StringUtil.h"
#include "SequencePacker.h"
#include "SampleModePacker.h"
#include "BpttPacker.h"
#include "BlockRandomizer.h"
#include "NoRandomizer.h"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -61,23 +66,58 @@ HTKMLFReader::HTKMLFReader(MemoryProviderPtr provider,
// TODO: deserializers and transformers will be dynamically loaded
// from external libraries based on the configuration/brain script.
assert(readerConfig(L"frameMode", true));
ConfigHelper config(readerConfig);
bool frameMode = readerConfig(L"frameMode", true);
bool truncated = readerConfig(L"truncated", false);
if (frameMode && truncated)
{
LogicError("frameMode and truncated BPTT are mutually exclusive.");
}
if (frameMode)
{
m_packingMode = PackingMode::sample;
}
else if (truncated)
{
m_packingMode = PackingMode::truncated;
}
else
{
m_packingMode = PackingMode::sequence;
}
// nbruttsineachrecurrentiter is old reader configuration, truncationLength is the new one.
// If truncation length is specified we estimate
// the number of parallel sequences we have to pack as max(1, (mbsize/truncationLength))
// If nbruttsineachrecurrentiter is specified we assume that the truncation size is mbSize
// and the real minibatch size in mbSize * nbruttsineachrecurrentiter[epochIndex]
m_truncationLength = readerConfig(L"truncationLength", 0);
m_numParallelSequencesForAllEpochs =
readerConfig(L"nbruttsineachrecurrentiter", ConfigParameters::Array(intargvector(vector<int> { 1 })));
ConfigHelper config(readerConfig);
size_t window = config.GetRandomizationWindow();
auto deserializers = CreateDeserializers(readerConfig);
assert(deserializers.size() == 2);
auto bundler = std::make_shared<Bundler>(readerConfig, deserializers[0], deserializers, false);
int verbosity = readerConfig(L"verbosity", 2);
std::wstring readMethod = config.GetRandomizer();
if (!AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize")))
// TODO: this should be bool. Change when config per deserializer is allowed.
if (AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize")))
{
RuntimeError("readMethod must be 'blockRandomize'");
m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DecimationMode::chunk, true /* useLegacyRandomization */);
}
else if (AreEqualIgnoreCase(readMethod, std::wstring(L"none")))
{
m_randomizer = std::make_shared<NoRandomizer>(bundler);
}
else
{
RuntimeError("readMethod must be 'blockRandomize' or 'none'.");
}
int verbosity = readerConfig(L"verbosity", 2);
m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DecimationMode::chunk, true /* useLegacyRandomization */);
m_randomizer->Initialize(nullptr, readerConfig);
// Create output stream descriptions (all dense)
@ -107,11 +147,57 @@ void HTKMLFReader::StartEpoch(const EpochConfiguration& config)
}
m_randomizer->StartEpoch(config);
m_packer = std::make_shared<SampleModePacker>(
m_provider,
m_randomizer,
config.m_minibatchSizeInSamples,
m_streams);
// TODO: should we unify sample and sequence mode packers into a single one.
// TODO: functionally they are the same, the only difference is how we handle
// TODO: MBlayout and what is the perf hit for iterating/copying sequences.
// TODO: Should do more perf tests before unifying these two.
// TODO: As the next step the packers will be moved out of the readers into the
// TODO: core CNTK. They are format agnostic and can be used with any type of
// TODO: deserializers.
switch (m_packingMode)
{
case PackingMode::sample:
m_packer = std::make_shared<SampleModePacker>(
m_provider,
m_randomizer,
config.m_minibatchSizeInSamples,
m_streams);
break;
case PackingMode::sequence:
m_packer = std::make_shared<SequencePacker>(
m_provider,
m_randomizer,
config.m_minibatchSizeInSamples,
m_streams);
break;
case PackingMode::truncated:
{
size_t minibatchSize = config.m_minibatchSizeInSamples;
size_t truncationLength = m_truncationLength;
if (truncationLength == 0)
{
// Old config, the truncation length is specified as the minibatch size.
// In this case the truncation size is mbSize
// and the real minibatch size is truncation size * nbruttsineachrecurrentiter
fprintf(stderr, "Legacy configuration is used for truncated BPTT mode, please adapt the config to explicitly specify truncationLength.");
truncationLength = minibatchSize;
size_t numParallelSequences = m_numParallelSequencesForAllEpochs[config.m_epochIndex];
minibatchSize = numParallelSequences * truncationLength;
}
m_packer = std::make_shared<BpttPacker>(
m_provider,
m_randomizer,
minibatchSize,
truncationLength,
m_streams);
break;
}
default:
LogicError("Unsupported type of packer '%d'.", (int)m_packingMode);
}
}
Minibatch HTKMLFReader::ReadMinibatch()

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше