Merge branch 'qiwye/asgd-dev' into qiwye/asgd-exp

This commit is contained in:
Qiwei Ye 2016-04-17 17:09:34 +08:00
Родитель 147d1178db 9968ebd25f
Коммит 1a0b88be0c
80 изменённых файлов: 1747 добавлений и 1345 удалений

Просмотреть файл

@ -458,13 +458,21 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{BFBC6BE1-C33
Tests\EndToEndTests\SLU\atis.dev.IOB.simple = Tests\EndToEndTests\SLU\atis.dev.IOB.simple
Tests\EndToEndTests\SLU\atis.test.apos.pred.pos.head.IOB.simple = Tests\EndToEndTests\SLU\atis.test.apos.pred.pos.head.IOB.simple
Tests\EndToEndTests\SLU\atis.train.apos.pred.pos.head.IOB.simple = Tests\EndToEndTests\SLU\atis.train.apos.pred.pos.head.IOB.simple
Tests\EndToEndTests\SLU\baseline.linux.cpu.txt = Tests\EndToEndTests\SLU\baseline.linux.cpu.txt
Tests\EndToEndTests\SLU\baseline.linux.gpu.txt = Tests\EndToEndTests\SLU\baseline.linux.gpu.txt
Tests\EndToEndTests\SLU\baseline.windows.cpu.txt = Tests\EndToEndTests\SLU\baseline.windows.cpu.txt
Tests\EndToEndTests\SLU\baseline.windows.gpu.txt = Tests\EndToEndTests\SLU\baseline.windows.gpu.txt
Tests\EndToEndTests\SLU\globals.cntk = Tests\EndToEndTests\SLU\globals.cntk
Tests\EndToEndTests\SLU\input.txt = Tests\EndToEndTests\SLU\input.txt
Tests\EndToEndTests\SLU\inputmap.txt = Tests\EndToEndTests\SLU\inputmap.txt
Tests\EndToEndTests\SLU\lstm.ndl = Tests\EndToEndTests\SLU\lstm.ndl
Tests\EndToEndTests\SLU\lstmNDL.txt = Tests\EndToEndTests\SLU\lstmNDL.txt
Tests\EndToEndTests\SLU\output.txt = Tests\EndToEndTests\SLU\output.txt
Tests\EndToEndTests\SLU\README.txt = Tests\EndToEndTests\SLU\README.txt
Tests\EndToEndTests\SLU\rnnlu.cntk = Tests\EndToEndTests\SLU\rnnlu.cntk
Tests\EndToEndTests\SLU\rnnlu.ndl.cntk = Tests\EndToEndTests\SLU\rnnlu.ndl.cntk
Tests\EndToEndTests\SLU\run-test = Tests\EndToEndTests\SLU\run-test
Tests\EndToEndTests\SLU\testcases.yml = Tests\EndToEndTests\SLU\testcases.yml
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "MNIST", "MNIST", "{FA33A61E-95C7-4049-8111-22058CE361A3}"
@ -509,7 +517,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CIFAR-10", "CIFAR-10", "{77
Examples\Image\Miscellaneous\CIFAR-10\CifarConverter.py = Examples\Image\Miscellaneous\CIFAR-10\CifarConverter.py
Examples\Image\Miscellaneous\CIFAR-10\labelsmap.txt = Examples\Image\Miscellaneous\CIFAR-10\labelsmap.txt
Examples\Image\Miscellaneous\CIFAR-10\Macros.ndl = Examples\Image\Miscellaneous\CIFAR-10\Macros.ndl
Examples\Image\Miscellaneous\CIFAR-10\README.md = Examples\Image\Miscellaneous\CIFAR-10\README.md
Examples\Image\Miscellaneous\CIFAR-10\readme.txt = Examples\Image\Miscellaneous\CIFAR-10\readme.txt
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ImageNet", "ImageNet", "{EF710C5A-E616-442A-889D-C997D39AF2E1}"
@ -666,6 +674,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Config", "Config", "{850008
ProjectSection(SolutionItems) = preProject
Examples\Text\PennTreebank\Config\rnn.cntk = Examples\Text\PennTreebank\Config\rnn.cntk
Examples\Text\PennTreebank\Config\S2SAutoEncoder.cntk = Examples\Text\PennTreebank\Config\S2SAutoEncoder.cntk
Examples\Text\PennTreebank\Config\S2SLib.bs = Examples\Text\PennTreebank\Config\S2SLib.bs
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{E6DC3B7D-303D-4A54-B040-D8DCF8C56E17}"

Просмотреть файл

@ -1,4 +1,5 @@
# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk RunDir=$(SolutionDir)Examples/Text/PennTreebank/_run RootDir=$(SolutionDir)Examples/Text/PennTreebank/_run DataDir=$(SolutionDir)Examples/Text/PennTreebank/Data ConfigDir=$(SolutionDir)Examples/Text/PennTreebank/Config stderr=$(SolutionDir)Examples/Text/PennTreebank/_run/S2SAutoEncoder.log DeviceId=-1 makeMode=false
# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk RunRootDir=$(SolutionDir)Examples/Text/PennTreebank DeviceId=-1 makeMode=false
# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk RunRootDir=$(SolutionDir)g2p makeMode=false
####################
# WORK IN PROGRESS #
# WORK IN PROGRESS #
@ -6,7 +7,28 @@
####################
# Command line to run in debugger:
# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk RunDir=$(SolutionDir)Examples/Text/PennTreebank/_run RootDir=$(SolutionDir)Examples/Text/PennTreebank/_run DataDir=$(SolutionDir)Examples/Text/PennTreebank/Data ConfigDir=$(SolutionDir)Examples/Text/PennTreebank/Config stderr=$(SolutionDir)Examples/Text/PennTreebank/_run/S2SAutoEncoder.log train=[SGD=[maxEpochs=1]] confVocabSize=1000 DeviceId=-1 makeMode=false
# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk RunRootDir=$(SolutionDir)Examples/Text/PennTreebank train=[SGD=[maxEpochs=1]] confVocabSize=1000 DeviceId=-1 makeMode=false
# directory defaults (if not overridden)
RunRootDir = "../.." # default if not overridden
DataDir = "$RunRootDir$/Data"
CacheDir = "$DataDir$/cache" # (not used currently)
ExpRootDir = "$RunRootDir$"
# experiment id
#ExpId = _run
deviceId = 1
#ExpId = 68-$deviceId$-s2sae-bigmodel
ExpId = 06-$deviceId$-g2p
#ExpId = 05-3-g2p # for decoding a different model
# directories
ExpDir = "$ExpRootDir$/$ExpId$"
ModelDir = "$ExpDir$/Models"
stderr = $ExpDir$/S2SAutoEncoder.log7
# Append this for small set:
# train=[epochSize=2048]] trainFile=ptb.small.train.txt validFile=ptb.small.valid.txt testFile=ptb.small.test.txt
@ -14,44 +36,37 @@
# It encodes an entire sentence into a flat vector, and tries to regenerate it.
# Meant to be useful mainly understanding how to do sequence-to-sequence in CNTK.
# Parameters can be overwritten on the command line
# for example: cntk configFile=myConfigFile RootDir=../..
# For running from Visual Studio add
# currentDirectory=$(SolutionDir)/<path to corresponding data folder>
RootDir = ".."
ConfigDir = "$RootDir$/Config"
DataDir = "$RootDir$/Data"
CacheDir = "$RootDir$/Data/cache"
OutputDir = "$RootDir$/Output"
ModelDir = "$OutputDir$/Models"
# deviceId=-1 for CPU, >=0 for GPU devices, "auto" chooses the best GPU, or CPU if no usable GPU is available
deviceId = "auto"
command = writeWordAndClassInfo:train:test:write
#command = write
precision = "float"
traceLevel = 1
modelPath = "$ModelDir$/S2SAutoEncoder.dnn"
decodeModelPath = "$modelPath$.13" # epoch to decode. Has best CV WER
# uncomment the following line to write logs to a file
#stderr=$OutputDir$/rnnOutput
decodeModelPath = "$modelPath$.13" # epoch to decode can be appended here
beamDepth = 1 # 0=predict; 1=greedy; >1=beam
decodeOutputPath = "$decodeModelPath$.b$beamDepth$"
#numCPUThreads = 1
#confVocabSize = 10000
#confClassSize = 50
confVocabSize = 10000
confClassSize = 50
useStabilizer = true
#trainFile = "ptb.train.txt"
##trainFile = "ptb.small.train.txt"
#validFile = "ptb.valid.txt"
##validFile = "ptb.small.valid.txt"
#testFile = "ptb.test.txt"
##testFile = "ptb.test.txt-econ1"
##testFile = "ptb.small.train.txt" # test on train, to see whether model makes sense at all
#startSymbol = "</s>"
trainFile = "ptb.train.txt"
#trainFile = "ptb.small.train.txt"
validFile = "ptb.valid.txt"
#validFile = "ptb.small.valid.txt"
testFile = "ptb.test.txt"
#testFile = "ptb.test.txt-econ1"
confVocabSize = 69 #10000
confClassSize = 0 #50
trainFile = "g014b2b.train-dev-20-21.bsf.joint"
#trainFile = "g014b2b.train-dev-1-21.bsf.joint" # small one for debugging
validFile = "g014b2b.train-dev-1-21.bsf.joint"
testFile = "g014b2b.test.bsf.joint"
startSymbol = "<s>"
#######################################
# network definition #
@ -59,12 +74,22 @@ testFile = "ptb.test.txt"
BrainScriptNetworkBuilder = (new ComputationNetwork [
# TODO: move this somewhere shared
enableTracing = true
traceFrequency = 1000
tracingLabelMappingFile = "$ModelDir$/vocab.wl"
include "S2SLib.bs"
beamDepth=3 // for above Trace macros only
# import general config options from outside config values
vocabDim = $confVocabSize$
nbrClass = $confClassSize$
useStabilizer = $useStabilizer$
useEncoder = true // if false, this becomes a regular RNN
isAutoencoder = false # input is only one sequence, meant to reproduce itself
useStabilizer = true
useEncoder = true # if false, this becomes a regular RNN
useNYUStyle = false # if true use thought vector for all inputs, NYU-style
attentionSpan = 20 # we only support fixed-size attention windows for now. 0 means no attention; exactly 20 is needed for the g2p CMUDict task
# import some namespaces
Parameters = BS.Parameters
@ -74,125 +99,176 @@ BrainScriptNetworkBuilder = (new ComputationNetwork [
Boolean = BS.Boolean
RecurrentLSTMP = BS.RNNs.RecurrentLSTMP
embeddingDim = 300
hiddenDim = 200
embeddingDim = $confVocabSize$ # 300
hiddenDim = 750 # 512 # 1024 # 200 --TODO: Kaisheng used 500
maxLayer = 2 # 1 # 0
encoderDims[i:0..0] = hiddenDim # this defines the number of hidden layers in each
decoderDims[i:0..0] = hiddenDim # both are one LSTM layer only for now
encoderDims[i:0..maxLayer] = hiddenDim # this defines the number of hidden layers in each
decoderDims[i:0..maxLayer] = hiddenDim # both are one LSTM layer only for now
# inputs
#input = SparseInput(vocabDim, tag='feature'); # BUGBUG: Slice() not working for sparse, need to extend TensorView
input = Input(vocabDim, tag='feature');
# for an auto-encoder, both are the same
labels = input
streams = [
rawInput = input
out = if isAutoencoder
then [
# for an auto-encoder, both are the same
input = rawInput
labels = rawInput
]
else [
# we encode input and label as a single input; this splits it into two
separatorRow = 2 # row index of separator symbokl
isSeparator = RowSlice (separatorRow, 1, rawInput) # cut out the separator as a flag
inInput = Boolean.Or (FutureValue (1, inInput , defaultHiddenActivation=0), isSeparator) # flag sequence: word is input...
inLabels = Boolean.Or (PastValue (1, inLabels, defaultHiddenActivation=0), isSeparator) # ...or labels
input = Sequences.Gather (inInput, rawInput) # use flags to split raw input into input and labels
labels = Sequences.Gather (inLabels, rawInput) # (both have different lengths)
]
].out
# helpers
First (x) = Slice (0, 1, x, axis=-1)
Last (x) = Slice (-1, 0, x, axis=-1)
# strip separators
CastAs (type, data) = Sequences.Scatter (Constants.OnesLike (type), data)
# TODO: find out which one is the correct one
#inputSequence = Slice (0, -1, streams.input, axis=-1) # e.g. <s> A B C # TODO: process </s> as well, to trigger the thought vector
inputSequence = streams.input # e.g. <s> A B C </s>
labelSequence = Slice (1, 0, streams.labels, axis=-1) # e.g. A B C </s>
inputSequence = Slice (0, -1, input, axis=-1) # e.g. <s> A B C
labelSequence = CastAs (inputSequence, Slice (1, 0, labels, axis=-1)) # e.g. A B C </s>
# embeddings
# embeddings --as long as we cannot read multiple sequences, we got one embedding
# Note: Embeddings are linear, so better stabilize. We really should use BatchNorm.
Einput = Parameters.Stabilize (Parameters.WeightParam (vocabDim, embeddingDim), enabled=useStabilizer) # note: this is assumed to be applied transposed, hence the swapped dimensions
Elabel = Einput
E = Parameters.Stabilize (Parameters.WeightParam (vocabDim, embeddingDim), enabled=useStabilizer) # note: this is assumed to be applied transposed, hence the swapped dimensions
EmbedInput (x) = if vocabDim != embeddingDim then TransposeTimes (E, x) else x
EmbedLabels (x) = if vocabDim != embeddingDim then TransposeTimes (E, x) else x
Embed (E, x) = TransposeTimes (E, x)
inputEmbedded = EmbedInput (inputSequence)
labelsEmbedded = EmbedLabels (labelSequence)
labelSentenceStart = First (streams.labels)
labelSentenceStartEmbedded = EmbedLabels (labelSentenceStart)
inputEmbedded = Embed (Einput, inputSequence)
labelsEmbedded = Embed (Elabel, labelSequence)
RecurrentLSTMPWithAttentionWindow2 (inputDim/*x.dim*/, outputDim/*h.dim*/, cellDim/*c.dim*/, x, projectedAttentionWindowBroadcast, attentionDim, attentionSpan, enableSelfStabilization=false) =
[
prevState =
[
h = Loop.Previous (lstmState.h) # hidden state(t-1)
c = Loop.Previous (lstmState.c) # cell(t-1)
]
# compute additional hidden state from attention
W(x) = Parameters.WeightParam (attentionDim, outputDim) * Parameters.Stabilize (x, enabled=useStabilizer)
projectedH = W (prevState.h) # [cellDim]
tanHOut = Tanh (projectedAttentionWindowBroadcast.value + projectedH) # [attentionDim x attentionSpan]
v(x) = Parameters.WeightParam (1, attentionDim) * Parameters.Stabilize (x, enabled=useStabilizer) # [1 x attentionDim]
u = v (tanHOut) # [1 x attentionSpan]
uValid = u + Log (projectedAttentionWindowBroadcast.valid) # [1 x attentionSpan]
attentionWeights = Softmax (uValid) # [1 x attentionSpan]
weightedAttentionWindow = projectedAttentionWindowBroadcast.value .* attentionWeights # [attentionDim x attentionSpan]
weightedAttentionAverage = weightedAttentionWindow * BS.Constants.OnesTensor (attentionSpan) # [attentionDim]
# feed both to LSTM as a single agumented input, so that we can reuse the existing LSTM component
augmentedX = RowStack (weightedAttentionAverage : x)
enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
lstmState = BS.RNNs.LSTMP (attentionDim + inputDim, outputDim, cellDim, augmentedX, prevState, enableSelfStabilization=enableSelfStabilization1)
].lstmState // that's the value we return
RecurrentLSTMP2WithInitialState (inputDim, outputDim, cellDim, x, initialState, enableSelfStabilization=false) =
[
prevState =
[
isFirst = Loop.IsFirst (initialState.h)
h = Boolean.If (isFirst, initialState.h, Loop.Previous (lstmState.h)) // hidden state(t-1)
c = Boolean.If (isFirst, initialState.c, Loop.Previous (lstmState.c)) // cell(t-1)
]
enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
lstmState = BS.RNNs.LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
].lstmState // that's the value we return
# encoder (processes inputEmbedded)
encoder = BS.RNNs.RecurrentLSTMP2Stack (inputEmbedded, embeddingDim, encoderDims, encoderDims, enableSelfStabilization=useStabilizer)
# encoder (processes user input)
encoderOutputLayer = Length (encoderDims)-1
encoder[i:0..encoderOutputLayer] =
RecurrentLSTMP(if i == 0 then embeddingDim else encoderDims[i-1],
encoderDims[i], encoderDims[i],
if i == 0 then inputEmbedded else encoder[i-1],
enableSelfStabilization=useStabilizer)
encoderOutput = encoder[encoderOutputLayer]
# that last frame should be fed as an additional input to every decoder step
# (This is the NYU model, not the Google model where the thought vector is only the initial state.)
# Three ways of passing encoder state:
# 1. as initial state for decoder (Google style)
# 2. as side information for every decoder step (NYU style)
# 3. attention
thoughtVector =
[
x = encoderOutput
result = Boolean.If (Loop.IsLast (x), // if last entry
/*then*/ x, // then copy that
/*else*/ FutureValue (0, result)) // else just propagate to the front --TODO: Use Scatter() once input and labels are no longer the same.
].result
thoughtVector = [
h = Last (encoderOutput.h)
c = Last (encoderOutput.c)
]
thoughtVectorDim = encoderDims[encoderOutputLayer]
thoughtVectorPadded = [ # padded with zeroes until end of target sequence
h = Sequences.BroadcastSequenceAs (labelsEmbedded, thoughtVector.h)
c = Sequences.BroadcastSequenceAs (labelsEmbedded, thoughtVector.c)
]
# attention (fixed rolling window)
attentionWindow = Sequences.PastValueWindow (attentionSpan, encoderOutput.h)
attentionDim = thoughtVectorDim
projectedAttentionWindowBroadcast = [
W(x) = Parameters.WeightParam (attentionDim, thoughtVectorDim) * Parameters.Stabilize (x, enabled=useStabilizer)
#B = Parameters.BiasParam (vocabDim) # no bias in attention
value = Sequences.BroadcastSequenceAs (labelsEmbedded, W (attentionWindow.value)) # apply the projection columnwise to the attentionWindow tensor
valid = Sequences.BroadcastSequenceAs (labelsEmbedded, attentionWindow.valid)
]
# NYU style: expand h to all, drop c
# TODO: just use use thoughtVectorPadded.h (do this when we next test this branch again)
thoughtVectorEverywhere = Boolean.If (Loop.IsFirst (thoughtVectorPadded.h), # if first entry
/*then*/ thoughtVectorPadded.h, # then copy that
/*else*/ Loop.Previous (thoughtVectorEverywhere)) # else just propagate to the front
# TODO: create an indexSequence that contains all zeroes, basically broadcast a single-frame sequence across another sequence length
# decoder
# NYU style:
# The decoder starts with hidden state 0
# and takes as input [thoughtVector; previous word].
# and takes as input [thoughtVectorEverywhere; previous word].
isTraining = EnvironmentInput ('isTraining', tag='evaluation')
#decoderFeedback = Boolean.If (isTraining, labelsEmbedded, decoderOutputEmbedded) # not working
decoderFeedback = labelsEmbedded
sentenceStartEmbedded = inputEmbedded # first token is sentence start
# ^^ inputEmbedded is used to get </s>. Must make this a constant once we separate input and output.
delayedDecoderFeedback = Loop.PreviousOrDefault (defaultValue=labelSentenceStartEmbedded, labelsEmbedded)
delayedDecoderFeedback = Boolean.If (Loop.IsFirst (decoderFeedback), sentenceStartEmbedded, Loop.Previous (decoderFeedback))
decoderInputDim = if useEncoder then thoughtVectorDim + embeddingDim else embeddingDim
decoderInput = if useEncoder then RowStack (thoughtVector : delayedDecoderFeedback) else delayedDecoderFeedback
decoderInputDim = labelsEmbedded.dim #embeddingDim
decoderInput = Pass (delayedDecoderFeedback)
decoderOutputLayer = Length (decoderDims)-1
decoder[i:0..decoderOutputLayer] =
if i == 0
then RecurrentLSTMP (decoderInputDim, decoderDims[i], decoderDims[i],
decoderInput,
enableSelfStabilization=useStabilizer)
else RecurrentLSTMP (decoderDims[i-1], decoderDims[i], decoderDims[i],
decoder[i-1],
enableSelfStabilization=useStabilizer)
decoderDim = decoderDims[decoderOutputLayer]
decoderOutput = decoder[decoderOutputLayer]
then if useEncoder && useNYUStyle then BS.RNNs.RecurrentLSTMP2 (thoughtVectorDim + decoderInputDim, decoderDims[i], decoderDims[i],
RowStack (thoughtVectorEverywhere : decoderInput),
enableSelfStabilization=useStabilizer)
else if useEncoder && attentionSpan > 0 then RecurrentLSTMPWithAttentionWindow2 (thoughtVectorDim + decoderInputDim, decoderDims[i], decoderDims[i],
RowStack (thoughtVectorEverywhere : decoderInput),
projectedAttentionWindowBroadcast, attentionDim, attentionSpan,
enableSelfStabilization=useStabilizer)
else RecurrentLSTMP2WithInitialState (decoderInputDim, decoderDims[i], decoderDims[i],
decoderInput,
thoughtVectorPadded, # BUGBUG: Should be thoughtVector, but Scatter() can't expand from inside a loop
enableSelfStabilization=useStabilizer)
else BS.RNNs.RecurrentLSTMP2 (decoderDims[i-1], decoderDims[i], decoderDims[i],
decoder[i-1].h,
enableSelfStabilization=useStabilizer)
#decoderDim = decoderDims[decoderOutputLayer]
decoderOutput = decoder[decoderOutputLayer].h
decoderDim = decoderOutput.dim
# and add a softmax layer on top
W(x) = Parameters.WeightParam (vocabDim, decoderDim) * Parameters.Stabilize (x, enabled=useStabilizer)
B = Parameters.BiasParam (vocabDim)
z = W(decoderOutput) + B; // top-level input to Softmax
decoderOutputEmbedded = Embed (Elabel, Hardmax (z))
z = W (decoderOutput) + B; // top-level input to Softmax
# training criteria
ce = CrossEntropyWithSoftmax(labelSequence, z, tag='criterion') // this is the training objective
wer = ErrorPrediction (labelSequence, z, tag='evaluation') // this also gets tracked
#indexTestVals = Plus (decoderOutput, BS.Constants.Zero, tag='evaluation')
#indexTest = Slice (0, 1, indexTestVals)
#index = Where (RectifiedLinear (indexTest), tag='evaluation'); // for testing: this thresholds all negative numbers to 0=false, keeping positive as !=0=true
#packedIndex = PackedIndex (indexTest, index, tag='evaluation')
#filtered = GatherPacked (packedIndex, indexTestVals, tag='evaluation')
#unfiltered = ScatterPacked (indexTest, packedIndex, filtered, tag='evaluation')
//# define an LSTM with a per-sequence initialization value
//# TODO: Not currently used. Move to BS library once tested.
//RecurrentLSTMPWithInitValue (inputDim, outputDim, cellDim, x, initValue, enableSelfStabilization=false) =
//[
// prevState = // Loop.Previous (lstmState). BS can't apply Previous() to dictionaries, so expand it manually
// [
// h = Loop.Previous (lstmState.h); // hidden state(t-1)
// c = Loop.Previous (lstmState.c); // cell(t-1)
// ]
// # resettable LSTM function
// lstmState =
// [
// // apply the LSTM function to the input state; for first frame, we will ignore the output
// enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
// lstmState1 = LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
//
// // the actual LSTM state (incl. its output) gets overwritten in the first frame by the initValue
// isFirst = Loop.IsFirst (x)
// h = Boolean.If (isFirst, initValue, lstmState1.h); // hidden state(t-1)
// c = Boolean.If (isFirst, initValue, lstmState1.c); // cell(t-1)
// ]
//].lstmState.h // that's the value we return
ce = CrossEntropyWithSoftmax (labelSequence, z, tag='criterion') // this is the training objective
wer = ErrorPrediction (labelSequence, z, tag='evaluation') // this also gets tracked
])
#######################################
@ -241,7 +317,7 @@ reader = [
labelType = "category"
labelDim = "$confVocabSize$"
labelMappingFile = "$ModelDir$/vocab.wl"
beginSequence = "</s>"
beginSequence = "$startSymbol$" # "</s>"
endSequence = "</s>"
#### Write definition ####
@ -341,7 +417,7 @@ cvReader = [
writeWordAndClassInfo = [
action = "writeWordAndClass"
inputFile = "$DataDir$/$trainFile$"
beginSequence = "</s>"
beginSequence = "$startSymbol$" # "</s>"
endSequence = "</s>"
outputMappingFile = "$ModelDir$/vocab.wl"
outputVocabFile = "$ModelDir$/vocab.txt"
@ -362,23 +438,24 @@ train = [
traceLevel = 1
epochSize = 0 # (for quick tests, this can be overridden with something small)
#BrainScriptNetworkBuilder is defined in outer scope
# BrainScriptNetworkBuilder is defined in outer scope
SGD = [
minibatchSize = 128*2:256:512
learningRatesPerSample = 0.01
minibatchSize = 128:128:256:512
learningRatesPerSample = 0.007*2:0.0035 #0.01 #0.005 # 0.01
momentumAsTimeConstant = 2500
gradientClippingWithTruncation = true # TODO: clip and truncate? What is the difference?
clippingThresholdPerSample = 15.0
maxEpochs = 16
maxEpochs = 50
numMBsToShowResult = 100
firstMBsToShowResult = 10
gradUpdateType = "none" # FSAdaGrad?
loadBestModel = true
# tracing (enable these for debugging)
#traceNodeNamesReal = labelsEmbedded:decoderInput:"decoder[0].lstmState._privateInnards.ht":z.Plus_left.Times_right.result:z:ce
#traceNodeNamesReal = labelsEmbedded:decoderInput:z:ce
#traceNodeNamesReal = thoughtVector.result:zMask:z:ce:wer:indexTestVals:index:packedIndex:filtered:unfiltered:isTraining
#traceNodeNamesReal = thoughtVectorEverywhere.result:zMask:z:ce:wer:indexTestVals:index:packedIndex:filtered:unfiltered:isTraining
#traceNodeNamesCategory = inputSequence.out:labelSequence
dropoutRate = 0.0
@ -454,7 +531,7 @@ test = [
labelType = "category"
labelDim = "$confVocabSize$"
labelMappingFile = "$ModelDir$/vocab.wl"
beginSequence = "</s>"
beginSequence = "$startSymbol$" # "</s>"
endSequence = "</s>"
#### Write definition ####
@ -504,51 +581,21 @@ write = [
# We need to make a change:
BrainScriptNetworkBuilder = ([
beamDepth = 3 // 0=predict; 1=greedy; >1=beam
enableTracing = true
traceFrequency = 1000
tracingLabelMappingFile = "$ModelDir$/vocab.wl"
include "S2SLib.bs"
beamDepth = $beamDepth$ // 0=predict; 1=greedy; >1=beam
# import some names
Constants = BS.Constants
Boolean = BS.Boolean
Loop = BS.Loop
Previous = Loop.Previous
IsFirst = Loop.IsFirst
Trace (node, say='', logFrequency=traceFrequency, logFirst=10, logGradientToo=false, onlyUpToRow=100000000, onlyUpToT=100000000, format=[], tag='') = new ComputationNode [
operation = 'Trace' ; inputs = node
]
formatDense = [
type = "real"
transpose = false
precisionFormat = ".4"
]
formatOneHot = [
type = "category"
transpose = false
labelMappingFile = "$ModelDir$/vocab.wl"
]
formatSparse = [
type = "sparse"
transpose = false
labelMappingFile = "$ModelDir$/vocab.wl"
]
enableTracing = true
traceFrequency = 1
TraceState (h, what) =
if enableTracing
then Transpose (Trace (Transpose (h), say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3, format=formatDense))
else h
TraceDense (h, what) =
if enableTracing
then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3, format=formatDense)
else h
TraceOneHot (h, what) =
if enableTracing
then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=15,*/ format=formatOneHot)
else h
TraceSparse (h, what) =
if enableTracing
then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3,*/ format=formatSparse)
else h
If = Boolean.If
OnesTensor = Constants.OnesTensor
# macro that extracts top D hypotheses from a 2D tensor
# input: scores[w,n] w = word index, d = hyp index in beam (d=0 is the best one)
@ -575,31 +622,49 @@ write = [
modelAsTrained = BS.Network.Load ("$decodeModelPath$")
useNYUStyle = false # TODO: we should be able to infer this from some dimensions
hasEmbeddings = false # TODO: infer this
top1DecodingModel(model) = new ComputationNetwork [
# compute top-N from output
logP = LogSoftmax (model.z)
offset = Constant (10000)
top1a = Hardmax (logP) .* (logP + offset)/*for tracing*/
top1b = top1a
top1b = Hardmax (logP) .* (logP + offset)/*for tracing*/
top1 = TraceSparse (top1b, 'logP') # TODO: get the accumulated logP out, it's a little more involved
topN = 10
tokenSet = GetTopNTensor (topN, logP) # [V x 1] -> [V x 1 x topN]
tokenSetScores = tokenSet .* logP # [V x 1 x topN]
topPaths = GetTopNTensor (topN, logP) # [V x 1] -> [V x 1 x topN]
topPathScores = topPaths .* logP # [V x 1 x topN]
# reduce back to a single column
topHyps = TraceSparse (tokenSetScores * ConstantTensor (1, (1 : topN)), 'topHyps')
topHyps = TraceSparse (topPathScores * OnesTensor (1 : topN), 'topHyps')
inputsOut = Pass (model.inputSequence)
labelsOut = Pass (TraceOneHot (model.labelSequence, 'labels'))
decodeOut = Pass (TraceOneHot (top1, 'out'))
topNOut = Pass (topHyps)
]
# replace old decoderFeedback node by newDecoderFeedback
EmbedLabels (x) = if hasEmbeddings then TransposeTimes (modelAsTrained.labelsEmbedded.TransposeTimesArgs[0], x) else x
decoderFeedback = EmbedLabels (Hardmax (modelAsTrained.z)) # in training, this is decoderFeedback = labelsEmbedded
decoderFeedback = modelAsTrained.decoderOutputEmbedded # in training, this is decoderFeedback = labelsEmbedded
sentenceStartEmbedded = Boolean.If (Loop.IsFirst (decoderFeedback), modelAsTrained.inputEmbedded, Previous (sentenceStartEmbedded)) # enforces no leaking of labels
delayedDecoderFeedback = Boolean.If (Loop.IsFirst (decoderFeedback), sentenceStartEmbedded, Loop.Previous (decoderFeedback)) # same expression as in training
# TODO: fold this in
PreviousOrDefault1 (x, defaultValue=Constant (0)) = # a delay node with initial value --TODO: merge the two, then do in C++
[
flags = IsFirst (defaultValue/*x*/)
out = If (flags,
/*then*/ defaultValue,
/*else*/ Previous (x))
].out
labelSentenceStart = modelAsTrained.labelSentenceStart_out # _ is a hack
labelsToUse = if hasEmbeddings then modelAsTrained.labelsEmbedded else modelAsTrained.labelSequence
labelSentenceStartToUse = if hasEmbeddings then modelAsTrained.labelSentenceStartEmbedded else labelSentenceStart
labelSentenceStartEmbeddedScattered = TraceDense (BS.Sequences.Scatter (IsFirst (labelsToUse), labelSentenceStartToUse), 'sest')
delayedDecoderFeedback = TraceDense (/*Loop.*/PreviousOrDefault1 (defaultValue=labelSentenceStartEmbeddedScattered, TraceDense (decoderFeedback, 'lemb')) , 'prev lemb')
greedyDecodingModel = BS.Network.Edit (modelAsTrained,
BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.delayedDecoderFeedback, delayedDecoderFeedback),
@ -611,6 +676,8 @@ write = [
# decoder[0].prevState.h = PastValue (decoder[0].lstmState._privateInnards.ht) : [200 x 1 {1,200} x *] -> [200 x 1 {1,200} x *]
# decoder[0].prevState.c = PastValue (decoder[0].lstmState._privateInnards.ct) : [200 x 1 {1,200} x *] -> [200 x 1 {1,200} x *]
# decoderInput.inputs[1] = PastValue (labelsEmbedded) : [300 x 1 {1,300} x *] -> [300 x 1 {1,300} x *]
# decoder[0].prevState.h.elseVal = PastValue (decoder[0].lstmState._privateInnards.ht) : [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h] -> [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h]
# decoder[0].prevState.c.elseVal = PastValue (decoder[0].lstmState._privateInnards.ct) : [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h] -> [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h]
hiddenDim = modelAsTrained.delayedDecoderFeedback.dim
embeddingDim = modelAsTrained.decoderOutputEmbedded.dim
@ -635,21 +702,66 @@ write = [
# - traceback is a right-to-left recurrence
# - output best hypo conditioned on the path (it is already known)
propagationEdits[i:0..2] = // TODO: implement and use { } syntax
if i == 0 then (node => if node.name == 'decoder[0].prevState.h' then TraceState (Previous (PropagateTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
else if i == 1 then (node => if node.name == 'decoder[0].prevState.c' then TraceState (Previous (PropagateTopN (node.PastValueArgs[0])), 'propagated') else node)
propagationEdits[i:0..8] = // TODO: implement and use { } syntax TODO: VV elseVal only for non-NYU?
# non-NYU:
if i == 0 then (node => if node.name == 'decoder[0].prevState.h.elseVal' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
else if i == 1 then (node => if node.name == 'decoder[0].prevState.c.elseVal' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
# NYU:
else if i == 2 then (node => if node.name == 'decoder[0].prevState.h' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
else if i == 3 then (node => if node.name == 'decoder[0].prevState.c' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
# all:
else if i == 4 then (node => if node.name == 'decoder[1].prevState.h' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
else if i == 5 then (node => if node.name == 'decoder[1].prevState.c' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
else if i == 6 then (node => if node.name == 'decoder[2].prevState.h' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
else if i == 7 then (node => if node.name == 'decoder[2].prevState.c' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
else BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.delayedDecoderFeedback, delayedDecoderFeedback)
# decoderFeedback must be updated to take actual decoder output
Elabel = modelAsTrained.decoderOutputEmbedded.TransposeTimesArgs[0]
decoderFeedback = TraceState (TransposeTimes (Elabel, TraceSparse (topWords, 'topWords')), 'feedback')
delayedDecoderFeedback = Boolean.If (Loop.IsFirst (decoderFeedback), sentenceStartEmbedded, Loop.Previous (decoderFeedback))
m2 = BS.Network.Edit (modelAsTrained,
propagationEdits,
(labelsOut : decodeOut)) # additional roots
(inputsOut : labelsOut : decodeOut)) # additional roots
ReduceAxis (axisDim, x, axis=1) = # unfortunately, we must feed in the dimension of the axis, it can't be inferred
if axis == 1 then Times (OnesTensor (axisDim), x, outputRank = 0)
else if axis == 2 then ReduceAxis (axisDim, TransposeDimensions (x, 1, 2), axis=1)
else Fail("ReduceAxis: Only supports axes 1 and 2.")
# === BEGIN DECODER ===
# constants for initial score and final traceback
initialPathScores = FirstAndOther (0, LOGZERO, beamDepth, axis = 2) # [1 x D]: [ 0, -INF, -INF, -INF, ... ]
finalHyp = FirstAndOther (1, 0, beamDepth, axis = 1) # [D] the final token is the top-scoring hypothesis, that is, hyp[0]
# path expansion of the D hypotheses that were best in previous time step (ordered as in previous time step)
logLLs = Columnwise (LogSoftmax, beamDepth, modelAsTrained.z) # [V x Dprev] log P(w|hist)
expandedPathScores = logLLs + If (IsFirst (logLLs), initialPathScores, Previous (tokens.score)) # [V x Dprev] log (P(w|hist) * P(hist)) for all top D hypotheses
# determine top D of expanded paths
topPaths = TraceSparse (GetTopNTensor (beamDepth, expandedPathScores), 'topPaths') # [V x Dprev] -> [V x Dprev x Dnew]
topPathScores = TraceSparse (topPaths .* expandedPathScores, 'topPathScores') # [V x Dprev x Dnew]
# form new decoding token, by reducing topPaths(Scores) along relevant dimensions
tokens = [ # [. x Dnew]
from = ReduceAxis (axis=1, vocabSize, topPaths) # [Dprev x Dnew], reduced over V
word = ReduceAxis (axis=2, beamDepth, topPaths) # [V x Dnew], reduced over Dprev
score = TraceDense (OnesTensor (1/*output dim*/ : /*reduction dims: */vocabSize : beamDepth/*Dprev*/) * topPathScores, 'tokens.score') # [1 x Dnew], reduced over [V x Dprev] and inserted a '1'
]
# network feedback for next time step
decoderFeedback = TraceState (EmbedLabels (TraceSparse (tokens.word, 'tokens.word')), 'feedback') # [embeddingDim x Dnew]
delayedDecoderFeedback = If (IsFirst (labelSentenceStartEmbeddedScattered), labelSentenceStartEmbeddedScattered, Loop.Previous (decoderFeedback))
# network state for next step. We must reorder the network state for use in next time step: Apply this lambda to all decoder LSTMs' h and c.
ReorderTopN (past_h_or_c) = Times (TraceState (past_h_or_c, 'past'), TraceDense (tokens.from, 'backp'))
# final traceback
traceback = TraceDense (If (Loop.IsLast (labelSentenceStartEmbeddedScattered/*tokens.from*/), finalHyp, Loop.Next (tokens.from * traceback)), 'traceback') # [D] one-hot, multiplying tokens.from from the left will select another one-hot row of tokens.from
decodeHyp = Times (topPaths, traceback, outputRank = 2) # [V x Dprev] 2D one-hot, selected the best hyp according to traceback
decode = TraceOneHot (decodeHyp * OnesTensor (beamDepth), 'out') # [V] reduces over Dprev -> 1D one-hot
# TODO: Can this be done in one ^^ go?
# === END DECODER ===
# propagate LSTM state to the right top-N rank given where that rank came from in the previous time step
@ -658,21 +770,19 @@ write = [
0 0 0
0 0 0")
PropagateTopN (past_h_or_c) = Times (TraceState (past_h_or_c, 'past'), TraceDense (backPointers, 'backp'))
# backPointers: [Dprev, Dnew]
# PropagateTopN:
# tokens.from: [Dprev, Dnew]
# v--------- best came from input hyp[1]
# v------- second best came from input hyp[0]
# v----- third best came from input hyp[2]
# 0 1 0
# 1 0 0
# 0 0 1
# backPointers[:,n] one-hot encodes the best predecessor at top-N rank n
# tokens.from[:,n] one-hot encodes the best predecessor at top-N rank n
# each column is a one-hot vector
# multiplying with such a column from the right will select the column represented by the one-hot value
# get decoder log likelihoods
# EvalActions: EnableNodeTracing {L"decoder[0].lstmState._privateInnards.it", L"z"}, //
logLLs = Columnwise (LogSoftmax, beamDepth, modelAsTrained.z) # [V x D] un-normalized log P(w|hist) + const
# logLLs: get decoder log likelihoods
Columnwise (f, beamDepth, z) = # TODO: Takes LogSoftmax over axis=1. it is more tricky to do this over arbitrary axes
[
@ -680,14 +790,12 @@ write = [
out = Splice (cols, axis=2)
].out
# decoder start token: 0 for first hyp, -INF for the others
# initialPathScores: decoder start token: 0 for first hyp, -INF for the others
LOGZERO = -1e30
initialPathScores = FirstAndOther (0, LOGZERO, beamDepth, axis = 2) # row vector: [ 0, -INF, -INF, -INF, ... ]
expandedPathScores = logLLs + PreviousOrDefault (PropagateTopN (pathScores), initialPathScores) # [V x Dprev] un-normalized log (P(w|hist) * P(hist)) for all top D hypotheses
# ^^ path expansion, [V x 1] + [1 x D] -> [V x D]
# expandedPathScores: path expansion, [V x 1] + [1 x D] -> [V x D]
tokenSet = TraceSparse (GetTopNTensor (beamDepth, expandedPathScores), 'tokenSet') # [V x Dprev] -> [V x Dprev x Dnew]
# topPaths:
# +-----+
# |0 0 0|
# |0 0 0|-+
@ -699,11 +807,8 @@ write = [
# |0 0 0|
# +-----+
#topWords = ReduceSum (axis=2, tokenSet) # TODO: add an axis parameter to SumColumnElements()
topWords = [
v1 = TransposeDimensions (tokenSet, 1, 2) # reduction axis is now the first
out = Times (ConstantTensor (1, (beamDepth)), v1, outputRank = 0) # reduce over the first axis and drop it
].out
# tokens.word:
#tokens.word = ReduceSum (axis=2, topPaths) # TODO: add an axis parameter to SumColumnElements()
# +-+
# |0|
# |0|-+
@ -715,7 +820,7 @@ write = [
# |0|
# +-+
backPointers = Times (ConstantTensor (1, (vocabSize)), tokenSet, outputRank = 0) # this is a tensor Times operation that reduces over the first dimension
# tokens.from:
# before dropping the first dimension: [V x Dprev x Dnew]
# +-----+
# |0 1 0| means input hyp[1] gave rise to the best
@ -724,16 +829,16 @@ write = [
# +-----+-+
# |0 0 1| means input hyp[2] gave rise to third best
# +-----+
# after: [Dprev,Dnew] e.g. "0 1 0" goes into first column, vertically
# after: [Dprev x Dnew] e.g. "0 1 0" goes into first column, vertically
# v--------- best came from input hyp[1]
# v------- second best came from input hyp[0]
# v----- third best came from input hyp[2]
# 0 1 0
# 1 0 0
# 0 0 1
# backPointers[:,n] one-hot encodes the best predecessor at top-N rank n
# tokens.from[:,n] one-hot encodes the best predecessor at top-N rank n
tokenSetScores = TraceSparse (tokenSet .* expandedPathScores, 'tokenSetScores') # [V x Dprev x Dnew]
# topPathScores:
# +-----+
# |0 0 0|
# |0 0 0|-+
@ -744,29 +849,24 @@ write = [
# +-----+z| z denotes the accumulated path score max_w P(w|hyp[2])
# |0 0 0|
# +-----+
pathScores = TraceDense (ConstantTensor (1, (1/*output dim*/ : /*reduction dims: */vocabSize : beamDepth/*Dprev*/)) * tokenSetScores, 'pathScores') # [1 x Dnew]
# traceback
# last state: take Hardmax over pathScores
# previous states: multiply wth respective backPointers matrix
# traceback:
# last state: take Hardmax over tokens.score
# previous states: multiply wth respective tokens.from matrix
# -> hyp index for every time step
# then finally use that to select the actual output TODO: That's a sample-wise matrix product between two sequences!!!
traceback = TraceDense (NextOrDefault (backPointers * traceback, finalHyp), 'traceback') # [D] one-hot, multiplying backPointers from the left will select another one-hot row of backPointers
# TODO: condition must be 1-dim, not 2-dim tensor, so we use labelSentenceStartEmbeddedScattered instead of tokens.from
# +-+
# |0|
# |1| means at this time step, hyp[1] was the best globally
# |0|
# +-+
finalHyp = FirstAndOther (1, 0, beamDepth, axis = 1) # the final token is the top-scoring hypothesis, that is, hyp[0]
# and the actual decoding output
# decode: and the actual decoding output
# This is the one to output (top sentence-level hypothesis after traceback).
decode = [
hyp = Times (tokenSet, traceback, outputRank = 2) # [V x Dprev] 2D one-hot
out = TraceOneHot (hyp * ConstantTensor (1, beamDepth), 'out') # reduces over Dprev -> 1D one-hot
].out
# traceback : [Dnew]
# tokenSet : [V x Dprev x Dnew]
# topPaths : [V x Dprev x Dnew]
# +-----+
# |0 0 0|
# |0 0 0|-+
@ -787,25 +887,10 @@ write = [
else Splice (Constant (firstVal) : ConstantTensor (otherVals, (1 : N -1)), axis = axis1 /*, axis*/) # row vector: [ 0, -INF, -INF, -INF, ... ]
].out
inputsOut = Pass (modelAsTrained.inputSequence)
labelsOut = Pass (modelAsTrained.labelSequence)
decodeOut = Pass (decode)
#topNOut = Pass (topHyps)
PreviousOrDefault (x, initialValue) = # a delay node with initial value
BS.Boolean.If (BS.Loop.IsFirst (x),
/*then*/ initialValue,
/*else*/ BS.Loop.Previous (x))
#if BS.Loop.IsFirst (x)
#then initialValue
#else BS.Loop.Previous (x)
NextOrDefault (x, initialValue) = # a delay node with initial value
BS.Boolean.If (BS.Loop.IsLast (x),
/*then*/ initialValue,
/*else*/ BS.Loop.Next (x))
#if BS.Loop.IsLast (x)
#then initialValue
#else BS.Loop.Next (x)
].m2
model = if beamDepth == 0 then top1DecodingModel (modelAsTrained)
@ -814,8 +899,8 @@ write = [
].model)
#outputPath = "$OutputDir$/Write"
outputPath = "-" # "-" will write to stdout; useful for debugging
outputPath = $decodeOutputPath$
#outputPath = "-" # "-" will write to stdout; useful for debugging
#outputNodeNames = z1.out:labels1 # when processing one sentence per minibatch, this is the sentence posterior
#outputNodeNames = network.beamDecodingModel.z1.out:labels1 # when processing one sentence per minibatch, this is the sentence posterior
@ -825,13 +910,13 @@ write = [
#outputNodeNames = network.beamDecodingModel.labelsOut:network.beamDecodingModel.decodeOut #:topNOut
# joint:
outputNodeNames = labelsOut:decodeOut:network.beamDecodingModel.labelsOut:network.beamDecodingModel.decodeOut
outputNodeNames = inputsOut:labelsOut:decodeOut:network.beamDecodingModel.inputsOut:network.beamDecodingModel.labelsOut:network.beamDecodingModel.decodeOut
#outputNodeNames = labels1:network.beamDecodingModel.decode.out
#outputNodeNames = labels1:network.beamDecodingModel.expandedPathScores
#outputNodeNames = network.beamDecodingModel.pathScores:network.beamDecodingModel.traceback
# network.beamDecodingModel.tokenSetScores
# network.beamDecodingModel.pathScores
#outputNodeNames = network.beamDecodingModel.tokens.score:network.beamDecodingModel.traceback
# network.beamDecodingModel.topPathScores
# network.beamDecodingModel.tokens.score
# network.beamDecodingModel.traceback
# network.beamDecodingModel.expandedPathScores
@ -840,12 +925,12 @@ write = [
transpose = false
labelMappingFile = "$ModelDir$/vocab.wl"
#precisionFormat = "10"
sequenceEpilogue = "\t// %s\n"
#sequenceEpilogue = "\t// %s\n"
]
#traceNodeNamesReal = network.beamDecodingModel.pathScores:network.beamDecodingModel.tokenSetScores:network.beamDecodingModel.expandedPathScores:network.beamDecodingModel.backPointers
#traceNodeNamesCategory = network.beamDecodingModel.tokenSetScores
#traceNodeNamesSparse = network.beamDecodingModel.tokenSetScores:network.beamDecodingModel.backPointers:decoderOutputEmbedded.x
#traceNodeNamesReal = network.beamDecodingModel.tokens.score:network.beamDecodingModel.topPathScores:network.beamDecodingModel.expandedPathScores:network.beamDecodingModel.tokens.from
#traceNodeNamesCategory = network.beamDecodingModel.topPathScores
#traceNodeNamesSparse = network.beamDecodingModel.topPathScores:network.beamDecodingModel.tokens.from:decoderOutputEmbedded.x
minibatchSize = 8192 # choose this to be big enough for the longest sentence
# need to be small since models are updated for each minibatch
@ -895,7 +980,7 @@ write = [
labelType = "category"
labelDim = "$confVocabSize$"
labelMappingFile = "$ModelDir$/vocab.wl"
beginSequence = "</s>"
beginSequence = "$startSymbol$" # "</s>"
endSequence = "</s>"
#### Write definition ####

Просмотреть файл

@ -0,0 +1,48 @@
# TODO: must sort this out. For now, this is just shared stuff between training and decoding.
# these depend on beamDepth parameter for now, fix this
TraceState (h, what) =
if enableTracing
then Transpose (Trace (Transpose (h), say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3, format=formatDense))
else h
TraceDense (h, what) =
if enableTracing
then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=21/*beamDepth*beamDepth*/, onlyUpToT=25, format=formatDense)
else h
TraceDenseTransposed (h, what) =
if enableTracing
then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=25, format=formatDenseTransposed)
else h
TraceOneHot (h, what) =
if enableTracing
then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=15,*/ format=formatOneHot)
else h
TraceSparse (h, what) =
if enableTracing
then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3,*/ format=formatSparse)
else h
Trace (node, say='', logFrequency=traceFrequency, logFirst=10, logGradientToo=false, onlyUpToRow=100000000, onlyUpToT=100000000, format=[], tag='') = new ComputationNode [
operation = 'Trace' ; inputs = node
]
formatDense = [
type = "real"
transpose = false
precisionFormat = ".4"
]
formatDenseTransposed = [
type = "real"
transpose = true
precisionFormat = ".4"
]
formatOneHot = [
type = "category"
transpose = false
labelMappingFile = tracingLabelMappingFile
]
formatSparse = [
type = "sparse"
transpose = false
labelMappingFile = tracingLabelMappingFile
]

@ -1 +1 @@
Subproject commit f785679a6bd5cc089b138b3c6bcb68e4b1f345ae
Subproject commit f57be8b8caeddf385a44a14acc587f4e5168152d

Просмотреть файл

@ -17,6 +17,7 @@
#include "Config.h"
#include "SimpleEvaluator.h"
#include "SimpleOutputWriter.h"
#include "Criterion.h"
#include "BestGpu.h"
#include "ScriptableObjects.h"
#include "BrainScriptEvaluator.h"
@ -121,8 +122,8 @@ void DoCrossValidate(const ConfigParameters& config)
int traceLevel = config(L"traceLevel", "0");
size_t numMBsToShowResult = config(L"numMBsToShowResult", "100");
size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);
size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);
ConfigArray evalNodeNames = config(L"evalNodeNames", "");
vector<wstring> evalNodeNamesVector;
@ -131,7 +132,7 @@ void DoCrossValidate(const ConfigParameters& config)
evalNodeNamesVector.push_back(evalNodeNames[i]);
}
std::vector<std::vector<double>> cvErrorResults;
std::vector<std::vector<EpochCriterion>> cvErrorResults;
std::vector<std::wstring> cvModels;
DataReader cvDataReader(readerConfig);
@ -143,7 +144,7 @@ void DoCrossValidate(const ConfigParameters& config)
if (!fexists(cvModelPath))
{
fprintf(stderr, "model %ls does not exist.\n", cvModelPath.c_str());
fprintf(stderr, "Model %ls does not exist.\n", cvModelPath.c_str());
if (finalModelEvaluated || !fexists(modelPath))
continue; // file missing
else
@ -158,7 +159,7 @@ void DoCrossValidate(const ConfigParameters& config)
SimpleEvaluator<ElemType> eval(net, MPIWrapper::GetInstance(), numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches);
fprintf(stderr, "model %ls --> \n", cvModelPath.c_str());
fprintf(stderr, "Model %ls --> \n", cvModelPath.c_str());
auto evalErrors = eval.Evaluate(&cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
cvErrorResults.push_back(evalErrors);
@ -167,16 +168,14 @@ void DoCrossValidate(const ConfigParameters& config)
// find best model
if (cvErrorResults.size() == 0)
{
LogicError("No model is evaluated.");
}
std::vector<double> minErrors;
std::vector<int> minErrIds;
std::vector<double> evalErrors = cvErrorResults[0];
vector<double> minErrors;
vector<int> minErrIds;
vector<EpochCriterion> evalErrors = cvErrorResults[0];
for (int i = 0; i < evalErrors.size(); ++i)
{
minErrors.push_back(evalErrors[i]);
minErrors.push_back(evalErrors[i].Average());
minErrIds.push_back(0);
}
@ -185,9 +184,9 @@ void DoCrossValidate(const ConfigParameters& config)
evalErrors = cvErrorResults[i];
for (int j = 0; j < evalErrors.size(); j++)
{
if (evalErrors[j] < minErrors[j])
if (evalErrors[j].Average() < minErrors[j])
{
minErrors[j] = evalErrors[j];
minErrors[j] = evalErrors[j].Average();
minErrIds[j] = i;
}
}
@ -196,9 +195,7 @@ void DoCrossValidate(const ConfigParameters& config)
fprintf(stderr, "Best models:\n");
fprintf(stderr, "------------\n");
for (int i = 0; i < minErrors.size(); ++i)
{
fprintf(stderr, "Based on Err[%d]: Best model = %ls with min err %.8g\n", i, cvModels[minErrIds[i]].c_str(), minErrors[i]);
}
}
template void DoCrossValidate<float>(const ConfigParameters& config);

Просмотреть файл

@ -74,6 +74,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);
wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", "");
// TODO: Map dynamicAxis from name to node at this point, where that node is memoized inside NDL.
// first look for this node already existing in the network
// BUGBUG: How does this set the dimensions then?
if (m_net->NodeNameExists(name))

Просмотреть файл

@ -263,8 +263,8 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
InvalidArgument("Please specify parameters 'beginSequence' and 'endSequence'.");
if (!outputMappingFile.empty())
cerr << "Mapping file --> " << outputVocabFile << endl;
cerr << "Vocabulary file --> " << outputVocabFile << endl;
cerr << "Mapping file --> " << outputMappingFile << endl;
cerr << "Vocabulary file --> " << outputVocabFile << endl;
if (nbrCls > 0)
{
cerr << "Word-to-class map --> " << outputWord2Cls << endl;
@ -321,7 +321,10 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
str = str + endSequencePattern;
vstr = msra::strfun::split(str, "\t ");
for (int i = 1; i < vstr.size(); i++)
// This loop used to start with 1, assuming begin and end symbol are the same.
// If they are not, I am now counting them both. No idea whether that is correct w.r.t. the class algorithm.
bool startWith1 = !beginSequence.empty() && beginSequence == endSequence;
for (size_t i = startWith1 ? 1 : 0; i < vstr.size(); i++)
v_count[vstr[i]]++;
}
fp.close();
@ -355,93 +358,108 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
vocabSize = wordCountLessCutoff;
}
// form classes
// Implements an algorithm by Mikolov --TODO: get the reference
wrd2cls.Resize(vocabSize, 1);
typedef pair<string, double> stringdouble;
unordered_map<string, double> removed; // note: std::map is supposedly faster
double unkCount = 0; // TODO: why double?
size_t size = 0;
size_t actual_vocab_size = vocabSize - 1;
priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
q(compare_second<stringdouble>(), vector<stringdouble>(v_count.begin(), v_count.end()));
while (size < actual_vocab_size && !q.empty()) // ==for (q=...; cond; q.pop())
{
size++;
string word = q.top().first;
double freq = q.top().second; // TODO: why double?
if (word == unkWord)
{
unkCount += freq;
actual_vocab_size++;
}
removed[q.top().first] = q.top().second;
q.pop();
}
while (!q.empty())
{
unkCount += q.top().second;
q.pop();
}
removed[unkWord] = unkCount;
m_count.resize(removed.size());
double total = 0;
double dd = 0;
if (nbrCls > 0)
{
for (const auto& iter : removed)
total += iter.second;
// form classes
// Implements an algorithm by Mikolov --TODO: get the reference
wrd2cls.Resize(vocabSize, 1);
for (const auto& iter : removed)
dd += sqrt(iter.second / total);
}
double df = 0;
size_t class_id = 0;
m_class.resize(removed.size());
priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
p(compare_second<stringdouble>(), vector<stringdouble>(removed.begin(), removed.end()));
while (!p.empty())
{
string word = p.top().first;
double freq = p.top().second;
typedef pair<string, double> stringdouble;
unordered_map<string, double> removed; // note: std::map is supposedly faster
double unkCount = 0; // TODO: why double?
size_t size = 0;
size_t actual_vocab_size = vocabSize - 1;
priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
q(compare_second<stringdouble>(), vector<stringdouble>(v_count.begin(), v_count.end()));
while (size < actual_vocab_size && !q.empty()) // ==for (q=...; cond; q.pop())
{
size++;
string word = q.top().first;
double freq = q.top().second; // TODO: why double?
if (word == unkWord)
{
unkCount += freq;
actual_vocab_size++;
}
removed[q.top().first] = q.top().second;
q.pop();
}
while (!q.empty())
{
unkCount += q.top().second;
q.pop();
}
removed[unkWord] = unkCount;
m_count.resize(removed.size());
double total = 0;
double dd = 0;
if (nbrCls > 0)
{
df += sqrt(freq / total) / dd;
if (df > 1)
df = 1;
for (const auto& iter : removed)
total += iter.second;
if (df > 1.0 * (class_id + 1) / nbrCls && class_id < nbrCls)
class_id++;
for (const auto& iter : removed)
dd += sqrt(iter.second / total);
}
size_t wid = m_words.size();
bool inserted = m_index.insert(make_pair(word, wid)).second;
if (inserted)
m_words.push_back(word);
double df = 0;
size_t class_id = 0;
m_class.resize(removed.size());
m_count[wid] = freq;
if (nbrCls > 0)
m_class[wid] = class_id;
p.pop();
priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
p(compare_second<stringdouble>(), vector<stringdouble>(removed.begin(), removed.end()));
while (!p.empty())
{
string word = p.top().first;
double freq = p.top().second;
if (nbrCls > 0)
{
df += sqrt(freq / total) / dd;
if (df > 1)
df = 1;
if (df > 1.0 * (class_id + 1) / nbrCls && class_id < nbrCls)
class_id++;
}
size_t wid = m_words.size();
bool inserted = m_index.insert(make_pair(word, wid)).second;
if (inserted)
m_words.push_back(word);
m_count[wid] = freq;
if (nbrCls > 0)
m_class[wid] = class_id;
p.pop();
}
assert(m_words.size() == m_index.size() && m_words.size() == m_class.size());
}
else // no classes
{
for (let& iter : v_count)
m_words.push_back(iter.first);
sort(m_words.begin(), m_words.end());
m_count.resize(m_words.size());
for (size_t i = 0; i < m_words.size(); i++)
m_count[i] = v_count.find(m_words[i])->second;
}
assert(m_words.size() == m_count.size());
// write the files
if (!outputMappingFile.empty())
{
msra::files::make_intermediate_dirs(s2ws(outputMappingFile));
ofstream ofmapping(outputMappingFile.c_str());
for (size_t i = 0; i < m_index.size(); i++)
ofmapping << m_words[i] << endl;
for (let& word : m_words)
ofmapping << word << endl;
ofmapping.close();
cerr << "Created label-mapping file with " << v_count.size() << " entries.\n";
}
msra::files::make_intermediate_dirs(s2ws(outputVocabFile));
ofstream ofvocab(outputVocabFile.c_str());
for (size_t i = 0; i < m_index.size(); i++)
for (size_t i = 0; i < m_words.size(); i++)
{
if (nbrCls > 0)
wrd2cls(i, 0) = (ElemType) m_class[i];

Просмотреть файл

@ -36,6 +36,7 @@ ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValu
ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, init = 'fromLiteral', initFromLiteral = literal, learningRateMultiplier = 0.0)
DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/ ]
Input(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
# TODO: change from dynamicAxis by name to dynamicAxis being an actual object
SparseInput(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]
SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
@ -81,6 +82,7 @@ Times(A, B, outputRank=1, tag='') = new ComputationNode [ operation = 'Times' ;
Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]
WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]
ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
ReconcileMBLayout = ReconcileDynamicAxis # back compat
CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
@ -173,11 +175,14 @@ BS = [
Constants = [
Zero = ConstantTensor (0, (1))
One = ConstantTensor (1, (1))
OnesTensor (dims) = ConstantTensor (1, dims)
# BUGBUG: ZeroesLike() would recreate the full dimension of x. Well, no need if it considers broadcasting. But still wrong if we want to broadcast a vector of different tensor dim.
#ZeroesLike (x) = CastAs (x, Zero) // read: Cast<x>(Zero)
#OnesLike (x) = CastAs (x, One)
# CastAs() does not implement broadcasting
ZeroesLike (x) = RowSlice (0, 1, x) .* Zero // hack: get one row of input and multiply with zero
ZeroesLike (x) = SumColumnElements (RowSlice (0, 1, x) .* Zero) // hack: get one row of input and multiply with zero; double-hack: reduce extra tensor dims by SumCol
ZeroSequenceLike = ZeroesLike # TODO: this should yield a scalar sequence, while ZeroesLike should be a tensor
ZeroesLike1 (x) = x .* Zero # get a tensor of zeroes of same dim as x TODO: Do this as a C++ node (will be simple)
OnesLike (x) = ZeroesLike (x) + One
# is this like Sequences.Repeat?
True = 1
@ -216,6 +221,32 @@ Boolean = [
##############################################################################
Sequences = [
# broadcast a single-step sequence to a multi-step sequence
BroadcastSequenceAs (type, data1) = [ # type=example sequence with desired length (outside of a loop), data1=1 time step
ZeroSequenceLike (x) = RowSlice (0, 1, x) .* Constants.Zero # BUGBUG: SumColumnElements() has a CPU/GPU problem
index = /*Constants.*/ZeroSequenceLike (type) # create an index sequence [ 0 0 0 ... ] of target length
packedIndex = PackedIndex (data1, index) # convert into internal packed index w.r.t. 'data1'
out = GatherPacked (packedIndex, data1) # copy data1[0] to all elements, total length like 'type'
].out
# rolling window over past N samples
# returns a record [ value=..., valid=... ]
# This implementation is suboptimal in that it creates copies for the intermediate steps.
PastValueWindow (N, in) = [
delayLine[t:0..N-1] = [ # shift register for encoder, last N inputs
value = if t == 0
then in # delay 0: current value
else Loop.PastValue (0, in, timeStep=t)
valid = if t == 0
then Constants.One
else Constants.One - PastValue (1, Constants.ZeroesLike (in), timeStep=t, defaultHiddenActivation=1)
]
# delayLine[t].value = value of t steps in the past
# delayLine[t].valid = true if we had a value t steps in the past
value = Slice (-1, 0, axis=-1, SplitDimension (RowStack (array[0..N-1](t=>delayLine[t].value)), 1, N)) # [i, delay]
valid = Slice (-1, 0, axis=-1, SplitDimension (RowStack (array[0..N-1](t=>delayLine[t].valid)), 1, N)) # [i, delay]
]
# fold left/right: Reduce entire sequence by applying binaryOp, e.g. FoldL (Plus, 0, input)
# LINQ calls this Aggregate; and may or may not specify the seed value; and allows a predicate
FoldL (binaryOp, x0, x) = _Fold (PastValue, binaryOp, x0, x)
@ -312,8 +343,24 @@ Loop = [
_IsWithin (DelayFn/*PastValue or FutureValue*/, N, x) = DelayFn (0, Constants.ZeroesLike (x)/*false*/, timeStep=N, defaultHiddenActivation=Constants.True)
# opposite of Id's "next"
Previous (x) = PastValue (0, x, timeStep=1)
Next (x) = FutureValue (0, x, timeStep=1)
Previous (x) = PastValue (0, x, timeStep=1)
Next (x) = FutureValue (0, x, timeStep=1)
PreviousOrDefault (x, defaultValue=Constant (0)) = # a delay node with initial value --TODO: merge the two, then do in C++
[
flags = BS.Loop.IsFirst (x)
out = BS.Boolean.If (flags,
/*then*/ BS.Sequences.Scatter (flags, defaultValue),
/*else*/ Previous (x))
].out
NextOrDefault (x, defaultValue=Constant (0)) = # a delay node with initial value
[
flags = BS.Loop.IsLast (x)
out = BS.Boolean.If (flags,
/*then*/ BS.Sequences.Scatter (flags, defaultValue),
/*else*/ Next (x))
].out
]
##############################################################################
@ -323,8 +370,9 @@ Loop = [
Parameters =
[
WeightParam (outputDim, inputDim) = Parameter (outputDim, inputDim, init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1)
BiasParam (dim) = ParameterTensor (dim, init='fixedValue', value=0.0)
ScalarParam() = Parameter (1, 1, init='fixedValue', value=0.0)
DiagWeightParam (outputDim) = ParameterTensor ((outputDim), init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1) # meant to be applied elementwise
BiasParam (dim) = ParameterTensor ((dim), init='fixedValue', value=0.0)
ScalarParam() = BiasParam (1)
# route input through an extra scalar weight, for stabilization
Stabilize (x, enabled=true) =
@ -350,16 +398,17 @@ RNNs =
// If we change this, we'd need to fix the LSTM end-to-end test.
LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=false) =
[
#inputDim = x.dim # get dimension from 'x' (if this works, we can remove the inputDim1 parameter)
_privateInnards = [ // encapsulate the privateInnards workings
dh = prevState.h // previous values
dc = prevState.c
// parameter macros--these carry their own weight matrices
B() = Parameters.BiasParam(cellDim)
B() = Parameters.BiasParam (cellDim)
W(v) = Parameters.WeightParam (cellDim, inputDim) * Parameters.Stabilize (v, enabled=enableSelfStabilization) // input-to-hidden
H(h) = Parameters.WeightParam (cellDim, outputDim) * Parameters.Stabilize (h, enabled=enableSelfStabilization) // hidden-to-hidden
C(c) = Parameters.WeightParam (cellDim, 1) .* Parameters.Stabilize (c, enabled=enableSelfStabilization) // cell-to-hiddden (note: applied elementwise)
C(c) = Parameters.DiagWeightParam (cellDim) .* Parameters.Stabilize (c, enabled=enableSelfStabilization) // cell-to-hiddden (note: applied elementwise)
// note: the W(x) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
it = Sigmoid (W(x) + B() + H(dh) + C(dc)) // input gate(t)
@ -401,6 +450,28 @@ RNNs =
enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
lstmState = LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
].lstmState.h // that's the value we return
# same as RecurrentLSTMP but returns both h and c
RecurrentLSTMP2 (inputDim, outputDim, cellDim, x, enableSelfStabilization=false) =
[
prevState =
[
h = Loop.Previous (lstmState.h) # hidden state(t-1)
c = Loop.Previous (lstmState.c) # cell(t-1)
]
enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
lstmState = BS.RNNs.LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
].lstmState // that's the value we return
# a stack of recurrent LSTMs (unidirectional)
RecurrentLSTMP2Stack (input, inputDim, hiddenDims, cellDims, enableSelfStabilization=false) = [
useStabilizer = enableSelfStabilization
layer[i:0..Length (hiddenDims)-1] =
RecurrentLSTMP2 (if i == 0 then inputDim else hiddenDims[i-1],
hiddenDims[i], cellDims[i],
if i == 0 then input else layer[i-1].h,
enableSelfStabilization=useStabilizer)
].layer
]
##############################################################################

Просмотреть файл

@ -184,7 +184,7 @@ void TestSequenceReader(const ConfigParameters& configBase)
MBLayoutPtr pMBLayout = make_shared<MBLayout>();
StreamMinibatchInputs matrices;
matrices.AddInput(featureNames[0], featuresMatrix, pMBLayout, TensorShape());
matrices.AddInput(labelNames[0], labelsMatrix , pMBLayout, TensorShape());
matrices.AddInput(labelNames[1] , labelsMatrix , pMBLayout, TensorShape());
auto start = std::chrono::system_clock::now();
int epochs = config("maxEpochs");

Просмотреть файл

@ -40,6 +40,7 @@ void DataReaderBase::SetMinibatchLayout(StreamMinibatchInputs& minibatch)
for (const auto& iter : minibatch)
{
assert(iter.second.pMBLayout == pMBLayout);
// TODO: This should be a runtime check, not an assert() that only runs in Debug.
UNUSED(iter);
}

Просмотреть файл

@ -148,47 +148,33 @@ void File::Init(const wchar_t* filename, int fileOptions)
// (wstring only for now; feel free to make this a template if needed)
/*static*/ wstring File::DirectoryPathOf(wstring path)
{
#ifdef WIN32
if (IsWindows8OrGreater())
#ifdef _WIN32
HRESULT hr;
path = msra::strfun::ReplaceAll<wstring>(path, L"/", L"\\"); // Win32 accepts forward slashes, but it seems that PathRemoveFileSpec() does not
if (IsWindows8OrGreater()) // PathCchRemoveFileSpec() only available on Windows 8+
{
typedef HRESULT(*PathCchRemoveFileSpecProc)(_Inout_updates_(_Inexpressible_(cchPath)) PWSTR, _In_ size_t);
HINSTANCE hinstLib = LoadLibrary(TEXT("api-ms-win-core-path-l1-1-0.dll"));
if (hinstLib == nullptr)
RuntimeError("DirectoryPathOf: LoadLibrary() unexpectedly failed.");
PathCchRemoveFileSpecProc PathCchRemoveFileSpec = reinterpret_cast<PathCchRemoveFileSpecProc>(GetProcAddress(hinstLib, "PathCchRemoveFileSpec"));
if (!PathCchRemoveFileSpec)
RuntimeError("DirectoryPathOf: GetProcAddress() unexpectedly failed.");
HINSTANCE hinstLib;
PathCchRemoveFileSpecProc ProcAdd;
BOOL fFreeResult = FALSE;
// this is the actual function call we care about
hr = PathCchRemoveFileSpec(&path[0], path.size());
FreeLibrary(hinstLib);
}
else // on Windows 7-, use older PathRemoveFileSpec() instead
hr = PathRemoveFileSpec(&path[0]);
hinstLib = LoadLibrary(TEXT("api-ms-win-core-path-l1-1-0.dll"));
if (hinstLib != nullptr)
{
ProcAdd = reinterpret_cast<PathCchRemoveFileSpecProc>(GetProcAddress(hinstLib, "PathCchRemoveFileSpec"));
if (NULL != ProcAdd)
{
auto hr = (ProcAdd)(&path[0], path.size());
if (hr == S_OK) // done
path.resize(wcslen(&path[0]));
else if (hr == S_FALSE) // nothing to remove: use .
path = L".";
}
else
{
LogicError("DirectoryPathOf: GetProcAddress() unexpectedly failed.");
}
fFreeResult = FreeLibrary(hinstLib);
}
else
{
LogicError("DirectoryPathOf: LoadLibrary() unexpectedly failed.");
}
}
else
{
auto hr = PathRemoveFileSpec(&path[0]);
if (hr != 0) // done
path.resize(wcslen(&path[0]));
else
path = L".";
}
RuntimeError("DirectoryPathOf: Path(Cch)RemoveFileSpec() unexpectedly failed with 0x%08x.", (unsigned int)hr);
#else
auto pos = path.find_last_of(L"/");
if (pos != path.npos)
@ -264,7 +250,7 @@ File::~File(void)
{
if (m_pcloseNeeded)
{
// TODO: Check for error code and throw if !std::uncaught_exception()
// TODO: Check for error code and throw if !std::uncaught_exception()
_pclose(m_file);
}
else if (m_file != stdin && m_file != stdout && m_file != stderr)

Просмотреть файл

@ -384,8 +384,8 @@ public:
{
// look for closing brace and also for another opening brace
// Inside strings we only accept the closing quote, and ignore any braces inside.
current = str.find_first_of(braceStack.back() == '"' ? "\"" : charsToLookFor, current + 1); //
if (current == string::npos) // none found: done or error
current = str.find_first_of(braceStack.back() == '"' ? "\"" : charsToLookFor, current + 1);
if (current == string::npos) // none found: error
break;
char brace = str[current];
// found the expected closing brace?
@ -406,7 +406,7 @@ public:
}
}
// hit end before everything was closed: error
RuntimeError("no closing bracket found in parameters");
RuntimeError("no closing %c found in parameters", braceStack.back());
//RuntimeError("no closing bracket found in parameters (opening bracket at offset %d)\n%s", (int)tokenStart, str.substr(tokenStart).c_str());
}

Просмотреть файл

@ -67,21 +67,21 @@ public:
Input() {} // some STL classes need this for general happiness
// helper for typecasting the matrix pointer
template<class ElemType>
template<class ElemType>
Matrix<ElemType>& GetMatrix(const wchar_t* name/*for debugging only*/ = L"(unknown)") const
{
{
assert(matrix);
auto* matrixp = dynamic_cast<Matrix<ElemType>*>(matrix.get());
if (!matrixp)
{
// print a rather rich error to track down a regression failure
auto isFloat = !!dynamic_cast<Matrix<float>*> (matrix.get());
if (!matrixp)
{
// print a rather rich error to track down a regression failure
auto isFloat = !!dynamic_cast<Matrix<float>*> (matrix.get());
auto isDouble = !!dynamic_cast<Matrix<double>*>(matrix.get());
LogicError("GetMatrix<%s>: Attempted to access input stream '%ls' with wrong precision, got %s {%d,%d} instead of %s.",
typeid(ElemType).name(), name, typeid(matrix.get()).name(), (int)isFloat, (int)isDouble, typeid(Matrix<ElemType>*).name());
}
return *matrixp;
}
return *matrixp;
}
};
private:

Просмотреть файл

@ -166,6 +166,7 @@ struct MBLayout
m_columnsValidityMask = std::move(other->m_columnsValidityMask);
m_writable = other->m_writable;
m_axisName = std::move(other->m_axisName);
}
@ -254,9 +255,11 @@ public:
size_t GetNumTimeSteps() const { return m_numTimeSteps; }
size_t GetNumParallelSequences() const { return m_numParallelSequences; }
const std::wstring GetAxisName() const { return m_axisName; }
void SetAxisName(const std::wstring& axisName) { m_axisName = axisName; }
// axis names are for now only a debugging aid
// In the future, there will be a mechanism to denote that axes are meant to be the same.
const wchar_t* GetAxisName() const { return m_axisName.c_str(); }
void SetAxisName(const std::wstring& name) { m_axisName = name; }
void SetUniqueAxisName(std::wstring name) // helper for constructing
{
static std::map<std::wstring, size_t> nameIndices;
@ -554,7 +557,9 @@ private:
mutable bool m_writable;
// The axis this MBLayout represents.
// For now only a string meant for debugging.
std::wstring m_axisName;
public:
// special accessor for sequence training --TODO: must be replaced by a different mechanism

Просмотреть файл

@ -776,8 +776,8 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
for (const auto& x : allnodes)
{
line.clear();
line = msra::strfun::wstrprintf(L" \"%ls\" [ label = \"%ls [%s%s]\\n%ls\" ] ;\n",
x->GetName().c_str(), x->GetName().c_str(), string(x->GetSampleLayout()).c_str(), x->HasMBLayout() ? " x *" : "",
line = msra::strfun::wstrprintf(L" \"%ls\" [ label = \"%ls [%s%ls]\\n%ls\" ] ;\n",
x->GetName().c_str(), x->GetName().c_str(), string(x->GetSampleLayout()).c_str(), x->GetMBLayoutAxisString().c_str(),
x->OperationName().c_str());
fstream << line;
}

Просмотреть файл

@ -52,9 +52,10 @@ public:
m_randomSeedOffset(0),
m_isCompiled(false),
m_areMatricesAllocated(false),
m_pMBLayoutOfNetwork(make_shared<MBLayout>(1,0, L"*")),
m_pMBLayoutOfNetwork(make_shared<MBLayout>(1, 0, L"*")),
m_environment(make_shared<ComputationEnvironment>())
{
//m_pMBLayoutOfNetwork->SetAxisName(L"T");
}
ComputationNetwork(DEVICEID_TYPE deviceId)
@ -706,10 +707,9 @@ public:
// evaluation
// -----------------------------------------------------------------------
// zeroes out all gradients except the root itself
// TODO: why not the root?
// zeroes out all gradients except the root itself (since its gradient is set from outside rather than propagated down)
// (Note that inside the nodes this only really sets a flag to do it later when needed, but that's not our concern.)
void ZeroGradients(const ComputationNodeBasePtr& rootNode)
void ZeroInputGradients(const ComputationNodeBasePtr& rootNode)
{
for (auto& node : GetAllNodesForRoot(rootNode))
node->ZeroGradientsOfInputs();

Просмотреть файл

@ -111,6 +111,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
// TODO: DiagTimes is also an alias of ElementTimes; current separate implementation is unnecessary.
else if (nodeType == L"PerDimMeanVarNormalizationNode") return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == L"PerDimMeanVarDeNormalizationNode") return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == L"ReconcileMBLayout") return New<ReconcileDynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == L"RowElementTimes") return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == L"RowSlice") return New<SliceNode<ElemType>>(forward<_Types>(_Args)...);
else if (nodeType == L"Scale") return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
@ -194,6 +195,7 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
}
// TODO: change these to take an actual object instead of a name for dynamicAxis
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName)
{

Просмотреть файл

@ -43,18 +43,17 @@ void ComputationNetwork::ForwardProp(const ComputationNodeBasePtr rootNode)
GetNestedNetwork(rootNode)->ForwardProp(FrameRange(nullptr));
}
// set the gradient matrix of a node to an 1x1 matrix containing 1.0
// Returns false if the node is not a ComputationNode<ElemType>.
// set the gradient matrix of a (root) node 1.0
// Returns false if the node is not a ComputationNode<ElemType>; see Backprop() below for intended use.
template <class ElemType>
static bool SetGradientToScalarOne(ComputationNodeBasePtr nodep)
static bool SetRootGradientToScalarOne(ComputationNodeBasePtr nodep)
{
auto node = dynamic_pointer_cast<ComputationNode<ElemType>>(nodep);
bool hasMatchingType = (node != nullptr);
if (hasMatchingType)
{
Matrix<ElemType>& grad = node->Gradient();
grad.Resize(node->Value());
grad.SetValue((ElemType) 1.0);
// reset the root gradient to 1
node->ResetGradient(1);
}
return hasMatchingType;
}
@ -69,13 +68,13 @@ void ComputationNetwork::Backprop(const ComputationNodeBasePtr rootNode) // trai
if (!Environment().IsTraining())
LogicError("Backprop: Requires network is to be in training mode.");
// reset all gradients to zero (actually, internally, this is lazy, but we don't care here)
ZeroGradients(rootNode);
// initialize root gradient with a scalar value of 1.0
if (!SetGradientToScalarOne<float>(rootNode) && !SetGradientToScalarOne<double>(rootNode))
if (!SetRootGradientToScalarOne<float>(rootNode) && !SetRootGradientToScalarOne<double>(rootNode))
LogicError("Backprop: Training criterion is neither ComputationNode<float> nor ComputationNode<double>.");
// reset all gradients below rootNode to zero (actually, internally, this is lazy, but we don't care here)
ZeroInputGradients(rootNode);
// backpropagate through the network
GetNestedNetwork(rootNode)->Backprop(FrameRange(nullptr), true, true);
}
@ -134,6 +133,10 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
{
for (auto& node : m_nestedNodes)
{
#if 0
if (dynamic_pointer_cast<LearnableParameter<float>>(node))
dynamic_pointer_cast<ComputationNode<float>>(node)->DebugLogMinibatch();
#endif
if (node->IsOutOfDateWrtInputs())
{
node->BeginForwardProp();
@ -189,8 +192,9 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
for (auto& node : m_nestedNodes)
{
if (node->GetMBLayout() != GetMBLayout())
LogicError("Evaluate: all nodes inside a recurrent loop must have a layout that is identical; mismatch found for nodes '%ls' vs. '%ls'",
node->NodeName().c_str(), m_nestedNodes[0]->NodeName().c_str());
LogicError("Evaluate: All nodes inside a recurrent loop must have a layout that is identical; mismatch found for nodes '%ls' (%ls) vs. '%ls' (%ls)",
node ->NodeName().c_str(), node ->GetMBLayoutAxisString().c_str(),
m_nestedNodes[0]->NodeName().c_str(), m_nestedNodes[0]->GetMBLayoutAxisString().c_str());
}
// tell all that loop is about to commence
@ -525,7 +529,7 @@ void ComputationNetwork::ResetMBLayouts()
for (const auto& node : GetAllNodesForRoot(nullptr))
node->LinkToMBLayout(nullptr);
// DynamicAxis nodes are (apart from the network-wide MBLayout) the main holders of MBLayouts. Initialize them.
// DynamicAxis nodes are (apart from the soon-to-be-deprecated network-wide MBLayout) the main holders of MBLayouts. Initialize them.
// The only other instances are nodes that change the MBLayout, like WhereNode.
for (auto node : GetNodesWithType(L"DynamicAxis"))
node->LinkToMBLayout(make_shared<MBLayout>(1, 0, node->GetName()));
@ -533,6 +537,7 @@ void ComputationNetwork::ResetMBLayouts()
// This is now initialized inside of the Input nodes, with the proper connections.
for (auto node : InputNodes(nullptr))
{
// TODO: use if (!Is<ITakesDynamicAxis>(node))...
auto n = dynamic_pointer_cast<ITakesDynamicAxis>(node);
if (!n)
LogicError("Expected %ls to implement ITakesDynamicAxis, but it doesn't.", node->NodeDescription().c_str());
@ -704,7 +709,7 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
{
unchanged = !ValidateNode(node, isFinalValidationPass);
string updatedPrototype = node->FormatOperationPrototype("");
#if 1 // print prototype in final validation pass
#if 0 // print prototype in final validation pass. Problematic for tracking down validation errors in loops.
unchanged;
if (isFinalValidationPass)
#else // print prototype upon every change (useful for debugging)

Просмотреть файл

@ -156,9 +156,16 @@ void ComputationNetwork::ConstructFromRoots(DEVICEID_TYPE deviceId, deque<Comput
// not in the cache yet: create it (or not if no such member)
void /*CustomConfigRecord::*/ ComputationNetwork::LazyCreateConfigMember(const wstring& id) const /*override*/
{
let iter = m_nameToNodeMap.find(id);
auto iter = m_nameToNodeMap.find(id);
if (iter == m_nameToNodeMap.end())
return; // no such node
{
// workaround to allow to access members with '.' inside: change to _
for (iter = m_nameToNodeMap.begin(); iter != m_nameToNodeMap.end(); iter++)
if (msra::strfun::ReplaceAll<wstring>(iter->first, L".", L"_") == id)
break;
if (iter == m_nameToNodeMap.end())
return; // no such node
}
const ComputationNodeBasePtr& node = iter->second;
// TODO: What is the expressionPath?
let& nodeName = node->NodeName(); // failFn lambda below holds a copy of the name for the error message. Let's not hold an unneccessary shared_ptr to the node, risking cycles & stuff.
@ -168,16 +175,20 @@ void /*CustomConfigRecord::*/ ComputationNetwork::LazyCreateConfigMember(const w
vector<wstring> /*IConfigRecord::*/ ComputationNetwork::GetMemberIds() const
{
vector<wstring> nodeNames;
set<wstring> nodeNames;
for (let& iter : m_nameToNodeMap)
{
const ComputationNodeBasePtr& node = iter.second;
const wstring& nodeName = node->NodeName();
if (nodeName.find_first_of(L".[$")) // only expose the top-level names
wstring nodeName = node->NodeName();
if (nodeName.find_first_of(L"$") != nodeName.npos) // skip non-top-level names
continue;
nodeNames.push_back(nodeName);
// temp solution for composites: use _ instead of .
nodeName = msra::strfun::ReplaceAll<wstring>(nodeName, L".", L"_");
if (nodeName.find_first_of(L".[") != nodeName.npos) // skip composite names
continue;
nodeNames.insert(nodeName);
}
return nodeNames;
return vector<wstring>(nodeNames.begin(), nodeNames.end());
}
// ===================================================================

Просмотреть файл

@ -31,8 +31,15 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
// after nodes that propagate outside of the loop, and thus, in the last
// time step of the sequence, have not yet received a gradient from a parent
// and thus may not have had their gradient matrices allocated.
//if (m_needsGradient)
// LazyZeroGradient(); // set gradient to 0 if this is the first time
#if 1 // keep enabled once this works
#if 1 // log the cases where this is needed
if (m_needsGradient && !m_gradientInitialized)
//LogicError("%ls %ls operation: Backprop called with uninitialized gradient.", NodeName().c_str(), OperationName().c_str());
fprintf(stderr, "%ls %ls operation: Initializing gradient out of line.\n", NodeName().c_str(), OperationName().c_str());
#endif
if (m_needsGradient)
LazyZeroGradient(); // set gradient to 0 if this is the first time
#endif
if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop)
LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str());
@ -139,11 +146,11 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all
{
size_t dim1 = shape1[k];
// BUGBUG: We must consider the allowBroadcast flag here.
if (dims[k] == 1) // is [0] broadcasting?
if (dims[k] <= 1 && dim1 != 0) // is [0] broadcasting (1) or unspecified (0)?
dims[k] = dim1; // then use dimension we broadcast to
else if (dim1 == 1) // if [1] is broadcasting
; // dims is already correct
else if (isFinalValidationPass && dim1 != dims[k]) // no broadcasting: they must match
else if (dim1 <= 1 && dims[k] != 0) // if [1] is broadcasting or unspecified
; // then dims is already correct
else if (isFinalValidationPass && dim1 != dims[k]) // no broadcasting or unspecified: they must match
InvalidArgument("%ls: Input dimensions [%s] and [%s] are not compatible.",
NodeDescription().c_str(), string(shape0).c_str(), string(shape1).c_str());
}
@ -348,7 +355,7 @@ const std::string ComputationNodeBase::ShapeDescription() const
return msra::strfun::strprintf("[%s%s%ls]",
string(m_sampleLayout).c_str(),
HasMBLayout() ? " x " : "",
HasMBLayout() ? GetMBLayout()->GetAxisName().c_str() : L"");
HasMBLayout() ? GetMBLayout()->GetAxisName() : L"");
}
template <class ElemType>
@ -507,6 +514,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
{
if (formatChar == 'f') // print as real number
{
if (dval == 0) dval = fabs(dval); // clear the sign of a negative 0, which are produced inconsistently between CPU and GPU
fprintfOrDie(f, valueFormatString.c_str(), dval);
}
else if (formatChar == 'u') // print category as integer index
@ -707,7 +715,11 @@ using namespace Microsoft::MSR::CNTK;
template <>
shared_ptr<Object> MakeRuntimeObject<ComputationNodeBase>(const IConfigRecordPtr configp)
{
return NewComputationNodeFromConfig(configp);
let node = NewComputationNodeFromConfig(configp);
// temporarily disabling this, as it caused a test to fail:
//if (!node->Is<IRecurrentNode>())
// node->Validate(/*isFinalValidationPass*/false); // do an initial validation, so that we have access to dimensions
return node;
}
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNodeBase> registerComputationNode(L"ComputationNode");

Просмотреть файл

@ -482,6 +482,18 @@ public:
const MBLayoutPtr& GetMBLayout() const { return m_pMBLayout; }
bool HasMBLayout() const { return !!m_pMBLayout; }
// for logging: get the string fragment for displaying the dimension
std::wstring GetMBLayoutAxisString() const
{
if (!HasMBLayout())
return L"";
const wstring& axisName = GetMBLayout()->GetAxisName();
if (axisName.empty())
return L" x *";
else
return L" x " + axisName;
}
protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access
size_t GetNumParallelSequences() const
@ -685,6 +697,14 @@ public:
return false;
}
// reset gradients of a node's inputs
// This really only clears the lazy-init flags (LazyZeroGradient() actually clears the values lazily).
void /*ComputationNodeBase::*/ ZeroGradientsOfInputs()
{
for (size_t i = 0; i < m_inputs.size(); i++)
Input(i)->m_gradientInitialized = false;
}
// -----------------------------------------------------------------------
// masking
// -----------------------------------------------------------------------
@ -695,8 +715,6 @@ public:
virtual void InvalidateMissingValueColumns(const FrameRange&) = 0;
virtual void InvalidateMissingGradientColumns(const FrameRange&) = 0;
virtual void ZeroGradientsOfInputs() = 0;
// -----------------------------------------------------------------------
// memory sharing
// -----------------------------------------------------------------------
@ -1218,7 +1236,7 @@ public:
return GradientFor(fr);
}
// tensor version of the above functions
TensorView<ElemType> DataTensorFor(Matrix<ElemType>& data, size_t rank, const FrameRange& fr)
TensorView<ElemType> DataTensorFor(const MatrixBasePtr& data, size_t rank, const FrameRange& fr)
{
try
{
@ -1231,11 +1249,11 @@ public:
}
TensorView<ElemType> ValueTensorFor(size_t rank, const FrameRange& fr)
{
return DataTensorFor(Value(), rank, fr);
return DataTensorFor(ValuePtr(), rank, fr);
}
TensorView<ElemType> GradientTensorFor(size_t rank, const FrameRange& fr)
{
return DataTensorFor(Gradient(), rank, fr);
return DataTensorFor(GradientPtr(), rank, fr);
}
// TODO: Are all these meant to read out a scalar? Then rename and verify dimensions.
@ -1300,6 +1318,7 @@ public:
void UpdateFunctionValuesSize()
{
UpdateDataSize(Value());
Value().CollapseDataLocationAfterWriting(); // actually before writing, should change the name
}
// -----------------------------------------------------------------------
@ -1375,14 +1394,8 @@ public:
// TODO: move to -Base (or -Network?)
void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override;
// TODO: why of the inputs, and not the node itself?
void /*ComputationNodeBase::*/ ZeroGradientsOfInputs() override // clears the lazy-init flags (LazyZeroGradient() actually clears the values lazily)
{
for (size_t i = 0; i < m_inputs.size(); i++)
Input(i)->m_gradientInitialized = false;
}
// lazy resetting of gradient
// This performs the actual zeroing out.
void LazyZeroGradient()
{
if (!m_needsGradient)
@ -1391,8 +1404,14 @@ public:
if (m_gradientInitialized)
return;
ResetGradient(0);
}
// resize and reset this node's gradient to a given value (normally 0, 1 for root)
void ResetGradient(ElemType val)
{
UpdateDataSize(Gradient());
Gradient().SetValue(0);
Gradient().SetValue(val);
m_gradientInitialized = true;
}
@ -1503,8 +1522,45 @@ public:
const std::string& sampleSeparator, std::string valueFormatString,
bool outputGradient = false) const;
// simple helper to log the content of a minibatch
void DebugLogMinibatch(bool outputGradient = false) const
{
fprintf(stderr, "<<<<<<\n"); // some prologue and epilogue so that we can use diff -c1 to see the node name
fprintf(stderr, "<<<<<<\n");
fprintf(stderr, "DebugLogMinibatch: <<<<< %ls%s >>>>>\n", NodeName().c_str(), outputGradient ? " (gradient)" : "");
WriteMinibatchWithFormatting(stderr, FrameRange(), 8, 10, false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/false, std::vector<std::string>(),
""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/,
"%.8f"/*valueFormatString*/, outputGradient);
fprintf(stderr, ">>>>>>\n");
fprintf(stderr, ">>>>>>\n");
}
void Trace()
{
#if 0
static const std::set<std::wstring> toLog{
L"labelSentenceStartEmbedded",
L"delayedDecoderFeedback.h.x",
L"delayedDecoderFeedback.h.flags",
L"delayedDecoderFeedback.h.out.thenVal.h.indexSequence.h.indexSequence.h",
L"delayedDecoderFeedback.h.out.thenVal.h.indexSequence.h",
L"delayedDecoderFeedback.h.out.thenVal.h",
L"delayedDecoderFeedback.h.out.PlusArgs[0]",
L"delayedDecoderFeedback.h.out.PlusArgs[1].ElementTimesArgs[0]",
L"delayedDecoderFeedback.h.out.elseVal",
L"delayedDecoderFeedback.h.out.PlusArgs[1]",
L"delayedDecoderFeedback.h.out",
L"delayedDecoderFeedback"
};
if (toLog.find(NodeName()) != toLog.end())
DebugLogMinibatch();
if (NodeName() == L"delayedDecoderFeedback.h.out")
{
static int i = 0;
if (++i == 2)
exit(1);
}
#endif
if (m_traceNodeValueReal || m_traceNodeValueAsCategoryLabel || m_traceNodeValueSparse)
{
fprintf(stderr, "Trace --> %s\n", FormatOperationPrototype("").c_str());
@ -1556,8 +1612,8 @@ public:
/*HasToString::*/ wstring ToString() const override
{
// we format it like "name : type rows x cols ( args )"
wstring result = /*TidyName*/ (NodeName()) + L" : " + OperationName();
result.append(msra::strfun::wstrprintf(L" [%s%s]", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : ""));
wstring result = NodeName() + L" : " + OperationName();
result.append(msra::strfun::wstrprintf(L" [%s%ls]", string(GetSampleLayout()).c_str(), GetMBLayoutAxisString().c_str()));
if (m_inputs.empty())
result.append(L" ()");
else
@ -1580,7 +1636,7 @@ public:
// for debugging purposes
void /*ComputationNodeBase::*/ PrintSelf(bool printMatrices = false) const
{
fprintf(stderr, "\n%ls[%s%s] = %ls", NodeName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "", OperationName().c_str());
fprintf(stderr, "\n%ls[%s%ls] = %ls", NodeName().c_str(), string(GetSampleLayout()).c_str(), GetMBLayoutAxisString().c_str(), OperationName().c_str());
if (!IsLeaf())
{
@ -1589,7 +1645,7 @@ public:
{
if (i > 0)
fprintf(stderr, ", ");
fprintf(stderr, "%ls[%s%s] = %ls", m_inputs[i] ? m_inputs[i]->NodeName().c_str() : L"NULL", string(m_inputs[i]->GetSampleLayout()).c_str(), m_inputs[i]->HasMBLayout() ? " x *" : "", OperationName().c_str());
fprintf(stderr, "%ls[%s%ls] = %ls", m_inputs[i] ? m_inputs[i]->NodeName().c_str() : L"NULL", string(m_inputs[i]->GetSampleLayout()).c_str(), m_inputs[i]->GetMBLayoutAxisString().c_str(), OperationName().c_str());
}
fprintf(stderr, ")");
}
@ -1749,7 +1805,6 @@ public:
virtual void PrintSelf(bool) const override { NOT_IMPLEMENTED; }
virtual void ValidateInferInputDimsFrom(const TensorShape&) override { NOT_IMPLEMENTED; }
virtual void SetInput(const size_t, const Microsoft::MSR::CNTK::ComputationNodeBase::ComputationNodeBasePtr&) override { NOT_IMPLEMENTED; }
virtual void ZeroGradientsOfInputs(void) override { NOT_IMPLEMENTED; }
virtual void MaskMissingValueColumnsToZero(const Microsoft::MSR::CNTK::FrameRange&) override { NOT_IMPLEMENTED; }
virtual void MaskMissingGradientColumnsToZero(const Microsoft::MSR::CNTK::FrameRange&) override { NOT_IMPLEMENTED; }
virtual void InvalidateMissingValueColumns(const Microsoft::MSR::CNTK::FrameRange&) override { NOT_IMPLEMENTED; }
@ -1854,6 +1909,7 @@ protected:
using Base::GetInputSampleLayout; \
using Base::GetInputsFromConfig; \
using Base::GetMBLayout; \
using Base::GetMBLayoutAxisString; \
using Base::GetNumInputs; \
using Base::GetNumParallelSequences; \
using Base::GetNumTimeSteps; \
@ -1865,6 +1921,7 @@ protected:
using Base::Gradient; \
using Base::GradientAsMatrix; \
using Base::GradientFor; \
using Base::GradientPtr; \
using Base::GradientTensorFor; \
using Base::HasMBLayout; \
using Base::InferMBLayoutFromInputsForStandardCase; \
@ -1909,6 +1966,7 @@ protected:
using Base::ValidateUnaryMap; \
using Base::ValidateUnaryReduce; \
using Base::ValueFor; \
using Base::ValuePtr; \
using Base::ValueTensorFor; \
using Base::VerifyDataSize; \
using Base::VerifyDims; \

Просмотреть файл

@ -340,8 +340,8 @@ public:
size_t mapCount = m_mapCount.GetNumElements();
size_t weightCols = kW * kH * inDims.m_numChannels;
// check/infer input [0] (weights)
// BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
// check/infer input [0] (weights)
// BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols));
if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != mapCount))
@ -358,31 +358,31 @@ public:
else
{
if (m_imageLayout != ImageLayoutKind::CHW)
{
{
InvalidArgument(
"%ls %ls supports only cuDNN (CHW) data layout. "
"Please specify imageLayout=\"cudnn\" in %ls node in your script "
"and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
}
}
inputShape = GetInputSampleLayout(inputIdx);
auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
SetDims(outDims, HasMBLayout());
}
}
if (isFinalValidationPass)
{
if (m_convEng == nullptr)
{
{
auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
m_maxTempMemSizeInSamples, m_poolKind);
}
}
if (Input(0)->GetAsMatrixNumCols() != m_kernelShape.GetNumElements() ||
Input(0)->GetAsMatrixNumRows() != m_convEng->Geometry()->KernelCount())
{
{
LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
}
@ -587,7 +587,7 @@ public:
m_inputSizePerSample = inDims.m_width * inDims.m_height * inDims.m_numChannels;
SetDims(outDims.AsTensorShape(m_imageLayoutKind), true);
SetDims(outDims.AsTensorShape(m_imageLayoutKind), HasMBLayout());
if (isFinalValidationPass)
{

Просмотреть файл

@ -260,7 +260,7 @@ private:
TensorView<ElemType> OneSampleTensorFor(int inputIndex/*-1 for output*/, bool gradient/*instead of value*/, const FrameRange& fr)
{
auto input = inputIndex < 0 ? this : Input(inputIndex).get();
auto& data = gradient ? input->Gradient() : input->Value();
auto data = gradient ? input->GradientPtr() : input->ValuePtr();
size_t rank = input->GetSampleLayout().GetRank();
if (!Input(0)->HasMBLayout()) // left input is no MB data: run normally
return input->DataTensorFor(data, rank, fr);
@ -287,9 +287,9 @@ public:
// TensorView::DoMatrixProductOf() will reduce each tensor object into a 2D tensor (or fail if it cannot)
// and recreate actual Matrix objects (in case of sparse, they must be identical to the original tensor storage object).
// Transposition is applied after flattening into 2D, but only allowed if the input sample is 2D anyway.
auto input0 = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
auto input1 = OneSampleTensorFor(1, /*gradient=*/false, fr.AllowBroadcast());
auto output = OneSampleTensorFor(-1, /*gradient=*/false, fr);
auto input0 = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
auto input1 = OneSampleTensorFor(1, /*gradient=*/false, fr.AllowBroadcast());
auto output = OneSampleTensorFor(-1, /*gradient=*/false, fr);
output.AssignMatrixProductOf(false/*transC*/, input0, m_transpose/*transA*/, input1, false/*transB*/);
}
@ -318,16 +318,16 @@ public:
// If input data is sparse, then gradient is block sparse.
if (Input(1)->Value().GetMatrixType() == SPARSE && Input(0)->Gradient().GetMatrixType() == DENSE && Gradient().GetMatrixType() == DENSE)
Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true, fr.AllowBroadcast());
auto input1 = OneSampleTensorFor(1, /*gradient=*/false, fr.AllowBroadcast());
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true, fr.AllowBroadcast());
auto input1 = OneSampleTensorFor(1, /*gradient=*/false, fr.AllowBroadcast());
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
}
else if (inputIndex == 1) // right derivative
{
auto input0 = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
auto input1Gradient = OneSampleTensorFor(1, /*gradient=*/true, fr.AllowBroadcast());
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
auto input0 = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
auto input1Gradient = OneSampleTensorFor(1, /*gradient=*/true, fr.AllowBroadcast());
auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
}
}
@ -819,16 +819,16 @@ public:
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
{
size_t rank = DetermineElementwiseTensorRank();
auto output = ValueTensorFor(rank, fr);
auto input = TensorView<ElemType>(Input(0)->Value(), GetTransposedTensorSliceFor(rank, fr));
auto output = ValueTensorFor( rank, fr);
auto input = TensorView<ElemType>(Input(0)->ValuePtr(), GetTransposedTensorSliceFor(rank, fr));
output.AssignCopyOf(input);
}
virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
{
size_t rank = DetermineElementwiseTensorRank();
auto outputGradient = GradientTensorFor(rank, fr);
auto inputGradient = TensorView<ElemType>(Input(0)->Gradient(), GetTransposedTensorSliceFor(rank, fr));
auto outputGradient = GradientTensorFor( rank, fr);
auto inputGradient = TensorView<ElemType>(Input(0)->GradientPtr(), GetTransposedTensorSliceFor(rank, fr));
inputGradient.AddCopyOf(outputGradient);
}

Просмотреть файл

@ -51,7 +51,7 @@ public:
size_t rank = DetermineElementwiseTensorRank();
auto result = ValueTensorFor(rank, fr);
auto input = Input(0)->ValueTensorFor(rank, fr);
result.DoUnaryOpOf(0, input, 1, opForward);
result.DoUnaryOpOf(0, input, 1, opForward, opSum);
}
virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
@ -61,8 +61,8 @@ public:
// get the args
size_t rank = DetermineElementwiseTensorRank();
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one...
auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one
// we expect a constant conditional expression here -- suppress the warning that leads to an error
// TODO: alternative: assign to a non-const variable and test that.
@ -70,7 +70,7 @@ public:
#pragma warning( disable : 4127 )
if (opType == UnaryGradient)
{
sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward);
sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward, opSum);
}
else
{
@ -78,7 +78,7 @@ public:
// Not possible for Cos().
auto sliceValue = (opType == BinaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
Input(0)->ValueTensorFor(rank, fr);
sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward);
sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
}
#pragma warning( pop )
}
@ -194,6 +194,10 @@ public:
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
{
// move the target matrix to the target device, since below it is accessed as slices which cannot move
// TODO: once this gets reimplemented using TensorView, then this is no longer needed.
Input(0)->Value().TransferToDeviceIfNotThere(Value().GetDeviceId(), /*isBeingMoved=*/ false);
auto values = ValueFor(fr);
ForwardPropV(values, Input(0)->ValueFor(fr));
}

Просмотреть файл

@ -281,9 +281,9 @@ public:
DeclareConstructorFromConfigWithNumInputs(InvStdDevNode);
InvStdDevNode(DEVICEID_TYPE deviceId, const wstring& name)
: Base(deviceId, name),
m_mean(deviceId),
m_var(deviceId),
m_temp(deviceId)
m_mean(make_shared<Matrix<ElemType>>(deviceId)),
m_var (make_shared<Matrix<ElemType>>(deviceId)),
m_temp(make_shared<Matrix<ElemType>>(deviceId))
{
}
@ -295,21 +295,21 @@ public:
{
// reset accumulators
UpdateFunctionValuesSize();
m_mean.Resize(Value()); // mean accumulator normalized by #samples in it
m_var .Resize(Value()); // likewise the variance
m_temp.Resize(Value()); // and a temp
m_mean.SetValue(0); // reset the mean and var accumulators
m_var .SetValue(0);
m_mean->Resize(Value()); // mean accumulator normalized by #samples in it
m_var ->Resize(Value()); // likewise the variance
m_temp->Resize(Value()); // and a temp
m_mean->SetValue(0); // reset the mean and var accumulators
m_var ->SetValue(0);
Value().SetValue(0); // and clear m_value as well: We must do this here already to avoid a NaN check to flag while this is being estimated.
}
else // finalize
{
// m_value <- 1/stddev
ElemType sqrtFloor = 1e-10f;
m_var.InplaceTruncateBottom(sqrtFloor); // prevent too small variance (and negative square roots due to numeric inaccuracy)
m_var.InplaceSqrt();
m_var.ElementInverse();
Value().SetValue(m_var);
m_var->InplaceTruncateBottom(sqrtFloor); // prevent too small variance (and negative square roots due to numeric inaccuracy)
m_var->InplaceSqrt();
m_var->ElementInverse();
Value().SetValue(*m_var);
}
}
@ -361,16 +361,16 @@ public:
if (flags & CopyNodeFlags::copyNodeValue)
{
auto node = dynamic_pointer_cast<InvStdDevNode<ElemType>>(nodeP);
node->m_mean.SetValue(m_mean);
node->m_var.SetValue(m_var);
node->m_temp.SetValue(m_temp);
node->m_mean->SetValue(*m_mean);
node->m_var ->SetValue(*m_var);
node->m_temp->SetValue(*m_temp);
}
}
private:
Matrix<ElemType> m_mean;
Matrix<ElemType> m_var;
Matrix<ElemType> m_temp;
shared_ptr<Matrix<ElemType>> m_mean;
shared_ptr<Matrix<ElemType>> m_var;
shared_ptr<Matrix<ElemType>> m_temp;
};
template class InvStdDevNode<float>;

Просмотреть файл

@ -183,6 +183,10 @@ public:
virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
{
// move the target matrix to the target device, since below it is accessed as slices which cannot move
// TODO: change below accesses to TensorView, then this is no longer needed.
Input(0)->Gradient().TransferToDeviceIfNotThere(m_deviceId, /*isBeingMoved=*/ true);
assert(inputIndex == 0);
inputIndex;

Просмотреть файл

@ -74,30 +74,27 @@ template <class ElemType>
indexSequence.push_back(t);
// Note: The above accesses m_value directly on the CPU, putting it into BOTH state, possibly for other consumers as well.
}
input.CollapseDataLocationAfterWriting(); // BUGBUG: Move back, since BOTH state is broken at present.
// create a new MBLayout
let& outMBLayout = GetMBLayout();
outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
// copy to output
vector<ElemType> buf(outMBLayout->GetNumCols(), numeric_limits<ElemType>::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps
for (size_t i = 0, j = 0; i < sequences.size();)
let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this
for (size_t i = 0; i < size; i++)
{
if (sequences[i].seqId == GAP_SEQUENCE_ID) // gaps will keep the NaN
{
++i;
let& seq = outMBLayout->GetAllSequences()[i];
if (seq.seqId == GAP_SEQUENCE_ID) // gaps will keep the NaN
continue;
}
let& seq = outMBLayout->GetAllSequences()[j];
if (seq.seqId == GAP_SEQUENCE_ID) // When would we see this?
{
++j;
continue;
}
let& indexSequence = indexSequences[i];
for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
++i;
++j;
}
// there may be dangling gaps at the end. Take the opportunity to verify this.
for (size_t i = size; i < sequences.size(); i++)
assert(sequences[i].seqId == GAP_SEQUENCE_ID);
for (size_t i = size; i < outMBLayout->GetAllSequences().size(); i++)
assert(outMBLayout->GetAllSequences()[i].seqId == GAP_SEQUENCE_ID);
// the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true);
Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
@ -107,7 +104,6 @@ template <class ElemType>
/*virtual*/ void WhereNode<ElemType>::BackpropToNonLooping(size_t /*inputIndex*/) /*override*/
{
// we cannot backprop through a condition
// Can we?
return;
}
@ -161,6 +157,8 @@ template <class ElemType>
result(0, jIndex) = (ElemType)jSource;
}
}
// Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes.
result.CollapseDataLocationAfterWriting(); // BUGBUG: Move back, since BOTH state is broken at present.
}
template <class ElemType>

Просмотреть файл

@ -303,16 +303,16 @@ public:
virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
{
size_t rank = DetermineElementwiseTensorRank();
auto output = ValueTensorFor(rank, fr);
let input = TensorView<ElemType>(Input(0)->Value(), GetInputSlice(rank, fr.AllowBroadcast()));
auto output = ValueTensorFor( rank, fr);
let input = TensorView<ElemType>(Input(0)->ValuePtr(), GetInputSlice(rank, fr.AllowBroadcast()));
output.AssignCopyOf(input);
}
virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
{
size_t rank = DetermineElementwiseTensorRank();
let outputGrad = GradientTensorFor(rank, fr);
auto inputGrad = TensorView<ElemType>(Input(0)->Gradient(), GetInputSlice(rank, fr));
let outputGrad = GradientTensorFor( rank, fr);
auto inputGrad = TensorView<ElemType>(Input(0)->GradientPtr(), GetInputSlice(rank, fr.AllowBroadcast()));
inputGrad.AddCopyOf(outputGrad);
}
@ -413,7 +413,7 @@ public:
{
let input = Input(inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
let outputSubSlice = NarrowToStripe(outputSlice, inputIndex);
auto output = TensorView<ElemType>(Value(), outputSubSlice);
auto output = TensorView<ElemType>(ValuePtr(), outputSubSlice);
output.AssignCopyOf(input);
}
}
@ -425,7 +425,7 @@ public:
auto inputGrad = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
let outputSubSlice = NarrowToStripe(outputSlice, inputIndex);
let outputGrad = TensorView<ElemType>(Gradient(), outputSubSlice);
let outputGrad = TensorView<ElemType>(GradientPtr(), outputSubSlice);
inputGrad.AddCopyOf(outputGrad);
}
@ -1074,7 +1074,10 @@ public:
else if (Input(0)->HasMBLayout())
{
if (!m_pMBLayout)
{
m_pMBLayout = make_shared<MBLayout>(); // mini-batch data: this generates a new layout
m_pMBLayout->SetUniqueAxisName(NodeName());
}
}
else
assert(!m_pMBLayout); // reshaping non-mini-batch data

Просмотреть файл

@ -692,7 +692,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, cons
foreach_column(jIn, a)
{
auto jOutF = idx(0, jIn); // this is the column we copy/add into
if (jOutF < 0) // negative index means gap
if (jOutF < 0) // negative index means gap
continue;
size_t jOut = (size_t)jOutF;
if (jOut >= GetNumCols())
@ -4856,15 +4856,17 @@ void CPUMatrix<ElemType>::AssignScaledDifference(const ElemType alpha, const CPU
}
}
//c[ci,cj] += a[ai,aj]
// c[ci,cj] += a[ai,aj]
template <class ElemType>
void CPUMatrix<ElemType>::AddElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
void CPUMatrix<ElemType>::AddElementToElement(ElemType beta, const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
{
if (ai >= a.GetNumRows() || aj >= a.GetNumCols() ||
ci >= c.GetNumRows() || cj >= c.GetNumCols())
InvalidArgument("AddElementToElement: index out of range.");
c(ci, cj) += a(ai, aj);
ElemType us = beta ? beta * c(ci, cj) : 0; // do not multiply if beta is 0, could be a NaN
us += a(ai, aj);
c(ci, cj) = us;
}
////c[ci,cj] += a[ai,aj]
@ -4879,7 +4881,8 @@ void CPUMatrix<ElemType>::AddElementToElement(const CPUMatrix<ElemType>& a, cons
// c(ci, cj) += ((v < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : log(v));
//}
//c[ci,cj] = a[ai,aj]
#if 0 // now done as AddElementToElement (beta=0)
// c[ci,cj] = a[ai,aj]
template <class ElemType>
void CPUMatrix<ElemType>::AssignElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
{
@ -4889,6 +4892,7 @@ void CPUMatrix<ElemType>::AssignElementToElement(const CPUMatrix<ElemType>& a, c
c(ci, cj) = a(ai, aj);
}
#endif
/// <summary>c += alpha * (a-b)</summary>
/// if a, b, c must have same dim
@ -6079,11 +6083,14 @@ static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template <class ElemType>
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
#define CaseUnaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
@ -6098,18 +6105,21 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
{
ForAllUnaryOps(CaseUnaryTensorOp);
default:
LogicError("TensorUnaryOp: Unknown op code %d.", (int) op);
LogicError("TensorOp: Unknown unary op code %d.", (int) op);
}
}
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template <class ElemType>
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum)
InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum.");
#define CaseBinaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3>& pp) \
@ -6123,18 +6133,21 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
{
ForAllBinaryOps(CaseBinaryTensorOp);
default:
LogicError("TensorBinaryOp: Unknown op code %d.", (int) op);
LogicError("TensorOp: Unknown op binary code %d.", (int) op);
}
}
// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template <class ElemType>
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum)
InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");
#define CaseTernaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4>& pp) \
@ -6148,7 +6161,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
{
ForAllTernaryOps(CaseTernaryTensorOp);
default:
LogicError("TensorTernaryOp: Unknown op code %d.", (int) op);
LogicError("TensorOp: Unknown ternary op code %d.", (int) op);
}
}

Просмотреть файл

@ -380,9 +380,7 @@ public:
static void AddScaledDifference(const CPUMatrix<ElemType>& alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c); // alpha must be 1X1
static void AssignScaledDifference(const CPUMatrix<ElemType>& alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c); // alpha must be 1X1
static void AddElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
// static void AddLogElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
static void AssignElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
static void AddElementToElement(ElemType beta, const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
static void MinusOneAt(CPUMatrix<ElemType>& c, const size_t position);
@ -397,15 +395,15 @@ public:
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& reducingStrides);

Просмотреть файл

@ -413,10 +413,20 @@ public:
{
if (!m_sob.unique())
LogicError("%s: Cannot resize the matrix because it is a view.", function);
if (m_sob->HasExternalBuffer())
else if (m_sob->HasExternalBuffer())
LogicError("%s: Cannot resize the matrix because it is externally owned.", function);
}
// This is needed for Sparse Matrices to ensure they can write to the matrix. Note: writing to slices is not currently supported
// same as VerifyResizable() except for the error message. Could be folded into one.
void VerifyMigratable(const char* function) const
{
if (!m_sob.unique())
LogicError("%s: Cannot migrate the matrix between devices because it is a view.", function);
else if (m_sob->HasExternalBuffer())
LogicError("%s: Cannot migrate the matrix between devices because it is externally owned.", function);
}
// This is needed for Sparse Matrices to ensure they can write to the matrix. Note: writing to slices is not currently supported
void VerifyWritable(const char* function) const
{
if (!(m_sob->GetNumStorageRows() == m_numRows && m_sob->GetNumStorageCols() == m_numCols))

Просмотреть файл

@ -880,6 +880,7 @@ __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType
return;
// id = i + jOut * usStride;
// Each thread processes one element of the output matrix.
CUDA_LONG i = id % usStride; // row index into 'us' and 'a'
CUDA_LONG jOut = id / usStride; // col index into 'us' and 'idx'
@ -892,7 +893,7 @@ __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType
const ElemType& ra = a[ i + jIn * aStride ];
ElemType& rus = us[id/*i + jOut * usStride*/];
ElemType res = ra * alpha;
if (beta != 0)
res += rus * beta;
@ -909,7 +910,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
if (beta == 0)
RequireSize(a.GetNumRows(), idx.GetNumCols()); // output has same column format as a, but number of columns comes from idx
else
this->VerifySize(a.GetNumRows(), idx.GetNumCols());
VerifySize(a.GetNumRows(), idx.GetNumCols());
if (idx.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
@ -935,6 +936,7 @@ __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols
return;
// id = i + jIn * aStride
// Each thread processes one element of a
CUDA_LONG i = id % aStride; // row index into 'a' and 'us'
CUDA_LONG jIn = id / aStride; // col index into 'a' and 'idx'
@ -943,7 +945,7 @@ __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols
return;
size_t jOut = (size_t)jOutF;
if (jOut >= usCols)
return; // actually a failure
return; // actually a failure --TODO: This should not be necessary. Why is it?
const ElemType& ra = a[id/*i + jIn * aStride*/];
ElemType& rus = us[ i + jOut * usStride ];
@ -3345,7 +3347,7 @@ template <class ElemType>
return;
a.PrepareDevice();
if (a.IsEmpty() || b.IsEmpty())
LogicError("ScaleAndAdd: one of the input matrices is empty.");
LogicError("ScaleAndAdd: One of the input matrices is empty.");
c.RequireSize(b.GetNumRows(), b.GetNumCols());
// if (a.GetNumRows() != 1 && a.GetNumCols() != 1) // a is not a col or row vector
if (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()) // dimensions match
@ -3396,7 +3398,7 @@ template <class ElemType>
_matrixVectorRowWiseAddWithThreadPerElem<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(a.Data(), b.Data(), c.Data(), alpha, m, n);
}
else
InvalidArgument("dimension of matrix c does not match dimension of matrix a.");
InvalidArgument("Dimension of matrix c does not match dimension of matrix a.");
}
}
@ -3423,11 +3425,11 @@ void GPUMatrix<ElemType>::AddScaledDifference(const ElemType alpha, const GPUMat
if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()))
{
InvalidArgument("AddScaledDifference: a, b, and c must have same dimension.");
InvalidArgument("AddScaledDifference: a, b, and c must have same dimension.");
}
if (a.IsEmpty())
LogicError("AddScaledDifference: Input matrix a is empty.");
LogicError("AddScaledDifference: Input matrix a is empty.");
CUDA_LONG n = (CUDA_LONG) a.GetNumElements();
int blocksPerGrid = (int) ceil(1.0 * n / GridDim::maxThreadsPerBlock);
@ -3456,12 +3458,10 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const ElemType alpha, const GPU
assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
{
InvalidArgument("AssignScaledDifference: a, b must have same dimension.");
}
InvalidArgument("AssignScaledDifference: a, b must have same dimension.");
if (a.IsEmpty())
LogicError("AssignScaledDifference: Input matrix a is empty.");
LogicError("AssignScaledDifference: Input matrix a is empty.");
if (&c != &a && &c != &b)
c.RequireSize(a.GetNumRows(), a.GetNumCols());
@ -3484,7 +3484,7 @@ void GPUMatrix<ElemType>::AddScaledDifference(const GPUMatrix<ElemType>& alpha,
{
assert(alpha.GetNumElements() == 1);
if (!(alpha.GetNumElements() == 1))
InvalidArgument("AddScaledDifference: alpha must be a 1X1 matrix.");
InvalidArgument("AddScaledDifference: alpha must be a 1X1 matrix.");
if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
{
@ -3500,11 +3500,11 @@ void GPUMatrix<ElemType>::AddScaledDifference(const GPUMatrix<ElemType>& alpha,
if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()))
{
InvalidArgument("AddScaledDifference: a, b, and c must have same dimension.");
InvalidArgument("AddScaledDifference: a, b, and c must have same dimension.");
}
if (a.IsEmpty())
LogicError("AddScaledDifference: Input matrix a is empty.");
LogicError("AddScaledDifference: Input matrix a is empty.");
CUDA_LONG n = (CUDA_LONG) a.GetNumElements();
int blocksPerGrid = (int) ceil(1.0 * n / GridDim::maxThreadsPerBlock);
@ -3524,7 +3524,7 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alph
{
assert(alpha.GetNumElements() == 1);
if (!(alpha.GetNumElements() == 1))
InvalidArgument("AddScaledDifference: alpha must be a 1X1 matrix.");
InvalidArgument("AddScaledDifference: alpha must be a 1X1 matrix.");
if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
{
@ -3538,11 +3538,11 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alph
if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
{
InvalidArgument("AssignScaledDifference: a, b must have same dimension.");
InvalidArgument("AssignScaledDifference: a, b must have same dimension.");
}
if (a.IsEmpty())
LogicError("AssignScaledDifference: Input matrix a is empty.");
LogicError("AssignScaledDifference: Input matrix a is empty.");
c.RequireSize(a.GetNumRows(), a.GetNumCols());
@ -3555,16 +3555,15 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alph
//c[ci,cj] += a[ai,aj]
template <class ElemType>
void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
void GPUMatrix<ElemType>::AddElementToElement(ElemType beta, const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
{
if (ai >= a.GetNumRows() || aj >= a.GetNumCols() ||
ci >= c.GetNumRows() || cj >= c.GetNumCols())
InvalidArgument("AddElementToElement: index out of range.");
InvalidArgument("AddElementToElement: Index out of range.");
a.PrepareDevice();
int blocksPerGrid = 1; // only one element --BUGBUG: then why not launch only 1 thread per block?
SyncGuard syncGuard;
_addElementToElement<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock /*BUGBUG: should be 1?*/, 0, t_stream>>>(a.Data(), (CUDA_LONG) a.LocateElement(ai, aj), c.Data(), (CUDA_LONG) c.LocateElement(ci, cj));
_addElementToElement<ElemType><<<1, 1, 0, t_stream>>>(beta, a.Data(), (CUDA_LONG) a.LocateElement(ai, aj), c.Data(), (CUDA_LONG) c.LocateElement(ci, cj));
}
template <class ElemType>
@ -4238,11 +4237,14 @@ static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE dev
// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
template <class ElemType>
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
a.PrepareDevice();
if (a.GetComputeDeviceId() != GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
@ -4293,11 +4295,14 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
template <class ElemType>
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum)
InvalidArgument("TensorOp: The only permitted binary reduction operation is opSum.");
a.PrepareDevice();
if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");
@ -4307,11 +4312,14 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
template <class ElemType>
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
{
if (reductionOp != ElementWiseOperator::opSum)
InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");
a.PrepareDevice();
if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId() || c.GetComputeDeviceId() != GetComputeDeviceId())
InvalidArgument("All matrices must be on the same GPU");

Просмотреть файл

@ -125,6 +125,7 @@ public:
using Base::SetFormat;
using Base::IsEmpty;
using Base::VerifyResizable;
using Base::VerifySize;
public:
using Base::VerifyWritable;
@ -461,7 +462,7 @@ public:
static void AddScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
static void AssignScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
static void AddElementToElement(const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
static void AddElementToElement(ElemType beta, const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
// minus one at a specific position
static void MinusOneAt(GPUMatrix<ElemType>& c, const size_t position);
@ -477,15 +478,15 @@ public:
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& reducingStrides);

Просмотреть файл

@ -2567,13 +2567,16 @@ __global__ void _assignScaledDifference(
template <class ElemType>
__global__ void _addElementToElement(
ElemType beta,
const ElemType* a, CUDA_LONG indexA,
ElemType* c, CUDA_LONG indexC)
{
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
if (id > 0)
return;
c[indexC] += a[indexA];
//CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; // only one thread launched
//if (id > 0)
// return;
ElemType us = beta ? beta * c[indexC] : 0; // do not multiply if beta is 0, could be a NaN
us += a[indexA];
c[indexC] = us;
}
template <class ElemType>

Просмотреть файл

@ -110,61 +110,37 @@
} \
}
// version of helper macro that executes both CPU and GPU macros if 'MatrixPointerToCheck' location is BOTH
#define DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(MatrixPointerToCheck, MatrixPointerToSetFlag, CPUDense, GPUDense, CPUSparse, GPUSparse) \
{ \
CurrentDataLocation curLocation = (MatrixPointerToCheck)->GetCurrentMatrixLocation(); \
if (curLocation == CurrentDataLocation::BOTH) \
{ \
if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE) \
{ \
CPUDense; \
GPUDense; \
if (MatrixPointerToSetFlag != nullptr) \
((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::BOTH, MatrixType::DENSE); \
} \
else \
{ \
CPUSparse; \
GPUSparse; \
if (MatrixPointerToSetFlag != nullptr) \
((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::BOTH, MatrixType::SPARSE); \
} \
} \
else if (curLocation == CurrentDataLocation::GPU) \
{ \
if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE) \
{ \
GPUDense; \
if (MatrixPointerToSetFlag != nullptr) \
((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::DENSE); \
} \
else \
{ \
GPUSparse; \
if (MatrixPointerToSetFlag != nullptr) \
((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::SPARSE); \
} \
} \
else if (curLocation == CurrentDataLocation::CPU) \
{ \
if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE) \
{ \
CPUDense; \
if (MatrixPointerToSetFlag != nullptr) \
((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::DENSE); \
} \
else \
{ \
CPUSparse; \
if (MatrixPointerToSetFlag != nullptr) \
((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::SPARSE); \
} \
} \
else \
{ \
RuntimeError("Matrices do not exist in either CPU or GPU."); \
} \
// version of helper macro that executes both CPU and GPU macros if 'matrixPointer' location is BOTH
#define DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(matrixPointer, CPUDense, GPUDense, CPUSparse, GPUSparse) \
{ \
auto curLocation = (matrixPointer)->GetCurrentMatrixLocation(); \
auto curMatrixType = (matrixPointer)->GetMatrixType(); \
if (curLocation == CurrentDataLocation::NONE) \
LogicError("Matrices do not exist in either CPU or GPU."); \
if (curMatrixType == MatrixType::UNDETERMINED) \
LogicError("Matrices must be SPARSE or DENSE."); \
if (curLocation != CurrentDataLocation::CPU) /*GPU or BOTH*/ \
{ \
if (curMatrixType == MatrixType::DENSE) \
{ \
GPUDense; \
} \
else \
{ \
GPUSparse; \
} \
} \
if (curLocation != CurrentDataLocation::GPU) /*CPU or BOTH*/ \
{ \
if (curMatrixType == MatrixType::DENSE) \
{ \
CPUDense; \
} \
else \
{ \
CPUSparse; \
} \
} \
}
namespace Microsoft { namespace MSR { namespace CNTK {
@ -224,46 +200,85 @@ void Matrix<ElemType>::ShallowCopyFrom(const Matrix<ElemType>& other)
}
// Call this function after an update operation has created/set/updated the respective pointers.
// - location: BOTH|CPU|GPU
// - pass BOTH only if object will be read from; it is not allowed to write to both and then call this function.
// - if CPU/GPU and current is BOTH, then object was written to
// What gets updated:
// - m_currentDataLocation: from function argument
// - m_matrixType: from function argument unless UNDETERMINED in which case m_matrixType remains unmodified
// - m_baseMatrix: to one of current values of m_[GC]PU{Sparse,}Matrix
// This function is heavily overloaded in its responsibility.
// - first-time initialization, e.g. of a ColumnSlice (NONE->!NONE)
// - after creating a temp copy for reading
// - collapse temp copies after writing to one of them
// - setting matrixType if not set yet
template <class ElemType>
void Matrix<ElemType>::SetDataLocation(CurrentDataLocation location, MatrixType type) const
{
assert(location == CurrentDataLocation::CPU || location == CurrentDataLocation::GPU || location == CurrentDataLocation::BOTH);
// if the object used to live on BOTH, this will collapse it to 'location' (unless we actually wrote into BOTH)
// In that case, we do a sanity check here that the object is an owning Matrix,
// since otherwise the collapsing would go unnoticed by the original owner.
// In that case, we do a sanity check here that the object is a singleton view,
// since otherwise the collapsing would go unnoticed by the other views.
// The cases to cover:
// - original owner is BOTH, and this is called on the original owner
// -> The result was written to 'location' so we should collapse it to there.
// - original owning matrix is in BOTH state
// and a view inherits this
// -> FORBIDDEN to write into CPU or GPU since we cannot ensure we wrote into the one that will be read next
// - original owning matrix is CPU or GPU
// and a view onto it is put into BOTH state
// -> inefficent to read, since this is likely happening over again; so put the owner into BOTH state
// -> FORBIDDEN to write into CPU or GPU since we don't know the owner's true location and hence cannot ensure we wrote to the correct place
if (m_currentDataLocation == CurrentDataLocation::BOTH && location != CurrentDataLocation::BOTH)
// - everything is allowed on a singleton view
// - if the current state is BOTH:
// -> The result was written to 'location' so we should collapse it to there.
// - multiple views: much is forbidden since we cannot notify the other views on which one was written to
// - CPU <-> GPU: FORBIDDEN
// - BOTH -> CPU or GPU: current state is BOTH: location says which side was written to
// -> FORBIDDEN to write into
// - CPU or GPU -> BOTH: current state is CPU or GPU
// and a view onto it is put into BOTH state
// -> OK but inefficent to read, since this is likely happening over again; but we cannot put all views into BOTH state
// - BOTH -> BOTH:
// - read case: OK
// - write case: forbidden to call this function in this way
// - NONE -> !NONE: FORBIDDEN
if (m_currentDataLocation != location && // it is attempted to change location
m_currentDataLocation != CurrentDataLocation::NONE && // from a valid object (NONE means we are a fresh object from ColumnSlice())
location != CurrentDataLocation::BOTH) // and we are changing it not into a temporary copy for reading
{
// we get here if we wrote into this object that was BOTH but is no longer
if (!OwnBuffer()) // this means we should not have written into it in the first place, so fail now (better late than never)
// we get here if we wrote into this object that was BOTH but is no longer, or if we move between CPU and GPU
// Both is forbidden on shared views since we cannot inform other views of this change.
// We will now check any *valid* pointer will now be checked for uniqueness. There may be mismatching left-over pointers kept around in case they should be revived.
if (m_matrixType == MatrixType::DENSE) // note: this checks the current type, not the new one passed in. Asssumption: this tells us which pointers are valid.
{
assert(m_currentDataLocation == CurrentDataLocation::GPU || m_CPUMatrix);
assert(m_currentDataLocation == CurrentDataLocation::CPU || m_GPUMatrix);
if (m_currentDataLocation != CurrentDataLocation::GPU) ((BaseMatrix<ElemType>*)m_CPUMatrix.get())->VerifyMigratable("SetDataLocation [CPUMatrix]");
if (m_currentDataLocation != CurrentDataLocation::CPU) ((BaseMatrix<ElemType>*)m_GPUMatrix.get())->VerifyMigratable("SetDataLocation [GPUMatrix]");
}
else if (m_matrixType == MatrixType::SPARSE)
{
assert(m_currentDataLocation == CurrentDataLocation::GPU || m_CPUSparseMatrix);
assert(m_currentDataLocation == CurrentDataLocation::CPU || m_GPUSparseMatrix);
if (m_currentDataLocation != CurrentDataLocation::GPU) ((BaseMatrix<ElemType>*)m_CPUSparseMatrix.get())->VerifyMigratable("SetDataLocation [CPUSparseMatrix]");
if (m_currentDataLocation != CurrentDataLocation::CPU) ((BaseMatrix<ElemType>*)m_GPUSparseMatrix.get())->VerifyMigratable("SetDataLocation [GPUSparseMatrix]");
}
// TODO: Why do we need these typecasts? (without it will fail with "cannot access private member declared in class 'Microsoft::MSR::CNTK::CPUMatrix<float>'")
if (m_baseMatrix && !OwnBuffer()) // same arguments for externally owned matrices: Can read a temp but not write.
LogicError("SetDataLocation: A non-owning object cannot be written to in BOTH state.");
}
// passed validation: we can now update the state
m_currentDataLocation = location;
// set the matrix type if passed in
// update the matrix type if passed in
if (type != MatrixType::UNDETERMINED)
m_matrixType = type;
// set m_baseMatrix (if location is unchanged, this will not change the pointer)
// Note: m_currentDataLocation may also be CurrentDataLocation::BOTH, in which case the base matrix will be GPU.
if (m_matrixType == MatrixType::DENSE)
m_baseMatrix = ((m_currentDataLocation == CurrentDataLocation::CPU) ? dynamic_pointer_cast<BaseMatrix<ElemType>>(m_CPUMatrix) : dynamic_pointer_cast<BaseMatrix<ElemType>>(m_GPUMatrix));
else if (m_matrixType == MatrixType::SPARSE)
m_baseMatrix = ((m_currentDataLocation == CurrentDataLocation::CPU) ? dynamic_pointer_cast<BaseMatrix<ElemType>>(m_CPUSparseMatrix) : dynamic_pointer_cast<BaseMatrix<ElemType>>(m_GPUSparseMatrix));
// Note: Typecasts are necessary since C++ cannot figure out the common base type (probably due to shared_ptr).
// sanity check
if (!m_baseMatrix && m_matrixType != MatrixType::UNDETERMINED)
LogicError("SetDataLocation: new m_baseMatrix must not be NULL.");
LogicError("SetDataLocation: New m_baseMatrix must not be NULL.");
}
//this is a private constructor only used internally to initialize a blank matrix
@ -908,9 +923,8 @@ void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat
if (keepValues)
CopyElementsFromDenseToSparse(*m_CPUMatrix, *m_CPUSparseMatrix);
m_CPUMatrix = nullptr;
SetDataLocation(CPU, SPARSE);
m_CPUMatrix = nullptr;
}
else if (newMatrixType == MatrixType::DENSE)
{
@ -922,9 +936,8 @@ void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat
if (keepValues)
m_CPUMatrix->SetValue(m_CPUSparseMatrix->CopyColumnSliceToDense(0, GetNumCols()));
m_CPUSparseMatrix = nullptr;
SetDataLocation(CPU, DENSE);
m_CPUSparseMatrix = nullptr;
}
else
LogicError("SwitchToMatrixType: Unexpected/invalid new matrix type");
@ -941,9 +954,8 @@ void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat
if (keepValues)
m_GPUSparseMatrix->SetValue(*m_GPUMatrix);
m_GPUMatrix = nullptr;
SetDataLocation(GPU, SPARSE);
m_GPUMatrix = nullptr;
}
else if (newMatrixType == MatrixType::DENSE)
{
@ -955,9 +967,8 @@ void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat
if (keepValues)
m_GPUSparseMatrix->CopyToDenseMatrix(*m_GPUMatrix);
m_GPUSparseMatrix = nullptr;
SetDataLocation(GPU, DENSE);
m_GPUSparseMatrix = nullptr;
}
else
LogicError("SwitchToMatrixType: Unexpected/invalid new matrix type");
@ -977,25 +988,25 @@ void Matrix<ElemType>::CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from,
template <class ElemType>
ElemType Matrix<ElemType>::Get00Element() const
{
DISPATCH_MATRIX_ON_FLAG(this,
nullptr,
return m_CPUMatrix->Get00Element(),
return m_GPUMatrix->Get00Element(),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
DISPATCH_MATRIX_ON_FLAG(this, nullptr,
{ return m_CPUMatrix->Get00Element(); },
{ return m_GPUMatrix->Get00Element(); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
}
// const operator(,)
template <class ElemType>
const ElemType Matrix<ElemType>::operator()(const size_t row, const size_t col) const
{
DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this,
nullptr,
return m_CPUMatrix->operator()(row, col),
_transferFromDeviceToDevice(GetDeviceId(), CPUDEVICE, false); return m_CPUMatrix->operator()(row, col),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this, nullptr,
{ return m_CPUMatrix->operator()(row, col); },
{ _transferFromDeviceToDevice(GetDeviceId(), CPUDEVICE, false); return m_CPUMatrix->operator()(row, col); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
}
// non-const operator(,)
//WARNING: This function is very slow for GPUs since it requires copying values between CPUs and GPUs.
//In addition, if ColumnSlice is used after this function but before the values are copied back to GPU
//the operation will fail since the memory is not managed by the slice.
@ -1427,22 +1438,18 @@ void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients,
}
}
//both this and gradients will be changed
// both 'this' and gradients will be changed
template <class ElemType>
ElemType Matrix<ElemType>::Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier)
{
DecideAndMoveToRightDevice(*this, gradients);
DISPATCH_MATRIX_ON_FLAG(&gradients,
&gradients,
return m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier);
SetDataLocation(CPU),
return m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier);
SetDataLocation(GPU),
return gradients.m_CPUSparseMatrix->Adagrad(*m_CPUMatrix, needAveMultiplier);
SetDataLocation(CPU),
return gradients.m_GPUSparseMatrix->Adagrad(*m_GPUMatrix, needAveMultiplier);
SetDataLocation(GPU));
DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
{ return m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU); },
{ return m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU); },
{ return gradients.m_CPUSparseMatrix->Adagrad(*m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU); },
{ return gradients.m_GPUSparseMatrix->Adagrad(*m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU); });
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
}
template <class ElemType>
@ -1458,14 +1465,12 @@ void Matrix<ElemType>::FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Mat
aggadagradsqrframes = adagradkeepweight * aggadagradsqrframes + (1.0f - adagradkeepweight) * mbSize;
const ElemType targetadagradavdenom_x_sqrtadagradsqrframes = static_cast<ElemType>(targetadagradavdenom * sqrt(aggadagradsqrframes));
DISPATCH_MATRIX_ON_FLAG(&gradients,
&gradients,
m_CPUMatrix->FSAdagrad(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes);
SetDataLocation(CPU),
m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes);
SetDataLocation(GPU),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
{ m_CPUMatrix->FSAdagrad(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes); SetDataLocation(CPU); },
{ m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes); SetDataLocation(GPU); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
}
template <class ElemType>
@ -1479,14 +1484,12 @@ ElemType Matrix<ElemType>::RmsProp(Matrix<ElemType>& gradients,
{
DecideAndMoveToRightDevice(*this, gradients);
DISPATCH_MATRIX_ON_FLAG(this,
&gradients,
return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier);
SetDataLocation(CPU),
return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier);
SetDataLocation(GPU),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
DISPATCH_MATRIX_ON_FLAG(this, &gradients,
{ return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(CPU); },
{ return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
}
template <class ElemType>
@ -1494,12 +1497,11 @@ void Matrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
{
if (numRows != GetNumRows() || numCols != GetNumCols())
{
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->Reshape(numRows, numCols),
m_GPUMatrix->Reshape(numRows, numCols),
NOT_IMPLEMENTED,
m_GPUSparseMatrix->Reshape(numRows, numCols));
DISPATCH_MATRIX_ON_FLAG(this, this,
{ m_CPUMatrix->Reshape(numRows, numCols); },
{ m_GPUMatrix->Reshape(numRows, numCols); },
{ NOT_IMPLEMENTED; },
{ m_GPUSparseMatrix->Reshape(numRows, numCols); });
}
}
@ -1510,11 +1512,10 @@ void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const
{
// TODO: should this function test whether the size is changing, and skip if it isn't? We have at least one explicit test for this code calling this (recurrent node)
DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
this,
m_CPUMatrix->Resize(numRows, numCols, growOnly),
m_GPUMatrix->Resize(numRows, numCols, growOnly),
m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false),
m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false));
{ m_CPUMatrix->Resize(numRows, numCols, growOnly); },
{ m_GPUMatrix->Resize(numRows, numCols, growOnly); },
{ m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); },
{ m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); });
#ifdef _DEBUG
if (GetMatrixType() != MatrixType::SPARSE)
Invalidate(); // Fill the matrix with NaNs to detect using the content which is undefined. Unfortunately this won't work for sparse matrices.
@ -1551,11 +1552,10 @@ template <class ElemType>
void Matrix<ElemType>::Reset()
{
DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
this,
NOT_IMPLEMENTED,
NOT_IMPLEMENTED,
m_CPUSparseMatrix->Reset(),
m_GPUSparseMatrix->Reset());
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; },
{ m_CPUSparseMatrix->Reset(); },
{ m_GPUSparseMatrix->Reset(); });
}
template <class ElemType>
@ -3027,12 +3027,11 @@ ElemType Matrix<ElemType>::SumOfAbsElements() const
if (IsEmpty())
LogicError("SumOfAbsElements: Matrix is empty.");
DISPATCH_MATRIX_ON_FLAG(this,
nullptr,
return m_CPUMatrix->SumOfAbsElements(),
return m_GPUMatrix->SumOfAbsElements(),
NOT_IMPLEMENTED,
return m_GPUSparseMatrix->SumOfAbsElements());
DISPATCH_MATRIX_ON_FLAG(this, nullptr,
{ return m_CPUMatrix->SumOfAbsElements(); },
{ return m_GPUMatrix->SumOfAbsElements(); },
{ NOT_IMPLEMENTED; },
{ return m_GPUSparseMatrix->SumOfAbsElements(); });
}
//sum of all elements
@ -3042,11 +3041,10 @@ ElemType Matrix<ElemType>::LogSumOfElements() const
if (IsEmpty())
LogicError("LogSumOfElements: Matrix is empty.");
DISPATCH_MATRIX_ON_FLAG(this,
nullptr,
DISPATCH_MATRIX_ON_FLAG(this, nullptr,
{ return m_CPUMatrix->LogSumOfElements(); },
{ return m_GPUMatrix->LogSumOfElements(); },
{NOT_IMPLEMENTED},
{ NOT_IMPLEMENTED},
{ NOT_IMPLEMENTED });
}
@ -3354,65 +3352,57 @@ Matrix<ElemType>& Matrix<ElemType>::AddSignOf(const Matrix<ElemType>& a)
return *this;
}
//I decided to use Matrix<ElemType>& maxIndexes instead of integer vector because the result may be used to do additional calculation
// I decided to use Matrix<ElemType>& maxIndices instead of integer vector because the result may be used to do additional calculation
template <class ElemType>
void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const
void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndices, Matrix<ElemType>& maxValues, const bool isColWise) const
{
if (IsEmpty())
LogicError("VectorMax: Matrix is empty.");
DecideAndMoveToRightDevice(*this, maxIndexes, maxValues);
maxIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
DecideAndMoveToRightDevice(*this, maxIndices, maxValues);
maxIndices.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
maxValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
DISPATCH_MATRIX_ON_FLAG(this,
&maxValues,
m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise);
maxIndexes.SetDataLocation(CPU, DENSE),
m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise);
maxIndexes.SetDataLocation(GPU, DENSE),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
DISPATCH_MATRIX_ON_FLAG(this, &maxValues,
{ m_CPUMatrix->VectorMax(*maxIndices.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise); maxIndices.SetDataLocation(CPU, DENSE); },
{ m_GPUMatrix->VectorMax(*maxIndices.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise); maxIndices.SetDataLocation(GPU, DENSE); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
// Note: must SetDataLocation() also on maxIndices, since both maxValues and maxIndices are written.
}
template <class ElemType>
void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const
void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndices, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const
{
if (IsEmpty())
LogicError("VectorMax: Matrix is empty.");
DecideAndMoveToRightDevice(*this, maxIndexes, maxValues);
maxIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
DecideAndMoveToRightDevice(*this, maxIndices, maxValues);
maxIndices.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
maxValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
DISPATCH_MATRIX_ON_FLAG(this,
&maxValues,
m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK);
maxIndexes.SetDataLocation(CPU, DENSE),
m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK);
maxIndexes.SetDataLocation(GPU, DENSE),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
DISPATCH_MATRIX_ON_FLAG(this, &maxValues,
{ m_CPUMatrix->VectorMax(*maxIndices.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK); maxIndices.SetDataLocation(CPU, DENSE); },
{ m_GPUMatrix->VectorMax(*maxIndices.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK); maxIndices.SetDataLocation(GPU, DENSE); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
}
template <class ElemType>
void Matrix<ElemType>::VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const
void Matrix<ElemType>::VectorMin(Matrix<ElemType>& minIndices, Matrix<ElemType>& minValues, const bool isColWise) const
{
if (IsEmpty())
LogicError("VectorMin: Matrix is empty.");
DecideAndMoveToRightDevice(*this, minIndexes, minValues);
minIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
DecideAndMoveToRightDevice(*this, minIndices, minValues);
minIndices.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
minValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
DISPATCH_MATRIX_ON_FLAG(this,
&minValues,
m_CPUMatrix->VectorMin(*minIndexes.m_CPUMatrix, *minValues.m_CPUMatrix, isColWise);
minIndexes.SetDataLocation(CPU, DENSE),
m_GPUMatrix->VectorMin(*minIndexes.m_GPUMatrix, *minValues.m_GPUMatrix, isColWise);
minIndexes.SetDataLocation(GPU, DENSE),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
DISPATCH_MATRIX_ON_FLAG(this, &minValues,
{ m_CPUMatrix->VectorMin(*minIndices.m_CPUMatrix, *minValues.m_CPUMatrix, isColWise); minIndices.SetDataLocation(CPU, DENSE); },
{ m_GPUMatrix->VectorMin(*minIndices.m_GPUMatrix, *minValues.m_GPUMatrix, isColWise); minIndices.SetDataLocation(GPU, DENSE); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
}
#pragma endregion Member BLAS Functions
@ -3425,12 +3415,11 @@ int Matrix<ElemType>::GetDeviceId() const
if (m_currentDataLocation == CurrentDataLocation::NONE)
return m_preferredDeviceId;
DISPATCH_MATRIX_ON_FLAG(this,
nullptr,
return CPUDEVICE,
return m_GPUMatrix->GetComputeDeviceId(),
return CPUDEVICE,
return m_GPUSparseMatrix->GetComputeDeviceId());
DISPATCH_MATRIX_ON_FLAG(this, nullptr,
{ return CPUDEVICE; },
{ return m_GPUMatrix->GetComputeDeviceId(); },
{ return CPUDEVICE; },
{ return m_GPUSparseMatrix->GetComputeDeviceId(); });
}
// TODO: Comment why we need a second ElemType.
@ -3544,25 +3533,21 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
return;
}
// warn about device change
#define NUM_DEVICE_CHANGED_WARN 20
if (m_numTimesDeviceChanged <= NUM_DEVICE_CHANGED_WARN &&
(!emptyTransfer || (from_id >= 0 && to_id >= 0)))
{
m_numTimesDeviceChanged++;
if (m_devicesTransferedTo[0] < CPUDEVICE)
{
m_devicesTransferedTo[0] = to_id;
}
else if (m_devicesTransferedTo[0] != to_id)
{
m_devicesTransferedTo[1] = to_id;
}
}
if (m_numTimesDeviceChanged == NUM_DEVICE_CHANGED_WARN && m_devicesTransferedTo[1] >= CPUDEVICE)
{
fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long) GetNumRows(), (unsigned long) GetNumCols(), NUM_DEVICE_CHANGED_WARN);
}
// do the transfer
if (m_matrixType == MatrixType::SPARSE)
{
if (from_id == CPUDEVICE) // from CPU to GPU
@ -3582,8 +3567,8 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
if (isBeingMoved)
{
m_CPUSparseMatrix = nullptr;
SetDataLocation(GPU, SPARSE);
m_CPUSparseMatrix = nullptr;
}
else
{
@ -3607,8 +3592,8 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
if (isBeingMoved)
{
m_GPUSparseMatrix = nullptr;
SetDataLocation(CPU, SPARSE);
m_GPUSparseMatrix = nullptr;
}
else
{
@ -3638,8 +3623,8 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
}
if (isBeingMoved)
{
m_CPUMatrix = nullptr;
SetDataLocation(GPU, DENSE);
m_CPUMatrix = nullptr;
}
else
{
@ -3666,8 +3651,8 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
if (isBeingMoved)
{
m_GPUMatrix = nullptr;
SetDataLocation(CPU, DENSE);
m_GPUMatrix = nullptr;
}
else
{
@ -4180,17 +4165,19 @@ void Matrix<ElemType>::SVD(const Matrix<ElemType>& A, Matrix<ElemType>& SIGMA, M
VT.SwitchToMatrixType(A.GetMatrixType(), A.GetFormat(), false);
W.SwitchToMatrixType(A.GetMatrixType(), A.GetFormat(), false);
DISPATCH_MATRIX_ON_FLAG(&A,
nullptr,
Matrix<ElemType> tA = A.DeepClone();
CPUMatrix<ElemType>::SVD(*tA.m_CPUMatrix, *SIGMA.m_CPUMatrix, *U.m_CPUMatrix, *VT.m_CPUMatrix, *W.m_CPUMatrix);
SIGMA.SetDataLocation(CPU);
U.SetDataLocation(CPU);
VT.SetDataLocation(CPU);
W.SetDataLocation(CPU),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
DISPATCH_MATRIX_ON_FLAG(&A, nullptr,
{
Matrix<ElemType> tA = A.DeepClone();
CPUMatrix<ElemType>::SVD(*tA.m_CPUMatrix, *SIGMA.m_CPUMatrix, *U.m_CPUMatrix, *VT.m_CPUMatrix, *W.m_CPUMatrix);
SIGMA.SetDataLocation(CPU);
U.SetDataLocation(CPU);
VT.SetDataLocation(CPU);
W.SetDataLocation(CPU);
// need to SetDataLocation() on all matrices we write to
},
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
}
/// <summary>Matrix-matrix multiply with col-major matrices (a and b may be transposed): c = alpha * op(a) * op(b) + beta*c</summary>
@ -4400,34 +4387,33 @@ template <class ElemType>
if (a.GetMatrixType() == c.GetMatrixType())
{
DISPATCH_MATRIX_ON_FLAG(&c,
&c,
CPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix),
GPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix),
NOT_IMPLEMENTED,
GPUSparseMatrix<ElemType> b = move(*c.m_GPUSparseMatrix);
GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, b, *c.m_GPUSparseMatrix));
DISPATCH_MATRIX_ON_FLAG(&c, &c,
{ CPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix); },
{ GPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix); },
{ NOT_IMPLEMENTED; },
{ GPUSparseMatrix<ElemType> b = move(*c.m_GPUSparseMatrix); GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, b, *c.m_GPUSparseMatrix); });
}
else
{
DISPATCH_MATRIX_ON_FLAG(&c,
nullptr,
CPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUSparseMatrix, *c.m_CPUMatrix);
c.SetDataLocation(CPU),
if (a.m_GPUSparseMatrix->GetFormat() == MatrixFormat::matrixFormatSparseCSC)
{
GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, *c.m_GPUMatrix, *c.m_GPUMatrix);
} else // new GPU sparse matrix code
{
GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, *c.m_GPUMatrix);
} c.SetDataLocation(GPU),
NOT_IMPLEMENTED,
{
c.m_GPUMatrix = make_shared<GPUMatrix<ElemType>>(c.m_GPUSparseMatrix->CopyToDenseMatrix());
GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, 1, *c.m_GPUSparseMatrix, *c.m_GPUMatrix);
c.m_GPUSparseMatrix = nullptr;
c.SetDataLocation(GPU, DENSE);
});
DISPATCH_MATRIX_ON_FLAG(&c, nullptr,
{
CPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUSparseMatrix, *c.m_CPUMatrix);
c.SetDataLocation(CPU);
},
{
if (a.m_GPUSparseMatrix->GetFormat() == MatrixFormat::matrixFormatSparseCSC)
GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, *c.m_GPUMatrix, *c.m_GPUMatrix);
else // new GPU sparse matrix code
GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, *c.m_GPUMatrix);
c.SetDataLocation(GPU);
},
{ NOT_IMPLEMENTED; },
{
c.m_GPUMatrix = make_shared<GPUMatrix<ElemType>>(c.m_GPUSparseMatrix->CopyToDenseMatrix());
GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, 1, *c.m_GPUSparseMatrix, *c.m_GPUMatrix);
c.SetDataLocation(GPU, DENSE);
c.m_GPUSparseMatrix = nullptr;
});
}
}
@ -4444,9 +4430,7 @@ template <class ElemType>
if (beta == 1)
ScaleAndAdd(alpha, a, c);
else if (beta == 0)
{
Scale(alpha, a, c);
}
else
{
ScaleAndAdd(alpha / beta, a, c); // c1=alpha/beta * a + c
@ -4598,8 +4582,8 @@ void Matrix<ElemType>::AddElementToElement(const Matrix<ElemType>& a, const size
DISPATCH_MATRIX_ON_FLAG(&c,
&c,
CPUMatrix<ElemType>::AddElementToElement(*a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
GPUMatrix<ElemType>::AddElementToElement(*a.m_GPUMatrix, ai, aj, *c.m_GPUMatrix, ci, cj),
CPUMatrix<ElemType>::AddElementToElement(1, *a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
GPUMatrix<ElemType>::AddElementToElement(1, *a.m_GPUMatrix, ai, aj, *c.m_GPUMatrix, ci, cj),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
@ -4615,8 +4599,8 @@ void Matrix<ElemType>::AssignElementToElement(const Matrix<ElemType>& a, const s
DISPATCH_MATRIX_ON_FLAG(&c,
&c,
CPUMatrix<ElemType>::AssignElementToElement(*a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
NOT_IMPLEMENTED,
CPUMatrix<ElemType>::AddElementToElement(0, *a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
GPUMatrix<ElemType>::AddElementToElement(0, *a.m_GPUMatrix, ai, aj, *c.m_GPUMatrix, ci, cj),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
@ -5205,7 +5189,7 @@ static bool VerifyIsDense(const Matrix<ElemType>& a)
}
template <class ElemType>
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
@ -5216,14 +5200,14 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemTy
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
@ -5234,14 +5218,14 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}
template <class ElemType>
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
@ -5252,8 +5236,8 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const
DISPATCH_MATRIX_ON_FLAG(this,
this,
m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
NOT_IMPLEMENTED,
NOT_IMPLEMENTED);
}

Просмотреть файл

@ -115,11 +115,17 @@ public:
static Matrix<ElemType> RandomUniform(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED);
static Matrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);
static void SetDevice(DEVICEID_TYPE deviceId);
static void SetDevice(DEVICEID_TYPE deviceId); // TODO: unify with PrepareDevice()
void ReleaseMemory();
~Matrix();
// workaround to bugs in BOTH implementation: force to collapse to home location
void CollapseDataLocationAfterWriting() const
{
SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
}
private:
Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
@ -530,15 +536,15 @@ public:
static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c);
void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const std::array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& reducingStrides);

Просмотреть файл

@ -1894,7 +1894,7 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& /*al
//c[ci,cj] += a[ai,aj]
template <class ElemType>
void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& /*a*/, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
void GPUMatrix<ElemType>::AddElementToElement(ElemType beta, const GPUMatrix<ElemType>& /*a*/, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
{
}
@ -1953,21 +1953,21 @@ void GPUMatrix<ElemType>::TensorShuffleScaleAndAdd(ElemType keepWeight, const GP
}
template <class ElemType>
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
}
template <class ElemType>
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
{
}
template <class ElemType>
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)

Просмотреть файл

@ -38,14 +38,16 @@ using namespace std;
// main constructor (all constructors except the default one route through this)
template <class ElemType>
TensorView<ElemType>::TensorView(const Matrix<ElemType>& sob, const TensorShape& shape)
: m_sob(sob.AsReference()), m_shape(shape)
TensorView<ElemType>::TensorView(const MatrixBasePtr& sob, const TensorShape& shape)
: m_sob(dynamic_pointer_cast<Matrix<ElemType>>(sob)), m_shape(shape)
{
if (!m_sob)
LogicError("TensorView: Attempted to create a TensorView<ElemType> on a storage object of a different ElemType.");
#ifdef _DEBUG
// check bounds of TensorShape against underlying storage object
// This is useful to detect errors like passing a matrix from the wrong input.
const auto r = shape.GetLocationRange();
const auto n = m_sob.GetNumElements();
const auto n = m_sob->GetNumElements();
if (r.first < 0 || (size_t)r.second > n)
LogicError("TensorView: Shape bounds [%d,%d) exceed bounds of underlying storage object [0,%d).", (int) r.first, (int) r.second, (int) n);
#endif
@ -228,7 +230,7 @@ static bool CheckDifferentObject(const TensorView<ElemType>& a, const TensorView
}
template <class ElemType>
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op)
void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
{
// static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
@ -244,11 +246,11 @@ void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemT
CheckDifferentObject(a, *this);
// now perform the operation
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
template <class ElemType>
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op)
void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
{
// static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());
@ -262,11 +264,11 @@ void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, cons
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
template <class ElemType>
void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op)
void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
{
// static int cc = 0; if (cc++ == 0)
// fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());
@ -280,79 +282,7 @@ void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView& a, con
if (reducingOpDims.size() > 0)
CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// simple test function for testing stuff
// Call as: Microsoft::MSR::CNTK::TensorView<float>::Test();
template <class ElemType>
/*static*/ void TensorView<ElemType>::Test()
{
const DEVICEID_TYPE deviceId = 0; // -1
Matrix<ElemType> m1(deviceId);
Matrix<ElemType> m2(deviceId);
Matrix<ElemType> m3(deviceId);
{
m1.SetValue(5, 3, {1, 2, 3,
14, 15, 6,
4, 5, 16,
41, 5, 1,
1.8, 4.5, 7});
m2.SetValue(5, 1, {42,
13,
1968,
3.1415f,
7});
m3.Resize(m1);
// regular zip (just add m1 to itself)
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m1), 1);
m3.Print();
// unary op
TensorView(m3).DoSqrtOf(0, TensorView(m1), 1);
m3.Print();
// broadcasting of an input
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
TensorView(m3).DoMaxOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
TensorView(m3).DoGTOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
// reduction over columns
m3.Resize(5, 1);
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
// reduction over rows
m3.Resize(1, 3);
TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
TensorView(m3).DoLogSumOf(0, TensorView(m1), TensorView(m2), 1);
m3.Print();
}
{
m1.Resize(1, 42);
m2.Resize(13, 1);
m3.Resize(13, 21);
TensorShape s1(1, 2, 21);
TensorShape s2(13, 1);
TensorShape s3(13, 1, 21);
let t1 = TensorView<ElemType>(m1, s1);
t1;
let t2 = TensorView<ElemType>(m2, s2);
t2;
auto t3 = TensorView<ElemType>(m3, s3);
t3;
t3.DoSumOf(0, t1, t2, 1);
m3.Print();
}
GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
}
// -------------------------------------------------------------------
@ -409,19 +339,20 @@ static void FlattenToMatrix(TensorShape& shape, bool trans, size_t splitPoint)
// convert tensor into a Matrix object
template <class ElemType>
Matrix/*ref*/<ElemType> TensorView<ElemType>::AsMatrix() const
shared_ptr<Matrix<ElemType>> TensorView<ElemType>::AsMatrix() const
{
assert(m_shape.GetRank() == 2);
if (m_shape.GetStrides()[0] != 1 && m_shape[0] != 1)
InvalidArgument("AsMatrix: Flattened [%s] matrix is not dense (it has a stride).", string(m_shape).c_str());
// create a Matrix view into the TensorView (which in turn is a view over a Matrix...)
// The way to do this is to use a ColumnSlice.
// express the TensorView's storage in m_sob's coordinates
let firstColumn = m_shape.GetOffset() / m_sob.GetNumRows();
let numColumns = m_shape.GetNumElements() / m_sob.GetNumRows();
if (firstColumn * m_sob.GetNumRows() != m_shape.GetOffset() || numColumns * m_sob.GetNumRows() != m_shape.GetNumElements())
let firstColumn = m_shape.GetOffset() / m_sob->GetNumRows();
let numColumns = m_shape.GetNumElements() / m_sob->GetNumRows();
if (firstColumn * m_sob->GetNumRows() != m_shape.GetOffset() || numColumns * m_sob->GetNumRows() != m_shape.GetNumElements())
InvalidArgument("AsMatrix: Flattened [%s] matrix has an offset or width that is not a multiple of the storage object's row dimension.", string(m_shape).c_str());
auto sob = m_sob.ColumnSlice(firstColumn, numColumns);
// now reinterpret this slice according to the new tensor shape
// Example:
// - each sob column contains a set of vectors stored as a 2D tensor [I x J], and [S x T] samples
@ -431,12 +362,20 @@ Matrix/*ref*/<ElemType> TensorView<ElemType>::AsMatrix() const
// - which in turn yields a [K x (J * S x*T)] matrix
// which gets reinterpreted back as a [K x J x S x T] tensor
// In the special case of sparse matrices, this split cannot be done. E.g. in the above example, we could only multiply with a [K x I x J] tensor.
if (sob.GetMatrixType() == MatrixType::DENSE)
return sob.Reshaped(m_shape[0], m_shape[1]);
else if (m_shape[0] == sob.GetNumRows()) // SPARSE matrices cannot be reshaped, so we only support 1D and 2D tensors
return sob;
else
let needsSlicing = firstColumn != 0 || numColumns != m_sob->GetNumCols();
let needsReshaping = m_shape[0] != m_sob->GetNumRows() || m_shape[1] != m_sob->GetNumCols();
// Note: If an output matrix is a view and needs to move to a different device, we will fail later, since the current structure cannot support that.
// As a consequence, some configurations will simply not work currently.
// We minimize the chance of this by using the original storage object whenever possible.
if (!needsSlicing && !needsReshaping) // no need to mess with the storage object: pass it on as it is. Full support for moving devices.
return m_sob;
else if (needsSlicing && !needsReshaping) // slicing is supported for sparse as well
return make_shared<Matrix<ElemType>>(m_sob->ColumnSlice(firstColumn, numColumns));
else if (m_sob->GetMatrixType() != MatrixType::DENSE) // needsReshaping: not allowed for sparse matrices
RuntimeError("AsMatrix: Sparse tensors are not supported unless they are 1D or 2D matrices.");
else // dense can slice and reshape neutrally, but will also fail if output matrix needs to move devices
return make_shared<Matrix<ElemType>>(m_sob->ColumnSlice(firstColumn, numColumns).Reshaped(m_shape[0], m_shape[1]));
}
template <class ElemType>
@ -471,9 +410,9 @@ void TensorView<ElemType>::DoMatrixProductOf(ElemType beta, bool transC, const T
auto C = Reshaped(shapeC).AsMatrix();
// and go
if (!transC)
Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, A, transA, B, transB, beta, C);
Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, *A, transA, *B, transB, beta, *C);
else // C' = A * B <==> C = (A * B)' = B' * A'
Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, B, !transB, A, !transA, beta, C);
Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, *B, !transB, *A, !transA, beta, *C);
}
template class TensorView<float>;

Просмотреть файл

@ -26,20 +26,22 @@ public:
// -------------------------------------------------------------------
// reinterpret a matrix storage object (SOB) as a TensorView with a given TensorShape --this is the main constructor
TensorView(const Matrix<ElemType>& sob, const TensorShape& shape);
TensorView(const MatrixBasePtr& sob, const TensorShape& shape);
#if 0
// cast a Matrix as a 2D TensorView (without shape change)
TensorView(const Matrix<ElemType>& sob)
: m_sob(sob.AsReference()), m_shape(TensorShape(array<size_t, 2>{sob.GetNumRows(), sob.GetNumCols()}))
TensorView(const MatrixBasePtr& sob)
: m_sob(sob), m_shape(TensorShape(array<size_t, 2>{sob->GetNumRows(), sob->GetNumCols()}))
{
}
#endif
// reshape a TensorView
TensorView(const TensorView<ElemType>& other, const TensorShape& shape)
: m_sob(other.m_sob.AsReference()), m_shape(shape)
: m_sob(other.m_sob), m_shape(shape)
{
}
// copy constructor
TensorView(const TensorView<ElemType>& other)
: m_sob(other.m_sob.AsReference()), m_shape(other.m_shape)
: m_sob(other.m_sob), m_shape(other.m_shape)
{
}
@ -66,36 +68,36 @@ public:
// -------------------------------------------------------------------
#pragma push_macro("DeclareUnaryTensorOp")
#define DeclareUnaryTensorOp(oper) \
void Do##oper##Of(ElemType beta, const TensorView& a, ElemType alpha) \
{ \
DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op##oper); \
} \
void Assign##oper##Of(const TensorView& a, ElemType alpha = 1.0f) \
{ \
DoUnaryOpOf(0, a, alpha, ElementWiseOperator::op##oper); \
} \
void Add##oper##Of(const TensorView& a, ElemType alpha = 1.0f) \
{ \
DoUnaryOpOf(1.0f, a, alpha, ElementWiseOperator::op##oper); \
#define DeclareUnaryTensorOp(oper) \
void Do##oper##Of(ElemType beta, const TensorView& a, ElemType alpha) \
{ \
DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
} \
void Assign##oper##Of(const TensorView& a, ElemType alpha = 1.0f) \
{ \
DoUnaryOpOf(0, a, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
} \
void Add##oper##Of(const TensorView& a, ElemType alpha = 1.0f) \
{ \
DoUnaryOpOf(1.0f, a, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
}
ForAllUnaryOps(DeclareUnaryTensorOp);
#pragma pop_macro("DeclareUnaryTensorOp")
#pragma push_macro("DeclareBinaryTensorOp")
#define DeclareBinaryTensorOp(oper) \
void Do##oper##Of(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha) \
{ \
DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op##oper); \
} \
void Assign##oper##Of(const TensorView& a, const TensorView& b, ElemType alpha = 1.0f) \
{ \
DoBinaryOpOf(0, a, b, alpha, ElementWiseOperator::op##oper); \
} \
void Add##oper##Of(const TensorView& a, const TensorView& b, ElemType alpha = 1.0f) \
{ \
DoBinaryOpOf(1.0f, a, b, alpha, ElementWiseOperator::op##oper); \
#define DeclareBinaryTensorOp(oper) \
void Do##oper##Of(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha) \
{ \
DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
} \
void Assign##oper##Of(const TensorView& a, const TensorView& b, ElemType alpha = 1.0f) \
{ \
DoBinaryOpOf(0, a, b, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
} \
void Add##oper##Of(const TensorView& a, const TensorView& b, ElemType alpha = 1.0f) \
{ \
DoBinaryOpOf(1.0f, a, b, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
}
ForAllBinaryOps(DeclareBinaryTensorOp);
@ -105,25 +107,23 @@ public:
#define DeclareTernaryTensorOp(oper) \
void Do##oper##Of(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha) \
{ \
DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op##oper); \
DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
} \
void Assign##oper##Of(const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha = 1.0f) \
{ \
DoTernaryOpOf(0, a, b, c, alpha, ElementWiseOperator::op##oper); \
DoTernaryOpOf(0, a, b, c, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
} \
void Add##oper##Of(const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha = 1.0f) \
{ \
DoTernaryOpOf(1.0f, a, b, c, alpha, ElementWiseOperator::op##oper); \
DoTernaryOpOf(1.0f, a, b, c, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
}
ForAllTernaryOps(DeclareTernaryTensorOp);
#pragma pop_macro("DeclareTernaryTensorOp")
static void Test();
void DoUnaryOpOf (ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op);
void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op);
void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op);
void DoUnaryOpOf (ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
// -------------------------------------------------------------------
// matrix product -- GEMM for flattened tensors
@ -139,23 +139,23 @@ public:
void AssignMatrixProductOf( bool transC, const TensorView& a, bool transA, const TensorView& b, bool transB, ElemType alpha = 1.0f) { DoMatrixProductOf(0, transC, a, transA, b, transB, alpha); }
void AddMatrixProductOf ( bool transC, const TensorView& a, bool transA, const TensorView& b, bool transB, ElemType alpha = 1.0f) { DoMatrixProductOf(1.0f, transC, a, transA, b, transB, alpha); }
Matrix/*ref*/<ElemType> AsMatrix() const;
shared_ptr<Matrix<ElemType>> AsMatrix() const;
private:
// -------------------------------------------------------------------
// accessors
// -------------------------------------------------------------------
const Matrix<ElemType>& GetSOB() const { return m_sob; }
Matrix<ElemType>& GetSOB() { return m_sob; }
const Matrix<ElemType>& GetSOB() const { return *m_sob; }
Matrix<ElemType>& GetSOB() { return *m_sob; }
const TensorShape& GetShape() const { return m_shape; }
// -------------------------------------------------------------------
// sob members
// -------------------------------------------------------------------
Matrix<ElemType> m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. This is really a reference (not owing the buffer).
TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern
shared_ptr<Matrix<ElemType>> m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. This is really a reference (not owing the buffer).
TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern
};
}}}

Просмотреть файл

@ -583,6 +583,7 @@ public:
BinaryReader()
: m_pMBLayout(make_shared<MBLayout>())
{
m_pMBLayout->SetUniqueAxisName(L"BinaryReader");
}
virtual ~BinaryReader();
virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);

Просмотреть файл

@ -152,6 +152,7 @@ public:
DSSMReader()
: m_pMBLayout(make_shared<MBLayout>())
{
m_pMBLayout->SetUniqueAxisName(L"DSSMReader");
m_qfeaturesBuffer = NULL;
m_dfeaturesBuffer = NULL;
m_labelsBuffer = NULL;

Просмотреть файл

@ -152,6 +152,7 @@ public:
HTKMLFReader()
: m_pMBLayout(make_shared<MBLayout>())
{
m_pMBLayout->SetUniqueAxisName(L"HTKMLFReader");
}
template <class ConfigRecordType>
void InitFromConfig(const ConfigRecordType&);

Просмотреть файл

@ -2055,4 +2055,5 @@ void HTKMLFReader<ElemType>::GetDataNamesFromConfig(const ConfigRecordType& read
template class HTKMLFReader<float>;
template class HTKMLFReader<double>;
} } }
}}}

Просмотреть файл

@ -38,9 +38,10 @@ private:
MBLayoutPtr pMBLayout;
std::vector<std::vector<std::pair<wstring, size_t>>> minibatchUttInfo;
size_t currentMBSize;
MinibatchBufferUnit()
: pMBLayout(make_shared<MBLayout>()), currentMBSize(0)
MinibatchBufferUnit() :
pMBLayout(make_shared<MBLayout>()), currentMBSize(0)
{
pMBLayout->SetUniqueAxisName(L"HTKMLFReader");
}
};
bool m_doMinibatchBuffering;
@ -163,9 +164,10 @@ public:
// set to true so that a current minibatch can uses state activities from the previous minibatch.
// default will have truncated BPTT, which only does BPTT inside a minibatch
bool mIgnoreSentenceBeginTag;
HTKMLFReader()
: m_pMBLayout(make_shared<MBLayout>())
HTKMLFReader() :
m_pMBLayout(make_shared<MBLayout>())
{
m_pMBLayout->SetUniqueAxisName(L"HTKMLFReader");
}
template <class ConfigRecordType>

Просмотреть файл

@ -660,7 +660,7 @@ void SequenceReader<ElemType>::ReadClassInfo(const wstring& vocfile, int& classS
// check if unk is the same used in vocabulary file
if (word4idx.find(mUnk.c_str()) == word4idx.end())
RuntimeError("ReadClassInfo unknown symbol '%s' is not in vocabulary file.", mUnk.c_str());
fprintf(stderr, "ReadClassInfo: 'unknown' symbol unk='%s' is not in vocabulary file. Unknown words will error out if encountered.\n", mUnk.c_str());
}
// InitCache - Initialize the caching reader if cache files exist, otherwise the writer

Просмотреть файл

@ -381,6 +381,7 @@ public:
BatchSequenceReader()
: m_pMBLayout(make_shared<MBLayout>())
{
m_pMBLayout->SetUniqueAxisName(L"LMSequenceReader");
mLastProcessedSentenceId = 0;
mRequestedNumParallelSequences = 1;
mLastPosInSentence = 0;

Просмотреть файл

@ -270,6 +270,7 @@ public:
BatchLUSequenceReader()
: m_pMBLayout(make_shared<MBLayout>())
{
m_pMBLayout->SetUniqueAxisName(L"LUSequenceReader");
mLastProcessedSentenceId = 0;
mRequestedNumParallelSequences = 1;
mLastPosInSentence = 0;

Просмотреть файл

@ -247,6 +247,7 @@ public:
: DSSMLabels(nullptr), DSSMCols(0)
{
m_pMBLayout = make_shared<MBLayout>();
m_pMBLayout->SetUniqueAxisName(L"LibSVMReader");
};
virtual ~LibSVMBinaryReader();

Просмотреть файл

@ -130,7 +130,9 @@ BpttPacker::BpttPacker(
auto& buffer = m_streamBuffers[i];
buffer.Resize(m_numParallelSequences * m_truncationSize * GetSampleSize(stream));
m_sequenceBufferPerStream.push_back(make_shared<SequenceBuffer>(m_numParallelSequences));
m_currentLayouts.push_back(make_shared<MBLayout>());
auto pMBLayout = make_shared<MBLayout>();
pMBLayout->SetUniqueAxisName(L"BpttPacker");
m_currentLayouts.push_back(pMBLayout);
}
// Filling in the initial set of sequences

Просмотреть файл

@ -116,9 +116,7 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
// If not we should inject the IMemoryProvider per stream.
int deviceId = matrices.begin()->second.matrix->GetDeviceId();
for (auto mx : matrices)
{
assert(mx.second.matrix->GetDeviceId() == deviceId), UNUSED(deviceId);
}
assert(m_prefetchTask.valid());
@ -133,6 +131,7 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
}
// Reset stale mb layouts.
// BUGBUG: This seems incorrect. (1) layouts should all be updated below, and (2) some of these layouts are the same, we are resetting them twice.
for (const auto& iter : matrices)
{
iter.second.pMBLayout->Init(1, 0);
@ -149,12 +148,12 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
if (m_nameToStreamId.find(mx.first) == m_nameToStreamId.end())
{
string inputNames = EnumerateInputs(m_nameToStreamId);
RuntimeError("Could not map input '%ls' to the reader. Reader outputs only [%s].",
RuntimeError("Could not map input '%ls' to the reader. Reader outputs only [%s].",
mx.first.c_str(), inputNames.c_str());
}
size_t streamId = m_nameToStreamId[mx.first];
const auto& stream = minibatch.m_data[streamId];
m_numParallelSequences = stream->m_layout->GetNumParallelSequences();
@ -176,7 +175,7 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
RuntimeError("Dynamic axis layout '%ls' is shared between inputs '%ls' and '%ls', but layouts generated "
"from the input data are incompatible on this axis. Are you using different sequence lengths? "
"Did you consider adding a DynamicAxis() to the Input nodes?",
layout->GetAxisName().c_str(), layoutToInputMap[layout->GetAxisName()].c_str(), mx.first.c_str());
layout->GetAxisName(), layoutToInputMap[layout->GetAxisName()].c_str(), mx.first.c_str());
}
size_t sampleSize = m_streams[streamId]->m_sampleLayout->GetNumElements();
@ -217,7 +216,7 @@ void ReaderShim<ElemType>::FillMatrixFromStream(StorageType type, Matrix<ElemTyp
IndexType* columns = reinterpret_cast<IndexType*>(rows + nnzCount);
matrix->SetMatrixFromCSCFormat(columns, rows, values, nnzCount, numRows, numCols);
}
else
else
{
RuntimeError("Storage type %d is not supported.", (int)type);
}

Просмотреть файл

@ -114,7 +114,7 @@ MBLayoutPtr SequencePacker::PackDenseStream(const StreamBatch& batch, size_t str
assert(sampleOffset == sampleIndex * sampleSize);
PackDenseSample(destination, sequence, sampleOffset, sampleSize);
sampleOffset += sampleSize;
}
}
else if (stream->m_storageType == StorageType::sparse_csc)
{
// TODO: make type casts members of the SparseSequenceData

Просмотреть файл

@ -59,7 +59,10 @@ class SparsePCReader : public DataReaderBase
public:
SparsePCReader()
: m_pMBLayout(make_shared<MBLayout>()){};
: m_pMBLayout(make_shared<MBLayout>())
{
m_pMBLayout->SetUniqueAxisName(L"SparsePCReader");
};
virtual ~SparsePCReader();
virtual void Destroy();
template <class ConfigRecordType>

Просмотреть файл

@ -135,6 +135,7 @@ public:
UCIFastReader()
{
m_pMBLayout = make_shared<MBLayout>();
m_pMBLayout->SetUniqueAxisName(L"UCIFastReader");
}
virtual ~UCIFastReader();

109
Source/SGDLib/Criterion.h Normal file
Просмотреть файл

@ -0,0 +1,109 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// Criterion.h -- helper classes for accumulating criteria
#pragma once
#include "Basics.h"
#include "Matrix.h"
#include <memory> // for pair
#include <limits> // for isnan() and numeric_limits --TODO: is that the right header?
namespace Microsoft { namespace MSR { namespace CNTK {
// helper class for passing accumulated epoch-level criteria around while retaining their sample counts
// Criteria are represented as a tuple (aggregate criterion, sample count). The average criterion value is their ratio.
struct EpochCriterion : public std::pair<double, size_t>
{
// construction
explicit EpochCriterion(double aggregateCriterionValue = 0.0, size_t aggregateSampleCount = 0) : std::pair<double, size_t>(aggregateCriterionValue, aggregateSampleCount) { }
EpochCriterion(const std::pair<double, size_t>& other) : std::pair<double, size_t>(other) { }
// main way of reading this out: compute the actual average criterion value from the aggregate and sample count
double Average() const { return second > 0 ? first / second : 0.0; } // compute the epoch-average
// a few more handy operations that occured multiple times
bool IsNan() const { return std::isnan(first); }
EpochCriterion operator-(const EpochCriterion& other) const { return EpochCriterion(first - other.first, second - other.second); }
void operator+=(const EpochCriterion& other) { first += other.first; second += other.second; }
static EpochCriterion Infinity() { return EpochCriterion(std::numeric_limits<double>::infinity()); }
bool IsInfinity() const { return first == std::numeric_limits<double>::infinity(); }
};
// We accumulate criteria in this struct.
// Criteria are accumulated together with their counts (counts depend on sequence lengths, and different criteria may have different sequence lengths).
template <class ElemType>
struct CriterionAccumulator
{
// constructor
CriterionAccumulator(size_t numCriteria, DEVICEID_TYPE deviceId) :
m_aggregateCriterionValues(1, numCriteria, deviceId)
{
m_aggregateCriterionValues.SetValue(0);
m_aggregateSampleCounts.assign(numCriteria, 0);
}
// 'i' is the index of the element we add into (multiple eval criteria share the same matrix object)
// Use 'reset=true' to not accumulate but overwrite.
const CriterionAccumulator& Add(const std::vector<ComputationNodeBasePtr>& nodes, size_t i, size_t legacyNumSamples)
{
return Accumulate</*reset=*/false>(nodes, i, legacyNumSamples);
}
const CriterionAccumulator& Assign(const std::vector<ComputationNodeBasePtr>& nodes, size_t i, size_t legacyNumSamples)
{
return Accumulate</*reset=*/true>(nodes, i, legacyNumSamples);
}
// retrieve an accumulated result as a pair (numerator, denominator)
EpochCriterion GetCriterion(size_t i) const
{
// BUGBUG: For unknown reasons, this (or the other below) check makes a difference for MPI configs.
// If it is left out, then training and test configs end up being scaled by the same factor close to 1.
if (m_aggregateSampleCounts[i] == 0)
return EpochCriterion(0, 0); // avoid unnecessary GPU access
else
return EpochCriterion(m_aggregateCriterionValues(0, i), m_aggregateSampleCounts[i]);
}
private:
// shared part of Add() and Assign()
// This code assumes that if number of samples is 0, the criterion value is also 0 and does not need to be fetched from the GPU.
template<bool reset>
const CriterionAccumulator& Accumulate(const std::vector<ComputationNodeBasePtr>& nodes, size_t i, size_t legacyNumSamples)
{
const auto& node = nodes[i]; // multiple nodes are managed by this struct
float beta = reset ? 0 : 1;
// Note: A future change will be that criterion nodes emit criteria per frame.
// In that case, we will do masking and an implicit reduction right here using TensorView.
size_t numSamples = GetNumSamples(nodes[i], legacyNumSamples);
// temp solution until we add TensorView reduction
if (beta == 0)
{
Matrix<ElemType>::AssignElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(),
0, 0, m_aggregateCriterionValues, 0, i);
m_aggregateSampleCounts[i] = numSamples;
}
else if (numSamples > 0) // avoid unnecessary GPU access
{
Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(),
0, 0, m_aggregateCriterionValues, 0, i);
m_aggregateSampleCounts[i] += numSamples;
}
return *this;
}
// get the number of samples
static size_t GetNumSamples(const ComputationNodeBasePtr& node, size_t legacyNumSamples)
{
if (node->HasMBLayout())
return node->GetMBLayout()->GetActualNumSamples();
else
return legacyNumSamples;
}
private:
Matrix<ElemType> m_aggregateCriterionValues; // [1 x N]
vector<size_t> m_aggregateSampleCounts; // [N]
};
}}}

Просмотреть файл

@ -70,7 +70,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Verify that there's indeed a single layout
for (const auto& iter : inputMatrices)
{
assert(iter.second.pMBLayout == pMBLayout);
assert(iter.second.pMBLayout == pMBLayout);
// TODO: This must be a runtime check, not an assert().
UNUSED(iter);
}
@ -105,8 +106,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
static pair<size_t, size_t> DecimateMinibatch(const StreamMinibatchInputs& MB, // input matrices
StreamMinibatchInputs& decimatedMB, // output decimated matrices.
MBLayoutPtr pMBLayout, // input MBLayout
MBLayoutPtr& pDecimateMBLayout, // output decimated MBLayout (note: cannot work in-place)
MBLayoutPtr pMBLayout, // input MBLayout
MBLayoutPtr& pDecimateMBLayout, // output decimated MBLayout (note: cannot work in-place)
size_t numProcs, size_t rank)
{
size_t numParallelSequences = pMBLayout->GetNumParallelSequences();
@ -148,6 +149,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
// decimate MBLayout as well
pDecimateMBLayout = make_shared<MBLayout>(numNewParallelSequence, nT, L"");
pDecimateMBLayout->SetAxisName(pMBLayout->GetAxisName());
#if 1
// now copy over all sequence info records that are inside the range, with adjusted 's'
const auto& sequences = pMBLayout->GetAllSequences();
@ -181,17 +183,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// no need to do inplace decimation if numproc == 1
// allocate space for non-inplace decimation
MBLayoutPtr pDecimatedMB = make_shared<MBLayout>();
MBLayoutPtr pDecimatedMBLayout = make_shared<MBLayout>();
pDecimatedMBLayout->SetAxisName(pMBLayout->GetAxisName());
StreamMinibatchInputs decimatedMB;
// call in-place decimation
pair<size_t, size_t> selected = DecimateMinibatch<ElemType>(mb, decimatedMB, pMBLayout, pDecimatedMB, numprocs, rank);
pair<size_t, size_t> selected = DecimateMinibatch<ElemType>(mb, decimatedMB, pMBLayout, pDecimatedMBLayout, numprocs, rank);
// move the data
for (auto k : mb)
{
const auto& name = k.first;
mb.GetInputMatrix<ElemType>(name).SetValue(decimatedMB.GetInputMatrix<ElemType>(name)); // deep-copy our local one to the output location
}
pMBLayout->MoveFrom(pDecimatedMB);
pMBLayout->MoveFrom(pDecimatedMBLayout);
return selected;
}
@ -353,7 +356,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
// for sequence training
if (criterionNodes[0]->OperationName() == L"SequenceWithSoftmax")
if (!criterionNodes.empty() && criterionNodes[0]->OperationName() == L"SequenceWithSoftmax")
{
auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNodes[0]);
assert(node);
@ -379,7 +382,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t requestedSubminibatches)
{
// first, remember interface to the net
// BUGBUG: This will no longer be correct once we have multiple input layouts.
// BUGBUG (Issue #95): This will no longer be correct once we have multiple input layouts.
m_netMBLayoutPtr = net.GetMBLayoutPtrOfNetwork();
m_netInputMatrixPtr = inputMatrices;
@ -539,18 +542,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
shared_ptr<ComputationNode<ElemType>> pNode = m_LearnableNodePtr[nodename];
m_cachedGradient.GetInputMatrix<ElemType>(nodename) += pNode->Gradient();
pNode->Gradient().SetValue((ElemType) 0);
pNode->Gradient().SetValue(0);
}
// accumulate criterion value
Matrix<ElemType>::AddElementToElement(m_netCriterionNodes[0]->Value(), 0, 0,
*m_netCriterionAccumulator, 0, 0);
m_netCriterionNodes[0]->Value().SetValue((ElemType) 0);
if (!m_netCriterionNodes.empty())
{
Matrix<ElemType>::AddElementToElement(m_netCriterionNodes[0]->Value(), 0, 0,
*m_netCriterionAccumulator, 0, 0);
m_netCriterionNodes[0]->Value().SetValue(0);
}
// accumulate evaluation value
for (size_t i = 0; i < m_netEvaluationNodes.size(); i++)
{
Matrix<ElemType>::AddElementToElement(m_netEvaluationNodes[i]->Value(), 0, 0,
*m_netEvaluationAccumulator, 0, i);
m_netEvaluationNodes[i]->Value().SetValue((ElemType) 0);
m_netEvaluationNodes[i]->Value().SetValue(0);
}
// Export node state
@ -576,10 +582,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// also revert net.m_MBLayoutPtr
m_netMBLayoutPtr->CopyFrom(m_MBLayoutCache);
// m_netCriterionNodes[0]->Value().SetValue((ElemType)0);
Matrix<ElemType>::AddElementToElement(*m_netCriterionAccumulator, 0, 0,
m_netCriterionNodes[0]->Value(), 0, 0);
m_netCriterionAccumulator->SetValue((ElemType) 0);
if (!m_netCriterionNodes.empty())
{
// m_netCriterionNodes[0]->Value().SetValue((ElemType)0);
Matrix<ElemType>::AddElementToElement(*m_netCriterionAccumulator, 0, 0,
m_netCriterionNodes[0]->Value(), 0, 0);
}
m_netCriterionAccumulator->SetValue(0);
for (size_t i = 0; i < m_netEvaluationNodes.size(); i++)
{
@ -587,7 +596,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
Matrix<ElemType>::AddElementToElement(*m_netEvaluationAccumulator, 0, i,
m_netEvaluationNodes[i]->Value(), 0, 0);
}
m_netEvaluationAccumulator->SetValue((ElemType) 0);
m_netEvaluationAccumulator->SetValue(0);
}
};
};

Просмотреть файл

@ -6,12 +6,12 @@ struct DistGradHeader
{
public:
size_t numSamples;
size_t numSamplesWithLabel;
size_t numSamplesWithLabel; // this is the denominator for 'criterion'
double criterion;
// variable-size array
int numEvalNode;
double evalErrors[1];
pair<double,size_t> evalErrors[1];
static DistGradHeader* Create(int numEvalNode)
{
@ -41,7 +41,8 @@ public:
criterion += other->criterion;
for (int i = 0; i < numEvalNode; i++)
{
evalErrors[i] += other->evalErrors[i];
evalErrors[i].first += other->evalErrors[i].first; // numer
evalErrors[i].second += other->evalErrors[i].second; // denom
}
}
}
@ -58,7 +59,8 @@ public:
criterion = 0;
for (int i = 0; i < numEvalNode; i++)
{
evalErrors[i] = 0;
evalErrors[i].first = 0;
evalErrors[i].second = 0;
}
}
@ -77,17 +79,19 @@ public:
}
private:
static size_t DistGradHeaderSize(size_t nEvalNode)
static size_t DistGradHeaderSize(size_t nEvalNodes)
{
return sizeof(DistGradHeader) + (sizeof(double) * (nEvalNode - 1));
// BUGBUG: Should be sizeof(evalErrors[0]), but the compiler won't let me. This is only correct because evalErrors has 1 element.
return sizeof(DistGradHeader) + (sizeof(decltype(evalErrors)) * (nEvalNodes - 1));
}
// Disallow construction and destruction since this type contains a variable sized array member
// and hence must be constructed through the create and destroy functions
DistGradHeader() = delete;
DistGradHeader() = delete;
~DistGradHeader() = delete;
// Disallow copy and move construction/assignment
DISABLE_COPY_AND_MOVE(DistGradHeader);
};
} } }
}}}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -9,6 +9,7 @@
#include "SimpleEvaluator.h"
#include "DataReader.h"
#include "ScriptableObjects.h"
#include "Criterion.h"
#include <vector>
#include <string>
#include <stdexcept>
@ -230,7 +231,8 @@ protected:
GradientUpdateInfo m_gradType;
RMSPropInfo m_rpi;
int m_numMBsToShowResult;
size_t m_numMBsToShowResult = 0;
size_t m_firstMBsToShowResult = 0;
int m_numMBsToCUDAProfile;
bool m_doGradientCheck;
@ -398,9 +400,8 @@ protected:
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients,
/*out*/ double& epochCriterion,
/*out*/ std::vector<double>& epochEvalErrors,
/*out*/ size_t& totalSamplesSeen,
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
std::string prefixMsg = "");
size_t AdaptiveMinibatchSizing(ComputationNetworkPtr net,
@ -463,10 +464,9 @@ protected:
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients,
/*out*/ double& epochCriterion,
/*out*/ std::vector<double>& epochEvalErrors,
/*out*/ size_t& totalSamplesSeen,
std::string prefixMsg = "");
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
const std::string& prefixMsg = "");
void InitDistGradAgg(int numEvalNodes, int traceLevel);
void InitModelAggregationHandler(int traceLevel);
@ -496,13 +496,19 @@ protected:
void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, // TODO: combine totalSamplesSeen and prevCriterion into a EpochCriterion type
const double learnRatePerSample,
const std::list<Matrix<ElemType>>& smoothedGradients,
const double prevCriterion,
const size_t minibatchSize);
bool LoadCheckPointInfo(const size_t epochNumber,
bool TryLoadCheckPointInfo(const size_t epochNumber,
/*out*/ size_t& totalSamplesSeen,
/*out*/ double& learnRatePerSample,
std::list<Matrix<ElemType>>& smoothedGradients,
/*out*/ double& prevCriterion,
/*out*/ size_t& minibatchSize);
void LoadCheckPointInfo(const size_t epochNumber,
/*out*/ size_t& totalSamplesSeen,
/*out*/ double& learnRatePerSample,
std::list<Matrix<ElemType>>& smoothedGradients,
@ -533,17 +539,17 @@ public:
int npos);
protected:
wstring m_modelPath;
std::wstring m_modelPath;
bool m_keepCheckPointFiles;
// bool m_validateAfterModelReloading; // TODO: remove this. Why would one not validate a model?
wstring m_trainCriterionNodeName;
wstring m_evalCriterionNodeName;
std::wstring m_trainCriterionNodeName;
std::wstring m_evalCriterionNodeName;
// enable tracing. Nodes listed here get their m_traceNodeValueXXX flags set
vector<wstring> m_traceNodeNamesReal;
vector<wstring> m_traceNodeNamesCategory;
vector<wstring> m_traceNodeNamesSparse;
std::vector<std::wstring> m_traceNodeNamesReal;
std::vector<std::wstring> m_traceNodeNamesCategory;
std::vector<std::wstring> m_traceNodeNamesSparse;
size_t m_prevChosenMinibatchSize;
double m_lastFinishedEpochTrainLoss;

Просмотреть файл

@ -164,6 +164,7 @@
<ClInclude Include="..\ComputationNetworkLib\ComputationNetwork.h" />
<ClInclude Include="..\ComputationNetworkLib\ComputationNode.h" />
<ClInclude Include="..\ComputationNetworkLib\ConvolutionalNodes.h" />
<ClInclude Include="Criterion.h" />
<ClInclude Include="DataReaderHelpers.h" />
<ClInclude Include="DistGradHeader.h" />
<ClInclude Include="IDistGradAggregator.h" />

Просмотреть файл

@ -147,6 +147,9 @@
<ClInclude Include="MASGD.h">
<Filter>Parallelization</Filter>
</ClInclude>
<ClInclude Include="Criterion.h">
<Filter>SGD</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Filter Include="Common">

Просмотреть файл

@ -224,7 +224,7 @@ private:
assert(headerCPU->criterion == 0);
for (int i = 0; i < headerCPU->numEvalNode; ++i)
{
assert(headerCPU->evalErrors[i] == 0);
assert(headerCPU->evalErrors[i].first == 0);
}
// If the current node did not process any samples, the gradients should be zero'd

Просмотреть файл

@ -14,6 +14,7 @@
#include "DistGradHeader.h"
#include "IDistGradAggregator.h"
#include "SimpleDistGradAggregator.h"
#include "Criterion.h"
#include <vector>
#include <string>
@ -31,10 +32,11 @@ template <class ElemType>
class SimpleEvaluator
{
public:
SimpleEvaluator(ComputationNetworkPtr net, const MPIWrapperPtr& mpi, const size_t numMBsToShowResult = 100, const int traceLevel = 0, const size_t maxSamplesInRAM = SIZE_MAX,
SimpleEvaluator(ComputationNetworkPtr net, const MPIWrapperPtr& mpi, const size_t numMBsToShowResult = 100, const size_t firstMBsToShowResult = 0, const int traceLevel = 0, const size_t maxSamplesInRAM = SIZE_MAX,
const size_t numSubminiBatches = 1)
: m_net(net),
m_numMBsToShowResult(numMBsToShowResult),
m_firstMBsToShowResult(firstMBsToShowResult),
m_traceLevel(traceLevel),
m_maxSamplesInRAM(maxSamplesInRAM),
m_numSubminiBatches(numSubminiBatches),
@ -45,7 +47,7 @@ public:
}
// returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
vector<double> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
vector<EpochCriterion> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
{
ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring);
@ -81,9 +83,7 @@ public:
}
// initialize eval results
std::vector<double> evalResults;
for (int i = 0; i < evalNodes.size(); i++)
evalResults.push_back((double) 0);
std::vector<EpochCriterion> evalResults(evalNodes.size(), EpochCriterion(0));
// allocate memory for forward computation
m_net->AllocateAllMatrices(evalNodes, {}, nullptr);
@ -102,12 +102,10 @@ public:
size_t totalEpochSamples = 0;
size_t numMBsRun = 0;
size_t actualMBSize = 0;
size_t numSamplesLastMBs = 0;
size_t lastMBsRun = 0; // MBs run before this display
size_t numSamplesLastLogged = 0;
size_t numMBsRunLastLogged = 0; // MBs run before this display
std::vector<double> evalResultsLastMBs;
for (int i = 0; i < evalResults.size(); i++)
evalResultsLastMBs.push_back((ElemType) 0);
std::vector<EpochCriterion> evalResultsLastLogged(evalResults.size(), EpochCriterion(0));
//TODO: we should add support for distributed reading
dataReader->StartMinibatchLoop(mbSize, 0, testSize);
@ -123,6 +121,8 @@ public:
if (numSubminibatchesNeeded > 1)
smbDispatcher.Init(m_net, learnableNodes, criterionNodes, evalNodes);
CriterionAccumulator<ElemType> localEpochEvalErrors(evalNodes.size(), m_net->GetDeviceId());
const size_t numIterationsBeforePrintingProgress = 100;
size_t numItersSinceLastPrintOfProgress = 0;
while (DataReaderHelpers::GetMinibatchIntoNetwork<ElemType>(*dataReader, m_net, nullptr, dataReader->SupportsDistributedMBRead(), m_mpi != nullptr, inputMatrices, actualMBSize, m_mpi))
@ -162,9 +162,9 @@ public:
m_gradHeader->numEvalNode = evalNodes.size();
m_gradHeader->numSamples = actualMBSize;
m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
m_gradHeader->criterion = 0.0;
m_gradHeader->criterion = 0.0; // (not used here)
for (size_t i = 0; i < evalNodes.size(); i++)
m_gradHeader->evalErrors[i] = evalNodes[i]->Get00Element();
m_gradHeader->evalErrors[i] = localEpochEvalErrors.Assign(evalNodes, i, numSamplesWithLabel).GetCriterion(i);
// TODO: We are reusing the aggregation logic inside SimpleDistGradAggregator, which has a heavy dependency
// on the gradient matrix. At some point we should refactor the aggregator class to be able to only calculating
@ -185,9 +185,7 @@ public:
else
{
for (int i = 0; i < evalNodes.size(); i++)
{
evalResults[i] += (double)evalNodes[i]->Get00Element(); // criterionNode should be a scalar
}
evalResults[i] += localEpochEvalErrors.Assign(evalNodes, i, numSamplesWithLabel).GetCriterion(i);
}
totalEpochSamples += aggregateNumSamplesWithLabel;
@ -195,22 +193,19 @@ public:
if (m_traceLevel > 0)
{
numSamplesLastMBs += aggregateNumSamplesWithLabel;
numSamplesLastLogged += aggregateNumSamplesWithLabel;
if (numMBsRun % m_numMBsToShowResult == 0)
if (numMBsRun <= m_firstMBsToShowResult || (m_numMBsToShowResult && (numMBsRun % m_numMBsToShowResult == 0)))
{
DisplayEvalStatistics(lastMBsRun + 1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
DisplayEvalStatistics(numMBsRunLastLogged + 1, numMBsRun, numSamplesLastLogged, evalNodes, evalResults, evalResultsLastLogged);
for (int i = 0; i < evalResults.size(); i++)
{
evalResultsLastMBs[i] = evalResults[i];
}
numSamplesLastMBs = 0;
lastMBsRun = numMBsRun;
evalResultsLastLogged[i] = evalResults[i];
numSamplesLastLogged = 0;
numMBsRunLastLogged = numMBsRun;
}
}
numItersSinceLastPrintOfProgress = ProgressTracing::TraceFakeProgress(numIterationsBeforePrintingProgress, numItersSinceLastPrintOfProgress);
// call DataEnd to check if end of sentence is reached
@ -219,47 +214,37 @@ public:
}
// show last batch of results
if (m_traceLevel > 0 && numSamplesLastMBs > 0)
if (m_traceLevel > 0 && numSamplesLastLogged > 0)
{
DisplayEvalStatistics(lastMBsRun + 1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
DisplayEvalStatistics(numMBsRunLastLogged + 1, numMBsRun, numSamplesLastLogged, evalNodes, evalResults, evalResultsLastLogged);
}
// final statistics
for (int i = 0; i < evalResultsLastMBs.size(); i++)
evalResultsLastMBs[i] = 0; // clear this since statistics display will subtract the previous value
for (int i = 0; i < evalResultsLastLogged.size(); i++)
evalResultsLastLogged[i] = EpochCriterion(0); // clear this since statistics display will subtract the previous value
fprintf(stderr, "Final Results: ");
DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastMBs, true);
for (int i = 0; i < evalResults.size(); i++)
{
evalResults[i] /= totalEpochSamples;
}
DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastLogged, true);
return evalResults;
}
protected:
void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs,
void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastLogged,
const vector<ComputationNodeBasePtr>& evalNodes,
const double evalResults, const double evalResultsLastMBs, bool displayConvertedValue = false)
const EpochCriterion evalResults, const EpochCriterion evalResultsLastLogged, bool displayConvertedValue = false)
{
vector<double> evaR;
evaR.push_back(evalResults);
vector<double> evaLast;
evaLast.push_back(evalResultsLastMBs);
DisplayEvalStatistics(startMBNum, endMBNum, numSamplesLastMBs, evalNodes, evaR, evaLast, displayConvertedValue);
DisplayEvalStatistics(startMBNum, endMBNum, numSamplesLastLogged, evalNodes, { evalResults }, { evalResultsLastLogged }, displayConvertedValue);
}
void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector<ComputationNodeBasePtr>& evalNodes,
const vector<double>& evalResults, const vector<double>& evalResultsLastMBs, bool displayConvertedValue = false)
void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastLogged, const vector<ComputationNodeBasePtr>& evalNodes,
const vector<EpochCriterion>& evalResults, const vector<EpochCriterion>& evalResultsLastLogged, bool displayConvertedValue = false)
{
fprintf(stderr, "Minibatch[%lu-%lu]: SamplesSeen = %lu ", startMBNum, endMBNum, numSamplesLastMBs);
fprintf(stderr, "Minibatch[%lu-%lu]: SamplesSeen = %lu ", startMBNum, endMBNum, numSamplesLastLogged);
for (size_t i = 0; i < evalResults.size(); i++)
{
double eresult = (evalResults[i] - evalResultsLastMBs[i]) / numSamplesLastMBs;
double eresult = (evalResults[i] - evalResultsLastLogged[i]).Average(); // / numSamplesLastLogged;
fprintf(stderr, "%ls: %ls/Sample = %.8g ", evalNodes[i]->NodeName().c_str(), evalNodes[i]->OperationName().c_str(), eresult);
if (displayConvertedValue)
@ -279,6 +264,7 @@ protected:
protected:
ComputationNetworkPtr m_net;
size_t m_numMBsToShowResult;
size_t m_firstMBsToShowResult;
size_t m_maxSamplesInRAM;
size_t m_numSubminiBatches;
MPIWrapperPtr m_mpi;
@ -288,4 +274,5 @@ protected:
int m_traceLevel;
void operator=(const SimpleEvaluator&); // (not assignable)
};
} } }
}}}

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

0
Tests/EndToEndTests/SLU/run-test Executable file → Normal file
Просмотреть файл

Просмотреть файл