Merge branch 'qiwye/asgd-dev' into qiwye/asgd-exp

2016-04-17 17:09:34 +08:00 · 2016-04-17 17:09:34 +08:00 · 1a0b88be0c
--- a/CNTK.sln
+++ b/CNTK.sln
@ -458,13 +458,21 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{BFBC6BE1-C33
 		Tests\EndToEndTests\SLU\atis.dev.IOB.simple = Tests\EndToEndTests\SLU\atis.dev.IOB.simple
 		Tests\EndToEndTests\SLU\atis.test.apos.pred.pos.head.IOB.simple = Tests\EndToEndTests\SLU\atis.test.apos.pred.pos.head.IOB.simple
 		Tests\EndToEndTests\SLU\atis.train.apos.pred.pos.head.IOB.simple = Tests\EndToEndTests\SLU\atis.train.apos.pred.pos.head.IOB.simple
+		Tests\EndToEndTests\SLU\baseline.linux.cpu.txt = Tests\EndToEndTests\SLU\baseline.linux.cpu.txt
+		Tests\EndToEndTests\SLU\baseline.linux.gpu.txt = Tests\EndToEndTests\SLU\baseline.linux.gpu.txt
+		Tests\EndToEndTests\SLU\baseline.windows.cpu.txt = Tests\EndToEndTests\SLU\baseline.windows.cpu.txt
+		Tests\EndToEndTests\SLU\baseline.windows.gpu.txt = Tests\EndToEndTests\SLU\baseline.windows.gpu.txt
 		Tests\EndToEndTests\SLU\globals.cntk = Tests\EndToEndTests\SLU\globals.cntk
 		Tests\EndToEndTests\SLU\input.txt = Tests\EndToEndTests\SLU\input.txt
 		Tests\EndToEndTests\SLU\inputmap.txt = Tests\EndToEndTests\SLU\inputmap.txt
+		Tests\EndToEndTests\SLU\lstm.ndl = Tests\EndToEndTests\SLU\lstm.ndl
 		Tests\EndToEndTests\SLU\lstmNDL.txt = Tests\EndToEndTests\SLU\lstmNDL.txt
+		Tests\EndToEndTests\SLU\output.txt = Tests\EndToEndTests\SLU\output.txt
 		Tests\EndToEndTests\SLU\README.txt = Tests\EndToEndTests\SLU\README.txt
 		Tests\EndToEndTests\SLU\rnnlu.cntk = Tests\EndToEndTests\SLU\rnnlu.cntk
 		Tests\EndToEndTests\SLU\rnnlu.ndl.cntk = Tests\EndToEndTests\SLU\rnnlu.ndl.cntk
+		Tests\EndToEndTests\SLU\run-test = Tests\EndToEndTests\SLU\run-test
+		Tests\EndToEndTests\SLU\testcases.yml = Tests\EndToEndTests\SLU\testcases.yml
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "MNIST", "MNIST", "{FA33A61E-95C7-4049-8111-22058CE361A3}"
@ -509,7 +517,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CIFAR-10", "CIFAR-10", "{77
 		Examples\Image\Miscellaneous\CIFAR-10\CifarConverter.py = Examples\Image\Miscellaneous\CIFAR-10\CifarConverter.py
 		Examples\Image\Miscellaneous\CIFAR-10\labelsmap.txt = Examples\Image\Miscellaneous\CIFAR-10\labelsmap.txt
 		Examples\Image\Miscellaneous\CIFAR-10\Macros.ndl = Examples\Image\Miscellaneous\CIFAR-10\Macros.ndl
-		Examples\Image\Miscellaneous\CIFAR-10\README.md = Examples\Image\Miscellaneous\CIFAR-10\README.md
+		Examples\Image\Miscellaneous\CIFAR-10\readme.txt = Examples\Image\Miscellaneous\CIFAR-10\readme.txt
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ImageNet", "ImageNet", "{EF710C5A-E616-442A-889D-C997D39AF2E1}"
@ -666,6 +674,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Config", "Config", "{850008
 	ProjectSection(SolutionItems) = preProject
 		Examples\Text\PennTreebank\Config\rnn.cntk = Examples\Text\PennTreebank\Config\rnn.cntk
 		Examples\Text\PennTreebank\Config\S2SAutoEncoder.cntk = Examples\Text\PennTreebank\Config\S2SAutoEncoder.cntk
+		Examples\Text\PennTreebank\Config\S2SLib.bs = Examples\Text\PennTreebank\Config\S2SLib.bs
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{E6DC3B7D-303D-4A54-B040-D8DCF8C56E17}"
--- a/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
+++ b/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
@ -1,4 +1,5 @@
-# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunDir=$(SolutionDir)Examples/Text/PennTreebank/_run  RootDir=$(SolutionDir)Examples/Text/PennTreebank/_run  DataDir=$(SolutionDir)Examples/Text/PennTreebank/Data  ConfigDir=$(SolutionDir)Examples/Text/PennTreebank/Config  stderr=$(SolutionDir)Examples/Text/PennTreebank/_run/S2SAutoEncoder.log  DeviceId=-1  makeMode=false
+# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunRootDir=$(SolutionDir)Examples/Text/PennTreebank  DeviceId=-1  makeMode=false
+# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunRootDir=$(SolutionDir)g2p  makeMode=false
 ####################
 # WORK IN PROGRESS #
 # WORK IN PROGRESS #
@ -6,7 +7,28 @@
 ####################

 # Command line to run in debugger:
-# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunDir=$(SolutionDir)Examples/Text/PennTreebank/_run  RootDir=$(SolutionDir)Examples/Text/PennTreebank/_run  DataDir=$(SolutionDir)Examples/Text/PennTreebank/Data  ConfigDir=$(SolutionDir)Examples/Text/PennTreebank/Config  stderr=$(SolutionDir)Examples/Text/PennTreebank/_run/S2SAutoEncoder.log  train=[SGD=[maxEpochs=1]]  confVocabSize=1000  DeviceId=-1  makeMode=false
+# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunRootDir=$(SolutionDir)Examples/Text/PennTreebank  train=[SGD=[maxEpochs=1]]  confVocabSize=1000  DeviceId=-1  makeMode=false
+
+# directory defaults (if not overridden)
+RunRootDir = "../.."             # default if not overridden
+DataDir    = "$RunRootDir$/Data"
+CacheDir   = "$DataDir$/cache"   # (not used currently)
+ExpRootDir = "$RunRootDir$"
+
+# experiment id
+#ExpId = _run
+
+deviceId = 1
+#ExpId = 68-$deviceId$-s2sae-bigmodel
+ExpId = 06-$deviceId$-g2p
+#ExpId = 05-3-g2p # for decoding a different model
+
+# directories
+ExpDir    = "$ExpRootDir$/$ExpId$"
+ModelDir  = "$ExpDir$/Models"
+
+stderr = $ExpDir$/S2SAutoEncoder.log7
+
 # Append this for small set:
 # train=[epochSize=2048]]  trainFile=ptb.small.train.txt  validFile=ptb.small.valid.txt testFile=ptb.small.test.txt

@ -14,44 +36,37 @@
 # It encodes an entire sentence into a flat vector, and tries to regenerate it.
 # Meant to be useful mainly understanding how to do sequence-to-sequence in CNTK.

-# Parameters can be overwritten on the command line
-# for example: cntk configFile=myConfigFile RootDir=../.. 
-# For running from Visual Studio add
-# currentDirectory=$(SolutionDir)/<path to corresponding data folder> 
-RootDir = ".."
-
-ConfigDir = "$RootDir$/Config"
-DataDir   = "$RootDir$/Data"
-CacheDir  = "$RootDir$/Data/cache"
-OutputDir = "$RootDir$/Output"
-ModelDir  = "$OutputDir$/Models"
-
-# deviceId=-1 for CPU, >=0 for GPU devices, "auto" chooses the best GPU, or CPU if no usable GPU is available
-deviceId = "auto"
-
 command = writeWordAndClassInfo:train:test:write
 #command = write

 precision  = "float"
 traceLevel = 1
 modelPath  = "$ModelDir$/S2SAutoEncoder.dnn"
-decodeModelPath = "$modelPath$.13" # epoch to decode. Has best CV WER

-# uncomment the following line to write logs to a file
-#stderr=$OutputDir$/rnnOutput
+decodeModelPath = "$modelPath$.13" # epoch to decode can be appended here
+beamDepth = 1                    # 0=predict; 1=greedy; >1=beam
+decodeOutputPath = "$decodeModelPath$.b$beamDepth$"

-#numCPUThreads = 1
+#confVocabSize = 10000
+#confClassSize = 50

-confVocabSize = 10000
-confClassSize = 50
-useStabilizer = true
+#trainFile = "ptb.train.txt"
+##trainFile = "ptb.small.train.txt"
+#validFile = "ptb.valid.txt"
+##validFile = "ptb.small.valid.txt"
+#testFile  = "ptb.test.txt"
+##testFile  = "ptb.test.txt-econ1"
+##testFile = "ptb.small.train.txt" # test on train, to see whether model makes sense at all
+#startSymbol = "</s>"

-trainFile = "ptb.train.txt"
-#trainFile = "ptb.small.train.txt"
-validFile = "ptb.valid.txt"
-#validFile = "ptb.small.valid.txt"
-testFile  = "ptb.test.txt"
-#testFile  = "ptb.test.txt-econ1"
+confVocabSize = 69 #10000
+confClassSize = 0 #50
+
+trainFile = "g014b2b.train-dev-20-21.bsf.joint"
+#trainFile = "g014b2b.train-dev-1-21.bsf.joint" # small one for debugging
+validFile = "g014b2b.train-dev-1-21.bsf.joint"
+testFile  = "g014b2b.test.bsf.joint"
+startSymbol = "<s>"

 #######################################
 #  network definition                 #
@ -59,12 +74,22 @@ testFile  = "ptb.test.txt"

 BrainScriptNetworkBuilder = (new ComputationNetwork [

+# TODO: move this somewhere shared
+    enableTracing = true
+    traceFrequency = 1000
+    tracingLabelMappingFile = "$ModelDir$/vocab.wl"
+    include "S2SLib.bs"
+    beamDepth=3 // for above Trace macros only
+
    # import general config options from outside config values
    vocabDim = $confVocabSize$
    nbrClass = $confClassSize$

-    useStabilizer = $useStabilizer$
-    useEncoder = true                 // if false, this becomes a regular RNN
+    isAutoencoder = false               # input is only one sequence, meant to reproduce itself
+    useStabilizer = true
+    useEncoder    = true                # if false, this becomes a regular RNN
+    useNYUStyle   = false               # if true use thought vector for all inputs, NYU-style
+    attentionSpan = 20                  # we only support fixed-size attention windows for now. 0 means no attention; exactly 20 is needed for the g2p CMUDict task

    # import some namespaces
    Parameters = BS.Parameters
@ -74,125 +99,176 @@ BrainScriptNetworkBuilder = (new ComputationNetwork [
    Boolean    = BS.Boolean
    RecurrentLSTMP = BS.RNNs.RecurrentLSTMP

-    embeddingDim = 300
-    hiddenDim    = 200
+    embeddingDim = $confVocabSize$ # 300
+    hiddenDim    = 750 # 512 # 1024 # 200   --TODO: Kaisheng used 500
+    maxLayer = 2 # 1 # 0

-    encoderDims[i:0..0] = hiddenDim # this defines the number of hidden layers in each
-    decoderDims[i:0..0] = hiddenDim # both are one LSTM layer only for now
+    encoderDims[i:0..maxLayer] = hiddenDim # this defines the number of hidden layers in each
+    decoderDims[i:0..maxLayer] = hiddenDim # both are one LSTM layer only for now

    # inputs
    #input = SparseInput(vocabDim, tag='feature');  # BUGBUG: Slice() not working for sparse, need to extend TensorView
    input = Input(vocabDim, tag='feature');

-    # for an auto-encoder, both are the same
-    labels = input
+    streams = [
+                  rawInput = input
+                  out = if isAutoencoder
+                  then [
+                      # for an auto-encoder, both are the same
+                      input  = rawInput
+                      labels = rawInput
+                  ]
+                  else [
+                      # we encode input and label as a single input; this splits it into two
+                      separatorRow = 2                                                                          # row index of separator symbokl 
+                      isSeparator = RowSlice (separatorRow, 1, rawInput)                                        # cut out the separator as a flag
+                      inInput  = Boolean.Or (FutureValue (1, inInput , defaultHiddenActivation=0), isSeparator) # flag sequence: word is input...
+                      inLabels = Boolean.Or (PastValue   (1, inLabels, defaultHiddenActivation=0), isSeparator) # ...or labels
+                      input  = Sequences.Gather (inInput,  rawInput)                                            # use flags to split raw input into input and labels
+                      labels = Sequences.Gather (inLabels, rawInput)                                            # (both have different lengths)
+                  ]
+              ].out
+
+    # helpers
+    First (x) = Slice (0,  1, x,  axis=-1)
+    Last (x)  = Slice (-1, 0, x,  axis=-1)

    # strip separators
-    CastAs (type, data) = Sequences.Scatter (Constants.OnesLike (type), data)
+    # TODO: find out which one is the correct one
+    #inputSequence = Slice (0, -1, streams.input,   axis=-1)  # e.g. <s> A   B   C      # TODO: process </s> as well, to trigger the thought vector
+    inputSequence  =               streams.input              # e.g. <s> A   B   C    </s>
+    labelSequence  = Slice (1,  0, streams.labels,  axis=-1)  # e.g. A   B   C   </s>

-    inputSequence =                        Slice (0, -1, input,  axis=-1)  # e.g. <s> A   B   C
-    labelSequence = CastAs (inputSequence, Slice (1,  0, labels, axis=-1)) # e.g. A   B   C   </s>
-
-    # embeddings
+    # embeddings  --as long as we cannot read multiple sequences, we got one embedding
    # Note: Embeddings are linear, so better stabilize. We really should use BatchNorm.

-    Einput = Parameters.Stabilize (Parameters.WeightParam (vocabDim, embeddingDim), enabled=useStabilizer) # note: this is assumed to be applied transposed, hence the swapped dimensions
-    Elabel = Einput
+    E = Parameters.Stabilize (Parameters.WeightParam (vocabDim, embeddingDim), enabled=useStabilizer) # note: this is assumed to be applied transposed, hence the swapped dimensions
+    EmbedInput (x)  = if vocabDim != embeddingDim then TransposeTimes (E, x) else x
+    EmbedLabels (x) = if vocabDim != embeddingDim then TransposeTimes (E, x) else x

-    Embed (E, x) = TransposeTimes (E, x)
+    inputEmbedded  = EmbedInput  (inputSequence)
+    labelsEmbedded = EmbedLabels (labelSequence)
+    labelSentenceStart = First (streams.labels)
+    labelSentenceStartEmbedded = EmbedLabels (labelSentenceStart)

-    inputEmbedded  = Embed (Einput, inputSequence)
-    labelsEmbedded = Embed (Elabel, labelSequence)
+    RecurrentLSTMPWithAttentionWindow2 (inputDim/*x.dim*/, outputDim/*h.dim*/, cellDim/*c.dim*/, x, projectedAttentionWindowBroadcast, attentionDim, attentionSpan, enableSelfStabilization=false) =
+    [
+        prevState =
+        [
+            h = Loop.Previous (lstmState.h)             # hidden state(t-1)
+            c = Loop.Previous (lstmState.c)             # cell(t-1)
+        ]
+
+        # compute additional hidden state from attention
+        W(x) = Parameters.WeightParam (attentionDim, outputDim) * Parameters.Stabilize (x, enabled=useStabilizer)
+        projectedH = W (prevState.h) # [cellDim]
+        tanHOut = Tanh (projectedAttentionWindowBroadcast.value + projectedH) # [attentionDim x attentionSpan]
+        v(x) = Parameters.WeightParam (1, attentionDim) * Parameters.Stabilize (x, enabled=useStabilizer) # [1 x attentionDim]
+        u = v (tanHOut)                                            # [1 x attentionSpan]
+        uValid = u + Log (projectedAttentionWindowBroadcast.valid) # [1 x attentionSpan]
+        attentionWeights = Softmax (uValid)                        # [1 x attentionSpan]
+        weightedAttentionWindow = projectedAttentionWindowBroadcast.value .* attentionWeights # [attentionDim x attentionSpan]
+        weightedAttentionAverage = weightedAttentionWindow * BS.Constants.OnesTensor (attentionSpan) # [attentionDim]
+
+        # feed both to LSTM as a single agumented input, so that we can reuse the existing LSTM component
+        augmentedX = RowStack (weightedAttentionAverage : x)
+
+        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
+        lstmState = BS.RNNs.LSTMP (attentionDim + inputDim, outputDim, cellDim, augmentedX, prevState, enableSelfStabilization=enableSelfStabilization1)
+    ].lstmState // that's the value we return
+
+    RecurrentLSTMP2WithInitialState (inputDim, outputDim, cellDim, x, initialState, enableSelfStabilization=false) =
+    [
+        prevState =
+        [
+            isFirst = Loop.IsFirst (initialState.h)
+            h = Boolean.If (isFirst, initialState.h, Loop.Previous (lstmState.h))             // hidden state(t-1)
+            c = Boolean.If (isFirst, initialState.c, Loop.Previous (lstmState.c))             // cell(t-1)
+        ]
+        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
+        lstmState = BS.RNNs.LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
+    ].lstmState // that's the value we return
+
+    # encoder (processes inputEmbedded)
+    encoder = BS.RNNs.RecurrentLSTMP2Stack (inputEmbedded, embeddingDim, encoderDims, encoderDims, enableSelfStabilization=useStabilizer)

-    # encoder (processes user input)
    encoderOutputLayer = Length (encoderDims)-1
-    encoder[i:0..encoderOutputLayer] =
-        RecurrentLSTMP(if i == 0 then embeddingDim else encoderDims[i-1],
-                       encoderDims[i], encoderDims[i],
-                       if i == 0 then inputEmbedded else encoder[i-1],
-                       enableSelfStabilization=useStabilizer)
    encoderOutput = encoder[encoderOutputLayer]

    # that last frame should be fed as an additional input to every decoder step
-    # (This is the NYU model, not the Google model where the thought vector is only the initial state.)
+    # Three ways of passing encoder state:
+    #  1. as initial state for decoder (Google style)
+    #  2. as side information for every decoder step (NYU style)
+    #  3. attention

-    thoughtVector =
-    [
-        x = encoderOutput
-        result = Boolean.If (Loop.IsLast (x),      // if last entry
-                 /*then*/ x,                       // then copy that
-                 /*else*/ FutureValue (0, result)) // else just propagate to the front  --TODO: Use Scatter() once input and labels are no longer the same.
-    ].result
+    thoughtVector = [
+        h = Last (encoderOutput.h)
+        c = Last (encoderOutput.c)
+    ]
    thoughtVectorDim = encoderDims[encoderOutputLayer]

+    thoughtVectorPadded = [ # padded with zeroes until end of target sequence
+        h = Sequences.BroadcastSequenceAs (labelsEmbedded, thoughtVector.h)
+        c = Sequences.BroadcastSequenceAs (labelsEmbedded, thoughtVector.c)
+    ]
+
+    # attention (fixed rolling window)
+    attentionWindow = Sequences.PastValueWindow (attentionSpan, encoderOutput.h)
+    attentionDim = thoughtVectorDim
+    projectedAttentionWindowBroadcast = [
+        W(x) = Parameters.WeightParam (attentionDim, thoughtVectorDim) * Parameters.Stabilize (x, enabled=useStabilizer)
+        #B = Parameters.BiasParam (vocabDim) # no bias in attention
+        value = Sequences.BroadcastSequenceAs (labelsEmbedded, W (attentionWindow.value)) # apply the projection columnwise to the attentionWindow tensor
+        valid = Sequences.BroadcastSequenceAs (labelsEmbedded,    attentionWindow.valid)
+    ]
+
+    # NYU style: expand h to all, drop c
+    # TODO: just use use thoughtVectorPadded.h (do this when we next test this branch again)
+    thoughtVectorEverywhere = Boolean.If (Loop.IsFirst (thoughtVectorPadded.h),    # if first entry
+                                 /*then*/ thoughtVectorPadded.h,                   # then copy that
+                                 /*else*/ Loop.Previous (thoughtVectorEverywhere)) # else just propagate to the front
+    # TODO: create an indexSequence that contains all zeroes, basically broadcast a single-frame sequence across another sequence length
+
    # decoder
+    # NYU style:
    # The decoder starts with hidden state 0
-    # and takes as input [thoughtVector; previous word].
+    # and takes as input [thoughtVectorEverywhere; previous word].

-    isTraining = EnvironmentInput ('isTraining', tag='evaluation')
-    #decoderFeedback      = Boolean.If (isTraining, labelsEmbedded, decoderOutputEmbedded) # not working
-    decoderFeedback       = labelsEmbedded
-    sentenceStartEmbedded = inputEmbedded          # first token is sentence start
-    # ^^ inputEmbedded is used to get </s>. Must make this a constant once we separate input and output.
+    delayedDecoderFeedback = Loop.PreviousOrDefault (defaultValue=labelSentenceStartEmbedded, labelsEmbedded)

-    delayedDecoderFeedback = Boolean.If (Loop.IsFirst (decoderFeedback), sentenceStartEmbedded, Loop.Previous (decoderFeedback))
-
-    decoderInputDim = if useEncoder then           thoughtVectorDim        + embeddingDim  else           embeddingDim
-    decoderInput    = if useEncoder then RowStack (thoughtVector : delayedDecoderFeedback) else delayedDecoderFeedback
+    decoderInputDim = labelsEmbedded.dim  #embeddingDim
+    decoderInput    = Pass (delayedDecoderFeedback)
    decoderOutputLayer = Length (decoderDims)-1
    decoder[i:0..decoderOutputLayer] =
        if i == 0
-        then RecurrentLSTMP (decoderInputDim, decoderDims[i], decoderDims[i],
-                             decoderInput,
-                             enableSelfStabilization=useStabilizer)
-        else RecurrentLSTMP (decoderDims[i-1], decoderDims[i], decoderDims[i],
-                             decoder[i-1],
-                             enableSelfStabilization=useStabilizer)
-    decoderDim = decoderDims[decoderOutputLayer]
-    decoderOutput = decoder[decoderOutputLayer]
+        then if useEncoder && useNYUStyle then BS.RNNs.RecurrentLSTMP2 (thoughtVectorDim + decoderInputDim, decoderDims[i], decoderDims[i],
+                                                                        RowStack (thoughtVectorEverywhere : decoderInput),
+                                                                        enableSelfStabilization=useStabilizer)
+             else if useEncoder && attentionSpan > 0 then RecurrentLSTMPWithAttentionWindow2 (thoughtVectorDim + decoderInputDim, decoderDims[i], decoderDims[i],
+                                                                                              RowStack (thoughtVectorEverywhere : decoderInput),
+                                                                                              projectedAttentionWindowBroadcast, attentionDim, attentionSpan,
+                                                                                              enableSelfStabilization=useStabilizer)
+             else RecurrentLSTMP2WithInitialState (decoderInputDim, decoderDims[i], decoderDims[i],
+                                                   decoderInput,
+                                                   thoughtVectorPadded, # BUGBUG: Should be thoughtVector, but Scatter() can't expand from inside a loop
+                                                   enableSelfStabilization=useStabilizer)
+        else BS.RNNs.RecurrentLSTMP2 (decoderDims[i-1], decoderDims[i], decoderDims[i],
+                                      decoder[i-1].h,
+                                      enableSelfStabilization=useStabilizer)
+    #decoderDim = decoderDims[decoderOutputLayer]
+    decoderOutput = decoder[decoderOutputLayer].h
+    decoderDim = decoderOutput.dim

    # and add a softmax layer on top

    W(x) = Parameters.WeightParam (vocabDim, decoderDim) * Parameters.Stabilize (x, enabled=useStabilizer)
    B = Parameters.BiasParam (vocabDim)

-    z = W(decoderOutput) + B;  // top-level input to Softmax
-
-    decoderOutputEmbedded = Embed (Elabel, Hardmax (z))
+    z = W (decoderOutput) + B;  // top-level input to Softmax

    # training criteria
-    ce  = CrossEntropyWithSoftmax(labelSequence, z, tag='criterion')   // this is the training objective
-    wer = ErrorPrediction        (labelSequence, z, tag='evaluation')  // this also gets tracked
-
-    #indexTestVals = Plus (decoderOutput, BS.Constants.Zero, tag='evaluation')
-    #indexTest = Slice (0, 1, indexTestVals)
-    #index = Where (RectifiedLinear (indexTest), tag='evaluation'); // for testing: this thresholds all negative numbers to 0=false, keeping positive as !=0=true
-    #packedIndex = PackedIndex (indexTest, index, tag='evaluation')
-    #filtered = GatherPacked (packedIndex, indexTestVals, tag='evaluation')
-    #unfiltered = ScatterPacked (indexTest, packedIndex, filtered, tag='evaluation')
-
-    //# define an LSTM with a per-sequence initialization value
-    //# TODO: Not currently used. Move to BS library once tested.
-    //RecurrentLSTMPWithInitValue (inputDim, outputDim, cellDim, x, initValue, enableSelfStabilization=false) =
-    //[
-    //    prevState =  // Loop.Previous (lstmState). BS can't apply Previous() to dictionaries, so expand it manually
-    //    [
-    //        h = Loop.Previous (lstmState.h);                     // hidden state(t-1)
-    //        c = Loop.Previous (lstmState.c);                     // cell(t-1)
-    //    ]
-    //    # resettable LSTM function
-    //    lstmState =
-    //    [
-    //        // apply the LSTM function to the input state; for first frame, we will ignore the output
-    //        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
-    //        lstmState1 = LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
-    //
-    //        // the actual LSTM state (incl. its output) gets overwritten in the first frame by the initValue
-    //        isFirst = Loop.IsFirst (x)
-    //        h = Boolean.If (isFirst, initValue, lstmState1.h); // hidden state(t-1)
-    //        c = Boolean.If (isFirst, initValue, lstmState1.c); // cell(t-1)
-    //    ]
-    //].lstmState.h // that's the value we return
+    ce  = CrossEntropyWithSoftmax (labelSequence, z, tag='criterion')   // this is the training objective
+    wer = ErrorPrediction         (labelSequence, z, tag='evaluation')  // this also gets tracked
 ])

 #######################################
@ -241,7 +317,7 @@ reader = [
        labelType = "category"
        labelDim = "$confVocabSize$"
        labelMappingFile = "$ModelDir$/vocab.wl"
-        beginSequence = "</s>"
+        beginSequence = "$startSymbol$" # "</s>"
        endSequence   = "</s>"

        #### Write definition ####
@ -341,7 +417,7 @@ cvReader = [
 writeWordAndClassInfo = [
    action = "writeWordAndClass"
    inputFile = "$DataDir$/$trainFile$"
-    beginSequence = "</s>"
+    beginSequence = "$startSymbol$" # "</s>"
    endSequence   = "</s>"
    outputMappingFile = "$ModelDir$/vocab.wl"
    outputVocabFile = "$ModelDir$/vocab.txt"
@ -362,23 +438,24 @@ train = [
    traceLevel = 1
    epochSize = 0               # (for quick tests, this can be overridden with something small)

-    #BrainScriptNetworkBuilder is defined in outer scope
+    # BrainScriptNetworkBuilder is defined in outer scope

    SGD = [
-        minibatchSize = 128*2:256:512
-        learningRatesPerSample = 0.01
+        minibatchSize = 128:128:256:512
+        learningRatesPerSample = 0.007*2:0.0035 #0.01 #0.005 # 0.01
        momentumAsTimeConstant = 2500
        gradientClippingWithTruncation = true   # TODO: clip and truncate? What is the difference?
        clippingThresholdPerSample = 15.0
-        maxEpochs = 16
+        maxEpochs = 50
        numMBsToShowResult = 100
+        firstMBsToShowResult = 10
        gradUpdateType = "none" # FSAdaGrad?
        loadBestModel = true

        # tracing (enable these for debugging)
        #traceNodeNamesReal = labelsEmbedded:decoderInput:"decoder[0].lstmState._privateInnards.ht":z.Plus_left.Times_right.result:z:ce
        #traceNodeNamesReal = labelsEmbedded:decoderInput:z:ce
-        #traceNodeNamesReal = thoughtVector.result:zMask:z:ce:wer:indexTestVals:index:packedIndex:filtered:unfiltered:isTraining
+        #traceNodeNamesReal = thoughtVectorEverywhere.result:zMask:z:ce:wer:indexTestVals:index:packedIndex:filtered:unfiltered:isTraining
        #traceNodeNamesCategory = inputSequence.out:labelSequence

        dropoutRate = 0.0
@ -454,7 +531,7 @@ test = [
            labelType = "category"
            labelDim = "$confVocabSize$"
            labelMappingFile = "$ModelDir$/vocab.wl"
-            beginSequence = "</s>"
+            beginSequence = "$startSymbol$" # "</s>"
            endSequence   = "</s>"
    
            #### Write definition ####
@ -504,51 +581,21 @@ write = [
    # We need to make a change:
    BrainScriptNetworkBuilder = ([

-        beamDepth = 3 // 0=predict; 1=greedy; >1=beam
+        enableTracing = true
+        traceFrequency = 1000
+        tracingLabelMappingFile = "$ModelDir$/vocab.wl"
+        include "S2SLib.bs"
+
+        beamDepth = $beamDepth$ // 0=predict; 1=greedy; >1=beam

        # import some names
+        Constants = BS.Constants
        Boolean = BS.Boolean
        Loop = BS.Loop
        Previous = Loop.Previous
        IsFirst = Loop.IsFirst
-
-        Trace (node, say='', logFrequency=traceFrequency, logFirst=10, logGradientToo=false, onlyUpToRow=100000000, onlyUpToT=100000000, format=[], tag='') = new ComputationNode [
-            operation = 'Trace' ; inputs = node
-        ]
-
-        formatDense = [
-            type = "real"
-            transpose = false
-            precisionFormat = ".4"
-        ]
-        formatOneHot = [
-            type = "category"
-            transpose = false
-            labelMappingFile = "$ModelDir$/vocab.wl"
-        ]
-        formatSparse = [
-            type = "sparse"
-            transpose = false
-            labelMappingFile = "$ModelDir$/vocab.wl"
-        ]
-        enableTracing = true
-        traceFrequency = 1
-        TraceState (h, what) =
-            if enableTracing
-            then Transpose (Trace (Transpose (h), say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3, format=formatDense))
-            else h
-        TraceDense (h, what) =
-            if enableTracing
-            then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3, format=formatDense)
-            else h
-        TraceOneHot (h, what) =
-            if enableTracing
-            then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=15,*/ format=formatOneHot)
-            else h
-        TraceSparse (h, what) =
-            if enableTracing
-            then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3,*/ format=formatSparse)
-            else h
+        If = Boolean.If
+        OnesTensor = Constants.OnesTensor

        # macro that extracts top D hypotheses from a 2D tensor
        # input: scores[w,n]    w = word index, d = hyp index in beam (d=0 is the best one)
@ -575,31 +622,49 @@ write = [

        modelAsTrained = BS.Network.Load ("$decodeModelPath$")

+        useNYUStyle = false     # TODO: we should be able to infer this from some dimensions
+        hasEmbeddings = false   # TODO: infer this
+
        top1DecodingModel(model) = new ComputationNetwork [
            # compute top-N from output
            logP = LogSoftmax (model.z)

            offset = Constant (10000)
-            top1a = Hardmax (logP)  .* (logP + offset)/*for tracing*/
-            top1b = top1a
+            top1b = Hardmax (logP)  .* (logP + offset)/*for tracing*/
            top1 = TraceSparse (top1b, 'logP') # TODO: get the accumulated logP out, it's a little more involved

            topN = 10
-            tokenSet = GetTopNTensor (topN, logP) # [V x 1] -> [V x 1 x topN]
-            tokenSetScores = tokenSet .* logP   # [V x 1 x topN]
+            topPaths = GetTopNTensor (topN, logP) # [V x 1] -> [V x 1 x topN]
+            topPathScores = topPaths .* logP   # [V x 1 x topN]
            # reduce back to a single column
-            topHyps = TraceSparse (tokenSetScores * ConstantTensor (1, (1 : topN)), 'topHyps')
+            topHyps = TraceSparse (topPathScores * OnesTensor (1 : topN), 'topHyps')

+            inputsOut = Pass (model.inputSequence)
            labelsOut = Pass (TraceOneHot (model.labelSequence, 'labels'))
            decodeOut = Pass (TraceOneHot (top1, 'out'))
            topNOut   = Pass (topHyps)
        ]

        # replace old decoderFeedback node by newDecoderFeedback
+        EmbedLabels (x) = if hasEmbeddings then TransposeTimes (modelAsTrained.labelsEmbedded.TransposeTimesArgs[0], x) else x
+        decoderFeedback = EmbedLabels (Hardmax (modelAsTrained.z))  # in training, this is decoderFeedback = labelsEmbedded

-        decoderFeedback        = modelAsTrained.decoderOutputEmbedded  # in training, this is decoderFeedback = labelsEmbedded
-        sentenceStartEmbedded  = Boolean.If (Loop.IsFirst (decoderFeedback), modelAsTrained.inputEmbedded, Previous (sentenceStartEmbedded)) # enforces no leaking of labels
-        delayedDecoderFeedback = Boolean.If (Loop.IsFirst (decoderFeedback), sentenceStartEmbedded, Loop.Previous (decoderFeedback)) # same expression as in training
+        # TODO: fold this in
+        PreviousOrDefault1 (x, defaultValue=Constant (0)) =   # a delay node with initial value  --TODO: merge the two, then do in C++
+        [
+            flags = IsFirst (defaultValue/*x*/)
+            out = If (flags,
+             /*then*/ defaultValue,
+             /*else*/ Previous (x))
+        ].out
+
+        labelSentenceStart = modelAsTrained.labelSentenceStart_out # _ is a hack
+
+        labelsToUse             = if hasEmbeddings then modelAsTrained.labelsEmbedded             else modelAsTrained.labelSequence
+        labelSentenceStartToUse = if hasEmbeddings then modelAsTrained.labelSentenceStartEmbedded else labelSentenceStart
+        labelSentenceStartEmbeddedScattered = TraceDense (BS.Sequences.Scatter (IsFirst (labelsToUse), labelSentenceStartToUse), 'sest')
+
+        delayedDecoderFeedback = TraceDense (/*Loop.*/PreviousOrDefault1 (defaultValue=labelSentenceStartEmbeddedScattered, TraceDense (decoderFeedback, 'lemb'))   , 'prev lemb')

        greedyDecodingModel = BS.Network.Edit (modelAsTrained,
                                               BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.delayedDecoderFeedback, delayedDecoderFeedback),
@ -611,6 +676,8 @@ write = [
            #  decoder[0].prevState.h = PastValue (decoder[0].lstmState._privateInnards.ht) : [200 x 1 {1,200} x *] -> [200 x 1 {1,200} x *]
            #  decoder[0].prevState.c = PastValue (decoder[0].lstmState._privateInnards.ct) : [200 x 1 {1,200} x *] -> [200 x 1 {1,200} x *]
            #  decoderInput.inputs[1] = PastValue (labelsEmbedded) : [300 x 1 {1,300} x *] -> [300 x 1 {1,300} x *]
+            #  decoder[0].prevState.h.elseVal = PastValue (decoder[0].lstmState._privateInnards.ht) : [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h] -> [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h]
+            #  decoder[0].prevState.c.elseVal = PastValue (decoder[0].lstmState._privateInnards.ct) : [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h] -> [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h]

            hiddenDim    = modelAsTrained.delayedDecoderFeedback.dim
            embeddingDim = modelAsTrained.decoderOutputEmbedded.dim
@ -635,21 +702,66 @@ write = [
            #  - traceback is a right-to-left recurrence
            #     - output best hypo conditioned on the path (it is already known)

-            propagationEdits[i:0..2] = // TODO: implement and use { } syntax
-                if      i == 0 then (node => if node.name == 'decoder[0].prevState.h' then TraceState (Previous (PropagateTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
-                else if i == 1 then (node => if node.name == 'decoder[0].prevState.c' then TraceState (Previous (PropagateTopN (node.PastValueArgs[0])), 'propagated') else node)
+            propagationEdits[i:0..8] = // TODO: implement and use { } syntax  TODO: VV elseVal only for non-NYU?
+                # non-NYU:
+                if      i == 0 then (node => if node.name == 'decoder[0].prevState.h.elseVal' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
+                else if i == 1 then (node => if node.name == 'decoder[0].prevState.c.elseVal' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
+                # NYU:
+                else if i == 2 then (node => if node.name == 'decoder[0].prevState.h' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
+                else if i == 3 then (node => if node.name == 'decoder[0].prevState.c' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
+                # all:
+                else if i == 4 then (node => if node.name == 'decoder[1].prevState.h' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
+                else if i == 5 then (node => if node.name == 'decoder[1].prevState.c' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
+                else if i == 6 then (node => if node.name == 'decoder[2].prevState.h' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
+                else if i == 7 then (node => if node.name == 'decoder[2].prevState.c' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
                else                BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.delayedDecoderFeedback, delayedDecoderFeedback)

            # decoderFeedback must be updated to take actual decoder output

-            Elabel = modelAsTrained.decoderOutputEmbedded.TransposeTimesArgs[0]
-            decoderFeedback = TraceState (TransposeTimes (Elabel, TraceSparse (topWords, 'topWords')), 'feedback')
-
-            delayedDecoderFeedback = Boolean.If (Loop.IsFirst (decoderFeedback), sentenceStartEmbedded, Loop.Previous (decoderFeedback))
-
            m2 = BS.Network.Edit (modelAsTrained,
                                  propagationEdits,
-                                  (labelsOut : decodeOut)) # additional roots
+                                  (inputsOut : labelsOut : decodeOut)) # additional roots
+
+            ReduceAxis (axisDim, x, axis=1) =   # unfortunately, we must feed in the dimension of the axis, it can't be inferred
+                if      axis == 1 then Times (OnesTensor (axisDim), x, outputRank = 0)
+                else if axis == 2 then ReduceAxis (axisDim, TransposeDimensions (x, 1, 2), axis=1)
+                else Fail("ReduceAxis: Only supports axes 1 and 2.")
+
+            # === BEGIN DECODER ===
+
+            # constants for initial score and final traceback
+            initialPathScores = FirstAndOther (0, LOGZERO, beamDepth, axis = 2)  # [1 x D]: [ 0, -INF, -INF, -INF, ... ]
+            finalHyp          = FirstAndOther (1, 0,       beamDepth, axis = 1)  # [D] the final token is the top-scoring hypothesis, that is, hyp[0]
+
+            # path expansion of the D hypotheses that were best in previous time step (ordered as in previous time step)
+            logLLs = Columnwise (LogSoftmax, beamDepth, modelAsTrained.z)                                   # [V x Dprev]     log  P(w|hist)
+            expandedPathScores = logLLs + If (IsFirst (logLLs), initialPathScores, Previous (tokens.score)) # [V x Dprev] log (P(w|hist) * P(hist)) for all top D hypotheses
+
+            # determine top D of expanded paths
+            topPaths      = TraceSparse (GetTopNTensor (beamDepth, expandedPathScores), 'topPaths') # [V x Dprev] -> [V x Dprev x Dnew]
+            topPathScores = TraceSparse (topPaths .* expandedPathScores, 'topPathScores')           #                [V x Dprev x Dnew]
+
+            # form new decoding token, by reducing topPaths(Scores) along relevant dimensions
+            tokens = [                                    # [. x Dnew]
+                from  = ReduceAxis (axis=1, vocabSize, topPaths) # [Dprev x Dnew], reduced over V
+                word  = ReduceAxis (axis=2, beamDepth, topPaths) # [V x Dnew], reduced over Dprev
+                score = TraceDense (OnesTensor (1/*output dim*/ : /*reduction dims: */vocabSize : beamDepth/*Dprev*/) * topPathScores, 'tokens.score')  # [1 x Dnew], reduced over [V x Dprev] and inserted a '1'
+            ]
+
+            # network feedback for next time step
+            decoderFeedback = TraceState (EmbedLabels (TraceSparse (tokens.word, 'tokens.word')), 'feedback') # [embeddingDim x Dnew]
+            delayedDecoderFeedback = If (IsFirst (labelSentenceStartEmbeddedScattered), labelSentenceStartEmbeddedScattered, Loop.Previous (decoderFeedback))
+
+            # network state for next step. We must reorder the network state for use in next time step: Apply this lambda to all decoder LSTMs' h and c.
+            ReorderTopN (past_h_or_c) = Times (TraceState (past_h_or_c, 'past'), TraceDense (tokens.from, 'backp'))
+
+            # final traceback
+            traceback = TraceDense (If (Loop.IsLast (labelSentenceStartEmbeddedScattered/*tokens.from*/), finalHyp, Loop.Next (tokens.from * traceback)), 'traceback')    # [D] one-hot, multiplying tokens.from from the left will select another one-hot row of tokens.from
+            decodeHyp = Times (topPaths, traceback, outputRank = 2)          # [V x Dprev] 2D one-hot, selected the best hyp according to traceback
+            decode = TraceOneHot (decodeHyp * OnesTensor (beamDepth), 'out') # [V] reduces over Dprev -> 1D one-hot
+            # TODO: Can this be done in one ^^ go?
+
+            # === END DECODER ===

            # propagate LSTM state to the right top-N rank given where that rank came from in the previous time step

@ -658,21 +770,19 @@ write = [
                                        0 0 0
                                        0 0 0")

-            PropagateTopN (past_h_or_c) = Times (TraceState (past_h_or_c, 'past'), TraceDense (backPointers, 'backp'))
-            # backPointers: [Dprev, Dnew]
+            # PropagateTopN:
+            # tokens.from: [Dprev, Dnew]
            #   v--------- best came from input hyp[1]
            #     v------- second best came from input hyp[0]
            #       v----- third best came from input hyp[2]
            #   0 1 0
            #   1 0 0
            #   0 0 1
-            # backPointers[:,n] one-hot encodes the best predecessor at top-N rank n
+            # tokens.from[:,n] one-hot encodes the best predecessor at top-N rank n
            # each column is a one-hot vector
            # multiplying with such a column from the right will select the column represented by the one-hot value

-            # get decoder log likelihoods
-            # EvalActions: EnableNodeTracing {L"decoder[0].lstmState._privateInnards.it", L"z"}, //
-            logLLs = Columnwise (LogSoftmax, beamDepth, modelAsTrained.z)    # [V x D] un-normalized log P(w|hist) + const
+            # logLLs: get decoder log likelihoods

            Columnwise (f, beamDepth, z) = # TODO: Takes LogSoftmax over axis=1. it is more tricky to do this over arbitrary axes
            [
@ -680,14 +790,12 @@ write = [
                out = Splice (cols, axis=2)
            ].out

-            # decoder start token: 0 for first hyp, -INF for the others
+            # initialPathScores: decoder start token: 0 for first hyp, -INF for the others
            LOGZERO = -1e30
-            initialPathScores = FirstAndOther (0, LOGZERO, beamDepth, axis = 2)  # row vector: [ 0, -INF, -INF, -INF, ... ]

-            expandedPathScores = logLLs + PreviousOrDefault (PropagateTopN (pathScores), initialPathScores)        # [V x Dprev] un-normalized log (P(w|hist) * P(hist)) for all top D hypotheses
-            # ^^ path expansion, [V x 1] + [1 x D] -> [V x D]
+            # expandedPathScores: path expansion, [V x 1] + [1 x D] -> [V x D]

-            tokenSet = TraceSparse (GetTopNTensor (beamDepth, expandedPathScores), 'tokenSet') # [V x Dprev] -> [V x Dprev x Dnew]
+            # topPaths:
            #   +-----+
            #   |0 0 0|
            #   |0 0 0|-+
@ -699,11 +807,8 @@ write = [
            #       |0 0 0|
            #       +-----+

-            #topWords = ReduceSum (axis=2, tokenSet) # TODO: add an axis parameter to SumColumnElements()
-            topWords = [
-                v1  = TransposeDimensions (tokenSet, 1, 2)     # reduction axis is now the first
-                out = Times (ConstantTensor (1, (beamDepth)), v1, outputRank = 0) # reduce over the first axis and drop it
-            ].out
+            # tokens.word:
+            #tokens.word = ReduceSum (axis=2, topPaths) # TODO: add an axis parameter to SumColumnElements()
            #   +-+
            #   |0|
            #   |0|-+
@ -715,7 +820,7 @@ write = [
            #       |0|
            #       +-+

-            backPointers = Times (ConstantTensor (1, (vocabSize)), tokenSet, outputRank = 0) # this is a tensor Times operation that reduces over the first dimension
+            # tokens.from:
            # before dropping the first dimension: [V x Dprev x Dnew]
            #   +-----+
            #   |0 1 0|       means input hyp[1] gave rise to the best    
@ -724,16 +829,16 @@ write = [
            #     +-----+-+
            #       |0 0 1|   means input hyp[2] gave rise to third best
            #       +-----+
-            # after: [Dprev,Dnew]        e.g. "0 1 0" goes into first column, vertically
+            # after: [Dprev x Dnew]        e.g. "0 1 0" goes into first column, vertically
            #   v--------- best came from input hyp[1]
            #     v------- second best came from input hyp[0]
            #       v----- third best came from input hyp[2]
            #   0 1 0
            #   1 0 0
            #   0 0 1
-            # backPointers[:,n] one-hot encodes the best predecessor at top-N rank n
+            # tokens.from[:,n] one-hot encodes the best predecessor at top-N rank n

-            tokenSetScores = TraceSparse (tokenSet .* expandedPathScores, 'tokenSetScores')   # [V x Dprev x Dnew]
+            # topPathScores:
            #   +-----+
            #   |0 0 0|
            #   |0 0 0|-+
@ -744,29 +849,24 @@ write = [
            #     +-----+z|   z denotes the accumulated path score max_w P(w|hyp[2])
            #       |0 0 0|
            #       +-----+
-            pathScores = TraceDense (ConstantTensor (1, (1/*output dim*/ : /*reduction dims: */vocabSize : beamDepth/*Dprev*/)) * tokenSetScores, 'pathScores')  # [1 x Dnew]

-            # traceback
-            # last state: take Hardmax over pathScores
-            # previous states: multiply wth respective backPointers matrix
+            # traceback:
+            # last state: take Hardmax over tokens.score
+            # previous states: multiply wth respective tokens.from matrix
            # -> hyp index for every time step
            # then finally use that to select the actual output   TODO: That's a sample-wise matrix product between two sequences!!!
-            traceback = TraceDense (NextOrDefault (backPointers * traceback, finalHyp), 'traceback')    # [D] one-hot, multiplying backPointers from the left will select another one-hot row of backPointers
+            # TODO: condition must be 1-dim, not 2-dim tensor, so we use labelSentenceStartEmbeddedScattered instead of tokens.from
            # +-+
            # |0|
            # |1|  means at this time step, hyp[1] was the best globally
            # |0|
            # +-+
-            finalHyp = FirstAndOther (1, 0, beamDepth, axis = 1)              # the final token is the top-scoring hypothesis, that is, hyp[0]

-            # and the actual decoding output
+            # decode: and the actual decoding output
            # This is the one to output (top sentence-level hypothesis after traceback).
-            decode = [
-                hyp = Times (tokenSet, traceback, outputRank = 2)   # [V x Dprev] 2D one-hot
-                out = TraceOneHot (hyp * ConstantTensor (1, beamDepth), 'out')           # reduces over Dprev -> 1D one-hot
-            ].out
+
            # traceback : [Dnew]
-            # tokenSet : [V x Dprev x Dnew]
+            # topPaths : [V x Dprev x Dnew]
            #   +-----+
            #   |0 0 0|
            #   |0 0 0|-+
@ -787,25 +887,10 @@ write = [
                      else Splice   (Constant       (firstVal)      : ConstantTensor (otherVals, (1 : N -1)), axis = axis1 /*, axis*/)   # row vector: [ 0, -INF, -INF, -INF, ... ]
            ].out

+            inputsOut = Pass (modelAsTrained.inputSequence)
            labelsOut = Pass (modelAsTrained.labelSequence)
            decodeOut = Pass (decode)
            #topNOut   = Pass (topHyps)
-
-            PreviousOrDefault (x, initialValue) =   # a delay node with initial value
-                BS.Boolean.If (BS.Loop.IsFirst (x),
-                /*then*/ initialValue,
-                /*else*/ BS.Loop.Previous (x))
-                #if BS.Loop.IsFirst (x)
-                #then initialValue
-                #else BS.Loop.Previous (x)
-
-            NextOrDefault (x, initialValue) =   # a delay node with initial value
-                BS.Boolean.If (BS.Loop.IsLast (x),
-                /*then*/ initialValue,
-                /*else*/ BS.Loop.Next (x))
-                #if BS.Loop.IsLast (x)
-                #then initialValue
-                #else BS.Loop.Next (x)
        ].m2

        model = if beamDepth == 0 then top1DecodingModel (modelAsTrained)
@ -814,8 +899,8 @@ write = [

    ].model)

-    #outputPath = "$OutputDir$/Write"
-    outputPath = "-"                    # "-" will write to stdout; useful for debugging
+    outputPath = $decodeOutputPath$
+    #outputPath = "-"                    # "-" will write to stdout; useful for debugging
    #outputNodeNames = z1.out:labels1 # when processing one sentence per minibatch, this is the sentence posterior
    #outputNodeNames = network.beamDecodingModel.z1.out:labels1 # when processing one sentence per minibatch, this is the sentence posterior

@ -825,13 +910,13 @@ write = [
    #outputNodeNames = network.beamDecodingModel.labelsOut:network.beamDecodingModel.decodeOut    #:topNOut

    # joint:
-    outputNodeNames = labelsOut:decodeOut:network.beamDecodingModel.labelsOut:network.beamDecodingModel.decodeOut
+    outputNodeNames = inputsOut:labelsOut:decodeOut:network.beamDecodingModel.inputsOut:network.beamDecodingModel.labelsOut:network.beamDecodingModel.decodeOut

    #outputNodeNames = labels1:network.beamDecodingModel.decode.out
    #outputNodeNames = labels1:network.beamDecodingModel.expandedPathScores
-    #outputNodeNames = network.beamDecodingModel.pathScores:network.beamDecodingModel.traceback
-    #   network.beamDecodingModel.tokenSetScores
-    #   network.beamDecodingModel.pathScores
+    #outputNodeNames = network.beamDecodingModel.tokens.score:network.beamDecodingModel.traceback
+    #   network.beamDecodingModel.topPathScores
+    #   network.beamDecodingModel.tokens.score
    #   network.beamDecodingModel.traceback
    #   network.beamDecodingModel.expandedPathScores

@ -840,12 +925,12 @@ write = [
        transpose = false
        labelMappingFile = "$ModelDir$/vocab.wl"
        #precisionFormat = "10"
-        sequenceEpilogue = "\t// %s\n"
+        #sequenceEpilogue = "\t// %s\n"
    ]

-    #traceNodeNamesReal = network.beamDecodingModel.pathScores:network.beamDecodingModel.tokenSetScores:network.beamDecodingModel.expandedPathScores:network.beamDecodingModel.backPointers
-    #traceNodeNamesCategory = network.beamDecodingModel.tokenSetScores
-    #traceNodeNamesSparse = network.beamDecodingModel.tokenSetScores:network.beamDecodingModel.backPointers:decoderOutputEmbedded.x
+    #traceNodeNamesReal = network.beamDecodingModel.tokens.score:network.beamDecodingModel.topPathScores:network.beamDecodingModel.expandedPathScores:network.beamDecodingModel.tokens.from
+    #traceNodeNamesCategory = network.beamDecodingModel.topPathScores
+    #traceNodeNamesSparse = network.beamDecodingModel.topPathScores:network.beamDecodingModel.tokens.from:decoderOutputEmbedded.x

    minibatchSize = 8192                # choose this to be big enough for the longest sentence
    # need to be small since models are updated for each minibatch
@ -895,7 +980,7 @@ write = [
            labelType = "category"
            labelDim = "$confVocabSize$"
            labelMappingFile = "$ModelDir$/vocab.wl"
-            beginSequence = "</s>"
+            beginSequence = "$startSymbol$" # "</s>"
            endSequence   = "</s>"

            #### Write definition ####
--- a/Examples/Text/PennTreebank/Config/S2SLib.bs
+++ b/Examples/Text/PennTreebank/Config/S2SLib.bs
@ -0,0 +1,48 @@
+# TODO: must sort this out. For now, this is just shared stuff between training and decoding.
+
+    # these depend on beamDepth parameter for now, fix this
+    TraceState (h, what) =
+        if enableTracing
+        then Transpose (Trace (Transpose (h), say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3, format=formatDense))
+        else h
+    TraceDense (h, what) =
+        if enableTracing
+        then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=21/*beamDepth*beamDepth*/, onlyUpToT=25, format=formatDense)
+        else h
+    TraceDenseTransposed (h, what) =
+        if enableTracing
+        then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=25, format=formatDenseTransposed)
+        else h
+    TraceOneHot (h, what) =
+        if enableTracing
+        then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=15,*/ format=formatOneHot)
+        else h
+    TraceSparse (h, what) =
+        if enableTracing
+        then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3,*/ format=formatSparse)
+        else h
+
+    Trace (node, say='', logFrequency=traceFrequency, logFirst=10, logGradientToo=false, onlyUpToRow=100000000, onlyUpToT=100000000, format=[], tag='') = new ComputationNode [
+        operation = 'Trace' ; inputs = node
+    ]
+
+    formatDense = [
+        type = "real"
+        transpose = false
+        precisionFormat = ".4"
+    ]
+    formatDenseTransposed = [
+        type = "real"
+        transpose = true
+        precisionFormat = ".4"
+    ]
+    formatOneHot = [
+        type = "category"
+        transpose = false
+        labelMappingFile = tracingLabelMappingFile
+    ]
+    formatSparse = [
+        type = "sparse"
+        transpose = false
+        labelMappingFile = tracingLabelMappingFile
+    ]
--- a/Source/1BitSGD
+++ b/Source/1BitSGD
@ -1 +1 @@
-Subproject commit f785679a6bd5cc089b138b3c6bcb68e4b1f345ae
+Subproject commit f57be8b8caeddf385a44a14acc587f4e5168152d
--- a/Source/ActionsLib/EvalActions.cpp
+++ b/Source/ActionsLib/EvalActions.cpp
@ -17,6 +17,7 @@
 #include "Config.h"
 #include "SimpleEvaluator.h"
 #include "SimpleOutputWriter.h"
+#include "Criterion.h"
 #include "BestGpu.h"
 #include "ScriptableObjects.h"
 #include "BrainScriptEvaluator.h"
@ -121,8 +122,8 @@ void DoCrossValidate(const ConfigParameters& config)

    int traceLevel = config(L"traceLevel", "0");
    size_t numMBsToShowResult = config(L"numMBsToShowResult", "100");
-    size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
-    size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);
+    size_t maxSamplesInRAM    = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
+    size_t numSubminiBatches  = config(L"numSubminibatches", (size_t)1);

    ConfigArray evalNodeNames = config(L"evalNodeNames", "");
    vector<wstring> evalNodeNamesVector;
@ -131,7 +132,7 @@ void DoCrossValidate(const ConfigParameters& config)
        evalNodeNamesVector.push_back(evalNodeNames[i]);
    }

-    std::vector<std::vector<double>> cvErrorResults;
+    std::vector<std::vector<EpochCriterion>> cvErrorResults;
    std::vector<std::wstring> cvModels;

    DataReader cvDataReader(readerConfig);
@ -143,7 +144,7 @@ void DoCrossValidate(const ConfigParameters& config)

        if (!fexists(cvModelPath))
        {
-            fprintf(stderr, "model %ls does not exist.\n", cvModelPath.c_str());
+            fprintf(stderr, "Model %ls does not exist.\n", cvModelPath.c_str());
            if (finalModelEvaluated || !fexists(modelPath))
                continue; // file missing
            else
@ -158,7 +159,7 @@ void DoCrossValidate(const ConfigParameters& config)
        
        SimpleEvaluator<ElemType> eval(net, MPIWrapper::GetInstance(), numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches);

-        fprintf(stderr, "model %ls --> \n", cvModelPath.c_str());
+        fprintf(stderr, "Model %ls --> \n", cvModelPath.c_str());
        auto evalErrors = eval.Evaluate(&cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
        cvErrorResults.push_back(evalErrors);

@ -167,16 +168,14 @@ void DoCrossValidate(const ConfigParameters& config)

    // find best model
    if (cvErrorResults.size() == 0)
-    {
        LogicError("No model is evaluated.");
-    }

-    std::vector<double> minErrors;
-    std::vector<int> minErrIds;
-    std::vector<double> evalErrors = cvErrorResults[0];
+    vector<double> minErrors;
+    vector<int>    minErrIds;
+    vector<EpochCriterion> evalErrors = cvErrorResults[0];
    for (int i = 0; i < evalErrors.size(); ++i)
    {
-        minErrors.push_back(evalErrors[i]);
+        minErrors.push_back(evalErrors[i].Average());
        minErrIds.push_back(0);
    }

@ -185,9 +184,9 @@ void DoCrossValidate(const ConfigParameters& config)
        evalErrors = cvErrorResults[i];
        for (int j = 0; j < evalErrors.size(); j++)
        {
-            if (evalErrors[j] < minErrors[j])
+            if (evalErrors[j].Average() < minErrors[j])
            {
-                minErrors[j] = evalErrors[j];
+                minErrors[j] = evalErrors[j].Average();
                minErrIds[j] = i;
            }
        }
@ -196,9 +195,7 @@ void DoCrossValidate(const ConfigParameters& config)
    fprintf(stderr, "Best models:\n");
    fprintf(stderr, "------------\n");
    for (int i = 0; i < minErrors.size(); ++i)
-    {
        fprintf(stderr, "Based on Err[%d]: Best model = %ls with min err %.8g\n", i, cvModels[minErrIds[i]].c_str(), minErrors[i]);
-    }
 }

 template void DoCrossValidate<float>(const ConfigParameters& config);
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -74,6 +74,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);

            wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", "");
+            // TODO: Map dynamicAxis from name to node at this point, where that node is memoized inside NDL.
            // first look for this node already existing in the network
            // BUGBUG: How does this set the dimensions then?
            if (m_net->NodeNameExists(name))
--- a/Source/ActionsLib/OtherActions.cpp
+++ b/Source/ActionsLib/OtherActions.cpp
@ -263,8 +263,8 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
        InvalidArgument("Please specify parameters 'beginSequence' and 'endSequence'.");

    if (!outputMappingFile.empty())
-        cerr << "Mapping file       --> " << outputVocabFile << endl;
-    cerr     << "Vocabulary file    --> " << outputVocabFile << endl;
+        cerr << "Mapping file       --> " << outputMappingFile << endl;
+    cerr     << "Vocabulary file    --> " << outputVocabFile   << endl;
    if (nbrCls > 0)
    {
        cerr << "Word-to-class map  --> " << outputWord2Cls  << endl;
@ -321,7 +321,10 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
            str = str + endSequencePattern;

        vstr = msra::strfun::split(str, "\t ");
-        for (int i = 1; i < vstr.size(); i++)
+        // This loop used to start with 1, assuming begin and end symbol are the same.
+        // If they are not, I am now counting them both. No idea whether that is correct w.r.t. the class algorithm.
+        bool startWith1 = !beginSequence.empty() && beginSequence == endSequence;
+        for (size_t i = startWith1 ? 1 : 0; i < vstr.size(); i++)
            v_count[vstr[i]]++;
    }
    fp.close();
@ -355,93 +358,108 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
        vocabSize = wordCountLessCutoff;
    }

-    // form classes
-    // Implements an algorithm by Mikolov --TODO: get the reference
-    wrd2cls.Resize(vocabSize, 1);
-
-    typedef pair<string, double> stringdouble;
-    unordered_map<string, double> removed; // note: std::map is supposedly faster
-    double unkCount = 0; // TODO: why double?
-    size_t size = 0;
-    size_t actual_vocab_size = vocabSize - 1;
-    priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
-        q(compare_second<stringdouble>(), vector<stringdouble>(v_count.begin(), v_count.end()));
-    while (size < actual_vocab_size && !q.empty()) // ==for (q=...; cond; q.pop())
-    {
-        size++;
-        string word = q.top().first;
-        double freq = q.top().second; // TODO: why double?
-        if (word == unkWord)
-        {
-            unkCount += freq;
-            actual_vocab_size++;
-        }
-        removed[q.top().first] = q.top().second;
-        q.pop();
-    }
-    while (!q.empty())
-    {
-        unkCount += q.top().second;
-        q.pop();
-    }
-    removed[unkWord] = unkCount;
-    m_count.resize(removed.size());
-    double total = 0;
-    double dd = 0;
    if (nbrCls > 0)
    {
-        for (const auto& iter : removed)
-            total += iter.second;
+        // form classes
+        // Implements an algorithm by Mikolov --TODO: get the reference
+        wrd2cls.Resize(vocabSize, 1);

-        for (const auto& iter : removed)
-            dd += sqrt(iter.second / total);
-    }
-
-    double df = 0;
-    size_t class_id = 0;
-    m_class.resize(removed.size());
-
-    priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
-        p(compare_second<stringdouble>(), vector<stringdouble>(removed.begin(), removed.end()));
-    while (!p.empty())
-    {
-        string word = p.top().first;
-        double freq = p.top().second;
+        typedef pair<string, double> stringdouble;
+        unordered_map<string, double> removed; // note: std::map is supposedly faster
+        double unkCount = 0; // TODO: why double?
+        size_t size = 0;
+        size_t actual_vocab_size = vocabSize - 1;
+        priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
+            q(compare_second<stringdouble>(), vector<stringdouble>(v_count.begin(), v_count.end()));
+        while (size < actual_vocab_size && !q.empty()) // ==for (q=...; cond; q.pop())
+        {
+            size++;
+            string word = q.top().first;
+            double freq = q.top().second; // TODO: why double?
+            if (word == unkWord)
+            {
+                unkCount += freq;
+                actual_vocab_size++;
+            }
+            removed[q.top().first] = q.top().second;
+            q.pop();
+        }
+        while (!q.empty())
+        {
+            unkCount += q.top().second;
+            q.pop();
+        }
+        removed[unkWord] = unkCount;
+        m_count.resize(removed.size());
+        double total = 0;
+        double dd = 0;
        if (nbrCls > 0)
        {
-            df += sqrt(freq / total) / dd;
-            if (df > 1)
-                df = 1;
+            for (const auto& iter : removed)
+                total += iter.second;

-            if (df > 1.0 * (class_id + 1) / nbrCls && class_id < nbrCls)
-                class_id++;
+            for (const auto& iter : removed)
+                dd += sqrt(iter.second / total);
        }

-        size_t wid = m_words.size();
-        bool inserted = m_index.insert(make_pair(word, wid)).second;
-        if (inserted)
-            m_words.push_back(word);
+        double df = 0;
+        size_t class_id = 0;
+        m_class.resize(removed.size());

-        m_count[wid] = freq;
-        if (nbrCls > 0)
-            m_class[wid] = class_id;
-        p.pop();
+        priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
+            p(compare_second<stringdouble>(), vector<stringdouble>(removed.begin(), removed.end()));
+        while (!p.empty())
+        {
+            string word = p.top().first;
+            double freq = p.top().second;
+            if (nbrCls > 0)
+            {
+                df += sqrt(freq / total) / dd;
+                if (df > 1)
+                    df = 1;
+
+                if (df > 1.0 * (class_id + 1) / nbrCls && class_id < nbrCls)
+                    class_id++;
+            }
+
+            size_t wid = m_words.size();
+            bool inserted = m_index.insert(make_pair(word, wid)).second;
+            if (inserted)
+                m_words.push_back(word);
+
+            m_count[wid] = freq;
+            if (nbrCls > 0)
+                m_class[wid] = class_id;
+            p.pop();
+        }
+        assert(m_words.size() == m_index.size() && m_words.size() == m_class.size());
    }
+    else // no classes
+    {
+        for (let& iter : v_count)
+            m_words.push_back(iter.first);
+        sort(m_words.begin(), m_words.end());
+        m_count.resize(m_words.size());
+        for (size_t i = 0; i < m_words.size(); i++)
+            m_count[i] = v_count.find(m_words[i])->second;
+    }
+
+    assert(m_words.size() == m_count.size());

    // write the files
    if (!outputMappingFile.empty())
    {
        msra::files::make_intermediate_dirs(s2ws(outputMappingFile));
        ofstream ofmapping(outputMappingFile.c_str());
-        for (size_t i = 0; i < m_index.size(); i++)
-            ofmapping << m_words[i] << endl;
+        for (let& word : m_words)
+            ofmapping << word << endl;
        ofmapping.close();
        cerr << "Created label-mapping file with " << v_count.size() << " entries.\n";
    }

    msra::files::make_intermediate_dirs(s2ws(outputVocabFile));
    ofstream ofvocab(outputVocabFile.c_str());
-    for (size_t i = 0; i < m_index.size(); i++)
+    for (size_t i = 0; i < m_words.size(); i++)
    {
        if (nbrCls > 0)
            wrd2cls(i, 0) = (ElemType) m_class[i];
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -36,6 +36,7 @@ ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValu
 ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, init = 'fromLiteral', initFromLiteral = literal, learningRateMultiplier = 0.0)
 DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/  ]
 Input(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
+# TODO: change from dynamicAxis by name to dynamicAxis being an actual object
 SparseInput(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
 ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]
 SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
@ -81,6 +82,7 @@ Times(A, B, outputRank=1, tag='') = new ComputationNode [ operation = 'Times' ;
 Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]
 WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]
 ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
+ReconcileMBLayout = ReconcileDynamicAxis # back compat
 CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
 Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
@ -173,11 +175,14 @@ BS = [
 Constants = [
    Zero = ConstantTensor (0, (1))
    One  = ConstantTensor (1, (1))
+    OnesTensor (dims) = ConstantTensor (1, dims)
    # BUGBUG: ZeroesLike() would recreate the full dimension of x. Well, no need if it considers broadcasting. But still wrong if we want to broadcast a vector of different tensor dim.
    #ZeroesLike (x) = CastAs (x, Zero) // read: Cast<x>(Zero)
    #OnesLike (x)   = CastAs (x, One)
    # CastAs() does not implement broadcasting
-    ZeroesLike (x) = RowSlice (0, 1, x) .* Zero  // hack: get one row of input and multiply with zero
+    ZeroesLike (x) = SumColumnElements (RowSlice (0, 1, x) .* Zero)  // hack: get one row of input and multiply with zero; double-hack: reduce extra tensor dims by SumCol
+    ZeroSequenceLike = ZeroesLike   # TODO: this should yield a scalar sequence, while ZeroesLike should be a tensor
+    ZeroesLike1 (x) = x .* Zero     # get a tensor of zeroes of same dim as x  TODO: Do this as a C++ node (will be simple)
    OnesLike (x) = ZeroesLike (x) + One
    # is this like Sequences.Repeat?
    True  = 1
@ -216,6 +221,32 @@ Boolean = [
 ##############################################################################

 Sequences = [
+    # broadcast a single-step sequence to a multi-step sequence
+    BroadcastSequenceAs (type, data1) = [                      # type=example sequence with desired length (outside of a loop), data1=1 time step
+        ZeroSequenceLike (x) = RowSlice (0, 1, x) .* Constants.Zero # BUGBUG: SumColumnElements() has a CPU/GPU problem
+        index = /*Constants.*/ZeroSequenceLike (type)  # create an index sequence [ 0 0 0 ... ] of target length
+        packedIndex = PackedIndex (data1, index)       # convert into internal packed index w.r.t. 'data1'
+        out = GatherPacked (packedIndex, data1)        # copy data1[0] to all elements, total length like 'type'
+    ].out
+
+    # rolling window over past N samples
+    # returns a record [ value=..., valid=... ]
+    # This implementation is suboptimal in that it creates copies for the intermediate steps.
+    PastValueWindow (N, in) = [
+        delayLine[t:0..N-1] = [     # shift register for encoder, last N inputs
+            value = if t == 0
+                    then in        # delay 0: current value
+                    else Loop.PastValue (0, in, timeStep=t)
+            valid = if t == 0
+                    then Constants.One
+                    else Constants.One - PastValue (1, Constants.ZeroesLike (in), timeStep=t, defaultHiddenActivation=1)
+        ]
+        # delayLine[t].value = value of t steps in the past
+        # delayLine[t].valid = true if we had a value t steps in the past
+        value = Slice (-1, 0, axis=-1, SplitDimension (RowStack (array[0..N-1](t=>delayLine[t].value)), 1, N))  # [i, delay]
+        valid = Slice (-1, 0, axis=-1, SplitDimension (RowStack (array[0..N-1](t=>delayLine[t].valid)), 1, N))  # [i, delay]
+    ]
+
    # fold left/right: Reduce entire sequence by applying binaryOp, e.g. FoldL (Plus, 0, input)
    # LINQ calls this Aggregate; and may or may not specify the seed value; and allows a predicate
    FoldL (binaryOp, x0, x) = _Fold (PastValue,   binaryOp, x0, x)
@ -312,8 +343,24 @@ Loop = [
    _IsWithin (DelayFn/*PastValue or FutureValue*/, N, x) = DelayFn (0, Constants.ZeroesLike (x)/*false*/, timeStep=N, defaultHiddenActivation=Constants.True)

    # opposite of Id's "next"
-    Previous (x) = PastValue (0, x, timeStep=1)
-    Next (x) = FutureValue (0, x, timeStep=1)
+    Previous (x) = PastValue   (0, x, timeStep=1)
+    Next (x)     = FutureValue (0, x, timeStep=1)
+
+    PreviousOrDefault (x, defaultValue=Constant (0)) =   # a delay node with initial value  --TODO: merge the two, then do in C++
+    [
+        flags = BS.Loop.IsFirst (x)
+        out = BS.Boolean.If (flags,
+                    /*then*/ BS.Sequences.Scatter (flags, defaultValue),
+                    /*else*/ Previous (x))
+    ].out
+
+    NextOrDefault (x, defaultValue=Constant (0)) =   # a delay node with initial value
+    [
+        flags = BS.Loop.IsLast (x)
+        out = BS.Boolean.If (flags,
+                    /*then*/ BS.Sequences.Scatter (flags, defaultValue),
+                    /*else*/ Next (x))
+    ].out
 ]

 ##############################################################################
@ -323,8 +370,9 @@ Loop = [
 Parameters =
 [
    WeightParam (outputDim, inputDim) = Parameter (outputDim, inputDim, init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1)
-    BiasParam (dim)                   = ParameterTensor (dim, init='fixedValue', value=0.0)
-    ScalarParam()                     = Parameter (1, 1, init='fixedValue', value=0.0)
+    DiagWeightParam (outputDim)       = ParameterTensor ((outputDim), init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1) # meant to be applied elementwise
+    BiasParam (dim)                   = ParameterTensor ((dim), init='fixedValue', value=0.0)
+    ScalarParam()                     = BiasParam (1)

    # route input through an extra scalar weight, for stabilization
    Stabilize (x, enabled=true) =
@ -350,16 +398,17 @@ RNNs =
    //       If we change this, we'd need to fix the LSTM end-to-end test.
    LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=false) =
    [
+        #inputDim = x.dim   # get dimension from 'x' (if this works, we can remove the inputDim1 parameter)
        _privateInnards = [       // encapsulate the privateInnards workings
            dh = prevState.h // previous values
            dc = prevState.c

            // parameter macros--these carry their own weight matrices
-            B() = Parameters.BiasParam(cellDim)
+            B() = Parameters.BiasParam (cellDim)

            W(v) = Parameters.WeightParam (cellDim, inputDim)  * Parameters.Stabilize (v, enabled=enableSelfStabilization) // input-to-hidden
            H(h) = Parameters.WeightParam (cellDim, outputDim) * Parameters.Stabilize (h, enabled=enableSelfStabilization) // hidden-to-hidden
-            C(c) = Parameters.WeightParam (cellDim, 1)        .* Parameters.Stabilize (c, enabled=enableSelfStabilization) // cell-to-hiddden (note: applied elementwise)
+            C(c) = Parameters.DiagWeightParam (cellDim)       .* Parameters.Stabilize (c, enabled=enableSelfStabilization) // cell-to-hiddden (note: applied elementwise)

            // note: the W(x) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
            it = Sigmoid (W(x) + B() + H(dh) + C(dc))          // input gate(t)
@ -401,6 +450,28 @@ RNNs =
        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
        lstmState = LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
    ].lstmState.h // that's the value we return
+
+    # same as RecurrentLSTMP but returns both h and c
+    RecurrentLSTMP2 (inputDim, outputDim, cellDim, x, enableSelfStabilization=false) =
+    [
+        prevState =
+        [
+            h = Loop.Previous (lstmState.h)             # hidden state(t-1)
+            c = Loop.Previous (lstmState.c)             # cell(t-1)
+        ]
+        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
+        lstmState = BS.RNNs.LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
+    ].lstmState // that's the value we return
+
+    # a stack of recurrent LSTMs (unidirectional)
+    RecurrentLSTMP2Stack (input, inputDim, hiddenDims, cellDims, enableSelfStabilization=false) = [
+        useStabilizer = enableSelfStabilization
+        layer[i:0..Length (hiddenDims)-1] =
+            RecurrentLSTMP2 (if i == 0 then inputDim else hiddenDims[i-1],
+                             hiddenDims[i], cellDims[i],
+                             if i == 0 then input else layer[i-1].h,
+                             enableSelfStabilization=useStabilizer)
+    ].layer
 ]

 ##############################################################################
--- a/Source/CNTK/tests.cpp
+++ b/Source/CNTK/tests.cpp
@ -184,7 +184,7 @@ void TestSequenceReader(const ConfigParameters& configBase)
        MBLayoutPtr pMBLayout = make_shared<MBLayout>();
        StreamMinibatchInputs matrices;
        matrices.AddInput(featureNames[0], featuresMatrix, pMBLayout, TensorShape());
-        matrices.AddInput(labelNames[0],   labelsMatrix  , pMBLayout, TensorShape());
+        matrices.AddInput(labelNames[1]  , labelsMatrix  , pMBLayout, TensorShape());

        auto start = std::chrono::system_clock::now();
        int epochs = config("maxEpochs");
--- a/Source/Common/DataReader.cpp
+++ b/Source/Common/DataReader.cpp
@ -40,6 +40,7 @@ void DataReaderBase::SetMinibatchLayout(StreamMinibatchInputs& minibatch)
    for (const auto& iter : minibatch)
    {
        assert(iter.second.pMBLayout == pMBLayout);
+        // TODO: This should be a runtime check, not an assert() that only runs in Debug.
        UNUSED(iter);
    }

--- a/Source/Common/File.cpp
+++ b/Source/Common/File.cpp
@ -148,47 +148,33 @@ void File::Init(const wchar_t* filename, int fileOptions)
 // (wstring only for now; feel free to make this a template if needed)
 /*static*/ wstring File::DirectoryPathOf(wstring path)
 {
-#ifdef WIN32
-    if (IsWindows8OrGreater())
+#ifdef _WIN32
+    HRESULT hr;
+    path = msra::strfun::ReplaceAll<wstring>(path, L"/", L"\\"); // Win32 accepts forward slashes, but it seems that PathRemoveFileSpec() does not
+    if (IsWindows8OrGreater()) // PathCchRemoveFileSpec() only available on Windows 8+
    {
        typedef HRESULT(*PathCchRemoveFileSpecProc)(_Inout_updates_(_Inexpressible_(cchPath)) PWSTR, _In_ size_t);
+        HINSTANCE hinstLib = LoadLibrary(TEXT("api-ms-win-core-path-l1-1-0.dll"));
+        if (hinstLib == nullptr)
+            RuntimeError("DirectoryPathOf: LoadLibrary() unexpectedly failed.");
+        PathCchRemoveFileSpecProc PathCchRemoveFileSpec = reinterpret_cast<PathCchRemoveFileSpecProc>(GetProcAddress(hinstLib, "PathCchRemoveFileSpec"));
+        if (!PathCchRemoveFileSpec)
+            RuntimeError("DirectoryPathOf: GetProcAddress() unexpectedly failed.");

-        HINSTANCE hinstLib;
-        PathCchRemoveFileSpecProc ProcAdd;
-        BOOL fFreeResult = FALSE;
+        // this is the actual function call we care about
+        hr = PathCchRemoveFileSpec(&path[0], path.size());
+
+        FreeLibrary(hinstLib);
+    }
+    else // on Windows 7-, use older PathRemoveFileSpec() instead
+        hr = PathRemoveFileSpec(&path[0]);

-        hinstLib = LoadLibrary(TEXT("api-ms-win-core-path-l1-1-0.dll"));
-        if (hinstLib != nullptr)
-        {
-            ProcAdd = reinterpret_cast<PathCchRemoveFileSpecProc>(GetProcAddress(hinstLib, "PathCchRemoveFileSpec"));
-            if (NULL != ProcAdd)
-            {
-                auto hr = (ProcAdd)(&path[0], path.size());
                if (hr == S_OK) // done
                    path.resize(wcslen(&path[0]));
                else if (hr == S_FALSE) // nothing to remove: use .
                    path = L".";
-            }
-            else
-            {
-                LogicError("DirectoryPathOf: GetProcAddress() unexpectedly failed.");
-            }
-
-            fFreeResult = FreeLibrary(hinstLib);
-        }
        else
-        {
-            LogicError("DirectoryPathOf: LoadLibrary() unexpectedly failed.");
-        }
-    }
-    else
-    {
-        auto hr = PathRemoveFileSpec(&path[0]);
-        if (hr != 0) // done
-            path.resize(wcslen(&path[0]));
-        else
-            path = L".";
-    }
+        RuntimeError("DirectoryPathOf: Path(Cch)RemoveFileSpec() unexpectedly failed with 0x%08x.", (unsigned int)hr);
 #else
    auto pos = path.find_last_of(L"/");
    if (pos != path.npos)
@ -264,7 +250,7 @@ File::~File(void)
 {
    if (m_pcloseNeeded)
    {
-        // TODO: Check for error code and throw if !std::uncaught_exception()
+        // TODO: Check for error code and throw if !std::uncaught_exception()     
        _pclose(m_file);
    }
    else if (m_file != stdin && m_file != stdout && m_file != stderr)
--- a/Source/Common/Include/Config.h
+++ b/Source/Common/Include/Config.h
@ -384,8 +384,8 @@ public:
        {
            // look for closing brace and also for another opening brace
            // Inside strings we only accept the closing quote, and ignore any braces inside.
-            current = str.find_first_of(braceStack.back() == '"' ? "\"" : charsToLookFor, current + 1); //
-            if (current == string::npos)                                                                // none found: done or error
+            current = str.find_first_of(braceStack.back() == '"' ? "\"" : charsToLookFor, current + 1);
+            if (current == string::npos) // none found: error
                break;
            char brace = str[current];
            // found the expected closing brace?
@ -406,7 +406,7 @@ public:
            }
        }
        // hit end before everything was closed: error
-        RuntimeError("no closing bracket found in parameters");
+        RuntimeError("no closing %c found in parameters", braceStack.back());
        //RuntimeError("no closing bracket found in parameters (opening bracket at offset %d)\n%s", (int)tokenStart, str.substr(tokenStart).c_str());
    }

--- a/Source/Common/Include/DataReader.h
+++ b/Source/Common/Include/DataReader.h
@ -67,21 +67,21 @@ public:
        Input() {} // some STL classes need this for general happiness

        // helper for typecasting the matrix pointer
-        template<class ElemType>
+    template<class ElemType>
        Matrix<ElemType>& GetMatrix(const wchar_t* name/*for debugging only*/ = L"(unknown)") const
-        {
+    {
            assert(matrix);
            auto* matrixp = dynamic_cast<Matrix<ElemType>*>(matrix.get());
-            if (!matrixp)
-            {
-                // print a rather rich error to track down a regression failure
-                auto isFloat = !!dynamic_cast<Matrix<float>*> (matrix.get());
+        if (!matrixp)
+        {
+            // print a rather rich error to track down a regression failure
+                auto isFloat  = !!dynamic_cast<Matrix<float>*> (matrix.get());
                auto isDouble = !!dynamic_cast<Matrix<double>*>(matrix.get());
                LogicError("GetMatrix<%s>: Attempted to access input stream '%ls' with wrong precision, got %s {%d,%d} instead of %s.",
                    typeid(ElemType).name(), name, typeid(matrix.get()).name(), (int)isFloat, (int)isDouble, typeid(Matrix<ElemType>*).name());
-            }
-            return *matrixp;
        }
+        return *matrixp;
+    }
    };

 private:
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -166,6 +166,7 @@ struct MBLayout

        m_columnsValidityMask = std::move(other->m_columnsValidityMask);
        m_writable = other->m_writable;
+
        m_axisName = std::move(other->m_axisName);
    }

@ -254,9 +255,11 @@ public:

    size_t GetNumTimeSteps() const { return m_numTimeSteps; }
    size_t GetNumParallelSequences() const { return m_numParallelSequences; }
-    const std::wstring GetAxisName() const { return m_axisName; }
-    void SetAxisName(const std::wstring& axisName) { m_axisName = axisName; }

+    // axis names are for now only a debugging aid
+    // In the future, there will be a mechanism to denote that axes are meant to be the same.
+    const wchar_t* GetAxisName() const { return m_axisName.c_str(); }
+    void SetAxisName(const std::wstring& name) { m_axisName = name; }
    void SetUniqueAxisName(std::wstring name) // helper for constructing
    {
        static std::map<std::wstring, size_t> nameIndices;
@ -554,7 +557,9 @@ private:
    mutable bool m_writable;

    // The axis this MBLayout represents.
+    // For now only a string meant for debugging.
    std::wstring m_axisName;
+
 public:

    // special accessor for sequence training  --TODO: must be replaced by a different mechanism
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -776,8 +776,8 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
    for (const auto& x : allnodes)
    {
        line.clear();
-        line = msra::strfun::wstrprintf(L" \"%ls\" [ label = \"%ls [%s%s]\\n%ls\" ] ;\n",
-                                        x->GetName().c_str(), x->GetName().c_str(), string(x->GetSampleLayout()).c_str(), x->HasMBLayout() ? " x *" : "",
+        line = msra::strfun::wstrprintf(L" \"%ls\" [ label = \"%ls [%s%ls]\\n%ls\" ] ;\n",
+                                        x->GetName().c_str(), x->GetName().c_str(), string(x->GetSampleLayout()).c_str(), x->GetMBLayoutAxisString().c_str(),
                                        x->OperationName().c_str());
        fstream << line;
    }
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -52,9 +52,10 @@ public:
        m_randomSeedOffset(0),
          m_isCompiled(false),
          m_areMatricesAllocated(false),
-        m_pMBLayoutOfNetwork(make_shared<MBLayout>(1,0, L"*")),
+        m_pMBLayoutOfNetwork(make_shared<MBLayout>(1, 0, L"*")),
        m_environment(make_shared<ComputationEnvironment>())
    {
+        //m_pMBLayoutOfNetwork->SetAxisName(L"T");
    }

    ComputationNetwork(DEVICEID_TYPE deviceId)
@ -706,10 +707,9 @@ public:
    // evaluation
    // -----------------------------------------------------------------------

-    // zeroes out all gradients except the root itself
-    // TODO: why not the root?
+    // zeroes out all gradients except the root itself (since its gradient is set from outside rather than propagated down)
    // (Note that inside the nodes this only really sets a flag to do it later when needed, but that's not our concern.)
-    void ZeroGradients(const ComputationNodeBasePtr& rootNode)
+    void ZeroInputGradients(const ComputationNodeBasePtr& rootNode)
    {
        for (auto& node : GetAllNodesForRoot(rootNode))
            node->ZeroGradientsOfInputs();
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -111,6 +111,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    // TODO: DiagTimes is also an alias of ElementTimes; current separate implementation is unnecessary.
    else if (nodeType == L"PerDimMeanVarNormalizationNode")                     return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"PerDimMeanVarDeNormalizationNode")                   return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == L"ReconcileMBLayout")                                  return New<ReconcileDynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"RowElementTimes")                                    return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"RowSlice")                                           return New<SliceNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"Scale")                                              return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
@ -194,6 +195,7 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
    return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
 }

+// TODO: change these to take an actual object instead of a name for dynamicAxis
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName)
 {
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -43,18 +43,17 @@ void ComputationNetwork::ForwardProp(const ComputationNodeBasePtr rootNode)
    GetNestedNetwork(rootNode)->ForwardProp(FrameRange(nullptr));
 }

-// set the gradient matrix of a node to an 1x1 matrix containing 1.0
-// Returns false if the node is not a ComputationNode<ElemType>.
+// set the gradient matrix of a (root) node 1.0
+// Returns false if the node is not a ComputationNode<ElemType>; see Backprop() below for intended use.
 template <class ElemType>
-static bool SetGradientToScalarOne(ComputationNodeBasePtr nodep)
+static bool SetRootGradientToScalarOne(ComputationNodeBasePtr nodep)
 {
    auto node = dynamic_pointer_cast<ComputationNode<ElemType>>(nodep);
    bool hasMatchingType = (node != nullptr);
    if (hasMatchingType)
    {
-        Matrix<ElemType>& grad = node->Gradient();
-        grad.Resize(node->Value());
-        grad.SetValue((ElemType) 1.0);
+        // reset the root gradient to 1
+        node->ResetGradient(1);
    }
    return hasMatchingType;
 }
@ -69,13 +68,13 @@ void ComputationNetwork::Backprop(const ComputationNodeBasePtr rootNode) // trai
    if (!Environment().IsTraining())
        LogicError("Backprop: Requires network is to be in training mode.");

-    // reset all gradients to zero (actually, internally, this is lazy, but we don't care here)
-    ZeroGradients(rootNode);
-
    // initialize root gradient with a scalar value of 1.0
-    if (!SetGradientToScalarOne<float>(rootNode) && !SetGradientToScalarOne<double>(rootNode))
+    if (!SetRootGradientToScalarOne<float>(rootNode) && !SetRootGradientToScalarOne<double>(rootNode))
        LogicError("Backprop: Training criterion is neither ComputationNode<float> nor ComputationNode<double>.");

+    // reset all gradients below rootNode to zero (actually, internally, this is lazy, but we don't care here)
+    ZeroInputGradients(rootNode);
+
    // backpropagate through the network
    GetNestedNetwork(rootNode)->Backprop(FrameRange(nullptr), true, true);
 }
@ -134,6 +133,10 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
 {
    for (auto& node : m_nestedNodes)
    {
+#if 0
+        if (dynamic_pointer_cast<LearnableParameter<float>>(node))
+            dynamic_pointer_cast<ComputationNode<float>>(node)->DebugLogMinibatch();
+#endif
        if (node->IsOutOfDateWrtInputs())
        {
            node->BeginForwardProp();
@ -189,8 +192,9 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
    for (auto& node : m_nestedNodes)
    {
        if (node->GetMBLayout() != GetMBLayout())
-            LogicError("Evaluate: all nodes inside a recurrent loop must have a layout that is identical; mismatch found for nodes '%ls' vs. '%ls'",
-                       node->NodeName().c_str(), m_nestedNodes[0]->NodeName().c_str());
+            LogicError("Evaluate: All nodes inside a recurrent loop must have a layout that is identical; mismatch found for nodes '%ls' (%ls) vs. '%ls' (%ls)",
+                       node            ->NodeName().c_str(), node            ->GetMBLayoutAxisString().c_str(),
+                       m_nestedNodes[0]->NodeName().c_str(), m_nestedNodes[0]->GetMBLayoutAxisString().c_str());
    }

    // tell all that loop is about to commence
@ -525,7 +529,7 @@ void ComputationNetwork::ResetMBLayouts()
    for (const auto& node : GetAllNodesForRoot(nullptr))
        node->LinkToMBLayout(nullptr);

-    // DynamicAxis nodes are (apart from the network-wide MBLayout) the main holders of MBLayouts. Initialize them.
+    // DynamicAxis nodes are (apart from the soon-to-be-deprecated network-wide MBLayout) the main holders of MBLayouts. Initialize them.
    // The only other instances are nodes that change the MBLayout, like WhereNode. 
    for (auto node : GetNodesWithType(L"DynamicAxis"))
        node->LinkToMBLayout(make_shared<MBLayout>(1, 0, node->GetName()));
@ -533,6 +537,7 @@ void ComputationNetwork::ResetMBLayouts()
    // This is now initialized inside of the Input nodes, with the proper connections.
    for (auto node : InputNodes(nullptr))
    {
+        // TODO: use if (!Is<ITakesDynamicAxis>(node))...
        auto n = dynamic_pointer_cast<ITakesDynamicAxis>(node);
        if (!n)
            LogicError("Expected %ls to implement ITakesDynamicAxis, but it doesn't.", node->NodeDescription().c_str());
@ -704,7 +709,7 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
            {
                unchanged = !ValidateNode(node, isFinalValidationPass);
                string updatedPrototype = node->FormatOperationPrototype("");
-#if 1           // print prototype in final validation pass
+#if 0           // print prototype in final validation pass. Problematic for tracking down validation errors in loops.
                unchanged;
                if (isFinalValidationPass)
 #else           // print prototype upon every change (useful for debugging)
--- a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
@ -156,9 +156,16 @@ void ComputationNetwork::ConstructFromRoots(DEVICEID_TYPE deviceId, deque<Comput
 // not in the cache yet: create it (or not if no such member)
 void /*CustomConfigRecord::*/ ComputationNetwork::LazyCreateConfigMember(const wstring& id) const /*override*/
 {
-    let iter = m_nameToNodeMap.find(id);
+    auto iter = m_nameToNodeMap.find(id);
    if (iter == m_nameToNodeMap.end())
-        return; // no such node
+    {
+        // workaround to allow to access members with '.' inside: change to _
+        for (iter = m_nameToNodeMap.begin(); iter != m_nameToNodeMap.end(); iter++)
+            if (msra::strfun::ReplaceAll<wstring>(iter->first, L".", L"_") == id)
+                break;
+        if (iter == m_nameToNodeMap.end())
+            return; // no such node
+    }
    const ComputationNodeBasePtr& node = iter->second;
    // TODO: What is the expressionPath?
    let& nodeName = node->NodeName();   // failFn lambda below holds a copy of the name for the error message. Let's not hold an unneccessary shared_ptr to the node, risking cycles & stuff.
@ -168,16 +175,20 @@ void /*CustomConfigRecord::*/ ComputationNetwork::LazyCreateConfigMember(const w

 vector<wstring> /*IConfigRecord::*/ ComputationNetwork::GetMemberIds() const
 {
-    vector<wstring> nodeNames;
+    set<wstring> nodeNames;
    for (let& iter : m_nameToNodeMap)
    {
        const ComputationNodeBasePtr& node = iter.second;
-        const wstring& nodeName = node->NodeName();
-        if (nodeName.find_first_of(L".[$")) // only expose the top-level names
+        wstring nodeName = node->NodeName();
+        if (nodeName.find_first_of(L"$") != nodeName.npos) // skip non-top-level names
            continue;
-        nodeNames.push_back(nodeName);
+        // temp solution for composites: use _ instead of .
+        nodeName = msra::strfun::ReplaceAll<wstring>(nodeName, L".", L"_");
+        if (nodeName.find_first_of(L".[") != nodeName.npos) // skip composite names
+            continue;
+        nodeNames.insert(nodeName);
    }
-    return nodeNames;
+    return vector<wstring>(nodeNames.begin(), nodeNames.end());
 }

 // ===================================================================
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -31,8 +31,15 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
    // after nodes that propagate outside of the loop, and thus, in the last
    // time step of the sequence, have not yet received a gradient from a parent
    // and thus may not have had their gradient matrices allocated.
-    //if (m_needsGradient)
-    //    LazyZeroGradient(); // set gradient to 0 if this is the first time
+#if 1 // keep enabled once this works
+#if 1 // log the cases where this is needed
+    if (m_needsGradient && !m_gradientInitialized)
+        //LogicError("%ls %ls operation: Backprop called with uninitialized gradient.", NodeName().c_str(), OperationName().c_str());
+        fprintf(stderr, "%ls %ls operation: Initializing gradient out of line.\n", NodeName().c_str(), OperationName().c_str());
+#endif
+    if (m_needsGradient)
+        LazyZeroGradient(); // set gradient to 0 if this is the first time
+#endif

    if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop)
        LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str());
@ -139,11 +146,11 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all
    {
        size_t dim1 = shape1[k];
        // BUGBUG: We must consider the allowBroadcast flag here.
-        if (dims[k] == 1)                                  // is [0] broadcasting?
+        if (dims[k] <= 1 && dim1 != 0)                     // is [0] broadcasting (1) or unspecified (0)?
            dims[k] = dim1;                                // then use dimension we broadcast to
-        else if (dim1 == 1)                                // if [1] is broadcasting
-            ;                                              // dims is already correct
-        else if (isFinalValidationPass && dim1 != dims[k]) // no broadcasting: they must match
+        else if (dim1 <= 1 && dims[k] != 0)                // if [1] is broadcasting or unspecified
+            ;                                              // then dims is already correct
+        else if (isFinalValidationPass && dim1 != dims[k]) // no broadcasting or unspecified: they must match
            InvalidArgument("%ls: Input dimensions [%s] and [%s] are not compatible.",
                            NodeDescription().c_str(), string(shape0).c_str(), string(shape1).c_str());
    }
@ -348,7 +355,7 @@ const std::string ComputationNodeBase::ShapeDescription() const
    return msra::strfun::strprintf("[%s%s%ls]",
        string(m_sampleLayout).c_str(),
        HasMBLayout() ? " x " : "",
-        HasMBLayout() ? GetMBLayout()->GetAxisName().c_str() : L"");
+        HasMBLayout() ? GetMBLayout()->GetAxisName() : L"");
 }

 template <class ElemType>
@ -507,6 +514,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
        {
            if (formatChar == 'f') // print as real number
            {
+                if (dval == 0) dval = fabs(dval);    // clear the sign of a negative 0, which are produced inconsistently between CPU and GPU
                fprintfOrDie(f, valueFormatString.c_str(), dval);
            }
            else if (formatChar == 'u') // print category as integer index
@ -707,7 +715,11 @@ using namespace Microsoft::MSR::CNTK;
 template <>
 shared_ptr<Object> MakeRuntimeObject<ComputationNodeBase>(const IConfigRecordPtr configp)
 {
-    return NewComputationNodeFromConfig(configp);
+    let node = NewComputationNodeFromConfig(configp);
+    // temporarily disabling this, as it caused a test to fail:
+    //if (!node->Is<IRecurrentNode>())
+    //    node->Validate(/*isFinalValidationPass*/false); // do an initial validation, so that we have access to dimensions
+    return node;
 }

 ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNodeBase> registerComputationNode(L"ComputationNode");
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -482,6 +482,18 @@ public:
    const MBLayoutPtr& GetMBLayout() const { return m_pMBLayout; }
    bool HasMBLayout() const { return !!m_pMBLayout; }

+    // for logging: get the string fragment for displaying the dimension
+    std::wstring GetMBLayoutAxisString() const
+    {
+        if (!HasMBLayout())
+            return L"";
+        const wstring& axisName = GetMBLayout()->GetAxisName();
+        if (axisName.empty())
+            return L" x *";
+        else
+            return L" x " + axisName;
+    }
+
 protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access

    size_t GetNumParallelSequences() const
@ -685,6 +697,14 @@ public:
        return false;
    }

+    // reset gradients of a node's inputs
+    // This really only clears the lazy-init flags (LazyZeroGradient() actually clears the values lazily).
+    void /*ComputationNodeBase::*/ ZeroGradientsOfInputs()
+    {
+        for (size_t i = 0; i < m_inputs.size(); i++)
+            Input(i)->m_gradientInitialized = false;
+    }
+
    // -----------------------------------------------------------------------
    // masking
    // -----------------------------------------------------------------------
@ -695,8 +715,6 @@ public:
    virtual void InvalidateMissingValueColumns(const FrameRange&) = 0;
    virtual void InvalidateMissingGradientColumns(const FrameRange&) = 0;

-    virtual void ZeroGradientsOfInputs() = 0;
-
    // -----------------------------------------------------------------------
    // memory sharing
    // -----------------------------------------------------------------------
@ -1218,7 +1236,7 @@ public:
        return GradientFor(fr);
    }
    // tensor version of the above functions
-    TensorView<ElemType> DataTensorFor(Matrix<ElemType>& data, size_t rank, const FrameRange& fr)
+    TensorView<ElemType> DataTensorFor(const MatrixBasePtr& data, size_t rank, const FrameRange& fr)
    {
        try
        {
@ -1231,11 +1249,11 @@ public:
    }
    TensorView<ElemType> ValueTensorFor(size_t rank, const FrameRange& fr)
    {
-        return DataTensorFor(Value(), rank, fr);
+        return DataTensorFor(ValuePtr(), rank, fr);
    }
    TensorView<ElemType> GradientTensorFor(size_t rank, const FrameRange& fr)
    {
-        return DataTensorFor(Gradient(), rank, fr);
+        return DataTensorFor(GradientPtr(), rank, fr);
    }

    // TODO: Are all these meant to read out a scalar? Then rename and verify dimensions.
@ -1300,6 +1318,7 @@ public:
    void UpdateFunctionValuesSize()
    {
        UpdateDataSize(Value());
+        Value().CollapseDataLocationAfterWriting(); // actually before writing, should change the name
    }

    // -----------------------------------------------------------------------
@ -1375,14 +1394,8 @@ public:
    // TODO: move to -Base (or -Network?)
    void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override;

-    // TODO: why of the inputs, and not the node itself?
-    void /*ComputationNodeBase::*/ ZeroGradientsOfInputs() override // clears the lazy-init flags (LazyZeroGradient() actually clears the values lazily)
-    {
-        for (size_t i = 0; i < m_inputs.size(); i++)
-            Input(i)->m_gradientInitialized = false;
-    }
-
    // lazy resetting of gradient
+    // This performs the actual zeroing out.
    void LazyZeroGradient()
    {
        if (!m_needsGradient)
@ -1391,8 +1404,14 @@ public:
        if (m_gradientInitialized)
            return;

+        ResetGradient(0);
+    }
+
+    // resize and reset this node's gradient to a given value (normally 0, 1 for root)
+    void ResetGradient(ElemType val)
+    {
        UpdateDataSize(Gradient());
-        Gradient().SetValue(0);
+        Gradient().SetValue(val);

        m_gradientInitialized = true;
    }
@ -1503,8 +1522,45 @@ public:
                                      const std::string& sampleSeparator, std::string valueFormatString,
                                      bool outputGradient = false) const;

+    // simple helper to log the content of a minibatch
+    void DebugLogMinibatch(bool outputGradient = false) const
+    {
+        fprintf(stderr, "<<<<<<\n"); // some prologue and epilogue so that we can use diff -c1 to see the node name
+        fprintf(stderr, "<<<<<<\n");
+        fprintf(stderr, "DebugLogMinibatch: <<<<< %ls%s >>>>>\n", NodeName().c_str(), outputGradient ? " (gradient)" : "");
+        WriteMinibatchWithFormatting(stderr, FrameRange(), 8, 10, false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/false, std::vector<std::string>(),
+            ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
+            "%.8f"/*valueFormatString*/, outputGradient);
+        fprintf(stderr, ">>>>>>\n");
+        fprintf(stderr, ">>>>>>\n");
+    }
+
    void Trace()
    {
+#if 0
+        static const std::set<std::wstring> toLog{
+            L"labelSentenceStartEmbedded",
+            L"delayedDecoderFeedback.h.x",
+            L"delayedDecoderFeedback.h.flags",
+            L"delayedDecoderFeedback.h.out.thenVal.h.indexSequence.h.indexSequence.h",
+            L"delayedDecoderFeedback.h.out.thenVal.h.indexSequence.h",
+            L"delayedDecoderFeedback.h.out.thenVal.h",
+            L"delayedDecoderFeedback.h.out.PlusArgs[0]",
+            L"delayedDecoderFeedback.h.out.PlusArgs[1].ElementTimesArgs[0]",
+            L"delayedDecoderFeedback.h.out.elseVal",
+            L"delayedDecoderFeedback.h.out.PlusArgs[1]",
+            L"delayedDecoderFeedback.h.out",
+            L"delayedDecoderFeedback"
+        };
+        if (toLog.find(NodeName()) != toLog.end())
+            DebugLogMinibatch();
+        if (NodeName() == L"delayedDecoderFeedback.h.out")
+        {
+            static int i = 0;
+            if (++i == 2)
+                exit(1);
+        }
+#endif
        if (m_traceNodeValueReal || m_traceNodeValueAsCategoryLabel || m_traceNodeValueSparse)
        {
            fprintf(stderr, "Trace --> %s\n", FormatOperationPrototype("").c_str());
@ -1556,8 +1612,8 @@ public:
    /*HasToString::*/ wstring ToString() const override
    {
        // we format it like "name : type rows x cols ( args )"
-        wstring result = /*TidyName*/ (NodeName()) + L" : " + OperationName();
-        result.append(msra::strfun::wstrprintf(L" [%s%s]", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : ""));
+        wstring result = NodeName() + L" : " + OperationName();
+        result.append(msra::strfun::wstrprintf(L" [%s%ls]", string(GetSampleLayout()).c_str(), GetMBLayoutAxisString().c_str()));
        if (m_inputs.empty())
            result.append(L" ()");
        else
@ -1580,7 +1636,7 @@ public:
    // for debugging purposes
    void /*ComputationNodeBase::*/ PrintSelf(bool printMatrices = false) const
    {
-        fprintf(stderr, "\n%ls[%s%s] = %ls", NodeName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "", OperationName().c_str());
+        fprintf(stderr, "\n%ls[%s%ls] = %ls", NodeName().c_str(), string(GetSampleLayout()).c_str(), GetMBLayoutAxisString().c_str(), OperationName().c_str());

        if (!IsLeaf())
        {
@ -1589,7 +1645,7 @@ public:
            {
                if (i > 0)
                    fprintf(stderr, ", ");
-                fprintf(stderr, "%ls[%s%s] = %ls", m_inputs[i] ? m_inputs[i]->NodeName().c_str() : L"NULL", string(m_inputs[i]->GetSampleLayout()).c_str(), m_inputs[i]->HasMBLayout() ? " x *" : "", OperationName().c_str());
+                fprintf(stderr, "%ls[%s%ls] = %ls", m_inputs[i] ? m_inputs[i]->NodeName().c_str() : L"NULL", string(m_inputs[i]->GetSampleLayout()).c_str(), m_inputs[i]->GetMBLayoutAxisString().c_str(), OperationName().c_str());
            }
            fprintf(stderr, ")");
        }
@ -1749,7 +1805,6 @@ public:
    virtual void PrintSelf(bool) const override { NOT_IMPLEMENTED; }
    virtual void ValidateInferInputDimsFrom(const TensorShape&) override { NOT_IMPLEMENTED; }
    virtual void SetInput(const size_t, const Microsoft::MSR::CNTK::ComputationNodeBase::ComputationNodeBasePtr&) override { NOT_IMPLEMENTED; }
-    virtual void ZeroGradientsOfInputs(void) override { NOT_IMPLEMENTED; }
    virtual void MaskMissingValueColumnsToZero(const Microsoft::MSR::CNTK::FrameRange&) override { NOT_IMPLEMENTED; }
    virtual void MaskMissingGradientColumnsToZero(const Microsoft::MSR::CNTK::FrameRange&) override { NOT_IMPLEMENTED; }
    virtual void InvalidateMissingValueColumns(const Microsoft::MSR::CNTK::FrameRange&) override { NOT_IMPLEMENTED; }
@ -1854,6 +1909,7 @@ protected:
    using Base::GetInputSampleLayout;                                                                                                                    \
    using Base::GetInputsFromConfig;                                                                                                                     \
    using Base::GetMBLayout;                                                                                                                             \
+    using Base::GetMBLayoutAxisString;                                                                                                                   \
    using Base::GetNumInputs;                                                                                                                            \
    using Base::GetNumParallelSequences;                                                                                                                 \
    using Base::GetNumTimeSteps;                                                                                                                         \
@ -1865,6 +1921,7 @@ protected:
    using Base::Gradient;                                                                                                                                \
    using Base::GradientAsMatrix;                                                                                                                        \
    using Base::GradientFor;                                                                                                                             \
+    using Base::GradientPtr;                                                                                                                             \
    using Base::GradientTensorFor;                                                                                                                       \
    using Base::HasMBLayout;                                                                                                                             \
    using Base::InferMBLayoutFromInputsForStandardCase;                                                                                                  \
@ -1909,6 +1966,7 @@ protected:
    using Base::ValidateUnaryMap;                                                                                                                        \
    using Base::ValidateUnaryReduce;                                                                                                                     \
    using Base::ValueFor;                                                                                                                                \
+    using Base::ValuePtr;                                                                                                                                \
    using Base::ValueTensorFor;                                                                                                                          \
    using Base::VerifyDataSize;                                                                                                                          \
    using Base::VerifyDims;                                                                                                                              \
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -340,8 +340,8 @@ public:
            size_t mapCount = m_mapCount.GetNumElements();
            size_t weightCols = kW * kH * inDims.m_numChannels;

-        // check/infer input [0] (weights)
-        // BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
+            // check/infer input [0] (weights)
+            // BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
            Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols));

            if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != mapCount))
@ -358,31 +358,31 @@ public:
        else
        {
            if (m_imageLayout != ImageLayoutKind::CHW)
-        {
+            {
                InvalidArgument(
                    "%ls %ls supports only cuDNN (CHW) data layout. "
                    "Please specify imageLayout=\"cudnn\" in %ls node in your script "
                    "and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
-        }
+            }
            inputShape = GetInputSampleLayout(inputIdx);
            auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
                                                                m_sharing, m_autoPad, m_lowerPad, m_upperPad);
            SetDims(outDims, HasMBLayout());
-    }
+        }

        if (isFinalValidationPass)
        {
            if (m_convEng == nullptr)
-    {
+            {
                auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
                m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
                                                                m_maxTempMemSizeInSamples, m_poolKind);
-    }
+            }

            if (Input(0)->GetAsMatrixNumCols() != m_kernelShape.GetNumElements() ||
                Input(0)->GetAsMatrixNumRows() != m_convEng->Geometry()->KernelCount())
-    {
+            {
                LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
                           Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
            }
@ -587,7 +587,7 @@ public:

        m_inputSizePerSample = inDims.m_width * inDims.m_height * inDims.m_numChannels;

-        SetDims(outDims.AsTensorShape(m_imageLayoutKind), true);
+        SetDims(outDims.AsTensorShape(m_imageLayoutKind), HasMBLayout());

        if (isFinalValidationPass)
        {
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -260,7 +260,7 @@ private:
    TensorView<ElemType> OneSampleTensorFor(int inputIndex/*-1 for output*/, bool gradient/*instead of value*/, const FrameRange& fr)
    {
        auto input = inputIndex < 0 ? this : Input(inputIndex).get();
-        auto& data = gradient ? input->Gradient() : input->Value();
+        auto data = gradient ? input->GradientPtr() : input->ValuePtr();
        size_t rank = input->GetSampleLayout().GetRank();
        if (!Input(0)->HasMBLayout()) // left input is no MB data: run normally
            return input->DataTensorFor(data, rank, fr);
@ -287,9 +287,9 @@ public:
        // TensorView::DoMatrixProductOf() will reduce each tensor object into a 2D tensor (or fail if it cannot)
        // and recreate actual Matrix objects (in case of sparse, they must be identical to the original tensor storage object).
        // Transposition is applied after flattening into 2D, but only allowed if the input sample is 2D anyway.
-        auto input0 =       OneSampleTensorFor(0,  /*gradient=*/false,                fr.AllowBroadcast());
-        auto input1 =       OneSampleTensorFor(1,  /*gradient=*/false,                fr.AllowBroadcast());
-        auto output =       OneSampleTensorFor(-1, /*gradient=*/false,                fr);
+        auto input0 = OneSampleTensorFor(0,  /*gradient=*/false, fr.AllowBroadcast());
+        auto input1 = OneSampleTensorFor(1,  /*gradient=*/false, fr.AllowBroadcast());
+        auto output = OneSampleTensorFor(-1, /*gradient=*/false, fr);
        output.AssignMatrixProductOf(false/*transC*/, input0, m_transpose/*transA*/, input1, false/*transB*/);
    }

@ -318,16 +318,16 @@ public:
            // If input data is sparse, then gradient is block sparse.
            if (Input(1)->Value().GetMatrixType() == SPARSE && Input(0)->Gradient().GetMatrixType() == DENSE && Gradient().GetMatrixType() == DENSE)
                Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
-            auto input0Gradient =       OneSampleTensorFor(0, /*gradient=*/true,                  fr.AllowBroadcast());
-            auto input1         =       OneSampleTensorFor(1,  /*gradient=*/false,                fr.AllowBroadcast());
-            auto outputGradient =       OneSampleTensorFor(-1, /*gradient=*/true,                 fr);
+            auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true,   fr.AllowBroadcast());
+            auto input1         = OneSampleTensorFor(1,  /*gradient=*/false, fr.AllowBroadcast());
+            auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true,  fr);
            input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
        }
        else if (inputIndex == 1) // right derivative
        {
-            auto input0         =          OneSampleTensorFor(0, /*gradient=*/false,                 fr.AllowBroadcast());
-            auto input1Gradient =          OneSampleTensorFor(1, /*gradient=*/true,                  fr.AllowBroadcast());
-            auto outputGradient =          OneSampleTensorFor(-1, /*gradient=*/true,                 fr);
+            auto input0         = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
+            auto input1Gradient = OneSampleTensorFor(1, /*gradient=*/true,  fr.AllowBroadcast());
+            auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
            input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
        }
    }
@ -819,16 +819,16 @@ public:
    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
        size_t rank = DetermineElementwiseTensorRank();
-        auto output = ValueTensorFor(rank, fr);
-        auto input  = TensorView<ElemType>(Input(0)->Value(), GetTransposedTensorSliceFor(rank, fr));
+        auto output =                                ValueTensorFor(                         rank, fr);
+        auto input  = TensorView<ElemType>(Input(0)->ValuePtr(), GetTransposedTensorSliceFor(rank, fr));
        output.AssignCopyOf(input);
    }

    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
    {
        size_t rank = DetermineElementwiseTensorRank();
-        auto outputGradient = GradientTensorFor(rank, fr);
-        auto inputGradient  = TensorView<ElemType>(Input(0)->Gradient(), GetTransposedTensorSliceFor(rank, fr));
+        auto outputGradient =                                GradientTensorFor(                         rank, fr);
+        auto inputGradient  = TensorView<ElemType>(Input(0)->GradientPtr(), GetTransposedTensorSliceFor(rank, fr));
        inputGradient.AddCopyOf(outputGradient);
    }

--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -51,7 +51,7 @@ public:
        size_t rank = DetermineElementwiseTensorRank();
        auto result = ValueTensorFor(rank, fr);
        auto input = Input(0)->ValueTensorFor(rank, fr);
-        result.DoUnaryOpOf(0, input, 1, opForward);
+        result.DoUnaryOpOf(0, input, 1, opForward, opSum);
    }

    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
@ -61,8 +61,8 @@ public:

        // get the args
        size_t rank = DetermineElementwiseTensorRank();
-        auto sliceOutputGrad = GradientTensorFor(rank, fr);               // propagate from this one...
-        auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr);      // ...to this one
+        auto sliceOutputGrad =           GradientTensorFor(rank, fr); // propagate from this one...
+        auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr); // ...to this one

        // we expect a constant conditional expression here -- suppress the warning that leads to an error
        // TODO: alternative: assign to a non-const variable and test that.
@ -70,7 +70,7 @@ public:
 #pragma warning( disable : 4127 )
        if (opType == UnaryGradient) 
        {
-            sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward);
+            sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward, opSum);
        }
        else 
        {
@ -78,7 +78,7 @@ public:
            // Not possible for Cos().
            auto sliceValue = (opType == BinaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
                Input(0)->ValueTensorFor(rank, fr);
-            sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward);
+            sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
        }
 #pragma warning( pop )
    }
@ -194,6 +194,10 @@ public:

    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
+        // move the target matrix to the target device, since below it is accessed as slices which cannot move
+        // TODO: once this gets reimplemented using TensorView, then this is no longer needed.
+        Input(0)->Value().TransferToDeviceIfNotThere(Value().GetDeviceId(), /*isBeingMoved=*/ false);
+
        auto values = ValueFor(fr);
        ForwardPropV(values, Input(0)->ValueFor(fr));
    }
--- a/Source/ComputationNetworkLib/PreComputeNodes.h
+++ b/Source/ComputationNetworkLib/PreComputeNodes.h
@ -281,9 +281,9 @@ public:
    DeclareConstructorFromConfigWithNumInputs(InvStdDevNode);
    InvStdDevNode(DEVICEID_TYPE deviceId, const wstring& name)
        : Base(deviceId, name),
-          m_mean(deviceId),
-          m_var(deviceId),
-          m_temp(deviceId)
+          m_mean(make_shared<Matrix<ElemType>>(deviceId)),
+          m_var (make_shared<Matrix<ElemType>>(deviceId)),
+          m_temp(make_shared<Matrix<ElemType>>(deviceId))
    {
    }

@ -295,21 +295,21 @@ public:
        {
            // reset accumulators
            UpdateFunctionValuesSize();
-            m_mean.Resize(Value()); // mean accumulator normalized by #samples in it
-            m_var .Resize(Value()); // likewise the variance
-            m_temp.Resize(Value()); // and a temp
-            m_mean.SetValue(0);  // reset the mean and var accumulators
-            m_var .SetValue(0);
+            m_mean->Resize(Value()); // mean accumulator normalized by #samples in it
+            m_var ->Resize(Value()); // likewise the variance
+            m_temp->Resize(Value()); // and a temp
+            m_mean->SetValue(0);  // reset the mean and var accumulators
+            m_var ->SetValue(0);
            Value().SetValue(0); // and clear m_value as well: We must do this here already to avoid a NaN check to flag while this is being estimated.
        }
        else // finalize
        {
            // m_value <- 1/stddev
            ElemType sqrtFloor = 1e-10f;
-            m_var.InplaceTruncateBottom(sqrtFloor); // prevent too small variance (and negative square roots due to numeric inaccuracy)
-            m_var.InplaceSqrt();
-            m_var.ElementInverse();
-            Value().SetValue(m_var);
+            m_var->InplaceTruncateBottom(sqrtFloor); // prevent too small variance (and negative square roots due to numeric inaccuracy)
+            m_var->InplaceSqrt();
+            m_var->ElementInverse();
+            Value().SetValue(*m_var);
        }
    }

@ -361,16 +361,16 @@ public:
        if (flags & CopyNodeFlags::copyNodeValue)
        {
            auto node = dynamic_pointer_cast<InvStdDevNode<ElemType>>(nodeP);
-            node->m_mean.SetValue(m_mean);
-            node->m_var.SetValue(m_var);
-            node->m_temp.SetValue(m_temp);
+            node->m_mean->SetValue(*m_mean);
+            node->m_var ->SetValue(*m_var);
+            node->m_temp->SetValue(*m_temp);
        }
    }

 private:
-    Matrix<ElemType> m_mean;
-    Matrix<ElemType> m_var;
-    Matrix<ElemType> m_temp;
+    shared_ptr<Matrix<ElemType>> m_mean;
+    shared_ptr<Matrix<ElemType>> m_var;
+    shared_ptr<Matrix<ElemType>> m_temp;
 };

 template class InvStdDevNode<float>;
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -183,6 +183,10 @@ public:

    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
    {
+        // move the target matrix to the target device, since below it is accessed as slices which cannot move
+        // TODO: change below accesses to TensorView, then this is no longer needed.
+        Input(0)->Gradient().TransferToDeviceIfNotThere(m_deviceId, /*isBeingMoved=*/ true);
+
        assert(inputIndex == 0);
        inputIndex;

--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -74,30 +74,27 @@ template <class ElemType>
                indexSequence.push_back(t);
        // Note: The above accesses m_value directly on the CPU, putting it into BOTH state, possibly for other consumers as well.
    }
+    input.CollapseDataLocationAfterWriting(); // BUGBUG: Move back, since BOTH state is broken at present.
    // create a new MBLayout
    let& outMBLayout = GetMBLayout();
    outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
    // copy to output
    vector<ElemType> buf(outMBLayout->GetNumCols(), numeric_limits<ElemType>::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps
-    for (size_t i = 0, j = 0; i < sequences.size();)
+    let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this
+    for (size_t i = 0; i < size; i++)
    {
-        if (sequences[i].seqId == GAP_SEQUENCE_ID) // gaps will keep the NaN
-        {
-            ++i;
+        let& seq = outMBLayout->GetAllSequences()[i];
+        if (seq.seqId == GAP_SEQUENCE_ID) // gaps will keep the NaN
            continue;
-        }
-        let& seq = outMBLayout->GetAllSequences()[j];
-        if (seq.seqId == GAP_SEQUENCE_ID) // When would we see this?
-        {
-            ++j;
-            continue;
-        }
        let& indexSequence = indexSequences[i];
        for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
            buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
-        ++i;
-        ++j;
    }
+    // there may be dangling gaps at the end. Take the opportunity to verify this.
+    for (size_t i = size; i < sequences.size(); i++)
+        assert(sequences[i].seqId == GAP_SEQUENCE_ID);
+    for (size_t i = size; i < outMBLayout->GetAllSequences().size(); i++)
+        assert(outMBLayout->GetAllSequences()[i].seqId == GAP_SEQUENCE_ID);
    // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
    Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true);
    Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
@ -107,7 +104,6 @@ template <class ElemType>
 /*virtual*/ void WhereNode<ElemType>::BackpropToNonLooping(size_t /*inputIndex*/) /*override*/
 {
    // we cannot backprop through a condition
-    // Can we?
    return;
 }

@ -161,6 +157,8 @@ template <class ElemType>
            result(0, jIndex) = (ElemType)jSource;
        }
    }
+    // Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes.
+    result.CollapseDataLocationAfterWriting(); // BUGBUG: Move back, since BOTH state is broken at present.
 }

 template <class ElemType>
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -303,16 +303,16 @@ public:
    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
        size_t rank = DetermineElementwiseTensorRank();
-        auto output =                                ValueTensorFor(rank,         fr);
-        let   input = TensorView<ElemType>(Input(0)->Value(), GetInputSlice(rank, fr.AllowBroadcast()));
+        auto output =                                ValueTensorFor(           rank, fr);
+        let   input = TensorView<ElemType>(Input(0)->ValuePtr(), GetInputSlice(rank, fr.AllowBroadcast()));
        output.AssignCopyOf(input);
    }

    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
    {
        size_t rank = DetermineElementwiseTensorRank();
-        let outputGrad =                                GradientTensorFor(rank,         fr);
-        auto inputGrad = TensorView<ElemType>(Input(0)->Gradient(), GetInputSlice(rank, fr));
+        let outputGrad =                                GradientTensorFor(           rank, fr);
+        auto inputGrad = TensorView<ElemType>(Input(0)->GradientPtr(), GetInputSlice(rank, fr.AllowBroadcast()));
        inputGrad.AddCopyOf(outputGrad);
    }

@ -413,7 +413,7 @@ public:
        {
            let input = Input(inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
            let outputSubSlice = NarrowToStripe(outputSlice, inputIndex);
-            auto output = TensorView<ElemType>(Value(), outputSubSlice);
+            auto output = TensorView<ElemType>(ValuePtr(), outputSubSlice);
            output.AssignCopyOf(input);
        }
    }
@ -425,7 +425,7 @@ public:

        auto inputGrad = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
        let outputSubSlice = NarrowToStripe(outputSlice, inputIndex);
-        let outputGrad = TensorView<ElemType>(Gradient(), outputSubSlice);
+        let outputGrad = TensorView<ElemType>(GradientPtr(), outputSubSlice);
        inputGrad.AddCopyOf(outputGrad);
    }

@ -1074,7 +1074,10 @@ public:
        else if (Input(0)->HasMBLayout())
        {
            if (!m_pMBLayout)
+            {
                m_pMBLayout = make_shared<MBLayout>(); // mini-batch data: this generates a new layout
+                m_pMBLayout->SetUniqueAxisName(NodeName());
+            }
        }
        else
            assert(!m_pMBLayout); // reshaping non-mini-batch data
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -692,7 +692,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, cons
    foreach_column(jIn, a)
    {
        auto jOutF = idx(0, jIn); // this is the column we copy/add into
-        if (jOutF < 0)          // negative index means gap
+        if (jOutF < 0)            // negative index means gap
            continue;
        size_t jOut = (size_t)jOutF;
        if (jOut >= GetNumCols())
@ -4856,15 +4856,17 @@ void CPUMatrix<ElemType>::AssignScaledDifference(const ElemType alpha, const CPU
    }
 }

-//c[ci,cj] += a[ai,aj]
+// c[ci,cj] += a[ai,aj]
 template <class ElemType>
-void CPUMatrix<ElemType>::AddElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
+void CPUMatrix<ElemType>::AddElementToElement(ElemType beta, const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
 {
    if (ai >= a.GetNumRows() || aj >= a.GetNumCols() ||
        ci >= c.GetNumRows() || cj >= c.GetNumCols())
        InvalidArgument("AddElementToElement:  index out of range.");

-    c(ci, cj) += a(ai, aj);
+    ElemType us = beta ? beta * c(ci, cj) : 0; // do not multiply if beta is 0, could be a NaN
+    us += a(ai, aj);
+    c(ci, cj) = us;
 }

 ////c[ci,cj] += a[ai,aj]
@ -4879,7 +4881,8 @@ void CPUMatrix<ElemType>::AddElementToElement(const CPUMatrix<ElemType>& a, cons
 //    c(ci, cj) += ((v < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : log(v));
 //}

-//c[ci,cj] = a[ai,aj]
+#if 0 // now done as AddElementToElement (beta=0)
+// c[ci,cj] = a[ai,aj]
 template <class ElemType>
 void CPUMatrix<ElemType>::AssignElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
 {
@ -4889,6 +4892,7 @@ void CPUMatrix<ElemType>::AssignElementToElement(const CPUMatrix<ElemType>& a, c

    c(ci, cj) = a(ai, aj);
 }
+#endif

 /// <summary>c += alpha * (a-b)</summary>
 /// if a, b, c  must have same dim
@ -6079,11 +6083,14 @@ static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType
 // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 // This maps 'op' to a lambda.
 template <class ElemType>
-void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 2>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
 {
+    if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
+        InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
+
 // TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
 #define CaseUnaryTensorOp(oper)                                                        \
    case ElementWiseOperator::op##oper:                                                \
@ -6098,18 +6105,21 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
    {
        ForAllUnaryOps(CaseUnaryTensorOp);
    default:
-        LogicError("TensorUnaryOp: Unknown op code %d.", (int) op);
+        LogicError("TensorOp: Unknown unary op code %d.", (int) op);
    }
 }

 // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 // This maps 'op' to a lambda.
 template <class ElemType>
-void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 3>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
 {
+    if (reductionOp != ElementWiseOperator::opSum)
+        InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum.");
+
 #define CaseBinaryTensorOp(oper)                                                       \
    case ElementWiseOperator::op##oper:                                                \
        return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3>& pp) \
@ -6123,18 +6133,21 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
    {
        ForAllBinaryOps(CaseBinaryTensorOp);
    default:
-        LogicError("TensorBinaryOp: Unknown op code %d.", (int) op);
+        LogicError("TensorOp: Unknown op binary code %d.", (int) op);
    }
 }

 // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 // This maps 'op' to a lambda.
 template <class ElemType>
-void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 4>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
 {
+    if (reductionOp != ElementWiseOperator::opSum)
+        InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");
+
 #define CaseTernaryTensorOp(oper)                                                      \
    case ElementWiseOperator::op##oper:                                                \
        return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4>& pp) \
@ -6148,7 +6161,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
    {
        ForAllTernaryOps(CaseTernaryTensorOp);
    default:
-        LogicError("TensorTernaryOp: Unknown op code %d.", (int) op);
+        LogicError("TensorOp: Unknown ternary op code %d.", (int) op);
    }
 }

--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -380,9 +380,7 @@ public:
    static void AddScaledDifference(const CPUMatrix<ElemType>& alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);    // alpha must be 1X1
    static void AssignScaledDifference(const CPUMatrix<ElemType>& alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c); // alpha must be 1X1

-    static void AddElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
-    // static void AddLogElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
-    static void AssignElementToElement(const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
+    static void AddElementToElement(ElemType beta, const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);

    static void MinusOneAt(CPUMatrix<ElemType>& c, const size_t position);

@ -397,15 +395,15 @@ public:

    static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);

-    void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 2>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
-    void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 3>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
-    void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 4>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -413,10 +413,20 @@ public:
    { 
        if (!m_sob.unique())
            LogicError("%s: Cannot resize the matrix because it is a view.", function);
-        if (m_sob->HasExternalBuffer())
+        else if (m_sob->HasExternalBuffer())
            LogicError("%s: Cannot resize the matrix because it is externally owned.", function);
    }
-	// This is needed for Sparse Matrices to ensure they can write to the matrix. Note: writing to slices is not currently supported
+
+    // same as VerifyResizable() except for the error message. Could be folded into one.
+    void VerifyMigratable(const char* function) const
+    {
+        if (!m_sob.unique())
+            LogicError("%s: Cannot migrate the matrix between devices because it is a view.", function);
+        else if (m_sob->HasExternalBuffer())
+            LogicError("%s: Cannot migrate the matrix between devices because it is externally owned.", function);
+    }
+
+    // This is needed for Sparse Matrices to ensure they can write to the matrix. Note: writing to slices is not currently supported
    void VerifyWritable(const char* function) const 
    {
        if (!(m_sob->GetNumStorageRows() == m_numRows && m_sob->GetNumStorageCols() == m_numCols))
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -880,6 +880,7 @@ __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType
        return;

    // id = i + jOut * usStride;
+    // Each thread processes one element of the output matrix.
    CUDA_LONG i    = id % usStride; // row index into 'us' and 'a'
    CUDA_LONG jOut = id / usStride; // col index into 'us' and 'idx'

@ -892,7 +893,7 @@ __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType

    const ElemType&  ra = a[    i + jIn  *  aStride  ];
    ElemType&       rus = us[id/*i + jOut * usStride*/];
-    
+
    ElemType res = ra * alpha;
    if (beta != 0)
        res += rus * beta;
@ -909,7 +910,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
    if (beta == 0)
        RequireSize(a.GetNumRows(), idx.GetNumCols()); // output has same column format as a, but number of columns comes from idx
    else
-        this->VerifySize(a.GetNumRows(), idx.GetNumCols());
+        VerifySize(a.GetNumRows(), idx.GetNumCols());

    if (idx.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId())
        InvalidArgument("All matrices must be on the same GPU");
@ -935,6 +936,7 @@ __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols
        return;

    // id = i + jIn  *  aStride
+    // Each thread processes one element of a
    CUDA_LONG i   = id % aStride; // row index into 'a' and 'us'
    CUDA_LONG jIn = id / aStride; // col index into 'a' and 'idx'

@ -943,7 +945,7 @@ __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols
        return;
    size_t jOut = (size_t)jOutF;
    if (jOut >= usCols)
-        return; // actually a failure
+        return; // actually a failure  --TODO: This should not be necessary. Why is it?

    const ElemType&  ra =  a[id/*i + jIn  *  aStride*/];
    ElemType&       rus = us[    i + jOut * usStride  ];
@ -3345,7 +3347,7 @@ template <class ElemType>
            return;
        a.PrepareDevice();
        if (a.IsEmpty() || b.IsEmpty())
-            LogicError("ScaleAndAdd:  one of the input matrices is empty.");
+            LogicError("ScaleAndAdd: One of the input matrices is empty.");
        c.RequireSize(b.GetNumRows(), b.GetNumCols());
        // if (a.GetNumRows() != 1 && a.GetNumCols() != 1) // a is not a col or row vector
        if (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()) // dimensions match
@ -3396,7 +3398,7 @@ template <class ElemType>
            _matrixVectorRowWiseAddWithThreadPerElem<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(a.Data(), b.Data(), c.Data(), alpha, m, n);
        }
        else
-            InvalidArgument("dimension of matrix c does not match dimension of matrix a.");
+            InvalidArgument("Dimension of matrix c does not match dimension of matrix a.");
    }
 }

@ -3423,11 +3425,11 @@ void GPUMatrix<ElemType>::AddScaledDifference(const ElemType alpha, const GPUMat
        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
              a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()))
        {
-            InvalidArgument("AddScaledDifference:  a, b, and c must have same dimension.");
+            InvalidArgument("AddScaledDifference: a, b, and c must have same dimension.");
        }

        if (a.IsEmpty())
-            LogicError("AddScaledDifference:  Input matrix a is empty.");
+            LogicError("AddScaledDifference: Input matrix a is empty.");

        CUDA_LONG n = (CUDA_LONG) a.GetNumElements();
        int blocksPerGrid = (int) ceil(1.0 * n / GridDim::maxThreadsPerBlock);
@ -3456,12 +3458,10 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const ElemType alpha, const GPU
        assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());

        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
-        {
-            InvalidArgument("AssignScaledDifference:  a, b must have same dimension.");
-        }
+            InvalidArgument("AssignScaledDifference: a, b must have same dimension.");

        if (a.IsEmpty())
-            LogicError("AssignScaledDifference:  Input matrix a is empty.");
+            LogicError("AssignScaledDifference: Input matrix a is empty.");

        if (&c != &a && &c != &b)
            c.RequireSize(a.GetNumRows(), a.GetNumCols());
@ -3484,7 +3484,7 @@ void GPUMatrix<ElemType>::AddScaledDifference(const GPUMatrix<ElemType>& alpha,
 {
    assert(alpha.GetNumElements() == 1);
    if (!(alpha.GetNumElements() == 1))
-        InvalidArgument("AddScaledDifference:  alpha must be a 1X1 matrix.");
+        InvalidArgument("AddScaledDifference: alpha must be a 1X1 matrix.");

    if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
    {
@ -3500,11 +3500,11 @@ void GPUMatrix<ElemType>::AddScaledDifference(const GPUMatrix<ElemType>& alpha,
        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() &&
              a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()))
        {
-            InvalidArgument("AddScaledDifference:  a, b, and c must have same dimension.");
+            InvalidArgument("AddScaledDifference: a, b, and c must have same dimension.");
        }

        if (a.IsEmpty())
-            LogicError("AddScaledDifference:  Input matrix a is empty.");
+            LogicError("AddScaledDifference: Input matrix a is empty.");

        CUDA_LONG n = (CUDA_LONG) a.GetNumElements();
        int blocksPerGrid = (int) ceil(1.0 * n / GridDim::maxThreadsPerBlock);
@ -3524,7 +3524,7 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alph
 {
    assert(alpha.GetNumElements() == 1);
    if (!(alpha.GetNumElements() == 1))
-        InvalidArgument("AddScaledDifference:  alpha must be a 1X1 matrix.");
+        InvalidArgument("AddScaledDifference: alpha must be a 1X1 matrix.");

    if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
    {
@ -3538,11 +3538,11 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alph

        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
        {
-            InvalidArgument("AssignScaledDifference:  a, b must have same dimension.");
+            InvalidArgument("AssignScaledDifference: a, b must have same dimension.");
        }

        if (a.IsEmpty())
-            LogicError("AssignScaledDifference:  Input matrix a is empty.");
+            LogicError("AssignScaledDifference: Input matrix a is empty.");

        c.RequireSize(a.GetNumRows(), a.GetNumCols());

@ -3555,16 +3555,15 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& alph

 //c[ci,cj] += a[ai,aj]
 template <class ElemType>
-void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
+void GPUMatrix<ElemType>::AddElementToElement(ElemType beta, const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
 {
    if (ai >= a.GetNumRows() || aj >= a.GetNumCols() ||
        ci >= c.GetNumRows() || cj >= c.GetNumCols())
-        InvalidArgument("AddElementToElement:  index out of range.");
+        InvalidArgument("AddElementToElement: Index out of range.");

    a.PrepareDevice();
-    int blocksPerGrid = 1; // only one element   --BUGBUG: then why not launch only 1 thread per block?
    SyncGuard syncGuard;
-    _addElementToElement<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock /*BUGBUG: should be 1?*/, 0, t_stream>>>(a.Data(), (CUDA_LONG) a.LocateElement(ai, aj), c.Data(), (CUDA_LONG) c.LocateElement(ci, cj));
+    _addElementToElement<ElemType><<<1, 1, 0, t_stream>>>(beta, a.Data(), (CUDA_LONG) a.LocateElement(ai, aj), c.Data(), (CUDA_LONG) c.LocateElement(ci, cj));
 }

 template <class ElemType>
@ -4238,11 +4237,14 @@ static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE dev
 // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 // This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
 template <class ElemType>
-void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 2>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
 {
+    if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
+        InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
+
    a.PrepareDevice();
    if (a.GetComputeDeviceId() != GetComputeDeviceId())
        InvalidArgument("All matrices must be on the same GPU");
@ -4293,11 +4295,14 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,

 // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 template <class ElemType>
-void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 3>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
 {
+    if (reductionOp != ElementWiseOperator::opSum)
+        InvalidArgument("TensorOp: The only permitted binary reduction operation is opSum.");
+
    a.PrepareDevice();
    if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId())
        InvalidArgument("All matrices must be on the same GPU");
@ -4307,11 +4312,14 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,

 // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 template <class ElemType>
-void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 4>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
 {
+    if (reductionOp != ElementWiseOperator::opSum)
+        InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");
+
    a.PrepareDevice();
    if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId() || c.GetComputeDeviceId() != GetComputeDeviceId())
        InvalidArgument("All matrices must be on the same GPU");
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -125,6 +125,7 @@ public:
    using Base::SetFormat;
    using Base::IsEmpty;
    using Base::VerifyResizable;
+    using Base::VerifySize;

 public:
    using Base::VerifyWritable;
@ -461,7 +462,7 @@ public:
    static void AddScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
    static void AssignScaledDifference(const GPUMatrix<ElemType>& alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);

-    static void AddElementToElement(const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj);
+    static void AddElementToElement(ElemType beta, const GPUMatrix<ElemType>& a, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj);

    // minus one at a specific position
    static void MinusOneAt(GPUMatrix<ElemType>& c, const size_t position);
@ -477,15 +478,15 @@ public:

    static void TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);

-    void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 2>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
-    void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 3>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
-    void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 4>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -2567,13 +2567,16 @@ __global__ void _assignScaledDifference(

 template <class ElemType>
 __global__ void _addElementToElement(
+    ElemType beta,
    const ElemType* a, CUDA_LONG indexA,
    ElemType* c, CUDA_LONG indexC)
 {
-    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id > 0)
-        return;
-    c[indexC] += a[indexA];
+    //CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;  // only one thread launched
+    //if (id > 0)
+    //    return;
+    ElemType us = beta ? beta * c[indexC] : 0; // do not multiply if beta is 0, could be a NaN
+    us += a[indexA];
+    c[indexC] = us;
 }

 template <class ElemType>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -110,61 +110,37 @@
        }                                                                                                                            \
    }

-// version of helper macro that executes both CPU and GPU macros if 'MatrixPointerToCheck' location is BOTH
-#define DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(MatrixPointerToCheck, MatrixPointerToSetFlag, CPUDense, GPUDense, CPUSparse, GPUSparse) \
-    {                                                                                                                                 \
-        CurrentDataLocation curLocation = (MatrixPointerToCheck)->GetCurrentMatrixLocation();                                         \
-        if (curLocation == CurrentDataLocation::BOTH)                                                                                 \
-        {                                                                                                                             \
-            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                                        \
-            {                                                                                                                         \
-                CPUDense;                                                                                                             \
-                GPUDense;                                                                                                             \
-                if (MatrixPointerToSetFlag != nullptr)                                                                                \
-                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::BOTH, MatrixType::DENSE);                \
-            }                                                                                                                         \
-            else                                                                                                                      \
-            {                                                                                                                         \
-                CPUSparse;                                                                                                            \
-                GPUSparse;                                                                                                            \
-                if (MatrixPointerToSetFlag != nullptr)                                                                                \
-                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::BOTH, MatrixType::SPARSE);               \
-            }                                                                                                                         \
-        }                                                                                                                             \
-        else if (curLocation == CurrentDataLocation::GPU)                                                                             \
-        {                                                                                                                             \
-            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                                        \
-            {                                                                                                                         \
-                GPUDense;                                                                                                             \
-                if (MatrixPointerToSetFlag != nullptr)                                                                                \
-                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::DENSE);                 \
-            }                                                                                                                         \
-            else                                                                                                                      \
-            {                                                                                                                         \
-                GPUSparse;                                                                                                            \
-                if (MatrixPointerToSetFlag != nullptr)                                                                                \
-                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::SPARSE);                \
-            }                                                                                                                         \
-        }                                                                                                                             \
-        else if (curLocation == CurrentDataLocation::CPU)                                                                             \
-        {                                                                                                                             \
-            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                                        \
-            {                                                                                                                         \
-                CPUDense;                                                                                                             \
-                if (MatrixPointerToSetFlag != nullptr)                                                                                \
-                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::DENSE);                 \
-            }                                                                                                                         \
-            else                                                                                                                      \
-            {                                                                                                                         \
-                CPUSparse;                                                                                                            \
-                if (MatrixPointerToSetFlag != nullptr)                                                                                \
-                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::SPARSE);                \
-            }                                                                                                                         \
-        }                                                                                                                             \
-        else                                                                                                                          \
-        {                                                                                                                             \
-            RuntimeError("Matrices do not exist in either CPU or GPU.");                                                              \
-        }                                                                                                                             \
+// version of helper macro that executes both CPU and GPU macros if 'matrixPointer' location is BOTH
+#define DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(matrixPointer, CPUDense, GPUDense, CPUSparse, GPUSparse)  \
+    {                                                                                                   \
+        auto curLocation = (matrixPointer)->GetCurrentMatrixLocation();                                 \
+        auto curMatrixType = (matrixPointer)->GetMatrixType();                                          \
+        if (curLocation == CurrentDataLocation::NONE)                                                   \
+            LogicError("Matrices do not exist in either CPU or GPU.");                                  \
+        if (curMatrixType == MatrixType::UNDETERMINED)                                                  \
+            LogicError("Matrices must be SPARSE or DENSE.");                                            \
+        if (curLocation != CurrentDataLocation::CPU) /*GPU or BOTH*/                                    \
+        {                                                                                               \
+            if (curMatrixType == MatrixType::DENSE)                                                     \
+            {                                                                                           \
+                GPUDense;                                                                               \
+            }                                                                                           \
+            else                                                                                        \
+            {                                                                                           \
+                GPUSparse;                                                                              \
+            }                                                                                           \
+        }                                                                                               \
+        if (curLocation != CurrentDataLocation::GPU) /*CPU or BOTH*/                                    \
+        {                                                                                               \
+            if (curMatrixType == MatrixType::DENSE)                                                     \
+            {                                                                                           \
+                CPUDense;                                                                               \
+            }                                                                                           \
+            else                                                                                        \
+            {                                                                                           \
+                CPUSparse;                                                                              \
+            }                                                                                           \
+        }                                                                                               \
    }

 namespace Microsoft { namespace MSR { namespace CNTK {
@ -224,46 +200,85 @@ void Matrix<ElemType>::ShallowCopyFrom(const Matrix<ElemType>& other)
 }

 // Call this function after an update operation has created/set/updated the respective pointers.
+//  - location: BOTH|CPU|GPU
+//     - pass BOTH only if object will be read from; it is not allowed to write to both and then call this function.
+//     - if CPU/GPU and current is BOTH, then object was written to
 // What gets updated:
 //  - m_currentDataLocation: from function argument
 //  - m_matrixType:          from function argument unless UNDETERMINED in which case m_matrixType remains unmodified
 //  - m_baseMatrix:          to one of current values of m_[GC]PU{Sparse,}Matrix
+// This function is heavily overloaded in its responsibility.
+//  - first-time initialization, e.g. of a ColumnSlice (NONE->!NONE)
+//  - after creating a temp copy for reading
+//  - collapse temp copies after writing to one of them
+//  - setting matrixType if not set yet
 template <class ElemType>
 void Matrix<ElemType>::SetDataLocation(CurrentDataLocation location, MatrixType type) const
 {
+    assert(location == CurrentDataLocation::CPU || location == CurrentDataLocation::GPU || location == CurrentDataLocation::BOTH);
+
    // if the object used to live on BOTH, this will collapse it to 'location' (unless we actually wrote into BOTH)
-    // In that case, we do a sanity check here that the object is an owning Matrix,
-    // since otherwise the collapsing would go unnoticed by the original owner.
+    // In that case, we do a sanity check here that the object is a singleton view,
+    // since otherwise the collapsing would go unnoticed by the other views.
    // The cases to cover:
-    //  - original owner is BOTH, and this is called on the original owner
-    //    -> The result was written to 'location' so we should collapse it to there.
-    //  - original owning matrix is in BOTH state
-    //    and a view inherits this
-    //    -> FORBIDDEN to write into CPU or GPU since we cannot ensure we wrote into the one that will be read next
-    //  - original owning matrix is CPU or GPU
-    //    and a view onto it is put into BOTH state
-    //    -> inefficent to read, since this is likely happening over again; so put the owner into BOTH state
-    //    -> FORBIDDEN to write into CPU or GPU since we don't know the owner's true location and hence cannot ensure we wrote to the correct place
-    if (m_currentDataLocation == CurrentDataLocation::BOTH && location != CurrentDataLocation::BOTH)
+    //  - everything is allowed on a singleton view
+    //     - if the current state is BOTH:
+    //       -> The result was written to 'location' so we should collapse it to there.
+    //  - multiple views: much is forbidden since we cannot notify the other views on which one was written to
+    //     - CPU <-> GPU: FORBIDDEN
+    //     - BOTH -> CPU or GPU: current state is BOTH: location says which side was written to
+    //       -> FORBIDDEN to write into
+    //     - CPU or GPU -> BOTH: current state is CPU or GPU
+    //       and a view onto it is put into BOTH state
+    //       -> OK but inefficent to read, since this is likely happening over again; but we cannot put all views into BOTH state
+    //     - BOTH -> BOTH:
+    //        - read case: OK
+    //        - write case: forbidden to call this function in this way
+    //     - NONE -> !NONE: FORBIDDEN
+    if (m_currentDataLocation != location &&                  // it is attempted to change location
+        m_currentDataLocation != CurrentDataLocation::NONE && // from a valid object (NONE means we are a fresh object from ColumnSlice())
+        location != CurrentDataLocation::BOTH)                // and we are changing it not into a temporary copy for reading
    {
-        // we get here if we wrote into this object that was BOTH but is no longer
-        if (!OwnBuffer()) // this means we should not have written into it in the first place, so fail now (better late than never)
+        // we get here if we wrote into this object that was BOTH but is no longer, or if we move between CPU and GPU
+        // Both is forbidden on shared views since we cannot inform other views of this change.
+        // We will now check any *valid* pointer will now be checked for uniqueness. There may be mismatching left-over pointers kept around in case they should be revived.
+        if (m_matrixType == MatrixType::DENSE) // note: this checks the current type, not the new one passed in. Asssumption: this tells us which pointers are valid.
+        {
+            assert(m_currentDataLocation == CurrentDataLocation::GPU || m_CPUMatrix);
+            assert(m_currentDataLocation == CurrentDataLocation::CPU || m_GPUMatrix);
+            if (m_currentDataLocation != CurrentDataLocation::GPU) ((BaseMatrix<ElemType>*)m_CPUMatrix.get())->VerifyMigratable("SetDataLocation [CPUMatrix]");
+            if (m_currentDataLocation != CurrentDataLocation::CPU) ((BaseMatrix<ElemType>*)m_GPUMatrix.get())->VerifyMigratable("SetDataLocation [GPUMatrix]");
+        }
+        else if (m_matrixType == MatrixType::SPARSE)
+        {
+            assert(m_currentDataLocation == CurrentDataLocation::GPU || m_CPUSparseMatrix);
+            assert(m_currentDataLocation == CurrentDataLocation::CPU || m_GPUSparseMatrix);
+            if (m_currentDataLocation != CurrentDataLocation::GPU) ((BaseMatrix<ElemType>*)m_CPUSparseMatrix.get())->VerifyMigratable("SetDataLocation [CPUSparseMatrix]");
+            if (m_currentDataLocation != CurrentDataLocation::CPU) ((BaseMatrix<ElemType>*)m_GPUSparseMatrix.get())->VerifyMigratable("SetDataLocation [GPUSparseMatrix]");
+        }
+        // TODO: Why do we need these typecasts? (without it will fail with "cannot access private member declared in class 'Microsoft::MSR::CNTK::CPUMatrix<float>'")
+
+        if (m_baseMatrix && !OwnBuffer()) // same arguments for externally owned matrices: Can read a temp but not write.
            LogicError("SetDataLocation: A non-owning object cannot be written to in BOTH state.");
    }
+    // passed validation: we can now update the state
+
    m_currentDataLocation = location;

-    // set the matrix type if passed in
+    // update the matrix type if passed in
    if (type != MatrixType::UNDETERMINED)
        m_matrixType = type;

+    // set m_baseMatrix (if location is unchanged, this will not change the pointer)
    // Note: m_currentDataLocation may also be CurrentDataLocation::BOTH, in which case the base matrix will be GPU.
    if (m_matrixType == MatrixType::DENSE)
        m_baseMatrix = ((m_currentDataLocation == CurrentDataLocation::CPU) ? dynamic_pointer_cast<BaseMatrix<ElemType>>(m_CPUMatrix) : dynamic_pointer_cast<BaseMatrix<ElemType>>(m_GPUMatrix));
    else if (m_matrixType == MatrixType::SPARSE)
        m_baseMatrix = ((m_currentDataLocation == CurrentDataLocation::CPU) ? dynamic_pointer_cast<BaseMatrix<ElemType>>(m_CPUSparseMatrix) : dynamic_pointer_cast<BaseMatrix<ElemType>>(m_GPUSparseMatrix));
+    // Note: Typecasts are necessary since C++ cannot figure out the common base type (probably due to shared_ptr).
    // sanity check
    if (!m_baseMatrix && m_matrixType != MatrixType::UNDETERMINED)
-        LogicError("SetDataLocation: new m_baseMatrix must not be NULL.");
+        LogicError("SetDataLocation: New m_baseMatrix must not be NULL.");
 }

 //this is a private constructor only used internally to initialize a blank matrix
@ -908,9 +923,8 @@ void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat
            if (keepValues)
                CopyElementsFromDenseToSparse(*m_CPUMatrix, *m_CPUSparseMatrix);

-            m_CPUMatrix = nullptr;
-
            SetDataLocation(CPU, SPARSE);
+            m_CPUMatrix = nullptr;
        }
        else if (newMatrixType == MatrixType::DENSE)
        {
@ -922,9 +936,8 @@ void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat
            if (keepValues)
                m_CPUMatrix->SetValue(m_CPUSparseMatrix->CopyColumnSliceToDense(0, GetNumCols()));

-            m_CPUSparseMatrix = nullptr;
-
            SetDataLocation(CPU, DENSE);
+            m_CPUSparseMatrix = nullptr;
        }
        else
            LogicError("SwitchToMatrixType: Unexpected/invalid new matrix type");
@ -941,9 +954,8 @@ void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat
            if (keepValues)
                m_GPUSparseMatrix->SetValue(*m_GPUMatrix);

-            m_GPUMatrix = nullptr;
-
            SetDataLocation(GPU, SPARSE);
+            m_GPUMatrix = nullptr;
        }
        else if (newMatrixType == MatrixType::DENSE)
        {
@ -955,9 +967,8 @@ void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat
            if (keepValues)
                m_GPUSparseMatrix->CopyToDenseMatrix(*m_GPUMatrix);

-            m_GPUSparseMatrix = nullptr;
-
            SetDataLocation(GPU, DENSE);
+            m_GPUSparseMatrix = nullptr;
        }
        else
            LogicError("SwitchToMatrixType: Unexpected/invalid new matrix type");
@ -977,25 +988,25 @@ void Matrix<ElemType>::CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from,
 template <class ElemType>
 ElemType Matrix<ElemType>::Get00Element() const
 {
-    DISPATCH_MATRIX_ON_FLAG(this,
-                            nullptr,
-                            return m_CPUMatrix->Get00Element(),
-                            return m_GPUMatrix->Get00Element(),
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG(this, nullptr,
+        { return m_CPUMatrix->Get00Element(); },
+        { return m_GPUMatrix->Get00Element(); },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; });
 }

+// const operator(,)
 template <class ElemType>
 const ElemType Matrix<ElemType>::operator()(const size_t row, const size_t col) const
 {
-    DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this,
-                                         nullptr,
-                                         return m_CPUMatrix->operator()(row, col),
-                                         _transferFromDeviceToDevice(GetDeviceId(), CPUDEVICE, false); return m_CPUMatrix->operator()(row, col),
-                                                NOT_IMPLEMENTED,
-                                                NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this, nullptr,
+        { return m_CPUMatrix->operator()(row, col); },
+        { _transferFromDeviceToDevice(GetDeviceId(), CPUDEVICE, false); return m_CPUMatrix->operator()(row, col); },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; });
 }

+// non-const operator(,)
 //WARNING: This function is very slow for GPUs since it requires copying values between CPUs and GPUs.
 //In addition, if ColumnSlice is used after this function but before the values are copied back to GPU
 //the operation will fail since the memory is not managed by the slice.
@ -1427,22 +1438,18 @@ void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients,
    }
 }

-//both this and gradients will be changed
+// both 'this' and gradients will be changed
 template <class ElemType>
 ElemType Matrix<ElemType>::Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier)
 {
    DecideAndMoveToRightDevice(*this, gradients);

-    DISPATCH_MATRIX_ON_FLAG(&gradients,
-                            &gradients,
-                            return m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier);
-                            SetDataLocation(CPU),
-                            return m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier);
-                            SetDataLocation(GPU),
-                            return gradients.m_CPUSparseMatrix->Adagrad(*m_CPUMatrix, needAveMultiplier);
-                            SetDataLocation(CPU),
-                            return gradients.m_GPUSparseMatrix->Adagrad(*m_GPUMatrix, needAveMultiplier);
-                            SetDataLocation(GPU));
+    DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
+        { return m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier);       SetDataLocation(CPU); },
+        { return m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier);       SetDataLocation(GPU); },
+        { return gradients.m_CPUSparseMatrix->Adagrad(*m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU); },
+        { return gradients.m_GPUSparseMatrix->Adagrad(*m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU); });
+    // Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
 }

 template <class ElemType>
@ -1458,14 +1465,12 @@ void Matrix<ElemType>::FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Mat
    aggadagradsqrframes = adagradkeepweight * aggadagradsqrframes + (1.0f - adagradkeepweight) * mbSize;
    const ElemType targetadagradavdenom_x_sqrtadagradsqrframes = static_cast<ElemType>(targetadagradavdenom * sqrt(aggadagradsqrframes));

-    DISPATCH_MATRIX_ON_FLAG(&gradients,
-                            &gradients,
-                            m_CPUMatrix->FSAdagrad(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes);
-                            SetDataLocation(CPU),
-                            m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes);
-                            SetDataLocation(GPU),
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
+        { m_CPUMatrix->FSAdagrad(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes); SetDataLocation(CPU); },
+        { m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes); SetDataLocation(GPU); },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; });
+    // Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
 }

 template <class ElemType>
@ -1479,14 +1484,12 @@ ElemType Matrix<ElemType>::RmsProp(Matrix<ElemType>& gradients,
 {
    DecideAndMoveToRightDevice(*this, gradients);

-    DISPATCH_MATRIX_ON_FLAG(this,
-                            &gradients,
-                            return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier);
-                            SetDataLocation(CPU),
-                            return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier);
-                            SetDataLocation(GPU),
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG(this, &gradients,
+        { return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(CPU); },
+        { return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU); },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; });
+    // Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
 }

 template <class ElemType>
@ -1494,12 +1497,11 @@ void Matrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
 {
    if (numRows != GetNumRows() || numCols != GetNumCols())
    {
-        DISPATCH_MATRIX_ON_FLAG(this,
-                                this,
-                                m_CPUMatrix->Reshape(numRows, numCols),
-                                m_GPUMatrix->Reshape(numRows, numCols),
-                                NOT_IMPLEMENTED,
-                                m_GPUSparseMatrix->Reshape(numRows, numCols));
+        DISPATCH_MATRIX_ON_FLAG(this, this,
+            { m_CPUMatrix->Reshape(numRows, numCols); },
+            { m_GPUMatrix->Reshape(numRows, numCols); },
+            { NOT_IMPLEMENTED; },
+            { m_GPUSparseMatrix->Reshape(numRows, numCols); });
    }
 }

@ -1510,11 +1512,10 @@ void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const
 {
    // TODO: should this function test whether the size is changing, and skip if it isn't? We have at least one explicit test for this code calling this (recurrent node)
    DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
-                                          this,
-                                          m_CPUMatrix->Resize(numRows, numCols, growOnly),
-                                          m_GPUMatrix->Resize(numRows, numCols, growOnly),
-                                          m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false),
-                                          m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false));
+        { m_CPUMatrix->Resize(numRows, numCols, growOnly); },
+        { m_GPUMatrix->Resize(numRows, numCols, growOnly); },
+        { m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); },
+        { m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); });
 #ifdef _DEBUG
    if (GetMatrixType() != MatrixType::SPARSE)
        Invalidate(); // Fill the matrix with NaNs to detect using the content which is undefined. Unfortunately this won't work for sparse matrices.
@ -1551,11 +1552,10 @@ template <class ElemType>
 void Matrix<ElemType>::Reset()
 {
    DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
-                                          this,
-                                          NOT_IMPLEMENTED,
-                                          NOT_IMPLEMENTED,
-                                          m_CPUSparseMatrix->Reset(),
-                                          m_GPUSparseMatrix->Reset());
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; },
+        { m_CPUSparseMatrix->Reset(); },
+        { m_GPUSparseMatrix->Reset(); });
 }

 template <class ElemType>
@ -3027,12 +3027,11 @@ ElemType Matrix<ElemType>::SumOfAbsElements() const
    if (IsEmpty())
        LogicError("SumOfAbsElements: Matrix is empty.");

-    DISPATCH_MATRIX_ON_FLAG(this,
-                            nullptr,
-                            return m_CPUMatrix->SumOfAbsElements(),
-                            return m_GPUMatrix->SumOfAbsElements(),
-                            NOT_IMPLEMENTED,
-                            return m_GPUSparseMatrix->SumOfAbsElements());
+    DISPATCH_MATRIX_ON_FLAG(this, nullptr,
+                            { return m_CPUMatrix->SumOfAbsElements(); },
+                            { return m_GPUMatrix->SumOfAbsElements(); },
+                            { NOT_IMPLEMENTED; },
+                            { return m_GPUSparseMatrix->SumOfAbsElements(); });
 }

 //sum of all elements
@ -3042,11 +3041,10 @@ ElemType Matrix<ElemType>::LogSumOfElements() const
    if (IsEmpty())
        LogicError("LogSumOfElements: Matrix is empty.");

-    DISPATCH_MATRIX_ON_FLAG(this,
-                            nullptr,
+    DISPATCH_MATRIX_ON_FLAG(this, nullptr,
                            { return m_CPUMatrix->LogSumOfElements(); },
                            { return m_GPUMatrix->LogSumOfElements(); },
-                            {NOT_IMPLEMENTED},
+                            { NOT_IMPLEMENTED},
                            { NOT_IMPLEMENTED });
 }

@ -3354,65 +3352,57 @@ Matrix<ElemType>& Matrix<ElemType>::AddSignOf(const Matrix<ElemType>& a)
    return *this;
 }

-//I decided to use Matrix<ElemType>& maxIndexes instead of integer vector because the result may be used to do additional calculation
+// I decided to use Matrix<ElemType>& maxIndices instead of integer vector because the result may be used to do additional calculation
 template <class ElemType>
-void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const
+void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndices, Matrix<ElemType>& maxValues, const bool isColWise) const
 {
    if (IsEmpty())
        LogicError("VectorMax: Matrix is empty.");

-    DecideAndMoveToRightDevice(*this, maxIndexes, maxValues);
-    maxIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
+    DecideAndMoveToRightDevice(*this, maxIndices, maxValues);
+    maxIndices.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
    maxValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

-    DISPATCH_MATRIX_ON_FLAG(this,
-                            &maxValues,
-                            m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise);
-                            maxIndexes.SetDataLocation(CPU, DENSE),
-                            m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise);
-                            maxIndexes.SetDataLocation(GPU, DENSE),
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG(this, &maxValues,
+        { m_CPUMatrix->VectorMax(*maxIndices.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise); maxIndices.SetDataLocation(CPU, DENSE); },
+        { m_GPUMatrix->VectorMax(*maxIndices.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise); maxIndices.SetDataLocation(GPU, DENSE); },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; });
+    // Note: must SetDataLocation() also on maxIndices, since both maxValues and maxIndices are written.
 }

 template <class ElemType>
-void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const
+void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndices, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const
 {
    if (IsEmpty())
        LogicError("VectorMax: Matrix is empty.");

-    DecideAndMoveToRightDevice(*this, maxIndexes, maxValues);
-    maxIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
+    DecideAndMoveToRightDevice(*this, maxIndices, maxValues);
+    maxIndices.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
    maxValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

-    DISPATCH_MATRIX_ON_FLAG(this,
-                            &maxValues,
-                            m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK);
-                            maxIndexes.SetDataLocation(CPU, DENSE),
-                            m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK);
-                            maxIndexes.SetDataLocation(GPU, DENSE),
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG(this, &maxValues,
+        { m_CPUMatrix->VectorMax(*maxIndices.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK); maxIndices.SetDataLocation(CPU, DENSE); },
+        { m_GPUMatrix->VectorMax(*maxIndices.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK); maxIndices.SetDataLocation(GPU, DENSE); },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; });
 }

 template <class ElemType>
-void Matrix<ElemType>::VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const
+void Matrix<ElemType>::VectorMin(Matrix<ElemType>& minIndices, Matrix<ElemType>& minValues, const bool isColWise) const
 {
    if (IsEmpty())
        LogicError("VectorMin: Matrix is empty.");

-    DecideAndMoveToRightDevice(*this, minIndexes, minValues);
-    minIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
+    DecideAndMoveToRightDevice(*this, minIndices, minValues);
+    minIndices.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
    minValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

-    DISPATCH_MATRIX_ON_FLAG(this,
-                            &minValues,
-                            m_CPUMatrix->VectorMin(*minIndexes.m_CPUMatrix, *minValues.m_CPUMatrix, isColWise);
-                            minIndexes.SetDataLocation(CPU, DENSE),
-                            m_GPUMatrix->VectorMin(*minIndexes.m_GPUMatrix, *minValues.m_GPUMatrix, isColWise);
-                            minIndexes.SetDataLocation(GPU, DENSE),
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG(this, &minValues,
+        { m_CPUMatrix->VectorMin(*minIndices.m_CPUMatrix, *minValues.m_CPUMatrix, isColWise); minIndices.SetDataLocation(CPU, DENSE); },
+        { m_GPUMatrix->VectorMin(*minIndices.m_GPUMatrix, *minValues.m_GPUMatrix, isColWise); minIndices.SetDataLocation(GPU, DENSE); },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; });
 }

 #pragma endregion Member BLAS Functions
@ -3425,12 +3415,11 @@ int Matrix<ElemType>::GetDeviceId() const
    if (m_currentDataLocation == CurrentDataLocation::NONE)
        return m_preferredDeviceId;

-    DISPATCH_MATRIX_ON_FLAG(this,
-                            nullptr,
-                            return CPUDEVICE,
-                            return m_GPUMatrix->GetComputeDeviceId(),
-                            return CPUDEVICE,
-                            return m_GPUSparseMatrix->GetComputeDeviceId());
+    DISPATCH_MATRIX_ON_FLAG(this, nullptr,
+        { return CPUDEVICE; },
+        { return m_GPUMatrix->GetComputeDeviceId(); },
+        { return CPUDEVICE; },
+        { return m_GPUSparseMatrix->GetComputeDeviceId(); });
 }

 // TODO: Comment why we need a second ElemType.
@ -3544,25 +3533,21 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
        return;
    }

+    // warn about device change
 #define NUM_DEVICE_CHANGED_WARN 20
    if (m_numTimesDeviceChanged <= NUM_DEVICE_CHANGED_WARN &&
        (!emptyTransfer || (from_id >= 0 && to_id >= 0)))
    {
        m_numTimesDeviceChanged++;
        if (m_devicesTransferedTo[0] < CPUDEVICE)
-        {
            m_devicesTransferedTo[0] = to_id;
-        }
        else if (m_devicesTransferedTo[0] != to_id)
-        {
            m_devicesTransferedTo[1] = to_id;
-        }
    }
    if (m_numTimesDeviceChanged == NUM_DEVICE_CHANGED_WARN && m_devicesTransferedTo[1] >= CPUDEVICE)
-    {
        fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long) GetNumRows(), (unsigned long) GetNumCols(), NUM_DEVICE_CHANGED_WARN);
-    }

+    // do the transfer
    if (m_matrixType == MatrixType::SPARSE)
    {
        if (from_id == CPUDEVICE) // from CPU to GPU
@ -3582,8 +3567,8 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool

            if (isBeingMoved)
            {
-                m_CPUSparseMatrix = nullptr;
                SetDataLocation(GPU, SPARSE);
+                m_CPUSparseMatrix = nullptr;
            }
            else
            {
@ -3607,8 +3592,8 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool

                if (isBeingMoved)
                {
-                    m_GPUSparseMatrix = nullptr;
                    SetDataLocation(CPU, SPARSE);
+                    m_GPUSparseMatrix = nullptr;
                }
                else
                {
@ -3638,8 +3623,8 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
            }
            if (isBeingMoved)
            {
-                m_CPUMatrix = nullptr;
                SetDataLocation(GPU, DENSE);
+                m_CPUMatrix = nullptr;
            }
            else
            {
@ -3666,8 +3651,8 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool

                if (isBeingMoved)
                {
-                    m_GPUMatrix = nullptr;
                    SetDataLocation(CPU, DENSE);
+                    m_GPUMatrix = nullptr;
                }
                else
                {
@ -4180,17 +4165,19 @@ void Matrix<ElemType>::SVD(const Matrix<ElemType>& A, Matrix<ElemType>& SIGMA, M
    VT.SwitchToMatrixType(A.GetMatrixType(), A.GetFormat(), false);
    W.SwitchToMatrixType(A.GetMatrixType(), A.GetFormat(), false);

-    DISPATCH_MATRIX_ON_FLAG(&A,
-                            nullptr,
-                            Matrix<ElemType> tA = A.DeepClone();
-                            CPUMatrix<ElemType>::SVD(*tA.m_CPUMatrix, *SIGMA.m_CPUMatrix, *U.m_CPUMatrix, *VT.m_CPUMatrix, *W.m_CPUMatrix);
-                            SIGMA.SetDataLocation(CPU);
-                            U.SetDataLocation(CPU);
-                            VT.SetDataLocation(CPU);
-                            W.SetDataLocation(CPU),
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG(&A, nullptr,
+        {
+            Matrix<ElemType> tA = A.DeepClone();
+            CPUMatrix<ElemType>::SVD(*tA.m_CPUMatrix, *SIGMA.m_CPUMatrix, *U.m_CPUMatrix, *VT.m_CPUMatrix, *W.m_CPUMatrix);
+            SIGMA.SetDataLocation(CPU);
+            U.SetDataLocation(CPU);
+            VT.SetDataLocation(CPU);
+            W.SetDataLocation(CPU);
+            // need to SetDataLocation() on all matrices we write to
+        },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; },
+        { NOT_IMPLEMENTED; });
 }

 /// <summary>Matrix-matrix multiply with col-major matrices (a and b may be transposed): c = alpha * op(a) * op(b) + beta*c</summary>
@ -4400,34 +4387,33 @@ template <class ElemType>

    if (a.GetMatrixType() == c.GetMatrixType())
    {
-        DISPATCH_MATRIX_ON_FLAG(&c,
-                                &c,
-                                CPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix),
-                                GPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix),
-                                NOT_IMPLEMENTED,
-                                GPUSparseMatrix<ElemType> b = move(*c.m_GPUSparseMatrix);
-                                GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, b, *c.m_GPUSparseMatrix));
+        DISPATCH_MATRIX_ON_FLAG(&c, &c,
+            { CPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix); },
+            { GPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix); },
+            { NOT_IMPLEMENTED; },
+            { GPUSparseMatrix<ElemType> b = move(*c.m_GPUSparseMatrix); GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, b, *c.m_GPUSparseMatrix); });
    }
    else
    {
-        DISPATCH_MATRIX_ON_FLAG(&c,
-                                nullptr,
-                                CPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUSparseMatrix, *c.m_CPUMatrix);
-                                c.SetDataLocation(CPU),
-                                if (a.m_GPUSparseMatrix->GetFormat() == MatrixFormat::matrixFormatSparseCSC)
-                                {
-                                    GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, *c.m_GPUMatrix, *c.m_GPUMatrix);
-                                } else // new GPU sparse matrix code
-                                {
-                                    GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, *c.m_GPUMatrix);
-                                } c.SetDataLocation(GPU),
-                                NOT_IMPLEMENTED,
-                                {
-                                    c.m_GPUMatrix = make_shared<GPUMatrix<ElemType>>(c.m_GPUSparseMatrix->CopyToDenseMatrix());
-                                    GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, 1, *c.m_GPUSparseMatrix, *c.m_GPUMatrix);
-                                    c.m_GPUSparseMatrix = nullptr;
-                                    c.SetDataLocation(GPU, DENSE);
-                                });
+        DISPATCH_MATRIX_ON_FLAG(&c, nullptr,
+            {
+                CPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUSparseMatrix, *c.m_CPUMatrix);
+                c.SetDataLocation(CPU);
+            },
+            {
+                if (a.m_GPUSparseMatrix->GetFormat() == MatrixFormat::matrixFormatSparseCSC)
+                    GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, *c.m_GPUMatrix, *c.m_GPUMatrix);
+                else // new GPU sparse matrix code
+                    GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, *c.m_GPUMatrix);
+                c.SetDataLocation(GPU);
+            },
+            { NOT_IMPLEMENTED; },
+            {
+                c.m_GPUMatrix = make_shared<GPUMatrix<ElemType>>(c.m_GPUSparseMatrix->CopyToDenseMatrix());
+                GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, 1, *c.m_GPUSparseMatrix, *c.m_GPUMatrix);
+                c.SetDataLocation(GPU, DENSE);
+                c.m_GPUSparseMatrix = nullptr;
+            });
    }
 }

@ -4444,9 +4430,7 @@ template <class ElemType>
    if (beta == 1)
        ScaleAndAdd(alpha, a, c);
    else if (beta == 0)
-    {
        Scale(alpha, a, c);
-    }
    else
    {
        ScaleAndAdd(alpha / beta, a, c); // c1=alpha/beta * a + c
@ -4598,8 +4582,8 @@ void Matrix<ElemType>::AddElementToElement(const Matrix<ElemType>& a, const size

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
-                            CPUMatrix<ElemType>::AddElementToElement(*a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
-                            GPUMatrix<ElemType>::AddElementToElement(*a.m_GPUMatrix, ai, aj, *c.m_GPUMatrix, ci, cj),
+                            CPUMatrix<ElemType>::AddElementToElement(1, *a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
+                            GPUMatrix<ElemType>::AddElementToElement(1, *a.m_GPUMatrix, ai, aj, *c.m_GPUMatrix, ci, cj),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
 }
@ -4615,8 +4599,8 @@ void Matrix<ElemType>::AssignElementToElement(const Matrix<ElemType>& a, const s

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
-                            CPUMatrix<ElemType>::AssignElementToElement(*a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
-                            NOT_IMPLEMENTED,
+                            CPUMatrix<ElemType>::AddElementToElement(0, *a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
+                            GPUMatrix<ElemType>::AddElementToElement(0, *a.m_GPUMatrix, ai, aj, *c.m_GPUMatrix, ci, cj),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
 }
@ -5205,7 +5189,7 @@ static bool VerifyIsDense(const Matrix<ElemType>& a)
 }

 template <class ElemType>
-void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                const array<size_t, 2>& offsets,
                                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
@ -5216,14 +5200,14 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemTy

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
-                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
-                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
 }

 template <class ElemType>
-void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                const array<size_t, 3>& offsets,
                                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
@ -5234,14 +5218,14 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
-                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
-                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
 }

 template <class ElemType>
-void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                const array<size_t, 4>& offsets,
                                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
@ -5252,8 +5236,8 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
-                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
-                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
 }
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -115,11 +115,17 @@ public:
    static Matrix<ElemType> RandomUniform(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED);
    static Matrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);

-    static void SetDevice(DEVICEID_TYPE deviceId);
+    static void SetDevice(DEVICEID_TYPE deviceId); // TODO: unify with PrepareDevice()

    void ReleaseMemory();
    ~Matrix();

+    // workaround to bugs in BOTH implementation: force to collapse to home location
+    void CollapseDataLocationAfterWriting() const
+    {
+        SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
+    }
+
 private:
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID);                                  // only used internally to initialize a blank matrix
@ -530,15 +536,15 @@ public:

    static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c);

-    void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 2>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
-    void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 3>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
-    void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+    void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 4>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -1894,7 +1894,7 @@ void GPUMatrix<ElemType>::AssignScaledDifference(const GPUMatrix<ElemType>& /*al

 //c[ci,cj] += a[ai,aj]
 template <class ElemType>
-void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& /*a*/, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
+void GPUMatrix<ElemType>::AddElementToElement(ElemType beta, const GPUMatrix<ElemType>& /*a*/, const size_t ai, const size_t aj, GPUMatrix<ElemType>& c, const size_t ci, const size_t cj)
 {
 }

@ -1953,21 +1953,21 @@ void GPUMatrix<ElemType>::TensorShuffleScaleAndAdd(ElemType keepWeight, const GP
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 2>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
 {
 }
 template <class ElemType>
-void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 3>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
 {
 }
 template <class ElemType>
-void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, const GPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 4>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@ -38,14 +38,16 @@ using namespace std;

 // main constructor (all constructors except the default one route through this)
 template <class ElemType>
-TensorView<ElemType>::TensorView(const Matrix<ElemType>& sob, const TensorShape& shape)
-    : m_sob(sob.AsReference()), m_shape(shape)
+TensorView<ElemType>::TensorView(const MatrixBasePtr& sob, const TensorShape& shape)
+    : m_sob(dynamic_pointer_cast<Matrix<ElemType>>(sob)), m_shape(shape)
 {
+    if (!m_sob)
+        LogicError("TensorView: Attempted to create a TensorView<ElemType> on a storage object of a different ElemType.");
 #ifdef _DEBUG
    // check bounds of TensorShape against underlying storage object
    // This is useful to detect errors like passing a matrix from the wrong input.
    const auto r = shape.GetLocationRange();
-    const auto n = m_sob.GetNumElements();
+    const auto n = m_sob->GetNumElements();
    if (r.first < 0 || (size_t)r.second > n)
        LogicError("TensorView: Shape bounds [%d,%d) exceed bounds of underlying storage object [0,%d).", (int) r.first, (int) r.second, (int) n);
 #endif
@ -228,7 +230,7 @@ static bool CheckDifferentObject(const TensorView<ElemType>& a, const TensorView
 }

 template <class ElemType>
-void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op)
+void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
 {
    // static int cc = 0; if (cc++ == 0)
    //    fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
@ -244,11 +246,11 @@ void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemT
        CheckDifferentObject(a, *this);

    // now perform the operation
-    GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }

 template <class ElemType>
-void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op)
+void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
 {
    // static int cc = 0; if (cc++ == 0)
    //    fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());
@ -262,11 +264,11 @@ void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, cons
    if (reducingOpDims.size() > 0)
        CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this);

-    GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }

 template <class ElemType>
-void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op)
+void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
 {
    // static int cc = 0; if (cc++ == 0)
    //    fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());
@ -280,79 +282,7 @@ void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView& a, con
    if (reducingOpDims.size() > 0)
        CheckDifferentObject(a, *this) && CheckDifferentObject(b, *this) && CheckDifferentObject(c, *this);

-    GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-}
-
-// simple test function for testing stuff
-// Call as: Microsoft::MSR::CNTK::TensorView<float>::Test();
-template <class ElemType>
-/*static*/ void TensorView<ElemType>::Test()
-{
-    const DEVICEID_TYPE deviceId = 0; // -1
-    Matrix<ElemType> m1(deviceId);
-    Matrix<ElemType> m2(deviceId);
-    Matrix<ElemType> m3(deviceId);
-    {
-        m1.SetValue(5, 3, {1, 2, 3,
-                           14, 15, 6,
-                           4, 5, 16,
-                           41, 5, 1,
-                           1.8, 4.5, 7});
-        m2.SetValue(5, 1, {42,
-                           13,
-                           1968,
-                           3.1415f,
-                           7});
-
-        m3.Resize(m1);
-
-        // regular zip  (just add m1 to itself)
-        TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m1), 1);
-        m3.Print();
-
-        // unary op
-        TensorView(m3).DoSqrtOf(0, TensorView(m1), 1);
-        m3.Print();
-
-        // broadcasting of an input
-        TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
-        m3.Print();
-
-        TensorView(m3).DoMaxOf(0, TensorView(m1), TensorView(m2), 1);
-        m3.Print();
-
-        TensorView(m3).DoGTOf(0, TensorView(m1), TensorView(m2), 1);
-        m3.Print();
-
-        // reduction over columns
-        m3.Resize(5, 1);
-        TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
-        m3.Print();
-
-        // reduction over rows
-        m3.Resize(1, 3);
-        TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
-        m3.Print();
-
-        TensorView(m3).DoLogSumOf(0, TensorView(m1), TensorView(m2), 1);
-        m3.Print();
-    }
-    {
-        m1.Resize(1, 42);
-        m2.Resize(13, 1);
-        m3.Resize(13, 21);
-        TensorShape s1(1, 2, 21);
-        TensorShape s2(13, 1);
-        TensorShape s3(13, 1, 21);
-        let t1 = TensorView<ElemType>(m1, s1);
-        t1;
-        let t2 = TensorView<ElemType>(m2, s2);
-        t2;
-        auto t3 = TensorView<ElemType>(m3, s3);
-        t3;
-        t3.DoSumOf(0, t1, t2, 1);
-        m3.Print();
-    }
+    GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }

 // -------------------------------------------------------------------
@ -409,19 +339,20 @@ static void FlattenToMatrix(TensorShape& shape, bool trans, size_t splitPoint)

 // convert tensor into a Matrix object
 template <class ElemType>
-Matrix/*ref*/<ElemType> TensorView<ElemType>::AsMatrix() const
+shared_ptr<Matrix<ElemType>> TensorView<ElemType>::AsMatrix() const
 {
    assert(m_shape.GetRank() == 2);
    if (m_shape.GetStrides()[0] != 1 && m_shape[0] != 1)
        InvalidArgument("AsMatrix: Flattened [%s] matrix is not dense (it has a stride).", string(m_shape).c_str());
+
    // create a Matrix view into the TensorView (which in turn is a view over a Matrix...)
    // The way to do this is to use a ColumnSlice.
    // express the TensorView's storage in m_sob's coordinates
-    let firstColumn = m_shape.GetOffset()      / m_sob.GetNumRows();
-    let numColumns  = m_shape.GetNumElements() / m_sob.GetNumRows();
-    if (firstColumn * m_sob.GetNumRows() != m_shape.GetOffset() || numColumns * m_sob.GetNumRows() != m_shape.GetNumElements())
+    let firstColumn = m_shape.GetOffset()      / m_sob->GetNumRows();
+    let numColumns  = m_shape.GetNumElements() / m_sob->GetNumRows();
+    if (firstColumn * m_sob->GetNumRows() != m_shape.GetOffset() || numColumns * m_sob->GetNumRows() != m_shape.GetNumElements())
        InvalidArgument("AsMatrix: Flattened [%s] matrix has an offset or width that is not a multiple of the storage object's row dimension.", string(m_shape).c_str());
-    auto sob = m_sob.ColumnSlice(firstColumn, numColumns);
+
    // now reinterpret this slice according to the new tensor shape
    // Example:
    //  - each sob column contains a set of vectors stored as a 2D tensor [I x J], and [S x T] samples
@ -431,12 +362,20 @@ Matrix/*ref*/<ElemType> TensorView<ElemType>::AsMatrix() const
    //  - which in turn yields a [K x (J * S x*T)] matrix
    //    which gets reinterpreted back as a [K x J x S x T] tensor
    // In the special case of sparse matrices, this split cannot be done. E.g. in the above example, we could only multiply with a [K x I x J] tensor.
-    if (sob.GetMatrixType() == MatrixType::DENSE)
-        return sob.Reshaped(m_shape[0], m_shape[1]);
-    else if (m_shape[0] == sob.GetNumRows()) // SPARSE matrices cannot be reshaped, so we only support 1D and 2D tensors
-        return sob;
-    else
+    let needsSlicing = firstColumn != 0 || numColumns != m_sob->GetNumCols();
+    let needsReshaping = m_shape[0] != m_sob->GetNumRows() || m_shape[1] != m_sob->GetNumCols();
+
+    // Note: If an output matrix is a view and needs to move to a different device, we will fail later, since the current structure cannot support that.
+    // As a consequence, some configurations will simply not work currently.
+    // We minimize the chance of this by using the original storage object whenever possible.
+    if (!needsSlicing && !needsReshaping)     // no need to mess with the storage object: pass it on as it is. Full support for moving devices.
+        return m_sob;
+    else if (needsSlicing && !needsReshaping) // slicing is supported for sparse as well
+        return make_shared<Matrix<ElemType>>(m_sob->ColumnSlice(firstColumn, numColumns));
+    else if (m_sob->GetMatrixType() != MatrixType::DENSE) // needsReshaping: not allowed for sparse matrices
        RuntimeError("AsMatrix: Sparse tensors are not supported unless they are 1D or 2D matrices.");
+    else                                                  // dense can slice and reshape neutrally, but will also fail if output matrix needs to move devices
+        return make_shared<Matrix<ElemType>>(m_sob->ColumnSlice(firstColumn, numColumns).Reshaped(m_shape[0], m_shape[1]));
 }

 template <class ElemType>
@ -471,9 +410,9 @@ void TensorView<ElemType>::DoMatrixProductOf(ElemType beta, bool transC, const T
    auto C =   Reshaped(shapeC).AsMatrix();
    // and go
    if (!transC)
-        Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, A, transA, B, transB, beta, C);
+        Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, *A, transA, *B, transB, beta, *C);
    else // C' = A * B  <==>  C = (A * B)' = B' * A'
-        Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, B, !transB, A, !transA, beta, C);
+        Matrix<ElemType>::MultiplyAndWeightedAdd(alpha, *B, !transB, *A, !transA, beta, *C);
 }

 template class TensorView<float>;
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -26,20 +26,22 @@ public:
    // -------------------------------------------------------------------

    // reinterpret a matrix storage object (SOB) as a TensorView with a given TensorShape  --this is the main constructor
-    TensorView(const Matrix<ElemType>& sob, const TensorShape& shape);
+    TensorView(const MatrixBasePtr& sob, const TensorShape& shape);
+#if 0
    // cast a Matrix as a 2D TensorView (without shape change)
-    TensorView(const Matrix<ElemType>& sob)
-        : m_sob(sob.AsReference()), m_shape(TensorShape(array<size_t, 2>{sob.GetNumRows(), sob.GetNumCols()}))
+    TensorView(const MatrixBasePtr& sob)
+        : m_sob(sob), m_shape(TensorShape(array<size_t, 2>{sob->GetNumRows(), sob->GetNumCols()}))
    {
    }
+#endif
    // reshape a TensorView
    TensorView(const TensorView<ElemType>& other, const TensorShape& shape)
-        : m_sob(other.m_sob.AsReference()), m_shape(shape)
+        : m_sob(other.m_sob), m_shape(shape)
    {
    }
    // copy constructor
    TensorView(const TensorView<ElemType>& other)
-        : m_sob(other.m_sob.AsReference()), m_shape(other.m_shape)
+        : m_sob(other.m_sob), m_shape(other.m_shape)
    {
    }

@ -66,36 +68,36 @@ public:
    // -------------------------------------------------------------------

 #pragma push_macro("DeclareUnaryTensorOp")
-#define DeclareUnaryTensorOp(oper)                                        \
-    void Do##oper##Of(ElemType beta, const TensorView& a, ElemType alpha) \
-    {                                                                     \
-        DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op##oper);       \
-    }                                                                     \
-    void Assign##oper##Of(const TensorView& a, ElemType alpha = 1.0f)     \
-    {                                                                     \
-        DoUnaryOpOf(0, a, alpha, ElementWiseOperator::op##oper);          \
-    }                                                                     \
-    void Add##oper##Of(const TensorView& a, ElemType alpha = 1.0f)        \
-    {                                                                     \
-        DoUnaryOpOf(1.0f, a, alpha, ElementWiseOperator::op##oper);       \
+#define DeclareUnaryTensorOp(oper)                                                              \
+    void Do##oper##Of(ElemType beta, const TensorView& a, ElemType alpha)                       \
+    {                                                                                           \
+        DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
+    }                                                                                           \
+    void Assign##oper##Of(const TensorView& a, ElemType alpha = 1.0f)                           \
+    {                                                                                           \
+        DoUnaryOpOf(0, a, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum);    \
+    }                                                                                           \
+    void Add##oper##Of(const TensorView& a, ElemType alpha = 1.0f)                              \
+    {                                                                                           \
+        DoUnaryOpOf(1.0f, a, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
    }

    ForAllUnaryOps(DeclareUnaryTensorOp);
 #pragma pop_macro("DeclareUnaryTensorOp")

 #pragma push_macro("DeclareBinaryTensorOp")
-#define DeclareBinaryTensorOp(oper)                                                            \
-    void Do##oper##Of(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha) \
-    {                                                                                          \
-        DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op##oper);                        \
-    }                                                                                          \
-    void Assign##oper##Of(const TensorView& a, const TensorView& b, ElemType alpha = 1.0f)     \
-    {                                                                                          \
-        DoBinaryOpOf(0, a, b, alpha, ElementWiseOperator::op##oper);                           \
-    }                                                                                          \
-    void Add##oper##Of(const TensorView& a, const TensorView& b, ElemType alpha = 1.0f)        \
-    {                                                                                          \
-        DoBinaryOpOf(1.0f, a, b, alpha, ElementWiseOperator::op##oper);                        \
+#define DeclareBinaryTensorOp(oper)                                                                 \
+    void Do##oper##Of(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha)      \
+    {                                                                                               \
+        DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
+    }                                                                                               \
+    void Assign##oper##Of(const TensorView& a, const TensorView& b, ElemType alpha = 1.0f)          \
+    {                                                                                               \
+        DoBinaryOpOf(0, a, b, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum);    \
+    }                                                                                               \
+    void Add##oper##Of(const TensorView& a, const TensorView& b, ElemType alpha = 1.0f)             \
+    {                                                                                               \
+        DoBinaryOpOf(1.0f, a, b, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum); \
    }

    ForAllBinaryOps(DeclareBinaryTensorOp);
@ -105,25 +107,23 @@ public:
 #define DeclareTernaryTensorOp(oper)                                                                                \
    void Do##oper##Of(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha) \
    {                                                                                                               \
-        DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op##oper);                                         \
+        DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum);             \
    }                                                                                                               \
    void Assign##oper##Of(const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha = 1.0f)     \
    {                                                                                                               \
-        DoTernaryOpOf(0, a, b, c, alpha, ElementWiseOperator::op##oper);                                            \
+        DoTernaryOpOf(0, a, b, c, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum);                \
    }                                                                                                               \
    void Add##oper##Of(const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha = 1.0f)        \
    {                                                                                                               \
-        DoTernaryOpOf(1.0f, a, b, c, alpha, ElementWiseOperator::op##oper);                                         \
+        DoTernaryOpOf(1.0f, a, b, c, alpha, ElementWiseOperator::op##oper, ElementWiseOperator::opSum);             \
    }

    ForAllTernaryOps(DeclareTernaryTensorOp);
 #pragma pop_macro("DeclareTernaryTensorOp")

-    static void Test();
-
-    void DoUnaryOpOf  (ElemType beta, const TensorView& a,                                           ElemType alpha, ElementWiseOperator op);
-    void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b,                      ElemType alpha, ElementWiseOperator op);
-    void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op);
+    void DoUnaryOpOf  (ElemType beta, const TensorView& a,                                           ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
+    void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b,                      ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
+    void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);

    // -------------------------------------------------------------------
    // matrix product -- GEMM for flattened tensors
@ -139,23 +139,23 @@ public:
    void AssignMatrixProductOf(               bool transC, const TensorView& a, bool transA, const TensorView& b, bool transB, ElemType alpha = 1.0f) { DoMatrixProductOf(0,    transC, a, transA, b, transB, alpha); }
    void AddMatrixProductOf   (               bool transC, const TensorView& a, bool transA, const TensorView& b, bool transB, ElemType alpha = 1.0f) { DoMatrixProductOf(1.0f, transC, a, transA, b, transB, alpha); }

-    Matrix/*ref*/<ElemType> AsMatrix() const;
+    shared_ptr<Matrix<ElemType>> AsMatrix() const;

 private:
    // -------------------------------------------------------------------
    // accessors
    // -------------------------------------------------------------------

-    const Matrix<ElemType>& GetSOB() const { return m_sob; }
-    Matrix<ElemType>&       GetSOB()       { return m_sob; }
+    const Matrix<ElemType>& GetSOB() const { return *m_sob; }
+    Matrix<ElemType>&       GetSOB()       { return *m_sob; }
    const TensorShape& GetShape() const { return m_shape; }

    // -------------------------------------------------------------------
    // sob members
    // -------------------------------------------------------------------

-    Matrix<ElemType> m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. This is really a reference (not owing the buffer).
-    TensorShape m_shape;    // the meta-data that describes the data's shape and/or access pattern
+    shared_ptr<Matrix<ElemType>> m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. This is really a reference (not owing the buffer).
+    TensorShape m_shape;                // the meta-data that describes the data's shape and/or access pattern
 };

 }}}
--- a/Source/Readers/BinaryReader/BinaryReader.h
+++ b/Source/Readers/BinaryReader/BinaryReader.h
@ -583,6 +583,7 @@ public:
    BinaryReader()
        : m_pMBLayout(make_shared<MBLayout>())
    {
+        m_pMBLayout->SetUniqueAxisName(L"BinaryReader");
    }
    virtual ~BinaryReader();
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
--- a/Source/Readers/DSSMReader/DSSMReader.h
+++ b/Source/Readers/DSSMReader/DSSMReader.h
@ -152,6 +152,7 @@ public:
    DSSMReader()
        : m_pMBLayout(make_shared<MBLayout>())
    {
+        m_pMBLayout->SetUniqueAxisName(L"DSSMReader");
        m_qfeaturesBuffer = NULL;
        m_dfeaturesBuffer = NULL;
        m_labelsBuffer = NULL;
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.h
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.h
@ -152,6 +152,7 @@ public:
    HTKMLFReader()
        : m_pMBLayout(make_shared<MBLayout>())
    {
+        m_pMBLayout->SetUniqueAxisName(L"HTKMLFReader");
    }
    template <class ConfigRecordType>
    void InitFromConfig(const ConfigRecordType&);
--- a/Source/Readers/Kaldi2Reader/HTKMLFReader.cpp
+++ b/Source/Readers/Kaldi2Reader/HTKMLFReader.cpp
@ -2055,4 +2055,5 @@ void HTKMLFReader<ElemType>::GetDataNamesFromConfig(const ConfigRecordType& read

 template class HTKMLFReader<float>;
 template class HTKMLFReader<double>;
-} } }
+
+}}}
--- a/Source/Readers/Kaldi2Reader/HTKMLFReader.h
+++ b/Source/Readers/Kaldi2Reader/HTKMLFReader.h
@ -38,9 +38,10 @@ private:
        MBLayoutPtr pMBLayout;
        std::vector<std::vector<std::pair<wstring, size_t>>> minibatchUttInfo;
        size_t currentMBSize;
-        MinibatchBufferUnit()
-            : pMBLayout(make_shared<MBLayout>()), currentMBSize(0)
+        MinibatchBufferUnit() :
+            pMBLayout(make_shared<MBLayout>()), currentMBSize(0)
        {
+            pMBLayout->SetUniqueAxisName(L"HTKMLFReader");
        }
    };
    bool m_doMinibatchBuffering;
@ -163,9 +164,10 @@ public:
    // set to true so that a current minibatch can uses state activities from the previous minibatch.
    // default will have truncated BPTT, which only does BPTT inside a minibatch
    bool mIgnoreSentenceBeginTag;
-    HTKMLFReader()
-        : m_pMBLayout(make_shared<MBLayout>())
+    HTKMLFReader() :
+        m_pMBLayout(make_shared<MBLayout>())
    {
+        m_pMBLayout->SetUniqueAxisName(L"HTKMLFReader");
    }

    template <class ConfigRecordType>
--- a/Source/Readers/LMSequenceReader/SequenceReader.cpp
+++ b/Source/Readers/LMSequenceReader/SequenceReader.cpp
@ -660,7 +660,7 @@ void SequenceReader<ElemType>::ReadClassInfo(const wstring& vocfile, int& classS

    // check if unk is the same used in vocabulary file
    if (word4idx.find(mUnk.c_str()) == word4idx.end())
-        RuntimeError("ReadClassInfo unknown symbol '%s' is not in vocabulary file.", mUnk.c_str());
+        fprintf(stderr, "ReadClassInfo: 'unknown' symbol unk='%s' is not in vocabulary file. Unknown words will error out if encountered.\n", mUnk.c_str());
 }

 // InitCache - Initialize the caching reader if cache files exist, otherwise the writer
--- a/Source/Readers/LMSequenceReader/SequenceReader.h
+++ b/Source/Readers/LMSequenceReader/SequenceReader.h
@ -381,6 +381,7 @@ public:
    BatchSequenceReader()
        : m_pMBLayout(make_shared<MBLayout>())
    {
+        m_pMBLayout->SetUniqueAxisName(L"LMSequenceReader");
        mLastProcessedSentenceId = 0;
        mRequestedNumParallelSequences = 1;
        mLastPosInSentence = 0;
--- a/Source/Readers/LUSequenceReader/LUSequenceReader.h
+++ b/Source/Readers/LUSequenceReader/LUSequenceReader.h
@ -270,6 +270,7 @@ public:
    BatchLUSequenceReader()
        : m_pMBLayout(make_shared<MBLayout>())
    {
+        m_pMBLayout->SetUniqueAxisName(L"LUSequenceReader");
        mLastProcessedSentenceId = 0;
        mRequestedNumParallelSequences = 1;
        mLastPosInSentence = 0;
--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.h
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.h
@ -247,6 +247,7 @@ public:
        : DSSMLabels(nullptr), DSSMCols(0)
    {
        m_pMBLayout = make_shared<MBLayout>();
+        m_pMBLayout->SetUniqueAxisName(L"LibSVMReader");
    };

    virtual ~LibSVMBinaryReader();
--- a/Source/Readers/ReaderLib/BpttPacker.cpp
+++ b/Source/Readers/ReaderLib/BpttPacker.cpp
@ -130,7 +130,9 @@ BpttPacker::BpttPacker(
        auto& buffer = m_streamBuffers[i];
        buffer.Resize(m_numParallelSequences * m_truncationSize * GetSampleSize(stream));
        m_sequenceBufferPerStream.push_back(make_shared<SequenceBuffer>(m_numParallelSequences));
-        m_currentLayouts.push_back(make_shared<MBLayout>());
+        auto pMBLayout = make_shared<MBLayout>();
+        pMBLayout->SetUniqueAxisName(L"BpttPacker");
+        m_currentLayouts.push_back(pMBLayout);
    }

    // Filling in the initial set of sequences
--- a/Source/Readers/ReaderLib/ReaderShim.cpp
+++ b/Source/Readers/ReaderLib/ReaderShim.cpp
@ -116,9 +116,7 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
    // If not we should inject the IMemoryProvider per stream.
    int deviceId = matrices.begin()->second.matrix->GetDeviceId();
    for (auto mx : matrices)
-    {
        assert(mx.second.matrix->GetDeviceId() == deviceId), UNUSED(deviceId);
-    }

    assert(m_prefetchTask.valid());

@ -133,6 +131,7 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
    }

    // Reset stale mb layouts.
+    // BUGBUG: This seems incorrect. (1) layouts should all be updated below, and (2) some of these layouts are the same, we are resetting them twice.
    for (const auto& iter : matrices)
    {
        iter.second.pMBLayout->Init(1, 0);
@ -149,12 +148,12 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
            if (m_nameToStreamId.find(mx.first) == m_nameToStreamId.end())
            {
                string inputNames = EnumerateInputs(m_nameToStreamId);
-                RuntimeError("Could not map input '%ls' to the reader. Reader outputs only [%s].",
+                RuntimeError("Could not map input '%ls' to the reader. Reader outputs only [%s].", 
                    mx.first.c_str(), inputNames.c_str());
            }

            size_t streamId = m_nameToStreamId[mx.first];
-
+            
            const auto& stream = minibatch.m_data[streamId];

            m_numParallelSequences = stream->m_layout->GetNumParallelSequences();
@ -176,7 +175,7 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
                RuntimeError("Dynamic axis layout '%ls' is shared between inputs '%ls' and '%ls', but layouts generated "
                    "from the input data are incompatible on this axis. Are you using different sequence lengths? "
                    "Did you consider adding a DynamicAxis() to the Input nodes?",
-                    layout->GetAxisName().c_str(), layoutToInputMap[layout->GetAxisName()].c_str(), mx.first.c_str());
+                    layout->GetAxisName(), layoutToInputMap[layout->GetAxisName()].c_str(), mx.first.c_str());
            }

            size_t sampleSize = m_streams[streamId]->m_sampleLayout->GetNumElements();
@ -217,7 +216,7 @@ void ReaderShim<ElemType>::FillMatrixFromStream(StorageType type, Matrix<ElemTyp
        IndexType* columns = reinterpret_cast<IndexType*>(rows + nnzCount);
        matrix->SetMatrixFromCSCFormat(columns, rows, values, nnzCount, numRows, numCols);
    }
-    else
+    else 
    {
        RuntimeError("Storage type %d is not supported.", (int)type);
    }
--- a/Source/Readers/ReaderLib/SequencePacker.cpp
+++ b/Source/Readers/ReaderLib/SequencePacker.cpp
@ -114,7 +114,7 @@ MBLayoutPtr SequencePacker::PackDenseStream(const StreamBatch& batch, size_t str
                assert(sampleOffset == sampleIndex * sampleSize);
                PackDenseSample(destination, sequence, sampleOffset, sampleSize);
                sampleOffset += sampleSize;
-            }
+    }
            else if (stream->m_storageType == StorageType::sparse_csc)
            {
                // TODO: make type casts members of the SparseSequenceData
--- a/Source/Readers/SparsePCReader/SparsePCReader.h
+++ b/Source/Readers/SparsePCReader/SparsePCReader.h
@ -59,7 +59,10 @@ class SparsePCReader : public DataReaderBase

 public:
    SparsePCReader()
-        : m_pMBLayout(make_shared<MBLayout>()){};
+        : m_pMBLayout(make_shared<MBLayout>())
+    {
+        m_pMBLayout->SetUniqueAxisName(L"SparsePCReader");
+    };
    virtual ~SparsePCReader();
    virtual void Destroy();
    template <class ConfigRecordType>
--- a/Source/Readers/UCIFastReader/UCIFastReader.h
+++ b/Source/Readers/UCIFastReader/UCIFastReader.h
@ -135,6 +135,7 @@ public:
    UCIFastReader()
    {
        m_pMBLayout = make_shared<MBLayout>();
+        m_pMBLayout->SetUniqueAxisName(L"UCIFastReader");
    }
    virtual ~UCIFastReader();

--- a/Source/SGDLib/Criterion.h
+++ b/Source/SGDLib/Criterion.h
@ -0,0 +1,109 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// Criterion.h -- helper classes for accumulating criteria
+
+#pragma once
+
+#include "Basics.h"
+#include "Matrix.h"
+#include <memory> // for pair
+#include <limits> // for isnan() and numeric_limits  --TODO: is that the right header?
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// helper class for passing accumulated epoch-level criteria around while retaining their sample counts
+// Criteria are represented as a tuple (aggregate criterion, sample count). The average criterion value is their ratio.
+struct EpochCriterion : public std::pair<double, size_t>
+{
+    // construction
+    explicit EpochCriterion(double aggregateCriterionValue = 0.0, size_t aggregateSampleCount = 0) : std::pair<double, size_t>(aggregateCriterionValue, aggregateSampleCount) { }
+    EpochCriterion(const std::pair<double, size_t>& other) : std::pair<double, size_t>(other) { }
+
+    // main way of reading this out: compute the actual average criterion value from the aggregate and sample count
+    double Average() const { return second > 0 ? first / second : 0.0; } // compute the epoch-average
+
+    // a few more handy operations that occured multiple times
+    bool IsNan() const { return std::isnan(first); }
+    EpochCriterion operator-(const EpochCriterion& other) const { return EpochCriterion(first - other.first, second - other.second); }
+    void operator+=(const EpochCriterion& other) { first += other.first; second += other.second; }
+
+    static EpochCriterion Infinity() { return EpochCriterion(std::numeric_limits<double>::infinity()); }
+    bool IsInfinity() const { return first == std::numeric_limits<double>::infinity(); }
+};
+
+// We accumulate criteria in this struct.
+// Criteria are accumulated together with their counts (counts depend on sequence lengths, and different criteria may have different sequence lengths).
+template <class ElemType>
+struct CriterionAccumulator
+{
+    // constructor
+    CriterionAccumulator(size_t numCriteria, DEVICEID_TYPE deviceId) :
+        m_aggregateCriterionValues(1, numCriteria, deviceId)
+    {
+        m_aggregateCriterionValues.SetValue(0);
+        m_aggregateSampleCounts.assign(numCriteria, 0);
+    }
+    // 'i' is the index of the element we add into (multiple eval criteria share the same matrix object)
+    // Use 'reset=true' to not accumulate but overwrite.
+    const CriterionAccumulator& Add(const std::vector<ComputationNodeBasePtr>& nodes, size_t i, size_t legacyNumSamples)
+    {
+        return Accumulate</*reset=*/false>(nodes, i, legacyNumSamples);
+    }
+    const CriterionAccumulator& Assign(const std::vector<ComputationNodeBasePtr>& nodes, size_t i, size_t legacyNumSamples)
+    {
+        return Accumulate</*reset=*/true>(nodes, i, legacyNumSamples);
+    }
+    // retrieve an accumulated result as a pair (numerator, denominator)
+    EpochCriterion GetCriterion(size_t i) const
+    {
+        // BUGBUG: For unknown reasons, this (or the other below) check makes a difference for MPI configs.
+        //         If it is left out, then training and test configs end up being scaled by the same factor close to 1.
+        if (m_aggregateSampleCounts[i] == 0)
+            return EpochCriterion(0, 0); // avoid unnecessary GPU access
+        else
+            return EpochCriterion(m_aggregateCriterionValues(0, i), m_aggregateSampleCounts[i]);
+    }
+
+private:
+    // shared part of Add() and Assign()
+    // This code assumes that if number of samples is 0, the criterion value is also 0 and does not need to be fetched from the GPU.
+    template<bool reset>
+    const CriterionAccumulator& Accumulate(const std::vector<ComputationNodeBasePtr>& nodes, size_t i, size_t legacyNumSamples)
+    {
+        const auto& node = nodes[i]; // multiple nodes are managed by this struct
+        float beta = reset ? 0 : 1;
+        // Note: A future change will be that criterion nodes emit criteria per frame.
+        // In that case, we will do masking and an implicit reduction right here using TensorView.
+        size_t numSamples = GetNumSamples(nodes[i], legacyNumSamples);
+        // temp solution until we add TensorView reduction
+        if (beta == 0)
+        {
+            Matrix<ElemType>::AssignElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(),
+                                                     0, 0, m_aggregateCriterionValues, 0, i);
+            m_aggregateSampleCounts[i] = numSamples;
+        }
+        else if (numSamples > 0) // avoid unnecessary GPU access
+        {
+            Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(),
+                                                  0, 0, m_aggregateCriterionValues, 0, i);
+            m_aggregateSampleCounts[i] += numSamples;
+        }
+        return *this;
+    }
+    // get the number of samples
+    static size_t GetNumSamples(const ComputationNodeBasePtr& node, size_t legacyNumSamples)
+    {
+        if (node->HasMBLayout())
+            return node->GetMBLayout()->GetActualNumSamples();
+        else
+            return legacyNumSamples;
+    }
+
+private:
+    Matrix<ElemType> m_aggregateCriterionValues; // [1 x N]
+    vector<size_t> m_aggregateSampleCounts;      // [N]
+};
+
+}}}
--- a/Source/SGDLib/DataReaderHelpers.h
+++ b/Source/SGDLib/DataReaderHelpers.h
@ -70,7 +70,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // Verify that there's indeed a single layout
            for (const auto& iter : inputMatrices)
            {
-                assert(iter.second.pMBLayout == pMBLayout); 
+                assert(iter.second.pMBLayout == pMBLayout);
+                // TODO: This must be a runtime check, not an assert().
                UNUSED(iter);
            }
        
@ -105,8 +106,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template <class ElemType>
    static pair<size_t, size_t> DecimateMinibatch(const StreamMinibatchInputs& MB,    // input matrices
                                                  StreamMinibatchInputs& decimatedMB, // output decimated matrices.
-                                                  MBLayoutPtr pMBLayout,                        // input MBLayout
-                                                  MBLayoutPtr& pDecimateMBLayout,               // output decimated MBLayout (note: cannot work in-place)
+                                                  MBLayoutPtr pMBLayout,              // input MBLayout
+                                                  MBLayoutPtr& pDecimateMBLayout,     // output decimated MBLayout (note: cannot work in-place)
                                                  size_t numProcs, size_t rank)
    {
        size_t numParallelSequences = pMBLayout->GetNumParallelSequences();
@ -148,6 +149,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
        // decimate MBLayout as well
        pDecimateMBLayout = make_shared<MBLayout>(numNewParallelSequence, nT, L"");
+        pDecimateMBLayout->SetAxisName(pMBLayout->GetAxisName());
 #if 1
        // now copy over all sequence info records that are inside the range, with adjusted 's'
        const auto& sequences = pMBLayout->GetAllSequences();
@ -181,17 +183,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // no need to do inplace decimation if numproc == 1

        // allocate space for non-inplace decimation
-        MBLayoutPtr pDecimatedMB = make_shared<MBLayout>();
+        MBLayoutPtr pDecimatedMBLayout = make_shared<MBLayout>();
+        pDecimatedMBLayout->SetAxisName(pMBLayout->GetAxisName());
        StreamMinibatchInputs decimatedMB;
        // call in-place decimation
-        pair<size_t, size_t> selected = DecimateMinibatch<ElemType>(mb, decimatedMB, pMBLayout, pDecimatedMB, numprocs, rank);
+        pair<size_t, size_t> selected = DecimateMinibatch<ElemType>(mb, decimatedMB, pMBLayout, pDecimatedMBLayout, numprocs, rank);
        // move the data
        for (auto k : mb)
        {
            const auto& name = k.first;
            mb.GetInputMatrix<ElemType>(name).SetValue(decimatedMB.GetInputMatrix<ElemType>(name)); // deep-copy our local one to the output location
        }
-        pMBLayout->MoveFrom(pDecimatedMB);
+        pMBLayout->MoveFrom(pDecimatedMBLayout);
        return selected;
    }

@ -353,7 +356,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }

            // for sequence training
-            if (criterionNodes[0]->OperationName() == L"SequenceWithSoftmax")
+            if (!criterionNodes.empty() && criterionNodes[0]->OperationName() == L"SequenceWithSoftmax")
            {
                auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNodes[0]);
                assert(node);
@ -379,7 +382,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                     size_t requestedSubminibatches)
        {
            // first, remember interface to the net
-            // BUGBUG: This will no longer be correct once we have multiple input layouts.
+            // BUGBUG (Issue #95): This will no longer be correct once we have multiple input layouts.
            m_netMBLayoutPtr = net.GetMBLayoutPtrOfNetwork();
            m_netInputMatrixPtr = inputMatrices;

@ -539,18 +542,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
                shared_ptr<ComputationNode<ElemType>> pNode = m_LearnableNodePtr[nodename];
                m_cachedGradient.GetInputMatrix<ElemType>(nodename) += pNode->Gradient();
-                pNode->Gradient().SetValue((ElemType) 0);
+                pNode->Gradient().SetValue(0);
            }
            // accumulate criterion value
-            Matrix<ElemType>::AddElementToElement(m_netCriterionNodes[0]->Value(), 0, 0,
-                                                  *m_netCriterionAccumulator, 0, 0);
-            m_netCriterionNodes[0]->Value().SetValue((ElemType) 0);
+            if (!m_netCriterionNodes.empty())
+            {
+                Matrix<ElemType>::AddElementToElement(m_netCriterionNodes[0]->Value(), 0, 0,
+                                                      *m_netCriterionAccumulator, 0, 0);
+                m_netCriterionNodes[0]->Value().SetValue(0);
+            }
            // accumulate evaluation value
            for (size_t i = 0; i < m_netEvaluationNodes.size(); i++)
            {
                Matrix<ElemType>::AddElementToElement(m_netEvaluationNodes[i]->Value(), 0, 0,
                                                      *m_netEvaluationAccumulator, 0, i);
-                m_netEvaluationNodes[i]->Value().SetValue((ElemType) 0);
+                m_netEvaluationNodes[i]->Value().SetValue(0);
            }

            // Export node state
@ -576,10 +582,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // also revert net.m_MBLayoutPtr
            m_netMBLayoutPtr->CopyFrom(m_MBLayoutCache);

-            // m_netCriterionNodes[0]->Value().SetValue((ElemType)0);
-            Matrix<ElemType>::AddElementToElement(*m_netCriterionAccumulator, 0, 0,
-                                                  m_netCriterionNodes[0]->Value(), 0, 0);
-            m_netCriterionAccumulator->SetValue((ElemType) 0);
+            if (!m_netCriterionNodes.empty())
+            {
+                // m_netCriterionNodes[0]->Value().SetValue((ElemType)0);
+                Matrix<ElemType>::AddElementToElement(*m_netCriterionAccumulator, 0, 0,
+                                                      m_netCriterionNodes[0]->Value(), 0, 0);
+            }
+            m_netCriterionAccumulator->SetValue(0);

            for (size_t i = 0; i < m_netEvaluationNodes.size(); i++)
            {
@ -587,7 +596,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                Matrix<ElemType>::AddElementToElement(*m_netEvaluationAccumulator, 0, i,
                                                      m_netEvaluationNodes[i]->Value(), 0, 0);
            }
-            m_netEvaluationAccumulator->SetValue((ElemType) 0);
+            m_netEvaluationAccumulator->SetValue(0);
        }
    };
 };
--- a/Source/SGDLib/DistGradHeader.h
+++ b/Source/SGDLib/DistGradHeader.h
@ -6,12 +6,12 @@ struct DistGradHeader
 {
 public:
    size_t numSamples;
-    size_t numSamplesWithLabel;
+    size_t numSamplesWithLabel; // this is the denominator for 'criterion'
    double criterion;

    // variable-size array
    int numEvalNode;
-    double evalErrors[1];
+    pair<double,size_t> evalErrors[1];

    static DistGradHeader* Create(int numEvalNode)
    {
@ -41,7 +41,8 @@ public:
            criterion += other->criterion;
            for (int i = 0; i < numEvalNode; i++)
            {
-                evalErrors[i] += other->evalErrors[i];
+                evalErrors[i].first  += other->evalErrors[i].first;  // numer
+                evalErrors[i].second += other->evalErrors[i].second; // denom
            }
        }
    }
@ -58,7 +59,8 @@ public:
        criterion = 0;
        for (int i = 0; i < numEvalNode; i++)
        {
-            evalErrors[i] = 0;
+            evalErrors[i].first  = 0;
+            evalErrors[i].second = 0;
        }
    }

@ -77,17 +79,19 @@ public:
    }

 private:
-    static size_t DistGradHeaderSize(size_t nEvalNode)
+    static size_t DistGradHeaderSize(size_t nEvalNodes)
    {
-        return sizeof(DistGradHeader) + (sizeof(double) * (nEvalNode - 1));
+        // BUGBUG: Should be sizeof(evalErrors[0]), but the compiler won't let me. This is only correct because evalErrors has 1 element.
+        return sizeof(DistGradHeader) + (sizeof(decltype(evalErrors)) * (nEvalNodes - 1));
    }

    // Disallow construction and destruction since this type contains a variable sized array member
    // and hence must be constructed through the create and destroy functions
-    DistGradHeader() = delete;
+    DistGradHeader()  = delete;
    ~DistGradHeader() = delete;

    // Disallow copy and move construction/assignment
    DISABLE_COPY_AND_MOVE(DistGradHeader);
 };
-} } }
+
+}}}
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -9,6 +9,7 @@
 #include "SimpleEvaluator.h"
 #include "DataReader.h"
 #include "ScriptableObjects.h"
+#include "Criterion.h"
 #include <vector>
 #include <string>
 #include <stdexcept>
@ -230,7 +231,8 @@ protected:
    GradientUpdateInfo m_gradType;
    RMSPropInfo m_rpi;

-    int m_numMBsToShowResult;
+    size_t m_numMBsToShowResult = 0;
+    size_t m_firstMBsToShowResult = 0;
    int m_numMBsToCUDAProfile;

    bool m_doGradientCheck;
@ -398,9 +400,8 @@ protected:
                                         StreamMinibatchInputs* inputMatrices,
                                         const std::list<ComputationNodeBasePtr>& learnableNodes,
                                         std::list<Matrix<ElemType>>& smoothedGradients,
-                                         /*out*/ double& epochCriterion,
-                                         /*out*/ std::vector<double>& epochEvalErrors,
-                                         /*out*/ size_t& totalSamplesSeen,
+                                         /*out*/ EpochCriterion& epochCriterion,
+                                         /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
                                         std::string prefixMsg = "");

    size_t AdaptiveMinibatchSizing(ComputationNetworkPtr net,
@ -463,10 +464,9 @@ protected:
                         StreamMinibatchInputs* inputMatrices,
                         const std::list<ComputationNodeBasePtr>& learnableNodes,
                         std::list<Matrix<ElemType>>& smoothedGradients,
-                         /*out*/ double& epochCriterion,
-                         /*out*/ std::vector<double>& epochEvalErrors,
-                         /*out*/ size_t& totalSamplesSeen,
-                         std::string prefixMsg = "");
+                         /*out*/ EpochCriterion& epochCriterion,
+                         /*out*/ std::vector<EpochCriterion>& epochEvalErrors,
+                         const std::string& prefixMsg = "");

    void InitDistGradAgg(int numEvalNodes, int traceLevel);
    void InitModelAggregationHandler(int traceLevel);
@ -496,13 +496,19 @@ protected:

    void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;

-    void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
+    void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, // TODO: combine totalSamplesSeen and prevCriterion into a EpochCriterion type
                            const double learnRatePerSample,
                            const std::list<Matrix<ElemType>>& smoothedGradients,
                            const double prevCriterion,
                            const size_t minibatchSize);

-    bool LoadCheckPointInfo(const size_t epochNumber,
+    bool TryLoadCheckPointInfo(const size_t epochNumber,
+                               /*out*/ size_t& totalSamplesSeen,
+                               /*out*/ double& learnRatePerSample,
+                               std::list<Matrix<ElemType>>& smoothedGradients,
+                               /*out*/ double& prevCriterion,
+                               /*out*/ size_t& minibatchSize);
+    void LoadCheckPointInfo(const size_t epochNumber,
                            /*out*/ size_t& totalSamplesSeen,
                            /*out*/ double& learnRatePerSample,
                            std::list<Matrix<ElemType>>& smoothedGradients,
@ -533,17 +539,17 @@ public:
                       int npos);

 protected:
-    wstring m_modelPath;
+    std::wstring m_modelPath;
    bool m_keepCheckPointFiles;
    // bool m_validateAfterModelReloading; // TODO: remove this. Why would one not validate a model?

-    wstring m_trainCriterionNodeName;
-    wstring m_evalCriterionNodeName;
+    std::wstring m_trainCriterionNodeName;
+    std::wstring m_evalCriterionNodeName;

    // enable tracing. Nodes listed here get their m_traceNodeValueXXX flags set
-    vector<wstring> m_traceNodeNamesReal;
-    vector<wstring> m_traceNodeNamesCategory;
-    vector<wstring> m_traceNodeNamesSparse;
+    std::vector<std::wstring> m_traceNodeNamesReal;
+    std::vector<std::wstring> m_traceNodeNamesCategory;
+    std::vector<std::wstring> m_traceNodeNamesSparse;

    size_t m_prevChosenMinibatchSize;
    double m_lastFinishedEpochTrainLoss;
--- a/Source/SGDLib/SGDLib.vcxproj
+++ b/Source/SGDLib/SGDLib.vcxproj
@ -164,6 +164,7 @@
    <ClInclude Include="..\ComputationNetworkLib\ComputationNetwork.h" />
    <ClInclude Include="..\ComputationNetworkLib\ComputationNode.h" />
    <ClInclude Include="..\ComputationNetworkLib\ConvolutionalNodes.h" />
+    <ClInclude Include="Criterion.h" />
    <ClInclude Include="DataReaderHelpers.h" />
    <ClInclude Include="DistGradHeader.h" />
    <ClInclude Include="IDistGradAggregator.h" />
--- a/Source/SGDLib/SGDLib.vcxproj.filters
+++ b/Source/SGDLib/SGDLib.vcxproj.filters
@ -147,6 +147,9 @@
 	<ClInclude Include="MASGD.h">
      <Filter>Parallelization</Filter>
    </ClInclude>
+    <ClInclude Include="Criterion.h">
+      <Filter>SGD</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Common">
--- a/Source/SGDLib/SimpleDistGradAggregator.h
+++ b/Source/SGDLib/SimpleDistGradAggregator.h
@ -224,7 +224,7 @@ private:
            assert(headerCPU->criterion == 0);
            for (int i = 0; i < headerCPU->numEvalNode; ++i)
            {
-                assert(headerCPU->evalErrors[i] == 0);
+                assert(headerCPU->evalErrors[i].first == 0);
            }

            // If the current node did not process any samples, the gradients should be zero'd
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@ -14,6 +14,7 @@
 #include "DistGradHeader.h"
 #include "IDistGradAggregator.h"
 #include "SimpleDistGradAggregator.h"
+#include "Criterion.h"

 #include <vector>
 #include <string>
@ -31,10 +32,11 @@ template <class ElemType>
 class SimpleEvaluator
 {
 public:
-    SimpleEvaluator(ComputationNetworkPtr net, const MPIWrapperPtr& mpi, const size_t numMBsToShowResult = 100, const int traceLevel = 0, const size_t maxSamplesInRAM = SIZE_MAX,
+    SimpleEvaluator(ComputationNetworkPtr net, const MPIWrapperPtr& mpi, const size_t numMBsToShowResult = 100, const size_t firstMBsToShowResult = 0, const int traceLevel = 0, const size_t maxSamplesInRAM = SIZE_MAX,
                    const size_t numSubminiBatches = 1)
        : m_net(net), 
          m_numMBsToShowResult(numMBsToShowResult), 
+          m_firstMBsToShowResult(firstMBsToShowResult),
          m_traceLevel(traceLevel),
          m_maxSamplesInRAM(maxSamplesInRAM), 
          m_numSubminiBatches(numSubminiBatches), 
@ -45,7 +47,7 @@ public:
    }

    // returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
-    vector<double> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
+    vector<EpochCriterion> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
    {
        ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring);

@ -81,9 +83,7 @@ public:
        }

        // initialize eval results
-        std::vector<double> evalResults;
-        for (int i = 0; i < evalNodes.size(); i++)
-            evalResults.push_back((double) 0);
+        std::vector<EpochCriterion> evalResults(evalNodes.size(), EpochCriterion(0));

        // allocate memory for forward computation
        m_net->AllocateAllMatrices(evalNodes, {}, nullptr);
@ -102,12 +102,10 @@ public:
        size_t totalEpochSamples = 0;
        size_t numMBsRun = 0;
        size_t actualMBSize = 0;
-        size_t numSamplesLastMBs = 0;
-        size_t lastMBsRun = 0; // MBs run before this display
+        size_t numSamplesLastLogged = 0;
+        size_t numMBsRunLastLogged = 0; // MBs run before this display

-        std::vector<double> evalResultsLastMBs;
-        for (int i = 0; i < evalResults.size(); i++)
-            evalResultsLastMBs.push_back((ElemType) 0);
+        std::vector<EpochCriterion> evalResultsLastLogged(evalResults.size(), EpochCriterion(0));

        //TODO: we should add support for distributed reading
        dataReader->StartMinibatchLoop(mbSize, 0, testSize);
@ -123,6 +121,8 @@ public:
        if (numSubminibatchesNeeded > 1)
            smbDispatcher.Init(m_net, learnableNodes, criterionNodes, evalNodes);

+        CriterionAccumulator<ElemType> localEpochEvalErrors(evalNodes.size(), m_net->GetDeviceId());
+
        const size_t numIterationsBeforePrintingProgress = 100;
        size_t numItersSinceLastPrintOfProgress = 0;
        while (DataReaderHelpers::GetMinibatchIntoNetwork<ElemType>(*dataReader, m_net, nullptr, dataReader->SupportsDistributedMBRead(), m_mpi != nullptr, inputMatrices, actualMBSize, m_mpi))
@ -162,9 +162,9 @@ public:
                m_gradHeader->numEvalNode = evalNodes.size();
                m_gradHeader->numSamples = actualMBSize;
                m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
-                m_gradHeader->criterion = 0.0;
+                m_gradHeader->criterion = 0.0; // (not used here)
                for (size_t i = 0; i < evalNodes.size(); i++)
-                    m_gradHeader->evalErrors[i] = evalNodes[i]->Get00Element();
+                    m_gradHeader->evalErrors[i] = localEpochEvalErrors.Assign(evalNodes, i, numSamplesWithLabel).GetCriterion(i);

                // TODO: We are reusing the aggregation logic inside SimpleDistGradAggregator, which has a heavy dependency
                // on the gradient matrix. At some point we should refactor the aggregator class to be able to only calculating
@ -185,9 +185,7 @@ public:
            else
            {
                for (int i = 0; i < evalNodes.size(); i++)
-                {
-                    evalResults[i] += (double)evalNodes[i]->Get00Element(); // criterionNode should be a scalar
-                }
+                    evalResults[i] += localEpochEvalErrors.Assign(evalNodes, i, numSamplesWithLabel).GetCriterion(i);
            }

            totalEpochSamples += aggregateNumSamplesWithLabel;
@ -195,22 +193,19 @@ public:

            if (m_traceLevel > 0)
            {
-                numSamplesLastMBs += aggregateNumSamplesWithLabel;
+                numSamplesLastLogged += aggregateNumSamplesWithLabel;

-                if (numMBsRun % m_numMBsToShowResult == 0)
+                if (numMBsRun <= m_firstMBsToShowResult || (m_numMBsToShowResult && (numMBsRun % m_numMBsToShowResult == 0)))
                {
-                    DisplayEvalStatistics(lastMBsRun + 1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
+                    DisplayEvalStatistics(numMBsRunLastLogged + 1, numMBsRun, numSamplesLastLogged, evalNodes, evalResults, evalResultsLastLogged);

                    for (int i = 0; i < evalResults.size(); i++)
-                    {
-                        evalResultsLastMBs[i] = evalResults[i];
-                    }
-                    numSamplesLastMBs = 0;
-                    lastMBsRun = numMBsRun;
+                        evalResultsLastLogged[i] = evalResults[i];
+                    numSamplesLastLogged = 0;
+                    numMBsRunLastLogged = numMBsRun;
                }
            }

-
            numItersSinceLastPrintOfProgress = ProgressTracing::TraceFakeProgress(numIterationsBeforePrintingProgress, numItersSinceLastPrintOfProgress);

            // call DataEnd to check if end of sentence is reached
@ -219,47 +214,37 @@ public:
        }

        // show last batch of results
-        if (m_traceLevel > 0 && numSamplesLastMBs > 0)
+        if (m_traceLevel > 0 && numSamplesLastLogged > 0)
        {
-            DisplayEvalStatistics(lastMBsRun + 1, numMBsRun, numSamplesLastMBs, evalNodes, evalResults, evalResultsLastMBs);
+            DisplayEvalStatistics(numMBsRunLastLogged + 1, numMBsRun, numSamplesLastLogged, evalNodes, evalResults, evalResultsLastLogged);
        }

        // final statistics
-        for (int i = 0; i < evalResultsLastMBs.size(); i++)
-            evalResultsLastMBs[i] = 0; // clear this since statistics display will subtract the previous value
+        for (int i = 0; i < evalResultsLastLogged.size(); i++)
+            evalResultsLastLogged[i] = EpochCriterion(0); // clear this since statistics display will subtract the previous value

        fprintf(stderr, "Final Results: ");
-        DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastMBs, true);
-
-        for (int i = 0; i < evalResults.size(); i++)
-        {
-            evalResults[i] /= totalEpochSamples;
-        }
+        DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, evalNodes, evalResults, evalResultsLastLogged, true);

        return evalResults;
    }

 protected:
-    void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs,
+    void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastLogged,
                               const vector<ComputationNodeBasePtr>& evalNodes,
-                               const double evalResults, const double evalResultsLastMBs, bool displayConvertedValue = false)
+                               const EpochCriterion evalResults, const EpochCriterion evalResultsLastLogged, bool displayConvertedValue = false)
    {
-        vector<double> evaR;
-        evaR.push_back(evalResults);
-        vector<double> evaLast;
-        evaLast.push_back(evalResultsLastMBs);
-
-        DisplayEvalStatistics(startMBNum, endMBNum, numSamplesLastMBs, evalNodes, evaR, evaLast, displayConvertedValue);
+        DisplayEvalStatistics(startMBNum, endMBNum, numSamplesLastLogged, evalNodes, { evalResults }, { evalResultsLastLogged }, displayConvertedValue);
    }

-    void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastMBs, const vector<ComputationNodeBasePtr>& evalNodes,
-                               const vector<double>& evalResults, const vector<double>& evalResultsLastMBs, bool displayConvertedValue = false)
+    void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastLogged, const vector<ComputationNodeBasePtr>& evalNodes,
+                               const vector<EpochCriterion>& evalResults, const vector<EpochCriterion>& evalResultsLastLogged, bool displayConvertedValue = false)
    {
-        fprintf(stderr, "Minibatch[%lu-%lu]: SamplesSeen = %lu    ", startMBNum, endMBNum, numSamplesLastMBs);
+        fprintf(stderr, "Minibatch[%lu-%lu]: SamplesSeen = %lu    ", startMBNum, endMBNum, numSamplesLastLogged);

        for (size_t i = 0; i < evalResults.size(); i++)
        {
-            double eresult = (evalResults[i] - evalResultsLastMBs[i]) / numSamplesLastMBs;
+            double eresult = (evalResults[i] - evalResultsLastLogged[i]).Average(); // / numSamplesLastLogged;
            fprintf(stderr, "%ls: %ls/Sample = %.8g    ", evalNodes[i]->NodeName().c_str(), evalNodes[i]->OperationName().c_str(), eresult);

            if (displayConvertedValue)
@ -279,6 +264,7 @@ protected:
 protected:
    ComputationNetworkPtr m_net;
    size_t m_numMBsToShowResult;
+    size_t m_firstMBsToShowResult;
    size_t m_maxSamplesInRAM;
    size_t m_numSubminiBatches;
    MPIWrapperPtr m_mpi;
@ -288,4 +274,5 @@ protected:
    int m_traceLevel;
    void operator=(const SimpleEvaluator&); // (not assignable)
 };
-} } }
+
+}}}
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/run-test
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/run-test
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/run-test
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/EvalSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/EvalSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainAutoEncoder/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainAutoEncoder/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainLstm/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainLstm/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainMultiInput/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainMultiInput/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainMultiTask/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainMultiTask/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainNdlNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainNdlNetwork/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainWithPreTrain/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainWithPreTrain/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/WriteBottleneck/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/WriteBottleneck/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/WriteScaledLogLike/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/WriteScaledLogLike/run-test
--- a/Tests/EndToEndTests/SLU/run-test
+++ b/Tests/EndToEndTests/SLU/run-test
--- a/Tests/EndToEndTests/Text/SequenceClassification/run-test
+++ b/Tests/EndToEndTests/Text/SequenceClassification/run-test