From e8d2498c3e39cf7d7cea16852e469d06839e3ac1 Mon Sep 17 00:00:00 2001
From: Alexey Orlov <alexeyo@microsoft.com>
Date: Tue, 22 Nov 2016 17:25:52 +0100
Subject: [PATCH 01/25] Main ReadMe, November 21, 2016

---
 README.md | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 335cea48d..b3bb041e3 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 Highlights of this Release:
 * New ASGD/Hogwild! training using Microsoft’s Parameter Server ([Project Multiverso](https://github.com/Microsoft/multiverso))
 * Distributed Scenarios now supported in CNTK Python API
-* New Memory compression -- ability to trade off memory usage with compute. 
+* Introducing of [Memory compression](https://github.com/Microsoft/CNTK/wiki/Top-level-configurations#hypercompressmemory) optimizing memory usage, especially for GPU computation 
 * CNTK Docker image with 1bit-SGD support
 * Stability Improvements and bug fixes
 
@@ -55,18 +55,6 @@ If you ARE using Model Evaluation Library we **strongly recommend** installing v
 
 See [Release Notes](https://github.com/Microsoft/CNTk/wiki/CNTK_1_7_2_Release_Notes) for details.
 
-*2016-09-28.* V 1.7.1 Binary release  
-Highlights of this Release:
-* Two Breaking Changes related to Layers library default initialization and ```fsAdagrad``` gradient-normalization scheme
-* Improvements in BrainScript
-* Enabling of Deterministic Algorithm enforcement
-* Improvements in Model Evaluation including the support of Evaluation for Azure Applications
-* Different Performance improvements
-* Multiple bug fixes
-
-See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_1_7_1_Release_Notes) (including the full list of bugs fixed)  
-Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).
 
 # What is CNTK

From 3ec7230bb73dac11598b2664e7d472f8a6e78dc0 Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Wed, 16 Nov 2016 15:55:30 -0800
Subject: [PATCH 02/25] Add BrainScript implementation of LRN ConvNet and
 AlexNet.

---
 .../AlexNet/BrainScript/AlexNet_ImageNet.cntk | 181 ++++++++++++++++++
 .../ConvNetLRN_CIFAR10_DataAug.cntk           | 130 +++++++++++++
 .../BrainScript/ConvNet_CIFAR10_DataAug.cntk  |   4 +-
 .../ConvNet/BrainScript/README.md             |  10 +-
 4 files changed, 322 insertions(+), 3 deletions(-)
 create mode 100644 Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
 create mode 100644 Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk

diff --git a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
new file mode 100644
index 000000000..c4e734fa3
--- /dev/null
+++ b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
@@ -0,0 +1,181 @@
+# Note: reader configuration comes from AlexNet.cntk or AlexNetComposite.cntk, depending on the test
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros="$ConfigDir$/Macros.ndl"
+
+precision = "float"
+deviceId = "Auto"
+
+command = Test
+
+parallelTrain = "true"
+traceLevel = 1
+numMBsToShowResult = 500
+
+modelPath = "$ModelDir$/AlexNet"
+stderr = "$OutputDir$/AlexNet"
+
+################################
+Train = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape = 227:227:3
+        labelDim = 1000
+                
+        LRN {k, n, alpha, beta} = {
+            apply (x) = {
+                x2 = x .* x
+                x2s = SplitDimension(x2, 3, 1) 
+                W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
+                y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
+                b = FlattenDimensions(y, 3, 2)
+                den = Exp (beta .* Log(k + b)) 
+                r = x .* Reciprocal(den)
+            }.r
+        }.apply
+
+        model = Sequential (
+            ConvolutionalLayer {96, (11:11), stride=(4:4), pad = false} : ReLU : 
+            LRN {1.0, 2, 0.0001, 0.75}
+            MaxPoolingLayer    {(3:3), stride=(2:2)} :
+            ConvolutionalLayer {256, (5:5), pad = true, initBias=0.1} : ReLU : 
+            LRN {1.0, 2, 0.0001, 0.75}
+            MaxPoolingLayer    {(3:3), stride=(2:2)} :
+            ConvolutionalLayer {384, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {384, (3:3), pad = true, initBias=0.1} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true, initBias=0.1} : ReLU : 
+            MaxPoolingLayer    {(3:3), stride=(2:2)} :
+            DenseLayer         {4096, activation=ReLU, initBias=0.1} : Dropout :
+            DenseLayer         {4096, activation=ReLU, initBias=0.1} : Dropout :
+            LinearLayer        {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        featNorm = features - Constant(114)
+        labels = Input {labelDim}
+
+        # apply model to features
+        z = model (featNorm)
+
+        # loss and error computation
+        ce       = CrossEntropyWithSoftmax  (labels, z)
+        errs     = ClassificationError      (labels, z)
+        top5Errs = ClassificationError      (labels, z, topN=5)  # only used in Eval action
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (z)
+    }
+    
+    SGD = {
+        epochSize = 0
+        minibatchSize = 128
+        learningRatesPerMB = 0.01*20:0.005*12:0.001*28:0.0003
+        momentumPerMB = 0.9
+        maxEpochs = 90
+        gradUpdateType = None
+        L2RegWeight = 0.0005
+        dropoutRate = 0*5:0.5
+        
+        # TODO: try less bits?
+        ParallelTrain = {
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = "true"
+            parallelizationStartEpoch = 3
+            DataParallelSGD = {
+                gradientBits = 32
+            }
+        }
+        
+        numMBsToShowResult = 100
+    }
+    
+    # Reader
+    reader = {
+        verbosity = 0
+        randomize = true
+        randomizationWindow = 1
+
+        deserializers = (
+        {   
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$DataDir$/train_map.txt"
+            input = {
+                features = {
+                    transforms = (
+                        {
+                            type = "Crop"
+                            cropType = "random"
+                            cropRatio = 0.88671875
+                            jitterType = "uniRatio"
+                        }:{
+                            type = "Scale"
+                            width = 227
+                            height = 227
+                            channels = 3
+                            interpolations = "linear"
+                        #}:{
+                        #    type = "Mean"
+                        #    meanFile = "$ConfigDir$/ImageNet1K_mean.xml"
+                        }:{
+                            type = "Transpose"
+                        }
+                    )
+                }
+                labels = {
+                    labelDim = 1000
+                }
+            }
+        })
+    }    
+}
+
+################################
+Test = {
+    action=test
+    minibatchSize=128
+    evalNodeNames = errs:top5Errs  # also test top-5 error rate
+    
+    # Reader
+    reader = {
+        verbosity = 0
+        randomize = false
+
+        deserializers = (
+        {
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file="$DataDir$/val_map.txt"
+            input = {
+                features = {
+                    transforms = (
+                        {
+                            type = "Crop"
+                            cropType = "center"
+                            cropRatio = 0.88671875
+                        }:{
+                            type = "Scale"
+                            width = 227
+                            height = 227
+                            channels = 3
+                        }:{
+                            type = "Transpose"
+                        }
+                    )
+                }
+                labels = {
+                    labelDim = 1000
+                }
+            }
+        })
+    }        
+}
\ No newline at end of file
diff --git a/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk b/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
new file mode 100644
index 000000000..9beffba0b
--- /dev/null
+++ b/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
@@ -0,0 +1,130 @@
+# ConvNet applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
+
+command = TrainConvNet:Eval
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = "../../.." ; dataDir = "$rootDir$/DataSets/CIFAR-10" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/ConvNetLRN_CIFAR10_DataAug"
+#stderr = "$outputDir$/ConvNetLRN_CIFAR10_DataAug_bs_out"
+
+TrainConvNet = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape = 32:32:3
+        labelDim = 10
+
+        featScale = 1/256
+        Normalize{f} = x => f .* x
+
+        # Local Response Normalization 
+        # k : bias 
+        # n : half radius 
+        # alpha: scale factor 
+        # beta: exponent 
+        LRN {k, n, alpha, beta} = {
+            apply (x) = {
+                x2 = x .* x
+                # reshape 
+                x2s = SplitDimension(x2, 3, 1) 
+                # 3D convolution 
+                W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
+                y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
+                # reshape back 
+                b = FlattenDimensions(y, 3, 2)
+                den = Exp (beta .* Log(k + b)) 
+                r = x .* Reciprocal(den)
+            }.r
+        }.apply
+
+        model = Sequential (
+            Normalize {featScale} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            LRN {1.0, 4, 0.001, 0.75} : 
+            MaxPoolingLayer {(3:3), stride = (2:2)} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            LRN {1.0, 4, 0.001, 0.75} : 
+            MaxPoolingLayer {(3:3), stride = (2:2)} :
+            DenseLayer {256} : ReLU : Dropout : 
+            DenseLayer {128} : ReLU : Dropout : 
+            LinearLayer {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        labels   = Input {labelDim}
+
+        # apply model to features
+        z = model (features)
+
+        # connect to system
+        ce       = CrossEntropyWithSoftmax     (labels, z)
+        errs     = ClassificationError         (labels, z)
+        top5Errs = ClassificationError         (labels, z, topN=5)  # only used in Eval action
+
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)  # top5Errs only used in Eval
+        outputNodes     = (z)
+    }
+
+    SGD = {
+        epochSize = 0
+        minibatchSize = 64
+
+        learningRatesPerSample = 0.0015625*20:0.00046875*20:0.00015625*20:0.000046875*10:0.000015625
+        momentumAsTimeConstant = 0*20:600*20:1200
+        maxEpochs = 80
+        L2RegWeight = 0.002
+        dropoutRate = 0.5
+
+        numMBsToShowResult = 100
+    }
+
+    reader = {
+        verbosity = 0 ; randomize = true
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$dataDir$/train_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
+                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
+                    { type = "Mean" ; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = 10 }
+            }
+        })
+    }
+}
+
+# Eval action
+Eval = {
+    action = "eval"
+    evalNodeNames = errs:top5Errs  # also test top-5 error rate
+    # Set minibatch size for testing.
+    minibatchSize = 512
+
+    reader = {
+        verbosity = 0 ; randomize = false
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$dataDir$/test_map.txt"
+            input = {
+                features = { transforms = (
+                   { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
+                   { type = "Mean"; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
+                   { type = "Transpose" }
+                )}
+                labels = { labelDim = 10 }
+            }
+        })
+    }
+}
diff --git a/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10_DataAug.cntk b/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10_DataAug.cntk
index a95f52488..46d962b1b 100644
--- a/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10_DataAug.cntk
+++ b/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10_DataAug.cntk
@@ -28,8 +28,8 @@ TrainConvNet = {
             ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
             ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
               MaxPoolingLayer {(3:3), stride = (2:2)} :
-            DenseLayer {256} : Dropout : ReLU : 
-            DenseLayer {128} : Dropout : ReLU : 
+            DenseLayer {256} : ReLU : Dropout : 
+            DenseLayer {128} : ReLU : Dropout : 
             LinearLayer {labelDim}
         )
 
diff --git a/Examples/Image/Classification/ConvNet/BrainScript/README.md b/Examples/Image/Classification/ConvNet/BrainScript/README.md
index 8339b312f..8f10904a2 100644
--- a/Examples/Image/Classification/ConvNet/BrainScript/README.md
+++ b/Examples/Image/Classification/ConvNet/BrainScript/README.md
@@ -31,4 +31,12 @@ Run the example from the current folder using:
 
 `cntk configFile=ConvNet_CIFAR10_DataAug.cntk`
 
-As seen in the CNTK configuration file [ConvNet_CIFAR10_DataAug.cntk](./ConvNet_CIFAR10_DataAug.cntk), we use a fixed crop ratio of `0.8` and scale the image to `32x32` pixels for training. Since all training images are pre-padded to `40x40` pixels, effectively we only perform translation transform without scaling. The accuracy of the network on test data is around `14%`, which is a lot better than the previous model.
+As seen in the CNTK configuration file [ConvNet_CIFAR10_DataAug.cntk](./ConvNet_CIFAR10_DataAug.cntk), we use a fixed crop ratio of `0.8` and scale the image to `32x32` pixels for training. Since all training images are pre-padded to `40x40` pixels, effectively we only perform translation transform without scaling. The accuracy of the network on test data is around `14.2%`, which is a lot better than the previous model.
+
+### ConvNetLRN_CIFAR10_DataAug.cntk
+
+The fourth example added local response normalization (LRN) to the previous example. LRN is implemented as a BrainScript function using 3D convolution with a constant kernel. You may run the example from the current folder using:
+
+`cntk configFile=ConvNetLRN_CIFAR10_DataAug.cntk`
+
+This model achieves slightly better accuracy of `13.8%`, which demonstrates the effectiveness of LRN. Nevertheless, as mentioned earlier, LRN is now rarely used by state-of-the-art deep networks.

From 2097a8e9a109c0e6a7427837e34320e1f448eb37 Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Thu, 17 Nov 2016 15:01:30 -0800
Subject: [PATCH 03/25] Bug fix on BS code, and add comments to LRN.

---
 CNTK.sln                                       |  1 +
 .../AlexNet/BrainScript/AlexNet_ImageNet.cntk  | 18 +++++++++++++-----
 .../ConvNetLRN_CIFAR10_DataAug.cntk            |  6 +++---
 .../ComputationNetworkLib/InputAndParamNodes.h |  4 ++--
 4 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/CNTK.sln b/CNTK.sln
index ef780c65b..d48922fb6 100644
--- a/CNTK.sln
+++ b/CNTK.sln
@@ -551,6 +551,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tools", "Tools", "{83BFF5BF
 		Tools\generate_build_info = Tools\generate_build_info
 		Tools\msvc_collect_coverage.py = Tools\msvc_collect_coverage.py
 		Tools\run_boost_unit_tests.py = Tools\run_boost_unit_tests.py
+		Scripts\uci2ctf.py = Scripts\uci2ctf.py
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Extensibility", "Extensibility", "{60F87E25-BC87-4782-8E20-1621AAEBB113}"
diff --git a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
index c4e734fa3..d7d511b71 100644
--- a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
+++ b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
@@ -11,7 +11,7 @@ ndlMacros="$ConfigDir$/Macros.ndl"
 precision = "float"
 deviceId = "Auto"
 
-command = Test
+command = Train:Test
 
 parallelTrain = "true"
 traceLevel = 1
@@ -28,12 +28,20 @@ Train = {
         imageShape = 227:227:3
         labelDim = 1000
                 
+        # Local Response Normalization 
+        # k : bias 
+        # n : half radius 
+        # alpha: scale factor 
+        # beta: exponent 
         LRN {k, n, alpha, beta} = {
             apply (x) = {
                 x2 = x .* x
+                # reshape to insert a fake singleton reduction dimension after the 3rd axis
                 x2s = SplitDimension(x2, 3, 1) 
+                # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
                 W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
                 y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
+                # reshape back to remove the fake singleton reduction dimension
                 b = FlattenDimensions(y, 3, 2)
                 den = Exp (beta .* Log(k + b)) 
                 r = x .* Reciprocal(den)
@@ -42,10 +50,10 @@ Train = {
 
         model = Sequential (
             ConvolutionalLayer {96, (11:11), stride=(4:4), pad = false} : ReLU : 
-            LRN {1.0, 2, 0.0001, 0.75}
+            LRN {1.0, 2, 0.0001, 0.75} : 
             MaxPoolingLayer    {(3:3), stride=(2:2)} :
             ConvolutionalLayer {256, (5:5), pad = true, initBias=0.1} : ReLU : 
-            LRN {1.0, 2, 0.0001, 0.75}
+            LRN {1.0, 2, 0.0001, 0.75} : 
             MaxPoolingLayer    {(3:3), stride=(2:2)} :
             ConvolutionalLayer {384, (3:3), pad = true} : ReLU : 
             ConvolutionalLayer {384, (3:3), pad = true, initBias=0.1} : ReLU : 
@@ -58,7 +66,7 @@ Train = {
 
         # inputs
         features = Input {imageShape}
-        featNorm = features - Constant(114)
+        featNorm = features - 114
         labels = Input {labelDim}
 
         # apply model to features
@@ -79,7 +87,7 @@ Train = {
     
     SGD = {
         epochSize = 0
-        minibatchSize = 128
+        minibatchSize = 256
         learningRatesPerMB = 0.01*20:0.005*12:0.001*28:0.0003
         momentumPerMB = 0.9
         maxEpochs = 90
diff --git a/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk b/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
index 9beffba0b..4c0cd2471 100644
--- a/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
+++ b/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
@@ -28,12 +28,12 @@ TrainConvNet = {
         LRN {k, n, alpha, beta} = {
             apply (x) = {
                 x2 = x .* x
-                # reshape 
+                # reshape to insert a fake singleton reduction dimension after the 3rd axis
                 x2s = SplitDimension(x2, 3, 1) 
-                # 3D convolution 
+                # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
                 W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
                 y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
-                # reshape back 
+                # reshape back to remove the fake singleton reduction dimension
                 b = FlattenDimensions(y, 3, 2)
                 den = Exp (beta .* Log(k + b)) 
                 r = x .* Reciprocal(den)
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index ebf67ac93..eead0e2a7 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -103,9 +103,9 @@ private:
         bool log = GetEnvironmentPtr() && Environment().traceLevel > 0; // note: this will not log before node is part of network
         if (log)
         {
-            fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f*%f, onCPU=%s.\n)",
+            fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f(%f*%f), onCPU=%s.\n)",
                     NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
-                    (int)randomSeed, (int)fanOut, (int)fanIn, range, initValueScale, initOnCPUOnly ? "true" : "false");
+                    (int)randomSeed, (int)fanOut, (int)fanIn, range, range/initValueScale, initValueScale, initOnCPUOnly ? "true" : "false");
         }
     }
 

From bfc1806297496909aeb290222b302334dc99b0bf Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Fri, 18 Nov 2016 09:50:21 -0800
Subject: [PATCH 04/25] Revise Python scripts for beta 3.0 compatibility.
 Default Gaussian initializer to standard deviation 1.0, and uniform
 initializer to [-1.0,1.0]. Change AlexNet initialization accordingly.

---
 .../lyx/CNTKBook_CNTK_Adv_Chapter.lyx         |  4 +--
 .../lyx/CNTKBook_CNTK_Chapter.lyx             |  4 +--
 .../AlexNet/BrainScript/AlexNet_ImageNet.cntk | 25 ++++++++++---------
 .../ConvNet/Python/ConvNet_CIFAR10.py         |  7 +++---
 .../ConvNet/Python/ConvNet_CIFAR10_DataAug.py |  1 +
 .../ConvNet/Python/ConvNet_MNIST.py           |  8 +++---
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs |  2 +-
 .../InputAndParamNodes.cpp                    |  8 +++---
 bindings/python/cntk/initializer.py           |  8 ++++++
 9 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
index 59993d724..71b523bdf 100644
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
@@ -1704,7 +1704,7 @@ fromFile - No initialization is required, should only be used if the network
 \begin_layout Itemize
 uniform - Initializes the parameter matrix with uniform random numbers in
  the range of 
-\begin_inset Formula $\left[-0.05\times initValueScale,0.05\times initValueScale\right]$
+\begin_inset Formula $\left[-1.0\times initValueScale,1.0\times initValueScale\right]$
 \end_inset
 
 
@@ -1713,7 +1713,7 @@ uniform - Initializes the parameter matrix with uniform random numbers in
 \begin_layout Itemize
 gaussian - Initializes the parameter matrix with Gaussian random numbers
  with zero mean and standard deviation of 
-\begin_inset Formula $0.2\times initValueScale/\sqrt{cols}$
+\begin_inset Formula $1.0\times initValueScale$
 \end_inset
 
 
diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
index 8eec5c884..2c9a4c557 100644
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
@@ -623,8 +623,8 @@ initValueScale
 
 .
  If the model parameters are initialized using the Gaussian distribution,
- the standard deviation will be adjusted to 
-\begin_inset Formula $0.2\times initValueScale/\sqrt{fanout}$
+ the standard deviation will be adjusted to
+\begin_inset Formula $1.0\times initValueScale$
 \end_inset
 
 .
diff --git a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
index d7d511b71..b54f878e9 100644
--- a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
+++ b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
@@ -48,25 +48,26 @@ Train = {
             }.r
         }.apply
 
+        # initValueScale are determined to generate Gaussians with variance of 0.01 and 0.005 (for the two DenseLayers)
         model = Sequential (
-            ConvolutionalLayer {96, (11:11), stride=(4:4), pad = false} : ReLU : 
+            ConvolutionalLayer {96, (11:11), stride=(4:4), pad=false, init='gaussian', initValueScale=0.01} : ReLU : 
             LRN {1.0, 2, 0.0001, 0.75} : 
             MaxPoolingLayer    {(3:3), stride=(2:2)} :
-            ConvolutionalLayer {256, (5:5), pad = true, initBias=0.1} : ReLU : 
+            ConvolutionalLayer {192, (5:5), pad = true, init='gaussian', initValueScale=0.01, initBias=0.1} : ReLU : 
             LRN {1.0, 2, 0.0001, 0.75} : 
             MaxPoolingLayer    {(3:3), stride=(2:2)} :
-            ConvolutionalLayer {384, (3:3), pad = true} : ReLU : 
-            ConvolutionalLayer {384, (3:3), pad = true, initBias=0.1} : ReLU : 
-            ConvolutionalLayer {256, (3:3), pad = true, initBias=0.1} : ReLU : 
+            ConvolutionalLayer {384, (3:3), pad = true, init='gaussian', initValueScale=0.01} : ReLU : 
+            ConvolutionalLayer {384, (3:3), pad = true, init='gaussian', initValueScale=0.01, initBias=0.1} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true, init='gaussian', initValueScale=0.01, initBias=0.1} : ReLU : 
             MaxPoolingLayer    {(3:3), stride=(2:2)} :
-            DenseLayer         {4096, activation=ReLU, initBias=0.1} : Dropout :
-            DenseLayer         {4096, activation=ReLU, initBias=0.1} : Dropout :
-            LinearLayer        {labelDim}
+            DenseLayer         {4096, activation=ReLU, init='gaussian', initValueScale=0.005, initBias=0.1} : Dropout :
+            DenseLayer         {4096, activation=ReLU, init='gaussian', initValueScale=0.005, initBias=0.1} : Dropout :
+            LinearLayer        {labelDim, init='gaussian', initValueScale=0.01}
         )
 
         # inputs
         features = Input {imageShape}
-        featNorm = features - 114
+        featNorm = features - Constant(114)
         labels = Input {labelDim}
 
         # apply model to features
@@ -88,12 +89,12 @@ Train = {
     SGD = {
         epochSize = 0
         minibatchSize = 256
-        learningRatesPerMB = 0.01*20:0.005*12:0.001*28:0.0003
+        learningRatesPerMB = 0.01*25:0.001*25:0.0001*25:0.00001*25:0.000001
         momentumPerMB = 0.9
-        maxEpochs = 90
+        maxEpochs = 110
         gradUpdateType = None
         L2RegWeight = 0.0005
-        dropoutRate = 0*5:0.5
+        dropoutRate = 0.5
         
         # TODO: try less bits?
         ParallelTrain = {
diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
index 84b9ca27d..5d3ee6d43 100644
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
@@ -13,6 +13,7 @@ from cntk.layers import *
 from cntk.models import Sequential, LayerStack
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
 from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule, momentum_as_time_constant_schedule, UnitType
+from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
 from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, relu, minus, element_times, constant
 from _cntk_py import set_computation_network_trace_level
 
@@ -71,9 +72,9 @@ def convnet_cifar10(debug_output=False):
 
     # Set learning parameters
     lr_per_sample          = [0.0015625]*10+[0.00046875]*10+[0.00015625]
-    lr_schedule            = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
-    momentum_time_constant = [0]*20+[-minibatch_size/np.log(0.9)]
-    mm_schedule            = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
+    lr_schedule            = learning_rate_schedule(lr_per_sample, UnitType.sample, epoch_size)
+    mm_time_constant       = [0]*20+[-minibatch_size/np.log(0.9)]
+    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant, epoch_size)
     l2_reg_weight          = 0.002
 
     # Instantiate the trainer object to drive the model training
diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
index e76baf43a..b4ee2116c 100644
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
@@ -14,6 +14,7 @@ from cntk.ops import input_variable, cross_entropy_with_softmax, classification_
 from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
 from cntk import Trainer, persist, cntk_py
 from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule, momentum_as_time_constant_schedule, UnitType
+from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
 from _cntk_py import set_computation_network_trace_level
 
 # Paths relative to current python file.
diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py
index a1eb28f75..7183634c2 100644
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py
@@ -11,7 +11,7 @@ from cntk import Trainer, persist
 from cntk.utils import *
 from cntk.layers import *
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
-from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule, UnitType
+from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
 from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, relu, element_times, constant
 
 # Paths relative to current python file.
@@ -63,11 +63,11 @@ def convnet_mnist(debug_output=False):
     # Set learning parameters
     lr_per_sample          = [0.001]*10+[0.0005]*10+[0.0001]
     lr_schedule            = learning_rate_schedule(lr_per_sample, UnitType.sample, epoch_size)
-    momentum_time_constant = [0]*5+[1024]
-    mn_schedule = momentum_schedule(momentum_time_constant, epoch_size)
+    mm_time_constant       = [0]*5+[1024]
+    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant, epoch_size)
 
     # Instantiate the trainer object to drive the model training
-    learner     = momentum_sgd(z.parameters, lr_schedule, mn_schedule)
+    learner     = momentum_sgd(z.parameters, lr_schedule, mm_schedule)
     trainer     = Trainer(z, ce, pe, learner)
 
     # define mapping from reader streams to network inputs
diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index a7b9e0d7b..084dc6256 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -512,7 +512,7 @@ CNTK2 = [
 # Parameter{} can do several forms of initialization.
 #  - initValue=scalar, value=array --> initialize from this value  --array form not implemented yet
 #  - initFromFilePath="..." --> read from a data file
-#  - init="uniform|gaussian" (random init scaled by initValueScale). Warning: This has magic scaling factors. TODO: document them here
+#  - init="uniform|gaussian" (random init scaled by initValueScale). 
 #  - init="zero"
 # deprecated:
 #  - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.cpp b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
index b916f3ea5..b973b0a20 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.cpp
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
@@ -204,8 +204,8 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
 }
 
 // understood options:
-//  uniform:       1/20
-//  gaussian:      sqrt(0.04 / fanin)
+//  uniform:       1.0
+//  gaussian:      1.0
 //  xavier:        sqrt(3 / fanin)
 //  glorotNormal:  sqrt(2 / (fanin+fanout))
 //  glorotUniform: sqrt(6 / (fanin+fanout))
@@ -214,8 +214,8 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
 // returns (*,0) for unrecognized string
 static pair<bool/*uniform*/,double/*stddev or range*/> ParseRandomizationType(const wstring& type, size_t fanOut /* = 1*/, size_t fanIn /*= 1*/)
 {
-    if      (type == UniformInitializerTypeName)       return make_pair( true, 0.05f);
-    else if (type == GaussianInitializerTypeName)      return make_pair(false, 0.2 / sqrt(fanIn));
+    if      (type == UniformInitializerTypeName)       return make_pair(true,  1.0f);
+    else if (type == GaussianInitializerTypeName)      return make_pair(false, 1.0f);
     else if (type == XavierInitializerTypeName)        return make_pair(true,  sqrt(3.0 / fanIn));
     else if (type == GlorotUniformInitializerTypeName) return make_pair(true,  sqrt(6.0 / (fanIn + fanOut)));
     else if (type == GlorotNormalInitializerTypeName)  return make_pair(false, sqrt(2.0 / (fanIn + fanOut)));
diff --git a/bindings/python/cntk/initializer.py b/bindings/python/cntk/initializer.py
index fbd73d4be..cf1b6277e 100644
--- a/bindings/python/cntk/initializer.py
+++ b/bindings/python/cntk/initializer.py
@@ -19,6 +19,7 @@ def uniform(scale=DefaultParamInitScale, seed=None):
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
+        initialized to uniform distribution between `scale*[-1, 1]`
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed
@@ -37,6 +38,7 @@ def gaussian(output_rank=SentinelValueForInferParamInitRank, filter_rank=Sentine
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale`. 
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed
@@ -55,6 +57,7 @@ def xavier(output_rank=SentinelValueForInferParamInitRank, filter_rank=SentinelV
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale*sqrt(3.0/fanIn)`
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed
@@ -73,6 +76,7 @@ def glorot_uniform(output_rank=SentinelValueForInferParamInitRank, filter_rank=S
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
+        initialized to uniform distribution between `scale*sqrt(6.0/(fanIn+fanOut))*[-1,1]`
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed
@@ -91,6 +95,7 @@ def glorot_normal(output_rank=SentinelValueForInferParamInitRank, filter_rank=Se
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale*sqrt(2.0/(fanIn+fanOut))`
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed
@@ -109,6 +114,7 @@ def he_uniform(output_rank=SentinelValueForInferParamInitRank, filter_rank=Senti
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
+        initialized to uniform distribution between `scale*sqrt(6.0/fanIn)*[-1,1]`
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed
@@ -127,6 +133,7 @@ def he_normal(output_rank=SentinelValueForInferParamInitRank, filter_rank=Sentin
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale*sqrt(2.0/fanIn)`
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed
@@ -143,6 +150,7 @@ def bilinear(kernel_width, kernel_height):
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
+        useful for deconvolution layer
     '''
     return cntk_py.bilinear_initializer(kernel_width, kernel_height)
 

From f15687d998206c841791215f9abecd0f270db5f5 Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Fri, 18 Nov 2016 10:37:31 -0800
Subject: [PATCH 05/25] Parameter name change.

---
 .../Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py  | 4 ++--
 .../ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
index b4ee2116c..4b68752b8 100644
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
@@ -84,8 +84,8 @@ def convnet_cifar10_dataaug(reader_train, reader_test, max_epochs = 80):
     # Set learning parameters
     lr_per_sample          = [0.0015625]*20+[0.00046875]*20+[0.00015625]*20+[0.000046875]*10+[0.000015625]
     lr_schedule            = learning_rate_schedule(lr_per_sample, unit=UnitType.sample, epoch_size=epoch_size)
-    momentum_time_constant = [0]*20+[600]*20+[1200]
-    mm_schedule            = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
+    mm_time_constant       = [0]*20+[600]*20+[1200]
+    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
     l2_reg_weight          = 0.002
     
     # trainer object
diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
index 23c5bcbae..36c547672 100644
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
@@ -87,8 +87,8 @@ def convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer, max_
     # Set learning parameters
     lr_per_sample          = [0.0015625]*20+[0.00046875]*20+[0.00015625]*20+[0.000046875]*10+[0.000015625]
     lr_schedule            = learning_rate_schedule(lr_per_sample, unit=UnitType.sample, epoch_size=epoch_size)
-    momentum_time_constant = [0]*20+[600]*20+[1200]
-    mm_schedule            = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
+    mm_time_constant       = [0]*20+[600]*20+[1200]
+    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
     l2_reg_weight          = 0.002
     
     # trainer object

From 6804156bc20078832e6af65e5ff0dd18f7b3dc0b Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Mon, 21 Nov 2016 22:58:55 -0800
Subject: [PATCH 06/25] Max pooling and convolution boundary fix.

---
 Source/Math/ConvolveGeometry.h | 76 ++++++++--------------------------
 Source/Math/Math.vcxproj       |  7 ----
 2 files changed, 17 insertions(+), 66 deletions(-)

diff --git a/Source/Math/ConvolveGeometry.h b/Source/Math/ConvolveGeometry.h
index 0d828ad26..057f72f06 100644
--- a/Source/Math/ConvolveGeometry.h
+++ b/Source/Math/ConvolveGeometry.h
@@ -109,67 +109,25 @@ public:
         m_originIndex = 0;
         for (int i = (int)dimCount - 1; i >= 0; i--)
         {
-            assert((m_outputShape[i] % GetMapCount(i)) == 0);
-            int outPerMap = (int)(m_outputShape[i] / GetMapCount(i));
-            // Number of cells between first and last "centers", inclusive.
-            int cells = (int)((outPerMap - 1) * GetStride(i) + 1);
-            assert(m_inputShape[i] >= cells);
-
-            // Extra cells, to the left and right of "cells".
-            int extra = (int)m_inputShape[i] - cells;
-            assert(extra >= 0);
-
-            // When LowerPad and/or UpperPad are specified, the Start[i] value is determined by those values.
-            int lo = GetAutoPad(i) ? 0 : (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : i];
-            int hi = GetAutoPad(i) ? 0 : (int)m_upperPad[m_upperPad.size() == 1 ? 0 : i];
-            if (lo != 0 || hi != 0)
-            {
-                assert(extra + lo + hi + 1 == m_kernelShape[i]);
-                // Compute the number of cells on the left and right parts of the kernel,
-                // not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
-                // placed on the right (the center is shifted to the left).
-                int right = (int)m_kernelShape[i] - 1;
-                int left = right / 2;
-                right -= left;
-                assert(left <= right);
-                assert(right <= left + 1);
-
-                assert(lo <= left);
-                assert(hi <= right);
-                m_start[i] = left - lo;
-                assert(m_start[i] + cells + right == m_inputShape[i] + hi);
-            }
+            bool padded = GetAutoPad(i); 
+            if (padded)
+                m_start[i] = 0; 
             else
             {
-                m_start[i] = extra / 2;
-#ifdef _DEBUG
-                // If we're padding then extra should be covered.
-                bool padded = GetAutoPad(i);
-                assert(!padded || extra + 1 <= m_kernelShape[i]);
-                // If we're not padding then, we should stay within the input dimension.
-                assert(padded || extra + 1 >= m_kernelShape[i]);
-
-                // Compute the number of cells on the left and right parts of the kernel,
-                // not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
-                // placed on the right (the center is shifted to the left).
-                int right = (int)m_kernelShape[i] - 1;
-                int left = right / 2;
-                right -= left;
-                assert(0 <= left);
-                assert(left <= right);
-                assert(right <= left + 1);
-
-                int min = m_start[i] - left;
-                int max = m_start[i] + (int)cells + right;
-                assert(!padded || min <= 0 && max >= m_inputShape[i]);
-                assert(padded || min >= 0 && max <= m_inputShape[i]);
-
-                int diff = min - ((int)m_inputShape[i] - max);
-                assert(std::abs(diff) <= 1);
-
-                UNUSED(padded);
-                UNUSED(diff);
-#endif
+                m_start[i] = ((int)m_kernelShape[i] - 1) / 2;
+                int lo = (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : i];
+                int hi = (int)m_upperPad[m_upperPad.size() == 1 ? 0 : i];
+                if (lo != 0 || hi != 0)
+                {
+                    m_start[i] -= lo;
+                    int outPerMap = (int)(m_outputShape[i] / GetMapCount(i));
+                    int cells = (int)((outPerMap - 1) * GetStride(i) + 1);
+                    if (cells > 0)  // dummy if, just to get rid of warning 
+                    {
+                        assert(m_inputShape[i] >= cells);
+                        assert(m_start[i] + cells + (int)m_kernelShape[i] - 1 == m_inputShape[i] + hi);
+                    }
+                }
             }
 
             m_startIndex = m_startIndex * (int)m_inputShape[i] + m_start[i];
diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj
index 0d223472f..31a920b81 100644
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@@ -40,7 +40,6 @@
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
@@ -51,7 +50,6 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup>
     <!-- TODO intentional for all? -->
-    <LinkIncremental>false</LinkIncremental>
     <TargetName>Math</TargetName>
   </PropertyGroup>
   <ItemDefinitionGroup>
@@ -102,9 +100,6 @@
     <ClCompile>
       <WarningLevel>Level4</WarningLevel>
       <PrecompiledHeader>Use</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>$(MathDefine); NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
       <SDLCheck>true</SDLCheck>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
@@ -119,8 +114,6 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
       <AdditionalDependencies>$(MathLinkLibrary);Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <DelayLoadDLLs>$(MathDelayLoad); $(CudaDlls); %(DelayLoadDLLs)</DelayLoadDLLs>
       <Profile>true</Profile>

From 8625d9e0197a8ed7a941ba8df657f903e2af4687 Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Tue, 22 Nov 2016 10:50:05 -0800
Subject: [PATCH 07/25] Revert some changes back (related to Gaussian and
 uniform initializer). Will create separate branch for that.

---
 CNTK.sln                                                  | 1 -
 .../CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx     | 4 ++--
 .../CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx         | 2 +-
 Source/ComputationNetworkLib/InputAndParamNodes.cpp       | 8 ++++----
 bindings/python/cntk/initializer.py                       | 4 ++--
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/CNTK.sln b/CNTK.sln
index d48922fb6..ef780c65b 100644
--- a/CNTK.sln
+++ b/CNTK.sln
@@ -551,7 +551,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tools", "Tools", "{83BFF5BF
 		Tools\generate_build_info = Tools\generate_build_info
 		Tools\msvc_collect_coverage.py = Tools\msvc_collect_coverage.py
 		Tools\run_boost_unit_tests.py = Tools\run_boost_unit_tests.py
-		Scripts\uci2ctf.py = Scripts\uci2ctf.py
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Extensibility", "Extensibility", "{60F87E25-BC87-4782-8E20-1621AAEBB113}"
diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
index 71b523bdf..59993d724 100644
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
@@ -1704,7 +1704,7 @@ fromFile - No initialization is required, should only be used if the network
 \begin_layout Itemize
 uniform - Initializes the parameter matrix with uniform random numbers in
  the range of 
-\begin_inset Formula $\left[-1.0\times initValueScale,1.0\times initValueScale\right]$
+\begin_inset Formula $\left[-0.05\times initValueScale,0.05\times initValueScale\right]$
 \end_inset
 
 
@@ -1713,7 +1713,7 @@ uniform - Initializes the parameter matrix with uniform random numbers in
 \begin_layout Itemize
 gaussian - Initializes the parameter matrix with Gaussian random numbers
  with zero mean and standard deviation of 
-\begin_inset Formula $1.0\times initValueScale$
+\begin_inset Formula $0.2\times initValueScale/\sqrt{cols}$
 \end_inset
 
 
diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
index 2c9a4c557..f619b19d1 100644
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
@@ -624,7 +624,7 @@ initValueScale
 .
  If the model parameters are initialized using the Gaussian distribution,
  the standard deviation will be adjusted to
-\begin_inset Formula $1.0\times initValueScale$
+\begin_inset Formula $0.2\times initValueScale/\sqrt{fanout}$
 \end_inset
 
 .
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.cpp b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
index b973b0a20..b916f3ea5 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.cpp
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
@@ -204,8 +204,8 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
 }
 
 // understood options:
-//  uniform:       1.0
-//  gaussian:      1.0
+//  uniform:       1/20
+//  gaussian:      sqrt(0.04 / fanin)
 //  xavier:        sqrt(3 / fanin)
 //  glorotNormal:  sqrt(2 / (fanin+fanout))
 //  glorotUniform: sqrt(6 / (fanin+fanout))
@@ -214,8 +214,8 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
 // returns (*,0) for unrecognized string
 static pair<bool/*uniform*/,double/*stddev or range*/> ParseRandomizationType(const wstring& type, size_t fanOut /* = 1*/, size_t fanIn /*= 1*/)
 {
-    if      (type == UniformInitializerTypeName)       return make_pair(true,  1.0f);
-    else if (type == GaussianInitializerTypeName)      return make_pair(false, 1.0f);
+    if      (type == UniformInitializerTypeName)       return make_pair( true, 0.05f);
+    else if (type == GaussianInitializerTypeName)      return make_pair(false, 0.2 / sqrt(fanIn));
     else if (type == XavierInitializerTypeName)        return make_pair(true,  sqrt(3.0 / fanIn));
     else if (type == GlorotUniformInitializerTypeName) return make_pair(true,  sqrt(6.0 / (fanIn + fanOut)));
     else if (type == GlorotNormalInitializerTypeName)  return make_pair(false, sqrt(2.0 / (fanIn + fanOut)));
diff --git a/bindings/python/cntk/initializer.py b/bindings/python/cntk/initializer.py
index cf1b6277e..d23926958 100644
--- a/bindings/python/cntk/initializer.py
+++ b/bindings/python/cntk/initializer.py
@@ -19,7 +19,7 @@ def uniform(scale=DefaultParamInitScale, seed=None):
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
-        initialized to uniform distribution between `scale*[-1, 1]`
+        initialized to uniform distribution between `scale*[-0.05, 0.05]`
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed
@@ -38,7 +38,7 @@ def gaussian(output_rank=SentinelValueForInferParamInitRank, filter_rank=Sentine
 
     Returns:
         initializer for :class:`cntk.variables.Parameter`
-        initialized to Gaussian distribution with mean `0` and standard deviation `scale`. 
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale*0.2/sqrt(fanIn))`. 
     '''
     if seed is None:
         seed = SentinelValueForAutoSelectRandomSeed

From c585d08887a8bfc0817a0276cbc070bee0f14cc9 Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Tue, 22 Nov 2016 10:53:10 -0800
Subject: [PATCH 08/25] AlexNet model update.

---
 .../AlexNet/BrainScript/AlexNet_ImageNet.cntk    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
index b54f878e9..8d15b64f2 100644
--- a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
+++ b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
@@ -50,19 +50,19 @@ Train = {
 
         # initValueScale are determined to generate Gaussians with variance of 0.01 and 0.005 (for the two DenseLayers)
         model = Sequential (
-            ConvolutionalLayer {96, (11:11), stride=(4:4), pad=false, init='gaussian', initValueScale=0.01} : ReLU : 
+            ConvolutionalLayer {96, (11:11), stride=(4:4), pad=false, init='gaussian', initValueScale=0.95265} : ReLU : 
             LRN {1.0, 2, 0.0001, 0.75} : 
             MaxPoolingLayer    {(3:3), stride=(2:2)} :
-            ConvolutionalLayer {192, (5:5), pad = true, init='gaussian', initValueScale=0.01, initBias=0.1} : ReLU : 
+            ConvolutionalLayer {192, (5:5), pad = true, init='gaussian', initValueScale=2.44978, initBias=0.1} : ReLU : 
             LRN {1.0, 2, 0.0001, 0.75} : 
             MaxPoolingLayer    {(3:3), stride=(2:2)} :
-            ConvolutionalLayer {384, (3:3), pad = true, init='gaussian', initValueScale=0.01} : ReLU : 
-            ConvolutionalLayer {384, (3:3), pad = true, init='gaussian', initValueScale=0.01, initBias=0.1} : ReLU : 
-            ConvolutionalLayer {256, (3:3), pad = true, init='gaussian', initValueScale=0.01, initBias=0.1} : ReLU : 
+            ConvolutionalLayer {384, (3:3), pad = true, init='gaussian', initValueScale=2.07857} : ReLU : 
+            ConvolutionalLayer {384, (3:3), pad = true, init='gaussian', initValueScale=2.93945, initBias=0.1} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true, init='gaussian', initValueScale=2.93945, initBias=0.1} : ReLU : 
             MaxPoolingLayer    {(3:3), stride=(2:2)} :
-            DenseLayer         {4096, activation=ReLU, init='gaussian', initValueScale=0.005, initBias=0.1} : Dropout :
-            DenseLayer         {4096, activation=ReLU, init='gaussian', initValueScale=0.005, initBias=0.1} : Dropout :
-            LinearLayer        {labelDim, init='gaussian', initValueScale=0.01}
+            DenseLayer         {4096, activation=ReLU, init='gaussian', initValueScale=2.40038, initBias=0.1} : Dropout :
+            DenseLayer         {4096, activation=ReLU, init='gaussian', initValueScale=1.6, initBias=0.1} : Dropout :
+            LinearLayer        {labelDim, init='gaussian', initValueScale=3.2}
         )
 
         # inputs

From 3b2df9d37013d1840610b836423e157c0be1649e Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Tue, 22 Nov 2016 11:24:37 -0800
Subject: [PATCH 09/25] Add assert based on code review.

---
 Source/Math/ConvolveGeometry.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Source/Math/ConvolveGeometry.h b/Source/Math/ConvolveGeometry.h
index 057f72f06..240dd8853 100644
--- a/Source/Math/ConvolveGeometry.h
+++ b/Source/Math/ConvolveGeometry.h
@@ -120,6 +120,7 @@ public:
                 if (lo != 0 || hi != 0)
                 {
                     m_start[i] -= lo;
+                    assert(m_start[i] >= 0); 
                     int outPerMap = (int)(m_outputShape[i] / GetMapCount(i));
                     int cells = (int)((outPerMap - 1) * GetStride(i) + 1);
                     if (cells > 0)  // dummy if, just to get rid of warning 

From b5b54adbc928f1e12ac8df3585bdee473d12e8c8 Mon Sep 17 00:00:00 2001
From: Chris Basoglu <cbasoglu@microsoft.com>
Date: Tue, 22 Nov 2016 12:07:36 -0800
Subject: [PATCH 10/25] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b3bb041e3..6feca91f4 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 Highlights of this Release:
 * New ASGD/Hogwild! training using Microsoft’s Parameter Server ([Project Multiverso](https://github.com/Microsoft/multiverso))
 * Distributed Scenarios now supported in CNTK Python API
-* Introducing of [Memory compression](https://github.com/Microsoft/CNTK/wiki/Top-level-configurations#hypercompressmemory) optimizing memory usage, especially for GPU computation 
+* New [Memory Compression](https://github.com/Microsoft/CNTK/wiki/Top-level-configurations#hypercompressmemory) mode to reduce memory usage on GPU
 * CNTK Docker image with 1bit-SGD support
 * Stability Improvements and bug fixes
 

From 3c2e1120455f0ae3f963e0363bc26cf5a1bcde14 Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Tue, 22 Nov 2016 14:06:49 -0800
Subject: [PATCH 11/25] add /bigobj to enable compile.

---
 Source/Math/Math.vcxproj | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj
index 31a920b81..f1f7ba951 100644
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@@ -108,7 +108,7 @@
       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
       <FloatingPointExceptions>false</FloatingPointExceptions>
       <TreatWarningAsError>true</TreatWarningAsError>
-      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -140,6 +140,7 @@
   <ItemDefinitionGroup Condition="$(GpuBuild)">
     <ClCompile>
       <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaInclude)</AdditionalIncludeDirectories>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
@@ -148,6 +149,7 @@
   <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
     <ClCompile>
       <PreprocessorDefinitions>CPUONLY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <DelayLoadDLLs>$(MathDelayLoad)</DelayLoadDLLs>

From 1e73f4fccdbb8c72ba9e4033fbc241393d0d9124 Mon Sep 17 00:00:00 2001
From: Willi Richert <wilrich@microsoft.com>
Date: Tue, 22 Nov 2016 23:41:38 +0100
Subject: [PATCH 12/25] Don't ignore seq_starts

---
 bindings/python/cntk/utils/__init__.py        | 106 ++++++++++--------
 .../python/cntk/utils/tests/utils_test.py     |  18 ++-
 2 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/bindings/python/cntk/utils/__init__.py b/bindings/python/cntk/utils/__init__.py
index aa3012846..1519b07c1 100644
--- a/bindings/python/cntk/utils/__init__.py
+++ b/bindings/python/cntk/utils/__init__.py
@@ -329,8 +329,7 @@ def get_data_type(*args):
     inputs. Placeholders are ignored in the type determination.
 
     Args:
-        args (number, list, NumPy array, :class:`cntk.ops.variables.Variable`, 
-         or :class:`cntk.ops.functions.Function`): input
+        args (number, list, NumPy array, :class:`cntk.ops.variables.Variable`, or :class:`cntk.ops.functions.Function`): input
     Returns:
         np.float32, np.float64, or None
     """
@@ -410,9 +409,10 @@ def _pad_dense_to_max_len(var, batch, max_seq_len):
     Z = np.zeros((len(batch), max_seq_len) +
                  (data_point.shape), dtype=data_point.dtype)
     for idx, seq in enumerate(batch):
-        if seq[0].shape != data_point.shape:
+        elem_shape = seq[0].shape if hasattr(seq, 'shape') else ()
+        if elem_shape != data_point.shape:
             raise ValueError('shape mismatch: expected %s but got %s'
-                             % (str(data_point.shape), str(seq[0].shape)))
+                             % (str(data_point.shape), str(elem_shape)))
         Z[idx, :len(seq)] += seq
     return Z
 
@@ -443,6 +443,11 @@ def _pad_sparse_seq_to_max_len(batch, max_seq_len):
     return Z
 
 def _is_dense(batch):
+    if isinstance(batch, np.ndarray):
+        return True
+    elif sparse.issparse(batch):
+        return False
+
     is_dense = True
     b = batch
     while isinstance(b, list):
@@ -452,6 +457,7 @@ def _is_dense(batch):
 
     return True
 
+@typemap
 def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
     '''
     Convert to :class:`Value` with ``dtype``. If the samples in
@@ -476,37 +482,31 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
     if isinstance(batch, cntk_py.Value):
         return batch
 
+    if isinstance(batch, list):
+        if len(batch) == 0:
+            raise ValueError('batch is empty')
+
     # We need to figure out whether the data has a sequence axis. Note that
     # it is not enough to check whether the variable's dynamic axes include the
     # sequence axis, because the sequence axis might be omitted in the data if
     # it is not needed (CNTK core would then take care of this).
     batch_has_seq = has_seq_dim(var, batch)
 
-    if isinstance(batch, list):
-        is_dense = _is_dense(batch)
+    is_dense = _is_dense(batch)
 
-        if is_dense:
+    if batch_has_seq or seq_starts:
+        if isinstance(batch[0], list):
             seq_lens = [len(seq) for seq in batch]
-
-            # If the input is a list of lists of dense values, all of the same
-            # length, then we convert it into a NumPy array without requiring a
-            # mask.
-            if len(set(seq_lens)) == 1:
-                batch = np.asarray(batch)
         else:
-            if isinstance(batch[0], list):
-                seq_lens = [len(seq) for seq in batch]
-            else:
-                seq_lens = [seq.shape[0] for seq in batch]
+            seq_lens = [seq.shape[0] for seq in batch]
 
-        if batch_has_seq:
-            max_seq_len = max(seq_lens)
-    else:
-        is_dense = isinstance(batch, np.ndarray)
-        # It is a sparse or dense NumPy array having all sequences being the
-        # same length, so we just calculate the sequence lengths
-        if batch_has_seq:
-            max_seq_len = batch.shape[1]
+        max_seq_len = max(seq_lens)
+
+        # If the input is a list of lists of dense values, all of the same
+        # length, we convert it into a NumPy array. 
+        if is_dense and len(set(seq_lens)) == 1:
+            batch_has_seq = False
+            batch = np.asarray(batch, dtype=var.dtype)
 
     if dtype is None:
         dtype = get_data_type(var)
@@ -514,25 +514,8 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
     if device is None:
         device = use_default_device()
 
-    if isinstance(batch, np.ndarray):
-        if np.issubdtype(batch.dtype, int):
-            batch = batch.astype(var.dtype)
-        elif batch.dtype not in (np.float32, np.float64):
-            raise ValueError('only float32 and float64 are supported')
-
-        ndav = create_NDArrayView_from_NumPy(batch, device)
-        return Value(data=ndav)
-
-    if isinstance(batch, list):
-        if len(batch) == 0:
-            raise ValueError('batch is empty')
-
-        if not batch_has_seq and seq_starts is not None:
-            raise ValueError('specification of individual sequence begins does not'
-                    ' make sense when not using the sequence axis')
-
     # batch is now either a dense input that requires a mask, or it is sparse
-    if batch_has_seq:
+    if batch_has_seq or seq_starts:
         mask = cntk_py.NDMask((len(batch), max_seq_len), 
                 device or use_default_device())
         for idx, seq_len in enumerate(seq_lens):
@@ -550,7 +533,19 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
         mask = None
 
     if is_dense:
-        batch = _pad_dense_to_max_len(var, batch, max_seq_len)
+        if batch_has_seq:
+            batch = _pad_dense_to_max_len(var, batch, max_seq_len)
+        if not isinstance(batch, np.ndarray):
+            batch = np.asarray(batch)
+        ndav = create_NDArrayView_from_NumPy(batch.astype(dtype), device)
+        return Value(data=ndav, mask=mask)
+
+    if isinstance(batch, np.ndarray):
+        if np.issubdtype(batch.dtype, int):
+            batch = batch.astype(var.dtype)
+        elif batch.dtype not in (np.float32, np.float64):
+            raise ValueError('only float32 and float64 are supported')
+
         ndav = create_NDArrayView_from_NumPy(batch.astype(dtype), device)
         return Value(data=ndav, mask=mask)
 
@@ -841,6 +836,27 @@ class Value(cntk_py.Value):
         '''
         return super(Value, self).shape().dimensions()
 
+    @property
+    def mask(self):
+        '''
+        The mask matrix of this value. Each row denotes a sequence with its
+        elements describing the mask of the element:
+         * 2: beginning of sequence (e.g. an LSTM would be reset)
+         * 1: valid element
+         # 0: invalid element
+
+        Example:
+          A mask of 
+           ```[[2, 1, 1], [1, 1, 0]]
+           ```
+           describes a batch of two sequences. The first has three elements, of
+           which the first element signals the beginning of a sequence. The second
+           sequence has two elements, which are both continuations of the first
+           sequence.
+        '''
+        return np.asarray(super(Value, self).mask())
+    
+
     def __len__(self):
         '''
         Number of samples in this value object.
@@ -953,7 +969,7 @@ def value_to_seq(value):
     entries removed.
 
     Args:
-        value (`Value`): Value as it is returned by Swig
+        value (:class:`Value`): Value as it is returned by Swig
 
     Returns:
         a list of NumPy arrays
diff --git a/bindings/python/cntk/utils/tests/utils_test.py b/bindings/python/cntk/utils/tests/utils_test.py
index 9f66b6f94..2108b0a65 100644
--- a/bindings/python/cntk/utils/tests/utils_test.py
+++ b/bindings/python/cntk/utils/tests/utils_test.py
@@ -123,8 +123,6 @@ def test_get_data_type():
     # exception handling
     ((2,2), AA([[1,1],[2,2]]), ValueError),
     (1, [[[1,2]]], ValueError),
-    #(1, [AA([[40], [50]])], ValueError),
-    ((1,), [[[40], [50]]], ValueError),
 ])
 def test_has_seq_dim_dense(shape, batch, expected):
     i1 = input_variable(shape)
@@ -160,4 +158,20 @@ def test_sanitize_batch_sparse():
     # 2 sequences, with max seq len of 2 and dimension 3
     assert b.shape == (2,2,3)
 
+@pytest.mark.parametrize("batch, seq_starts, expected_mask", [
+    ([[5, 6, 7],
+       [8]],
+       [True, False],
+       [[2, 1, 1], [1, 0, 0]]),
+
+    ([[5],
+       [8]],
+       [True, False],
+       [[2], [1]]),
+])
+def test_mask(batch, seq_starts, expected_mask):
+    shape = (1,)
+    var = input_variable(shape)
+    s = sanitize_batch(var, batch, seq_starts)
+    assert np.allclose(s.mask, expected_mask)
 

From 6bcc3a342628654a744268783c5adf0c716e5111 Mon Sep 17 00:00:00 2001
From: Willi Richert <wilrich@microsoft.com>
Date: Wed, 23 Nov 2016 00:27:51 +0100
Subject: [PATCH 13/25] Do not expose internal util functions

---
 bindings/python/cntk/learner.py               |  4 +-
 bindings/python/cntk/ops/tests/linear_test.py |  6 +--
 bindings/python/cntk/utils/__init__.py        | 47 ++++++++-----------
 .../python/cntk/utils/tests/utils_test.py     | 11 +++--
 4 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/bindings/python/cntk/learner.py b/bindings/python/cntk/learner.py
index 948510175..c3e11e2ad 100644
--- a/bindings/python/cntk/learner.py
+++ b/bindings/python/cntk/learner.py
@@ -98,8 +98,8 @@ class Learner(cntk_py.Learner):
         Returns:
             `False` to indicate that learning has stopped for all of the parameters associated with this learner
         '''
-        from .utils import create_NDArrayView_from_NumPy
-        var_nd_map = { var:create_NDArrayView_from_NumPy(val) for var, val in
+        from .utils import _create_NDArrayView_from_NumPy
+        var_nd_map = { var: _create_NDArrayView_from_NumPy(val) for var, val in
                 gradient_values.items() }
 
         return super(Learner, self).update(var_nd_map, training_sample_count)
diff --git a/bindings/python/cntk/ops/tests/linear_test.py b/bindings/python/cntk/ops/tests/linear_test.py
index 078c279cb..059693cf5 100644
--- a/bindings/python/cntk/ops/tests/linear_test.py
+++ b/bindings/python/cntk/ops/tests/linear_test.py
@@ -13,7 +13,7 @@ from __future__ import division
 import numpy as np
 import pytest
 from .ops_test_utils import unittest_helper, _test_unary_op, _test_binary_op, AA, I, precision, PRECISION_TO_TYPE, batch_dense_to_sparse, left_matrix_type, right_matrix_type
-from ...utils import sanitize_dtype_cntk, ones_like, eval
+from ...utils import sanitize_dtype_cntk, _ones_like, eval
 
 TENSOR_PAIRS = [
     ([30.], [10.]),
@@ -74,8 +74,8 @@ def test_op_plus_var_sequences_input_input(left_batch, right_batch, device_id, p
                         for i in range(len(left_batch))]
 
     expected_backward = {
-        'left': ones_like(left_batch, PRECISION_TO_TYPE[precision]),
-        'right': ones_like(right_batch, PRECISION_TO_TYPE[precision])
+        'left': _ones_like(left_batch, PRECISION_TO_TYPE[precision]),
+        'right': _ones_like(right_batch, PRECISION_TO_TYPE[precision])
     }
 
     left_value = [AA(sample, dtype=PRECISION_TO_TYPE[precision])
diff --git a/bindings/python/cntk/utils/__init__.py b/bindings/python/cntk/utils/__init__.py
index 1519b07c1..ce0681a5c 100644
--- a/bindings/python/cntk/utils/__init__.py
+++ b/bindings/python/cntk/utils/__init__.py
@@ -53,18 +53,11 @@ def cntk_device(device_id):
         return gpu(device_id)
 
 
-def is_string(value):
-    if sys.version_info.major < 3:
-        return isinstance(value, basestring)
-
-    return isinstance(value, str)
-
-
-def dense_to_str(data):
+def _dense_to_str(data):
     return ' '.join(data.ravel(order='C').astype(np.str))
 
 
-def sparse_to_str(data):
+def _sparse_to_str(data):
     return ' '.join('%s:%s' % (k, v) for k, v in sorted(data.items()))
 
 
@@ -96,12 +89,12 @@ def tensors_to_text_format(sample_idx, alias_tensor_map):
                 # for this alias there no more sequence elements
                 continue
 
-            if is_tensor(tensor):
+            if _is_tensor(tensor):
                 if not isinstance(tensor, np.ndarray):
                     tensor = np.asarray(tensor)
-                to_str = dense_to_str
+                to_str = _dense_to_str
             elif isinstance(tensor, list) and isinstance(tensor[0], dict):
-                to_str = sparse_to_str
+                to_str = _sparse_to_str
             else:
                 raise ValueError(
                     'expected a tensor (dense) or list of dicts (sparse), but got "%s"' % type(tensor))
@@ -113,7 +106,7 @@ def tensors_to_text_format(sample_idx, alias_tensor_map):
     return '\n'.join(lines)
 
 
-def is_tensor(data):
+def _is_tensor(data):
     '''
     Checks whether the data is a tensor, i.e. whether it is a NumPy array or a
     list of NumPy arrays.
@@ -179,7 +172,7 @@ def one_hot(batch, num_classes, dtype=None, device=None):
         value = cntk_py.Value.create_one_hot_double(num_classes, batch, device, False) 
     return value
 
-def has_seq_dim(var, data):
+def _has_seq_dim(var, data):
     '''
     Checks whether the data has a sequence dimensions or not. 
 
@@ -274,7 +267,7 @@ def sanitize_shape(shape):
 
 def sanitize_input(arg, fallback_dtype=np.float32, reshape=None):
     """
-    Convert to :class:`cntk.ops.variables.Variable` so that it can be passed as Variable to the
+    Convert to :class:`~cntk.ops.variables.Variable` so that it can be passed as Variable to the
     CNTK operators.
 
       * If ``arg`` is a NumPy array and its type is neither `np.float32` nor `np.float64`, it sets it to `np.float32`.
@@ -490,7 +483,7 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
     # it is not enough to check whether the variable's dynamic axes include the
     # sequence axis, because the sequence axis might be omitted in the data if
     # it is not needed (CNTK core would then take care of this).
-    batch_has_seq = has_seq_dim(var, batch)
+    batch_has_seq = _has_seq_dim(var, batch)
 
     is_dense = _is_dense(batch)
 
@@ -537,7 +530,7 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
             batch = _pad_dense_to_max_len(var, batch, max_seq_len)
         if not isinstance(batch, np.ndarray):
             batch = np.asarray(batch)
-        ndav = create_NDArrayView_from_NumPy(batch.astype(dtype), device)
+        ndav = _create_NDArrayView_from_NumPy(batch.astype(dtype), device)
         return Value(data=ndav, mask=mask)
 
     if isinstance(batch, np.ndarray):
@@ -546,7 +539,7 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
         elif batch.dtype not in (np.float32, np.float64):
             raise ValueError('only float32 and float64 are supported')
 
-        ndav = create_NDArrayView_from_NumPy(batch.astype(dtype), device)
+        ndav = _create_NDArrayView_from_NumPy(batch.astype(dtype), device)
         return Value(data=ndav, mask=mask)
 
     # There are three possibilities of providing sparse batches:
@@ -633,7 +626,7 @@ def sanitize_value(shape, value, dtype, device):
         if shape is None:
             raise ValueError('you need to specify at least shape or value')
         cntk_dtype = sanitize_dtype_cntk(dtype)
-        ndav = create_NDArrayView(shape, cntk_dtype, device)
+        ndav = _create_NDArrayView(shape, cntk_dtype, device)
     else:
         np_dtype = sanitize_dtype_numpy(dtype)
         if not isinstance(value, np.ndarray) or value.dtype != np_dtype:
@@ -642,7 +635,7 @@ def sanitize_value(shape, value, dtype, device):
             else:
                 value = np.asarray(value, dtype=np_dtype)
 
-        ndav = create_NDArrayView_from_NumPy(value, device)
+        ndav = _create_NDArrayView_from_NumPy(value, device)
 
     return ndav
 
@@ -767,7 +760,7 @@ def sanitize_var_map(op_arguments, arguments, precision=None,
     return var_map
 
 
-def ones_like(batch, precision):
+def _ones_like(batch, precision):
     '''
     Returns a new batch, which has the same format as ``batch`` but all values
     set to 1.
@@ -778,7 +771,7 @@ def ones_like(batch, precision):
     return [np.ones_like(sample, dtype=sanitize_precision(precision)) for sample in batch]
 
 
-def create_NDArrayView(shape, data_type=cntk_py.DataType_Float, device=None):
+def _create_NDArrayView(shape, data_type=cntk_py.DataType_Float, device=None):
     shape = sanitize_shape(shape)
     if device is None:
         device = use_default_device()
@@ -788,7 +781,7 @@ def create_NDArrayView(shape, data_type=cntk_py.DataType_Float, device=None):
     return view
 
 
-def create_NDArrayView_from_NumPy(nd, device=None):
+def _create_NDArrayView_from_NumPy(nd, device=None):
     if device is None:
         device = use_default_device()
 
@@ -814,11 +807,11 @@ class Value(cntk_py.Value):
             device = use_default_device()
 
         if shape and dtype:
-            ndav = create_NDArrayView(shape, dtype, device)
+            ndav = _create_NDArrayView(shape, dtype, device)
 
         elif data:
             if isinstance(data, np.ndarray):
-                ndav = create_NDArrayView_from_NumPy(data, device)
+                ndav = _create_NDArrayView_from_NumPy(data, device)
             else:
                 ndav = data
 
@@ -955,7 +948,7 @@ def ensure_dev(ndav, dev):
 
     if ndav.device() != dev:
 
-        ndav_on_target = create_NDArrayView(
+        ndav_on_target = _create_NDArrayView(
             ndav.shape().dimensions(), data_type=ndav.get_data_type(), dev=dev)
         ndav_on_target.copy_from(ndav)
         ndav = ndav_on_target
@@ -1029,7 +1022,7 @@ def eval(op, arguments=None, precision=None, device=None, backward_pass=False, e
     if backward_pass:
         if expected_backward is None:
             expected_backward = arguments
-        root_gradients = {v: ones_like(o, precision) for v, o in
+        root_gradients = {v: _ones_like(o, precision) for v, o in
                           forward_output.items()}
 
         backward_output = op.backward(state, root_gradients, expected_backward)
diff --git a/bindings/python/cntk/utils/tests/utils_test.py b/bindings/python/cntk/utils/tests/utils_test.py
index 2108b0a65..340899f27 100644
--- a/bindings/python/cntk/utils/tests/utils_test.py
+++ b/bindings/python/cntk/utils/tests/utils_test.py
@@ -13,6 +13,7 @@ from cntk.device import default
 from cntk.tests.test_utils import precision, PRECISION_TO_TYPE
 from cntk.ops import *
 from cntk.utils import *
+from cntk.utils import _has_seq_dim, _is_tensor
 
 # Keeping things short
 AA = np.asarray
@@ -58,7 +59,7 @@ def test_tensor_conversion_dense(idx, alias_tensor_map, expected):
     ([AA([1, 2]), AA([])], False),
 ])
 def test_is_tensor(data, expected):
-    assert is_tensor(data) == expected
+    assert _is_tensor(data) == expected
 
 
 def test_sanitize_dtype_numpy():
@@ -127,10 +128,10 @@ def test_get_data_type():
 def test_has_seq_dim_dense(shape, batch, expected):
     i1 = input_variable(shape)
     if expected in [False, True]:
-        assert has_seq_dim(i1, batch) == expected
+        assert _has_seq_dim(i1, batch) == expected
     else:
         with pytest.raises(expected):
-            has_seq_dim(i1, batch)
+            _has_seq_dim(i1, batch)
 
 @pytest.mark.parametrize("shape, batch, expected", [
     ((1,2), [csr([1,0]), csr([2,3]), csr([5,6])], False),
@@ -139,10 +140,10 @@ def test_has_seq_dim_dense(shape, batch, expected):
 def test_has_seq_dim_sparse(shape, batch, expected):
     i1 = input_variable(shape, is_sparse=True)
     if expected in [False, True]:
-        assert has_seq_dim(i1, batch) == expected
+        assert _has_seq_dim(i1, batch) == expected
     else:
         with pytest.raises(expected):
-            has_seq_dim(i1, batch)
+            _has_seq_dim(i1, batch)
 
 def test_sanitize_batch_sparse():
     batch = [[csr([1,0,2]), csr([2,3,0])],

From a855bdfb9e7395a4a046aa164932a74b64fa0056 Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Tue, 22 Nov 2016 16:44:53 -0800
Subject: [PATCH 14/25] Update test baseline.

---
 .../EvalClientTests/CSEvalClientTest/baseline.txt             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt b/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
index 4448a313a..bbd2b3fa0 100644
--- a/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
+++ b/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
@@ -763,7 +763,7 @@ INFO: rn4_1.c_proj.y.y: loading pre-CuDNNv5 model: approximated mini-batch count
 INFO: rn4_2.c1.c.y.y: loading pre-CuDNNv5 model: approximated mini-batch count of 625625 as 10010000 trained samples.
       Statistics in further training may be biased; consider re-training instead.
 INFO: rn4_2.c2.y.y: loading pre-CuDNNv5 model: approximated mini-batch count of 625625 as 10010000 trained samples.
-      Statistics in further EvaluateImageInputUsingFeatureVector: Outcome = 340
+      Statistics in further EvaluateImageInputUsingFeatureVector: Outcome = 118
 
 ====== EvaluateImageInputUsingImageApi ========
 training may be biased; consider re-training instead.
@@ -847,7 +847,7 @@ WARNING: rn2_1.c2.y.y: loading pre-CuDNNv5 model: approximately converting varia
 WARNING: rn2_1.c_proj.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
 WARNING: rn2_2.c1.c.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
 WARNING: rn2_2.c2.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
-WARNING: rn3_1.c1.c.y.y: loading pre-CuDNNv5 model:EvaluateImageInputUsingImageApi: Outcome = 340
+WARNING: rn3_1.c1.c.y.y: loading pre-CuDNNv5 model:EvaluateImageInputUsingImageApi: Outcome = 118
 
 ====== CompareImageApiResults ========
 Both image API calls returned the same output vector.

From 4e00f7dcce1ff2ca55227cc35b49ef5ed065fd1a Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Tue, 22 Nov 2016 18:01:03 -0800
Subject: [PATCH 15/25] Undo previous change.

---
 .../EvalClientTests/CSEvalClientTest/baseline.txt             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt b/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
index bbd2b3fa0..4448a313a 100644
--- a/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
+++ b/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
@@ -763,7 +763,7 @@ INFO: rn4_1.c_proj.y.y: loading pre-CuDNNv5 model: approximated mini-batch count
 INFO: rn4_2.c1.c.y.y: loading pre-CuDNNv5 model: approximated mini-batch count of 625625 as 10010000 trained samples.
       Statistics in further training may be biased; consider re-training instead.
 INFO: rn4_2.c2.y.y: loading pre-CuDNNv5 model: approximated mini-batch count of 625625 as 10010000 trained samples.
-      Statistics in further EvaluateImageInputUsingFeatureVector: Outcome = 118
+      Statistics in further EvaluateImageInputUsingFeatureVector: Outcome = 340
 
 ====== EvaluateImageInputUsingImageApi ========
 training may be biased; consider re-training instead.
@@ -847,7 +847,7 @@ WARNING: rn2_1.c2.y.y: loading pre-CuDNNv5 model: approximately converting varia
 WARNING: rn2_1.c_proj.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
 WARNING: rn2_2.c1.c.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
 WARNING: rn2_2.c2.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
-WARNING: rn3_1.c1.c.y.y: loading pre-CuDNNv5 model:EvaluateImageInputUsingImageApi: Outcome = 118
+WARNING: rn3_1.c1.c.y.y: loading pre-CuDNNv5 model:EvaluateImageInputUsingImageApi: Outcome = 340
 
 ====== CompareImageApiResults ========
 Both image API calls returned the same output vector.

From efe4cc0ee955a12940eca451612422aa0642fac1 Mon Sep 17 00:00:00 2001
From: Cha Zhang <chazhang@microsoft.com>
Date: Tue, 22 Nov 2016 18:02:16 -0800
Subject: [PATCH 16/25] Change baseline.

---
 .../EvalClientTests/CSEvalClientTest/baseline.txt             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt b/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
index 4448a313a..bbd2b3fa0 100644
--- a/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
+++ b/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
@@ -763,7 +763,7 @@ INFO: rn4_1.c_proj.y.y: loading pre-CuDNNv5 model: approximated mini-batch count
 INFO: rn4_2.c1.c.y.y: loading pre-CuDNNv5 model: approximated mini-batch count of 625625 as 10010000 trained samples.
       Statistics in further training may be biased; consider re-training instead.
 INFO: rn4_2.c2.y.y: loading pre-CuDNNv5 model: approximated mini-batch count of 625625 as 10010000 trained samples.
-      Statistics in further EvaluateImageInputUsingFeatureVector: Outcome = 340
+      Statistics in further EvaluateImageInputUsingFeatureVector: Outcome = 118
 
 ====== EvaluateImageInputUsingImageApi ========
 training may be biased; consider re-training instead.
@@ -847,7 +847,7 @@ WARNING: rn2_1.c2.y.y: loading pre-CuDNNv5 model: approximately converting varia
 WARNING: rn2_1.c_proj.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
 WARNING: rn2_2.c1.c.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
 WARNING: rn2_2.c2.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
-WARNING: rn3_1.c1.c.y.y: loading pre-CuDNNv5 model:EvaluateImageInputUsingImageApi: Outcome = 340
+WARNING: rn3_1.c1.c.y.y: loading pre-CuDNNv5 model:EvaluateImageInputUsingImageApi: Outcome = 118
 
 ====== CompareImageApiResults ========
 Both image API calls returned the same output vector.

From 02406df0df338f440bbec48f38127df96160bc96 Mon Sep 17 00:00:00 2001
From: "REDMOND\\sayanpa" <sayanpa@microsoft.com>
Date: Tue, 22 Nov 2016 18:26:39 -0800
Subject: [PATCH 17/25] Added Sequence to sequence tutorial and test

---
 .../Examples/CNTK_204_Sequence_To_Sequence.py |   24 +
 Tutorials/CNTK_204_Sequence_To_Sequence.ipynb | 1124 +++++++++++++++++
 2 files changed, 1148 insertions(+)
 create mode 100644 Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py
 create mode 100644 Tutorials/CNTK_204_Sequence_To_Sequence.ipynb

diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py b/Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py
new file mode 100644
index 000000000..52ed55d4d
--- /dev/null
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import os
+import re
+
+abs_path = os.path.dirname(os.path.abspath(__file__))
+notebook = os.path.join(abs_path, "..", "..", "..", "..", "Tutorials", "CNTK_204_Sequence_To_Sequence.ipynb")
+
+def test_cntk_204_sequence_to_sequence_noErrors(nb):
+    errors = [output for cell in nb.cells if 'outputs' in cell
+              for output in cell['outputs'] if output.output_type == "error"]
+    print(errors)
+    assert errors == []
+
+expectedEvalError = 90
+
+def test_cntk_204_sequence_to_sequence_trainerror(nb):
+    testCell = [cell for cell in nb.cells
+                if cell.cell_type == 'code' and re.search('#Print the training error', cell.source)]
+    assert float((testCell[0].outputs[0])['text']) < expectedEvalError
\ No newline at end of file
diff --git a/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb b/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
new file mode 100644
index 000000000..cf136cf93
--- /dev/null
+++ b/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
@@ -0,0 +1,1124 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CNTK 204: Sequence to Sequence Networks with Text Data\n",
+    "\n",
+    "\n",
+    "## Introduction and Background\n",
+    "\n",
+    "This hands-on tutorial will take you through both the basics of sequence-to-sequence networks, and how to implement them in the Microsoft Cognitive Toolkit. In particular, we will implement a sequence-to-sequence model to perform grapheme to phoneme translation. We will start with some basic theory and then explain the data in more detail, and how you can download it.\n",
+    "\n",
+    "Andrej Karpathy has a [nice visualization](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) of the five paradigms of neural network architectures:\n",
+    "\n",
+    "<img src=http://cntk.ai/jup/paradigms.jpg width=750px>\n",
+    "\n",
+    "In this tutorial, we are going to be talking about the fourth paradigm: many-to-many, also known as sequence-to-sequence networks. The input is a sequence with a dynamic length, and the output is also a sequence with some dynamic length. It is the logical extension of the many-to-one paradigm in that previously we were predicting some category (which could easily be one of `V` words where `V` is an entire vocabulary) and now we want to predict a whole sequence of those categories.\n",
+    "\n",
+    "The applications of sequence-to-sequence networks are nearly limitless. It is a natural fit for machine translation (e.g. English input sequences, French output sequences); automatic text summarization (e.g. full document input sequence, summary output sequence); word to pronunciation models (e.g. character [grapheme] input sequence, pronunciation [phoneme] output sequence); and even parse tree generation (e.g. regular text input, flat parse tree output).\n",
+    "\n",
+    "## Basic theory\n",
+    "\n",
+    "A sequence-to-sequence model consists of two main pieces: (1) an encoder; and (2) a decoder. Both the encoder and the decoder are recurrent neural network (RNN) layers that can be implemented using a vanilla RNN, an LSTM, or GRU cells (here we will use LSTM). In the basic sequence-to-sequence model, the encoder processes the input sequence into a fixed representation that is fed into the decoder as a context. The decoder then uses some mechanism (discussed below) to decode the processed information into an output sequence. The decoder is a language model that is augmented with some \"strong context\" by the encoder, and so each symbol that it generates is fed back into the decoder for additional context (like a traditional LM). For an English to German translation task, the most basic setup might look something like this: \n",
+    "\n",
+    "<img src=http://cntk.ai/jup/s2s.png width=700px>\n",
+    "\n",
+    "The basic sequence-to-sequence network passes the information from the encoder to the decoder by initializing the decoder RNN with the final hidden state of the encoder as its initial hidden state. The input is then a \"sequence start\" tag (`<s>` in the diagram above) which primes the decoder to start generating an output sequence. Then, whatever word (or note or image, etc.) it generates at that step is fed in as the input for the next step. The decoder keeps generating outputs until it hits the special \"end sequence\" tag (`</s>` above).\n",
+    "\n",
+    "A more complex and powerful version of the basic sequence-to-sequence network uses an attention model. While the above setup works well, it can start to break down when the input sequences get long. At each step, the hidden state `h` is getting updated with the most recent information, and therefore `h` might be getting \"diluted\" in information as it processes each token. Further, even with a relatively short sequence, the last token will always get the last say and therefore the thought vector will be somewhat biased/weighted towards that last word. To deal with this problem, we use an \"attention\" mechanism that allows the decoder to look not only at all of the hidden states from the input, but it also learns which hidden states, for each step in decoding, to put the most weight on. We will discuss an attention implementation in a later version of this tutorial."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Problem: Grapheme-to-Phoneme Conversion\n",
+    "\n",
+    "The [grapheme](https://en.wikipedia.org/wiki/Grapheme) to [phoneme](https://en.wikipedia.org/wiki/Phoneme) problem is a translation task that takes the letters of a word as the input sequence (the graphemes are the smallest units of a writing system) and outputs the corresponding phonemes; that is, the units of sound that make up a language. In other words, the system aims to generate an unambigious representation of how to pronounce a given input word.\n",
+    "\n",
+    "### Example\n",
+    "\n",
+    "| Letters  | T | A | N | G | E | R |\n",
+    "| --- | --- |\n",
+    "| Phonemes | ~T | ~AE | ~NG | ~ER | null | null |\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task and Model Structure\n",
+    "\n",
+    "As discussed above, the task we are interested in solving is creating a model that takes some sequence as an input, and generates an output sequence based on the contents of the input. The model's job is to learn the mapping from the input sequence to the output sequence that it will generate. The job of the encoder is to come up with a good representation of the input that the decoder can use to generate a good output. For both the encoder and the decoder, the LSTM does a good job at this.\n",
+    "\n",
+    "We will use the LSTM implementation from the CNTK Blocks library. This implements the \"smarts\" of the LSTM and we can more or less think of it as a black box. What is important to understand, however, is that there are two pieces to think of when implementing an RNN: the recurrence, which is the unrolled network over a sequence, and the block, which is the piece of the network run for each element of the sequence. We only need to implement the recurrence.\n",
+    "\n",
+    "It helps to think of the recurrence as a function that keeps calling `step(x)` on the block (in our case, LSTM). At a high level, it looks like this:\n",
+    "\n",
+    "```\n",
+    "class LSTM {\n",
+    "    float hidden_state\n",
+    "    \n",
+    "    init(initial_value):\n",
+    "        hidden_state = initial_value\n",
+    "        \n",
+    "    step(x):\n",
+    "        hidden_state = LSTM_function(x, hidden_state)\n",
+    "        return hidden_state\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "So, each call to the `step(x)` function takes some input `x`, modifies the internal `hidden_state`, and returns it. Therefore, with every input `x`, the value of the `hidden_state` evolves. Below we will import some required functionality, and then implement the recurrence that makes use of this mechanism."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Importing CNTK and other useful libraries\n",
+    "\n",
+    "CNTK is a python module that contains several submodules like `io`, `learner`, `graph`, etc. We make extensive use of numpy as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os\n",
+    "from cntk import Trainer, Axis, save_model\n",
+    "from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT\n",
+    "from cntk.learner import momentum_sgd, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType\n",
+    "from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence, past_value, future_value, element_select, \\\n",
+    "                     alias, hardmax, placeholder_variable, combine, parameter, plus, times\n",
+    "from cntk.ops.functions import CloneMethod\n",
+    "from cntk.graph import find_nodes_by_name\n",
+    "from cntk.blocks import LSTM, Stabilizer\n",
+    "from cntk.initializer import glorot_uniform\n",
+    "from cntk.utils import get_train_eval_criterion, get_train_loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Downloading the data\n",
+    "\n",
+    "In this tutorial we will use a lightly pre-processed version of the CMUDict (version 0.7b) dataset from http://www.speech.cs.cmu.edu/cgi-bin/cmudict. The CMUDict data is the Carnegie Mellon University Pronouncing Dictionary is an open-source machine-readable pronunciation dictionary for North American English. The data is in the CNTKTextFormatReader format. Here is an example sequence pair from the data, where the input sequence (S0) is in the left column, and the output sequence (S1) is on the right:\n",
+    "\n",
+    "```\n",
+    "0\t|S0 3:1 |# <s>\t|S1 3:1 |# <s>\n",
+    "0\t|S0 4:1 |# A\t|S1 32:1 |# ~AH\n",
+    "0\t|S0 5:1 |# B\t|S1 36:1 |# ~B\n",
+    "0\t|S0 4:1 |# A\t|S1 31:1 |# ~AE\n",
+    "0\t|S0 7:1 |# D\t|S1 38:1 |# ~D\n",
+    "0\t|S0 12:1 |# I\t|S1 47:1 |# ~IY\n",
+    "0\t|S0 1:1 |# </s>\t|S1 1:1 |# </s>\n",
+    "```\n",
+    "\n",
+    "The code below will download the required files (training, the single sequence above for validation, and a small vocab file) and put them in your local folder (the training file is ~34 MB, testing is ~4MB, and the validation file and vocab file are both less than 1KB)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting download:  tiny.ctf\n",
+      "Download completed\n",
+      "Starting download:  cmudict-0.7b.train-dev-20-21.ctf\n",
+      "Download completed\n",
+      "Starting download:  cmudict-0.7b.mapping\n",
+      "Download completed\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "\n",
+    "def download(url, filename):\n",
+    "    \"\"\" utility to download necessary data \"\"\"\n",
+    "    response = requests.get(url, stream=True)\n",
+    "    with open(filename, \"wb\") as handle:\n",
+    "        for data in response.iter_content():\n",
+    "            handle.write(data)\n",
+    "            \n",
+    "valid = \"https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/Data/tiny.ctf?raw=true\"\n",
+    "train = \"https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf?raw=true\"\n",
+    "vocab = \"https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping?raw=true\"\n",
+    "\n",
+    "urls = [valid, train, vocab]\n",
+    "\n",
+    "for t in urls:\n",
+    "    filename = t.split('/')[-1].split('?')[0]\n",
+    "    try:\n",
+    "        f = open(filename)\n",
+    "        f.close()\n",
+    "        print(\"Reusing locally cached: \", filename)\n",
+    "    except IOError:\n",
+    "        print(\"Starting download: \", filename)\n",
+    "        download(t, filename)\n",
+    "        print(\"Download completed\")\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Select the notebook run mode\n",
+    "\n",
+    "There are two run modes:\n",
+    "- *Fast mode*: `isFast` is set to `True`. This is the default mode for the notebooks, which means we train for fewer iterations or train / test on limited data. This ensures functional correctness of the notebook though the models produced are far from what a completed training would produce.\n",
+    "\n",
+    "- *Slow mode*: We recommend the user to set this flag to `False` once the user has gained familiarity with the notebook content and wants to gain insight from running the notebooks for a longer period with different parameters for training. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "isFast = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reader\n",
+    "\n",
+    "To efficiently collect our data, randomize it for training, and pass it to the network, we use the CNTKTextFormat reader. We will create a small function that will be called when training (or testing) that defines the names of the streams in our data, and how they are referred to in the raw training data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Hepler function to load the model vocabulary file\n",
+    "def get_vocab(path):\n",
+    "    # get the vocab for printing output sequences in plaintext\n",
+    "    vocab = [w.strip() for w in open(path).readlines()]\n",
+    "    i2w = { i:ch for i,ch in enumerate(vocab) }\n",
+    "    \n",
+    "    return (vocab, i2w)\n",
+    "\n",
+    "# Read vocabulary data and generate their corresponding indices\n",
+    "vocab, i2w = get_vocab(\"cmudict-0.7b.mapping\")\n",
+    "\n",
+    "input_vocab_size = len(vocab)\n",
+    "label_vocab_size = len(vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vocabulary size is  69\n",
+      "First 15 letters are :\n",
+      "[\"'\", '</s>', '<s/>', '<s>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']\n",
+      "\n",
+      "Print dictionary with the vocabulary mapping: \n",
+      "{0: \"'\", 1: '</s>', 2: '<s/>', 3: '<s>', 4: 'A', 5: 'B', 6: 'C', 7: 'D', 8: 'E', 9: 'F', 10: 'G', 11: 'H', 12: 'I', 13: 'J', 14: 'K', 15: 'L', 16: 'M', 17: 'N', 18: 'O', 19: 'P', 20: 'Q', 21: 'R', 22: 'S', 23: 'T', 24: 'U', 25: 'V', 26: 'W', 27: 'X', 28: 'Y', 29: 'Z', 30: '~AA', 31: '~AE', 32: '~AH', 33: '~AO', 34: '~AW', 35: '~AY', 36: '~B', 37: '~CH', 38: '~D', 39: '~DH', 40: '~EH', 41: '~ER', 42: '~EY', 43: '~F', 44: '~G', 45: '~HH', 46: '~IH', 47: '~IY', 48: '~JH', 49: '~K', 50: '~L', 51: '~M', 52: '~N', 53: '~NG', 54: '~OW', 55: '~OY', 56: '~P', 57: '~R', 58: '~S', 59: '~SH', 60: '~T', 61: '~TH', 62: '~UH', 63: '~UW', 64: '~V', 65: '~W', 66: '~Y', 67: '~Z', 68: '~ZH'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print vocab and the correspoding mapping to the phonemes\n",
+    "print(\"Vocabulary size is \", len(vocab))\n",
+    "print(\"First 15 letters are :\")\n",
+    "print(vocab[:15])\n",
+    "print()\n",
+    "print(\"Print dictionary with the vocabulary mapping: \")\n",
+    "print(i2w)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use the above to create a reader for our training data. Let's create it now:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def create_reader(path, randomize, size=INFINITELY_REPEAT):\n",
+    "    return MinibatchSource(CTFDeserializer(path, StreamDefs(\n",
+    "        features  = StreamDef(field='S0', shape=input_vocab_size, is_sparse=True),\n",
+    "        labels    = StreamDef(field='S1', shape=label_vocab_size, is_sparse=True)\n",
+    "    )), randomize=randomize, epoch_size = size)\n",
+    "\n",
+    "# Train data reader\n",
+    "train_reader = create_reader(\"cmudict-0.7b.train-dev-20-21.ctf\", True)\n",
+    "\n",
+    "# Validation/Test data reader \n",
+    "valid_reader = create_reader(\"tiny.ctf\", False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now let's set our model hyperparameters..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Our input vocabulary size is 69, and those ones represent the label as well. Additionally we have 1 hidden layer with 128 nodes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model_dir = \".\" # we downloaded our data to the local directory above\n",
+    "\n",
+    "# model dimensions\n",
+    "input_vocab_dim  = input_vocab_size\n",
+    "label_vocab_dim  = label_vocab_size\n",
+    "hidden_dim = 128\n",
+    "num_layers = 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: setup the input to the network\n",
+    "\n",
+    "### Dynamic axes in CNTK (Key concept)\n",
+    "\n",
+    "One of the important concepts in understanding CNTK is the idea of two types of axes: \n",
+    "- **static axes**, which are the traditional axes of a variable's shape, and \n",
+    "- **dynamic axes**, which have dimensions that are unknown until the variable is bound to real data at computation time. \n",
+    "\n",
+    "The dynamic axes are particularly important in the world of recurrent neural networks. Instead of having to decide a maximum sequence length ahead of time, padding your sequences to that size, and wasting computation, CNTK's dynamic axes allow for variable sequence lengths that are automatically packed in minibatches to be as efficient as possible.\n",
+    "\n",
+    "When setting up sequences, there are *two dynamic axes* that are important to consider. The first is the *batch axis*, which is the axis along which multiple sequences are batched. The second is the dynamic axis particular to that sequence. The latter is specific to a particular input because of variable sequence lengths in your data. For example, in sequence to sequence networks, we have two sequences: the **input sequence**, and the **ouptput (or 'label') sequence**. One of the things that makes this type of network so powerful is that the length of the input sequence and the output sequence do not have to correspond to each other. Therefore, both the input sequence and the output sequence require their own unique dynamic axis.\n",
+    "\n",
+    "When defining the input to a network, we set up the required dynamic axes and the shape of the input variables. Below, we define the shape (vocabulary size) of the inputs, create their dynamic axes, and finally create input variables that represent input nodes in our network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Source and target inputs to the model\n",
+    "batch_axis = Axis.default_batch_axis()\n",
+    "input_seq_axis = Axis('inputAxis')\n",
+    "label_seq_axis = Axis('labelAxis')\n",
+    "\n",
+    "input_dynamic_axes = [batch_axis, input_seq_axis]\n",
+    "raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input')\n",
+    "\n",
+    "label_dynamic_axes = [batch_axis, label_seq_axis]\n",
+    "raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Questions\n",
+    "\n",
+    "1. Why do the shapes of the input variables correspond to the size of our dictionaries in sequence to sequence networks?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: define the network\n",
+    "\n",
+    "As discussed before, the sequence-to-sequence network is, at its most basic, an RNN encoder followed by an RNN decoder, and a dense output layer. We could do this in a few lines with the layers library, but let's go through things in a little more detail without adding too much complexity. The first step is to perform some manipulations on the input data; let's look at the code below and then discuss what we're doing. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Instantiate the sequence to sequence translation model\n",
+    "input_sequence = raw_input\n",
+    "\n",
+    "# Drop the sentence start token from the label, for decoder training\n",
+    "label_sequence = sequence.slice(raw_labels, \n",
+    "                       1, 0, name='label_sequence') # <s> A B C </s> --> A B C </s>\n",
+    "label_sentence_start = sequence.first(raw_labels)   # <s>\n",
+    "\n",
+    "is_first_label = sequence.is_first(label_sequence)  # 1 0 0 0 ...\n",
+    "label_sentence_start_scattered = sequence.scatter(  # <s> 0 0 0 ... (up to the length of label_sequence)\n",
+    "    label_sentence_start, is_first_label)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We have two input variables, `raw_input` and `raw_labels`. Typically, the labels would not have to be part of the network definition because they would only be used in a criterion node when we compare the network's output with the ground truth. However, in sequence-to-sequence networks, the labels themselves form part of the input to the network during training as they are fed as the input into the decoder.\n",
+    "\n",
+    "To make use of these input variables, we will pass them through computation nodes. We first set `input_sequence` to `raw_input` as a convenience step. We then perform several modifications to `label_sequence` so that it will work with our network. For now you'll just have to trust that we will make good use of this stuff later.\n",
+    "\n",
+    "First, we slice the first element off of `label_sequence` so that it's missing the sentence-start token. This is because the decoder will always first be primed with that token, both during training and evaluation. When the ground truth isn't fed into the decoder, we will still feed in a sentence-start token, so we want to consistently view the input to the decoder as a sequence that starts with an actual value.\n",
+    "\n",
+    "Then, we get `label_sequence_start` by getting the `first` element from the sequence `raw_labels`. This will be used to compose a sequence that is the first input to the decoder regardless of whether we're training or decoding. Finally, the last two statements set up an actual sequence, with the correct dynamic axis, to be fed into the decoder. The function `sequence.scatter` takes the contents of `label_sentence_start` (which is `<s>`) and turns it into a sequence with the first element containing the sequence start symbol and the rest of the elements containing 0's."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Let's create the LSTM recurrence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def LSTM_layer(input, output_dim, recurrence_hook_h=past_value, recurrence_hook_c=past_value):\n",
+    "    # we first create placeholders for the hidden state and cell state which we don't have yet\n",
+    "    dh = placeholder_variable(shape=(output_dim), dynamic_axes=input.dynamic_axes)\n",
+    "    dc = placeholder_variable(shape=(output_dim), dynamic_axes=input.dynamic_axes)\n",
+    "   \n",
+    "    # we now create an LSTM_cell function and call it with the input and placeholders\n",
+    "    LSTM_cell = LSTM(output_dim)\n",
+    "    f_x_h_c = LSTM_cell(input, (dh, dc))\n",
+    "    h_c = f_x_h_c.outputs\n",
+    "    \n",
+    "    # we setup the recurrence by specifying the type of recurrence (by default it's `past_value` -- the previous value)\n",
+    "    h = recurrence_hook_h(h_c[0])\n",
+    "    c = recurrence_hook_c(h_c[1])\n",
+    "\n",
+    "    replacements = { dh: h.output, dc: c.output }\n",
+    "    f_x_h_c.replace_placeholders(replacements)\n",
+    "\n",
+    "    h = f_x_h_c.outputs[0]\n",
+    "    c = f_x_h_c.outputs[1]\n",
+    "\n",
+    "    # and finally we return the hidden state and cell state as functions (by using `combine`)\n",
+    "    return combine([h]), combine([c])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise 1: Create the encoder\n",
+    "\n",
+    "We will use the LSTM recurrence that we defined just above. Remember that its function signature is:\n",
+    "\n",
+    "`def LSTM_layer(input, output_dim, recurrence_hook_h=past_value, recurrence_hook_c=past_value):`\n",
+    "\n",
+    "and it returns a tuple `(hidden_state, hidden_cell)`. We will complete the following four exercises below. If possible, try them out before looking at the answers.\n",
+    "\n",
+    "1. Create the encoder (set the `output_dim` and `cell_dim` to `hidden_dim` which we defined earlier).\n",
+    "2. Set `num_layers` to something higher than 1 and create a stack of LSTMs to represent the encoder.\n",
+    "3. Get the output of the encoder and put it into the right form to be passed into the decoder [hard]\n",
+    "4. Reverse the order of the `input_sequence` (this has been shown to help especially in machine translation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# 1.\n",
+    "# Create the encoder (set the output_dim to hidden_dim which we defined earlier).\n",
+    "\n",
+    "(encoder_output_h, encoder_output_c) = LSTM_layer(input_sequence, hidden_dim)\n",
+    "\n",
+    "# 2.\n",
+    "# Set num_layers to something higher than 1 and create a stack of LSTMs to represent the encoder.\n",
+    "num_layers = 2\n",
+    "output_h = alias(input_sequence) # get a copy of the input_sequence\n",
+    "for i in range(0, num_layers):\n",
+    "    (output_h, output_c) = LSTM_layer(output_h.output, hidden_dim)\n",
+    "\n",
+    "# 3.\n",
+    "# Get the output of the encoder and put it into the right form to be passed into the decoder [hard]\n",
+    "thought_vector_h = sequence.first(output_h)\n",
+    "thought_vector_c = sequence.first(output_c)\n",
+    "\n",
+    "thought_vector_broadcast_h = sequence.broadcast_as(thought_vector_h, label_sequence)\n",
+    "thought_vector_broadcast_c = sequence.broadcast_as(thought_vector_c, label_sequence)\n",
+    "\n",
+    "# 4.\n",
+    "# Reverse the order of the input_sequence (this has been shown to help especially in machine translation)\n",
+    "(encoder_output_h, encoder_output_c) = LSTM_layer(input_sequence, hidden_dim, future_value, future_value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise 2: Create the decoder\n",
+    "\n",
+    "In our basic version of the sequence-to-sequence network, the decoder generates an output sequence given the input sequence by setting the initial state of the decoder to the final hidden state of the encoder. The hidden state is represented by a tuple `(encoder_h, encoder_c)` where `h` represents the output hidden state and `c` represents the value of the LSTM cell.\n",
+    "\n",
+    "Besides setting the initial state of the decoder, we also need to give the decoder LSTM some input. The first element will always be the special sequence start tag `<s>`. After that, there are two ways that we want to wire up the decoder's input: one during training, and the other during evaluation (i.e. generating sequences on the trained network).\n",
+    "\n",
+    "For training, the input to the decoder is the output sequence from the training data, also known as the label(s) for the input sequence. During evaluation, we will instead redirect the output from the network back into the decoder as its history. Let's first set up the input for training..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(label_sequence))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Above, we use the function `element_select` which will return one of two options given the condition `is_first_label`. Remember that we're working with sequences so when the decoder LSTM is run its input will be unrolled along with the network. The above allows us to to have a dynamic input that will return a specific element given what time step we're currently processing.\n",
+    "\n",
+    "Therefore, the `decoder_input` will be `label_sentence_start_scattered` (which is simply `<s>`) when we are at the first time step, and otherwise it will return the `past_value` (i.e. the previous element given what time step we're currently at) of `label_sequence`.\n",
+    "\n",
+    "Next, we need to setup our actual decoder. Before, for the encoder, we did the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "(output_h, output_c) = LSTM_layer(input_sequence, hidden_dim, \n",
+    "                                  recurrence_hook_h=past_value, recurrence_hook_c=past_value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To be able to set the first hidden state of the decoder to be equal to the final hidden state of the encoder, we can leverage the parameters `recurrence_hookH` and `recurrent_hookC`. The default `past_value` is a function that returns, for time `t`, the element in the sequence at time `t-1`. See if you can figure out how to set that up.\n",
+    "\n",
+    "1. Create the recurrence hooks for the decoder LSTM.\n",
+    " * Hint: you'll have to create a `lambda operand:` and you will make use of the `is_first_label` mask we used earlier and the `thought_vector_broadcast_h` and `thought_vector_broadcast_c` representations of the output of the encoder.\n",
+    "\n",
+    "2. With your recurrence hooks, create the decoder.\n",
+    " * Hint: again we'll use the `LSTMP_component_with_self_stabilization()` function and again use `hidden_dim` for the `output_dim` and `cell_dim`.\n",
+    " \n",
+    "3. Create a decoder with multiple layers. Note that you will have to use different recurrence hooks for the lower layers that feed back into the stack of layers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# 1. \n",
+    "# Create the recurrence hooks for the decoder LSTM.\n",
+    "\n",
+    "recurrence_hook_h = lambda operand: element_select(is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
+    "recurrence_hook_c = lambda operand: element_select(is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "\n",
+    "# 2.\n",
+    "# With your recurrence hooks, create the decoder.\n",
+    "\n",
+    "(decoder_output_h, decoder_output_c) = LSTM_layer(decoder_input, hidden_dim, recurrence_hook_h, recurrence_hook_c)\n",
+    "\n",
+    "# 3.\n",
+    "# Create a decoder with multiple layers. \n",
+    "# Note that you will have to use different recurrence hooks for the lower layers\n",
+    "\n",
+    "num_layers = 3\n",
+    "decoder_output_h = alias(decoder_input)\n",
+    "for i in range(0, num_layers):\n",
+    "    if (i > 0):\n",
+    "        recurrence_hook_h = past_value\n",
+    "        recurrence_hook_c = past_value\n",
+    "    else:\n",
+    "        recurrence_hook_h = lambda operand: element_select(\n",
+    "            is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
+    "        recurrence_hook_c = lambda operand: element_select(\n",
+    "            is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "\n",
+    "    (decoder_output_h, decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim, \n",
+    "                                                      recurrence_hook_h, recurrence_hook_c)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise 3: Fully connected layer (network output)\n",
+    "\n",
+    "Now we're almost at the end of defining the network. All we need to do is take the output of the decoder, and run it through a linear layer. Ultimately it will be put into a `softmax` to get a probability distribution over the possible output words. However, we will include that as part of our criterion nodes (below).\n",
+    "\n",
+    "1. Add the linear layer (a weight matrix, a bias parameter, a times, and a plus) to get the final output of the network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# 1.\n",
+    "# Add the linear layer\n",
+    "\n",
+    "W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform())\n",
+    "B = parameter(shape=(label_vocab_dim), init=0)\n",
+    "z = plus(B, times(decoder_output_h, W))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Putting the model together\n",
+    "\n",
+    "With the above we have defined some of the network and asked you to define parts of it as exercises. Here let's put the whole thing into a function called `create_model()`. Remember, all this does is create a skeleton of the network that defines how data will flow through it. No data is running through it yet."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def create_model():\n",
+    "    \n",
+    "    # Source and target inputs to the model\n",
+    "    batch_axis = Axis.default_batch_axis()\n",
+    "    input_seq_axis = Axis('inputAxis')\n",
+    "    label_seq_axis = Axis('labelAxis')\n",
+    "\n",
+    "    input_dynamic_axes = [batch_axis, input_seq_axis]\n",
+    "    raw_input = input_variable(\n",
+    "        shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input')\n",
+    "\n",
+    "    label_dynamic_axes = [batch_axis, label_seq_axis]\n",
+    "    raw_labels = input_variable(\n",
+    "        shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels')\n",
+    "\n",
+    "    # Instantiate the sequence to sequence translation model\n",
+    "    input_sequence = raw_input\n",
+    "\n",
+    "    # Drop the sentence start token from the label, for decoder training\n",
+    "    label_sequence = sequence.slice(raw_labels, 1, 0, \n",
+    "                                    name='label_sequence') # <s> A B C </s> --> A B C </s>\n",
+    "    label_sentence_start = sequence.first(raw_labels)      # <s>\n",
+    "\n",
+    "    # Setup primer for decoder\n",
+    "    is_first_label = sequence.is_first(label_sequence)  # 1 0 0 0 ...\n",
+    "    label_sentence_start_scattered = sequence.scatter(\n",
+    "        label_sentence_start, is_first_label)\n",
+    "\n",
+    "    # Encoder\n",
+    "    stabilize = Stabilizer()\n",
+    "    encoder_output_h = stabilize(input_sequence)\n",
+    "    for i in range(0, num_layers):\n",
+    "        (encoder_output_h, encoder_output_c) = LSTM_layer(\n",
+    "            encoder_output_h.output, hidden_dim, future_value, future_value)\n",
+    "\n",
+    "    # Prepare encoder output to be used in decoder\n",
+    "    thought_vector_h = sequence.first(encoder_output_h)\n",
+    "    thought_vector_c = sequence.first(encoder_output_c)\n",
+    "\n",
+    "    thought_vector_broadcast_h = sequence.broadcast_as(\n",
+    "        thought_vector_h, label_sequence)\n",
+    "    thought_vector_broadcast_c = sequence.broadcast_as(\n",
+    "        thought_vector_c, label_sequence)\n",
+    "\n",
+    "    # Decoder\n",
+    "    decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence\n",
+    "\n",
+    "    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(\n",
+    "        decoder_history_hook))\n",
+    "\n",
+    "    decoder_output_h = stabilize(decoder_input)\n",
+    "    for i in range(0, num_layers):\n",
+    "        if (i > 0):\n",
+    "            recurrence_hook_h = past_value\n",
+    "            recurrence_hook_c = past_value\n",
+    "        else:\n",
+    "            recurrence_hook_h = lambda operand: element_select(\n",
+    "                is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
+    "            recurrence_hook_c = lambda operand: element_select(\n",
+    "                is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "\n",
+    "        (decoder_output_h, decoder_output_c) = LSTM_layer(\n",
+    "            decoder_output_h.output, hidden_dim, recurrence_hook_h, recurrence_hook_c)\n",
+    "\n",
+    "    # Linear output layer\n",
+    "    W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform())\n",
+    "    B = parameter(shape=(label_vocab_dim), init=0)\n",
+    "    z = plus(B, times(stabilize(decoder_output_h), W))    \n",
+    "    \n",
+    "    return z"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training\n",
+    "\n",
+    "Now that we've created the model, we are ready to train the network and learn its parameters. For sequence-to-sequence networks, the loss we use is cross-entropy. Note that we have to find the `label_sequences` node from the model because it was defined in our network and we want to compare the model's predictions specifically to the outputs of that node."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['raw_labels', 'raw_input']\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = create_model()\n",
+    "label_sequence = find_nodes_by_name(model, 'label_sequence')[0]\n",
+    "\n",
+    "# Criterion nodes\n",
+    "ce = cross_entropy_with_softmax(model, label_sequence)\n",
+    "errs = classification_error(model, label_sequence)\n",
+    "\n",
+    "# let's show the required arguments for this model\n",
+    "print([x.name for x in model.arguments])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we'll setup a bunch of parameters to drive our learning, we'll create the learner, and finally create our trainer:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# training parameters\n",
+    "lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)\n",
+    "minibatch_size = 72\n",
+    "momentum_time_constant = momentum_as_time_constant_schedule(1100)\n",
+    "clipping_threshold_per_sample = 2.3\n",
+    "gradient_clipping_with_truncation = True\n",
+    "learner = momentum_sgd(model.parameters,\n",
+    "                        lr_per_sample, momentum_time_constant,\n",
+    "                        gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,\n",
+    "                        gradient_clipping_with_truncation=gradient_clipping_with_truncation)\n",
+    "trainer = Trainer(model, ce, errs, learner)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And now we bind the features and labels from our `train_reader` to the inputs that we setup in our network definition. First however, we'll define a convenience function to help find an argument name when pointing the reader's features to an argument of our model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# helper function to find variables by name\n",
+    "def find_arg_by_name(name, expression):\n",
+    "    vars = [i for i in expression.arguments if i.name == name]\n",
+    "    assert len(vars) == 1\n",
+    "    return vars[0]\n",
+    "\n",
+    "train_bind = {\n",
+    "        find_arg_by_name('raw_input' , model) : train_reader.streams.features,\n",
+    "        find_arg_by_name('raw_labels', model) : train_reader.streams.labels\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we define our training loop and start training the network!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Minibatch: 0, Train Loss: 3.149, Train Evaluation Criterion: 0.870\n",
+      "Minibatch: 100, Train Loss: 3.029, Train Evaluation Criterion: 0.759\n",
+      "Minibatch: 200, Train Loss: 3.098, Train Evaluation Criterion: 0.778\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_progress_output_freq = 100\n",
+    "max_num_minibatch = 100 if isFast else 1000\n",
+    "\n",
+    "for i in range(max_num_minibatch):\n",
+    "    # get next minibatch of training data\n",
+    "    mb_train = train_reader.next_minibatch(minibatch_size, input_map=train_bind)\n",
+    "    trainer.train_minibatch(mb_train)\n",
+    "\n",
+    "    # collect epoch-wide stats\n",
+    "    if i % training_progress_output_freq == 0:\n",
+    "        print(\"Minibatch: {0}, Train Loss: {1:.3f}, Train Evaluation Criterion: {2:2.3f}\".format(i, \n",
+    "                        get_train_loss(trainer), get_train_eval_criterion(trainer)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model evaluation: greedy decoding\n",
+    "\n",
+    "Once we have a trained model, we of course then want to make use of it to generate output sequences! In this case, we will use greedy decoding. What this means is that we will run an input sequence through our trained network, and when we generate the output sequence, we will do so one element at a time by taking the `hardmax()` of the output of our network. This is obviously not optimal in general. Given the context, some word may always be the most probable at the first step, but another first word may be preferred given what is output later on. Decoding the optimal sequence is intractable in general. But we can do better doing a beam search where we keep around some small number of hypotheses at each step. However, greedy decoding can work surprisingly well for sequence-to-sequence networks because so much of the context is kept around in the RNN.\n",
+    "\n",
+    "To do greedy decoding, we need to hook in the previous output of our network as the input to the decoder. During training we passed the `label_sequences` (ground truth) in. You'll notice in our `create_model()` function above the following lines:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence\n",
+    "decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This gives us a way to modify the `decoder_history_hook` after training to something else. We've already trained our network, but now we need a way to evaluate it without using a ground truth. We can do that like this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "model = create_model()\n",
+    "\n",
+    "# get some references to the new model\n",
+    "label_sequence = find_nodes_by_name(model, 'label_sequence')[0]    \n",
+    "decoder_history_hook = find_nodes_by_name(model, 'decoder_history_hook')[0]  \n",
+    "\n",
+    "# and now replace the output of decoder_history_hook with the hardmax output of the network\n",
+    "def clone_and_hook():\n",
+    "    # network output for decoder history\n",
+    "    net_output = hardmax(model)\n",
+    "\n",
+    "    # make a clone of the graph where the ground truth is replaced by the network output\n",
+    "    return model.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output})\n",
+    "\n",
+    "# get a new model that uses the past network output as input to the decoder\n",
+    "new_model = clone_and_hook()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `new_model` now contains a version of the original network that shares parameters with it but that has a different input to the decoder. Namely, instead of feeding the ground truth labels into the decoder, it will feed in the history that the network has generated!\n",
+    "\n",
+    "Finally, let's see what it looks like if we train, and keep evaluating the network's output every `100` iterations by running a word's graphemes ('A B A D I') through our network. This way we can visualize the progress learning the best model... First we'll define a more complete `train()` action. It is largely the same as above but has some additional training parameters included; some additional smarts for printing out statistics as we go along; we now see progress over our data as epochs (one epoch is one complete pass over the training data); and we setup a reader for the single validation sequence we described above so that we can visually see our network's progress on that sequence as it learns."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "########################\n",
+    "# train action         #\n",
+    "########################\n",
+    "\n",
+    "def train(train_reader, valid_reader, vocab, i2w, model, max_epochs):\n",
+    "    \n",
+    "    # do some hooks that we won't need in the future\n",
+    "    label_sequence = find_nodes_by_name(model, 'label_sequence')[0]    \n",
+    "    decoder_history_hook = find_nodes_by_name(model, 'decoder_history_hook')[0]  \n",
+    "        \n",
+    "    # Criterion nodes\n",
+    "    ce = cross_entropy_with_softmax(model, label_sequence)\n",
+    "    errs = classification_error(model, label_sequence)\n",
+    "\n",
+    "    def clone_and_hook():\n",
+    "        # network output for decoder history\n",
+    "        net_output = hardmax(model)\n",
+    "\n",
+    "        # make a clone of the graph where the ground truth is replaced by the network output\n",
+    "        return model.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output})\n",
+    "\n",
+    "    # get a new model that uses the past network output as input to the decoder\n",
+    "    new_model = clone_and_hook()\n",
+    "\n",
+    "    # Instantiate the trainer object to drive the model training\n",
+    "    lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)\n",
+    "    minibatch_size = 72\n",
+    "    momentum_time_constant = momentum_as_time_constant_schedule(1100)\n",
+    "    clipping_threshold_per_sample = 2.3\n",
+    "    gradient_clipping_with_truncation = True\n",
+    "    learner = momentum_sgd(model.parameters,\n",
+    "                           lr_per_sample, momentum_time_constant,\n",
+    "                           gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,\n",
+    "                           gradient_clipping_with_truncation=gradient_clipping_with_truncation)\n",
+    "    trainer = Trainer(model, ce, errs, learner)\n",
+    "\n",
+    "    # Get minibatches of sequences to train with and perform model training\n",
+    "    i = 0\n",
+    "    mbs = 0\n",
+    "    \n",
+    "    # Set epoch size to a larger number of lower training error\n",
+    "    epoch_size = 5000 if isFast else 908241\n",
+    "    \n",
+    "    training_progress_output_freq = 100\n",
+    "\n",
+    "    # bind inputs to data from readers\n",
+    "    train_bind = {\n",
+    "        find_arg_by_name('raw_input' , model) : train_reader.streams.features,\n",
+    "        find_arg_by_name('raw_labels', model) : train_reader.streams.labels\n",
+    "    }\n",
+    "    valid_bind = {\n",
+    "        find_arg_by_name('raw_input' , new_model) : valid_reader.streams.features,\n",
+    "        find_arg_by_name('raw_labels', new_model) : valid_reader.streams.labels\n",
+    "    }\n",
+    "\n",
+    "    for epoch in range(max_epochs):\n",
+    "        loss_numer = 0\n",
+    "        metric_numer = 0\n",
+    "        denom = 0\n",
+    "\n",
+    "        while i < (epoch+1) * epoch_size:\n",
+    "            # get next minibatch of training data\n",
+    "            mb_train = train_reader.next_minibatch(minibatch_size, input_map=train_bind)\n",
+    "            trainer.train_minibatch(mb_train)\n",
+    "\n",
+    "            # collect epoch-wide stats\n",
+    "            samples = trainer.previous_minibatch_sample_count\n",
+    "            loss_numer += trainer.previous_minibatch_loss_average * samples\n",
+    "            metric_numer += trainer.previous_minibatch_evaluation_average * samples\n",
+    "            denom += samples\n",
+    "\n",
+    "            # every N MBs evaluate on a test sequence to visually show how we're doing; also print training stats\n",
+    "            if mbs % training_progress_output_freq == 0:\n",
+    "                \n",
+    "                print(\"Minibatch: {0}, Train Loss: {1:2.3f}, Train Evaluation Criterion: {2:2.3f}\".format(mbs, \n",
+    "                      get_train_loss(trainer), get_train_eval_criterion(trainer)))\n",
+    "                \n",
+    "                mb_valid = valid_reader.next_minibatch(minibatch_size, input_map=valid_bind)\n",
+    "                e = new_model.eval(mb_valid)\n",
+    "                print_sequences(e, i2w)\n",
+    "\n",
+    "            i += mb_train[find_arg_by_name('raw_labels', model)].num_samples\n",
+    "            mbs += 1\n",
+    "\n",
+    "        print(\"--- EPOCH %d DONE: loss = %f, errs = %f ---\" % (epoch, loss_numer/denom, 100.0*(metric_numer/denom)))\n",
+    "        return 100.0*(metric_numer/denom)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have our three important functions defined -- `create_model()` and `train()`, let's make use of them:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Minibatch: 0, Train Loss: 4.234, Train Evaluation Criterion: 1.000\n",
+      "['</s>', '</s>', '</s>', '</s>', '</s>', '~S']\n",
+      "--- EPOCH 0 DONE: loss = 3.817894, errs = 86.991687 ---\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Given a vocab and tensor, print the output\n",
+    "def print_sequences(sequences, i2w):\n",
+    "    for s in sequences:\n",
+    "        print([i2w[np.argmax(w)] for w in s], sep=\" \")\n",
+    "\n",
+    "# hook up data\n",
+    "train_reader = create_reader(\"cmudict-0.7b.train-dev-20-21.ctf\", True)\n",
+    "valid_reader = create_reader(\"tiny.ctf\", False)\n",
+    "vocab, i2w = get_vocab(\"cmudict-0.7b.mapping\")\n",
+    "\n",
+    "# create model\n",
+    "model = create_model()\n",
+    "    \n",
+    "# train\n",
+    "error = train(train_reader, valid_reader, vocab, i2w, model, max_epochs=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "86.992\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Print the training error \n",
+    "print(error)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task\n",
+    "Note the error is very high. This is largely due to the minimum training we have done so far. Please change the `epoch_size` to be a much higher number and re-run the `train` function. This might take considerably longer time but you will see a marked reduction in the error."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "An important extension to sequence-to-sequence models, especially when dealing with long sequences, is to use an attention mechanism. The idea behind attention is to allow the decoder, first, to look at any of the hidden state outputs from the encoder (instead of using only the final hidden state), and, second, to learn how much attention to pay to each of those hidden states given the context. This allows the outputted word at each time step `t` to depend not only on the final hidden state and the word that came before it, but instead on a weighted combination of *all* of the input hidden states!\n",
+    "\n",
+    "In the next version of this tutorial, we will talk about how to include attention in your sequence to sequence network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

From d9b99d1831bf600eb5f00d1928e7173e7aaa5398 Mon Sep 17 00:00:00 2001
From: Eldar Akchurin <eldak@microsoft.com>
Date: Mon, 21 Nov 2016 17:10:09 +0100
Subject: [PATCH 18/25] Refactoring recursion to decrease frame size

---
 Source/CNTKv2LibraryDll/API/CNTKLibrary.h     |  26 +---
 .../API/CNTKLibraryInternals.h                |   6 +-
 Source/CNTKv2LibraryDll/BackCompat.cpp        | 132 ++++++++++--------
 .../V2LibraryTests/V2LibraryTests.vcxproj     |   3 +-
 4 files changed, 81 insertions(+), 86 deletions(-)

diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
index 0cadf6d97..d5011703e 100644
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@@ -393,16 +393,11 @@ namespace CNTK
         friend class PackedValue;
         friend class MPICommunicatorImpl;
         friend class BlockMomentumDistributedTrainer;
+        friend class Internal::VariableResolver;
 
         template <typename T, typename ...CtorArgTypes>
         friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
 
-        template <typename ElementType>
-        friend Variable Internal::GetVariable(const Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                                              std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                                              std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                              std::unordered_set<FunctionPtr>& allPrimitiveFunctions);
-
     public:
         ///
         /// Construct a NDArrayView with the specified 'dataBuffer' as the backing storage.
@@ -635,6 +630,7 @@ namespace CNTK
         static const size_t AutoSelectRowColSplitPoint = SIZE_MAX;
 
     private:
+
         CNTK_API NDArrayView(::CNTK::DataType dataType, const DeviceDescriptor& device, ::CNTK::StorageFormat storageType, const NDShape& viewShape, bool readOnly, void* tensorView);
 
 
@@ -1599,11 +1595,7 @@ namespace CNTK
         template <typename T>
         friend struct std::hash;
 
-        template <typename ElementType>
-        friend Variable Internal::GetVariable(const Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                                              std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                                              std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                              std::unordered_set<FunctionPtr>& allPrimitiveFunctions);
+        friend class Internal::VariableResolver;
 
 #ifndef SWIG
     private:
@@ -1952,11 +1944,7 @@ private:
         template <typename T>
         friend struct std::hash;
 
-        template <typename ElementType>
-        friend Variable Internal::GetVariable(const Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                                              std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                                              std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                              std::unordered_set<FunctionPtr>& allPrimitiveFunctions);
+        friend class Internal::VariableResolver;
 
     public:
         ///
@@ -2037,11 +2025,7 @@ private:
         template <typename T>
         friend struct std::hash;
 
-        template <typename ElementType>
-        friend Variable Internal::GetVariable(const Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                                              std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                                              std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                              std::unordered_set<FunctionPtr>& allPrimitiveFunctions);
+        friend class Internal::VariableResolver;
 
     public:
         ///
diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
index 2ca45ffd8..b2f28ae5b 100644
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@@ -247,10 +247,6 @@ namespace CNTK
 
         CNTK_API bool AreEqual(const ::CNTK::NDArrayView& view1, const ::CNTK::NDArrayView& view2, double relativeTolerance = 0.0, double absoluteTolerance = 0.0);
 
-        template <typename ElementType>
-        Variable GetVariable(const  Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                             std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, ::CNTK::Variable>& nodeToVariableMap,
-                             std::unordered_map<::CNTK::Variable, ::CNTK::Variable>& placeholderReplacements,
-                             std::unordered_set<::CNTK::FunctionPtr>& allPrimitiveFunctions);
+        class VariableResolver;
     }
 }
\ No newline at end of file
diff --git a/Source/CNTKv2LibraryDll/BackCompat.cpp b/Source/CNTKv2LibraryDll/BackCompat.cpp
index 8bceb6ff2..adf19d029 100644
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@@ -20,28 +20,66 @@
 #include "DeprecatedNodes.h"
 #include "RNNNodes.h"
 
-
 using namespace Microsoft::MSR::CNTK;
 
 namespace CNTK
 {
     namespace Internal
     {
-        template <typename ElementType>
-        Variable GetVariable(const ComputationNodeBasePtr& node,
-                             std::unordered_map<ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                             std::unordered_map<Variable, Variable>& placeholderReplacements,
-                             std::unordered_set<FunctionPtr>& allPrimitiveFunctions)
+        // Helper class to resolve variables in the model.
+        class VariableResolver final
         {
-            auto iter = nodeToVariableMap.find(node);
-            if (iter != nodeToVariableMap.end())
-                return iter->second;
+            std::unordered_map<Variable, Variable> m_placeholderReplacements;
+            std::unordered_map<ComputationNodeBasePtr, Variable> m_nodeToVariableMap;
 
-            Variable var;
-            NDShape varShape = AsNDShape(node->GetSampleLayout());
-
-            if (node->IsLeaf())
+        public:
+            const std::unordered_map<Variable, Variable>& GetPlaceHolders() const
             {
+                return m_placeholderReplacements;
+            }
+
+            template<class ElementType>
+            Variable GetVariable(const ComputationNodeBasePtr& node)
+            {
+                auto iter = m_nodeToVariableMap.find(node);
+                if (iter != m_nodeToVariableMap.end())
+                    return iter->second;
+
+                Variable var;
+                if (node->IsLeaf())
+                {
+                    var = ResolveLeaf<ElementType>(node);
+                }
+                else
+                {
+                    // This is a non-leaf node and maps to a primitive Function
+                    NDShape varShape = AsNDShape(node->GetSampleLayout());
+                    auto placeholderVar = PlaceholderVariable(varShape);
+                    m_nodeToVariableMap[node] = placeholderVar;
+
+                    std::vector<Variable> inputVars(node->GetNumInputs());
+                    for (size_t i = 0; i < inputVars.size(); ++i)
+                    {
+                        inputVars[i] = GetVariable<ElementType>(node->Input(i));
+                        if (inputVars[i].IsPlaceholder())
+                            m_placeholderReplacements[inputVars[i]] = Variable();
+                    }
+
+                    var = ResolveFunction<ElementType>(node, inputVars);
+
+                    if (m_placeholderReplacements.find(placeholderVar) != m_placeholderReplacements.end())
+                        m_placeholderReplacements[placeholderVar] = var;
+                }
+
+                m_nodeToVariableMap[node] = var;
+                return var;
+            }
+
+        private:
+            template<class ElementType>
+            Variable ResolveLeaf(const ComputationNodeBasePtr& node)
+            {
+                NDShape variableShape = AsNDShape(node->GetSampleLayout());
                 std::wstring varUid, varName;
                 if (node->Is<InputValueBase<ElementType>>())
                 {
@@ -54,48 +92,32 @@ namespace CNTK
                         auto inputNodeInternalDynamicAxisName = node->As<InputValueBase<ElementType>>()->GetRequestedDynamicAxis();
                         std::vector<Axis> inputVarDynamicAxes = DynamicAxesFromInternalDynamicAxisName(inputNodeInternalDynamicAxisName);
 
-                        var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, varName, inputVarDynamicAxes, varUid);
-                    }
-                    else
-                    {
-                        // TODO: Allow creating inputs without a dynamic axis
-                        LogicError("Found InputNode with no dynamic axes which is currently unsupported");
+                        return Variable(variableShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, varName, inputVarDynamicAxes, varUid);
                     }
+
+                    // TODO: Allow creating inputs without a dynamic axis
+                    LogicError("Found InputNode with no dynamic axes which is currently unsupported");
                 }
-                else if (node->Is<LearnableParameter<ElementType>>())
+
+                if (node->Is<LearnableParameter<ElementType>>())
                 {
                     bool isConstant = (node->GetLearningRateMultiplier() == 0);
                     auto& matrix = node->As<ComputationNode<ElementType>>()->Value();
                     auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorViewShape(node->GetSampleLayout()));
-                    NDArrayViewPtr value = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), varShape, false, tensorView);
-                    if (isConstant)
-                    {
-                        std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), VariableKind::Constant);
-                        var = Constant(value, varName, varUid);
-                    }
-                    else
-                    {
-                        std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), VariableKind::Parameter);
-                        var = Parameter(value, varName, varUid);
-                    }
+                    NDArrayViewPtr value = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), variableShape, false, tensorView);
+
+                    auto kind = isConstant ? VariableKind::Constant : VariableKind::Parameter;
+                    std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), kind);
+                    return isConstant ? (Variable)Constant(value, varName, varUid) : Parameter(value, varName, varUid);
                 }
-                else
-                    LogicError("CNTK::LoadLegacyModel: Unsupported legacy CNTK node named '%S'", node->NodeName().c_str());
+
+                LogicError("CNTK::LoadLegacyModel: Unsupported legacy CNTK node named '%S'", node->NodeName().c_str());
+                return Variable();// make compiler happy.
             }
-            else
+
+            template<class ElementType>
+            Variable ResolveFunction(const ComputationNodeBasePtr& node, std::vector<Variable>& inputVars)
             {
-                // This is a non-leaf node and maps to a primitive Function
-                auto placeholderVar = PlaceholderVariable(varShape);
-                nodeToVariableMap[node] = placeholderVar;
-
-                std::vector<Variable> inputVars(node->GetNumInputs());
-                for (size_t i = 0; i < inputVars.size(); ++i)
-                {
-                    inputVars[i] = GetVariable<ElementType>(node->Input(i), nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions);
-                    if (inputVars[i].IsPlaceholder())
-                        placeholderReplacements[inputVars[i]] = Variable();
-                }
-
                 PrimitiveOpType opType;
                 Dictionary primitiveFunctionConfigParameters;
                 if (node->OperationName() == OperationNameOf(NegateNode))
@@ -376,15 +398,9 @@ namespace CNTK
                 std::tie(functionUid, functionName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), opType);
 
                 FunctionPtr primitiveFunction = MakeSharedObject<PrimitiveFunction>(opType, inputVars, std::move(primitiveFunctionConfigParameters), functionName, functionUid);
-                allPrimitiveFunctions.insert(primitiveFunction);
-                var = primitiveFunction->Output();
-                if (placeholderReplacements.find(placeholderVar) != placeholderReplacements.end())
-                    placeholderReplacements[placeholderVar] = var;
+                return primitiveFunction->Output();
             }
-
-            nodeToVariableMap[node] = var;
-            return var;
-        }
+        };
 
         FunctionPtr LoadLegacyModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
         {
@@ -410,8 +426,8 @@ namespace CNTK
             // Now traverse the model and construct the Function graph
             std::unordered_map<ComputationNodeBasePtr, Variable> nodeToVariableMap;
             std::unordered_map<Variable, Variable> placeholderReplacements;
-            std::unordered_set<FunctionPtr> allPrimitiveFunctions;
             std::vector<Variable> rootVariables;
+            VariableResolver resolver;
             auto& networkRoots = net->RootNodes();
             for (auto& rootNode : networkRoots)
             {
@@ -420,11 +436,11 @@ namespace CNTK
 
                 if (ComputationNetwork::IsNodePtr<ComputationNode<float>>(rootNode))
                 {
-                    rootVariables.push_back(Internal::GetVariable<float>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
+                    rootVariables.push_back(resolver.GetVariable<float>(rootNode).Owner());
                 }
                 else if (ComputationNetwork::IsNodePtr<ComputationNode<double>>(rootNode))
                 {
-                    rootVariables.push_back(Internal::GetVariable<double>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
+                    rootVariables.push_back(resolver.GetVariable<double>(rootNode).Owner());
                 }
                 else
                 {
@@ -433,7 +449,7 @@ namespace CNTK
             }
 
             auto rootComposite = Combine(rootVariables);
-            rootComposite->ReplacePlaceholders(placeholderReplacements);
+            rootComposite->ReplacePlaceholders(resolver.GetPlaceHolders());
 
             return rootComposite;
         }
diff --git a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
index e0d8ea1c6..4806c9401 100644
--- a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
+++ b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
@@ -80,7 +80,6 @@
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>CNTKLibrary-2.0.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <StackReserveSize Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">100000000</StackReserveSize>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@@ -145,4 +144,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>

From 907be82d1bb5770a7fa7e71c088259fcf607f2b8 Mon Sep 17 00:00:00 2001
From: Eldar Akchurin <eldak@microsoft.com>
Date: Wed, 23 Nov 2016 13:46:23 +0100
Subject: [PATCH 19/25] Moving primitive functions back

---
 Source/CNTKv2LibraryDll/BackCompat.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Source/CNTKv2LibraryDll/BackCompat.cpp b/Source/CNTKv2LibraryDll/BackCompat.cpp
index adf19d029..5e679af13 100644
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@@ -31,6 +31,7 @@ namespace CNTK
         {
             std::unordered_map<Variable, Variable> m_placeholderReplacements;
             std::unordered_map<ComputationNodeBasePtr, Variable> m_nodeToVariableMap;
+            std::unordered_set<FunctionPtr> m_allPrimitiveFunctions;
 
         public:
             const std::unordered_map<Variable, Variable>& GetPlaceHolders() const
@@ -398,6 +399,7 @@ namespace CNTK
                 std::tie(functionUid, functionName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), opType);
 
                 FunctionPtr primitiveFunction = MakeSharedObject<PrimitiveFunction>(opType, inputVars, std::move(primitiveFunctionConfigParameters), functionName, functionUid);
+                m_allPrimitiveFunctions.insert(primitiveFunction);
                 return primitiveFunction->Output();
             }
         };
@@ -450,7 +452,6 @@ namespace CNTK
 
             auto rootComposite = Combine(rootVariables);
             rootComposite->ReplacePlaceholders(resolver.GetPlaceHolders());
-
             return rootComposite;
         }
 

From edb061193f36377008a0038853b8b53e14eb4bc9 Mon Sep 17 00:00:00 2001
From: Nikos Karampatziakis <just.nikos@gmail.com>
Date: Wed, 23 Nov 2016 11:41:04 -0800
Subject: [PATCH 20/25] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 6feca91f4..636663b52 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+**The [CNTK Wiki](https://github.com/Microsoft/CNTK/wiki) has all information on CNTK including [setup](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine ), [examples](https://github.com/Microsoft/CNTK/wiki/Examples ), etc.**
+
 # Latest news
 *2016-11-21.* V 2.0 Beta 4 Release  
 Highlights of this Release:

From 974462108707266cd14166a942fa0cbbeb989fc8 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Wed, 23 Nov 2016 14:00:57 -0800
Subject: [PATCH 21/25] Update to python distributed examples:

1. Use randomized minibatch source in both ConvNet and ResNet distributed examples
2. Add options in ResNet distributed example to scale up minibatch size with #workers
3. Update README.md for both examples with link to the python file
4. Add option to disable multi threaded deserializer in MinibatchSource
5. Update convnet example tests to run with mpiexec
---
 .../ConvNet_CIFAR10_DataAug_Distributed.py    |  3 +-
 .../Classification/ConvNet/Python/README.md   |  4 +-
 .../Classification/ResNet/Python/README.md    |  6 +-
 .../Python/TrainResNet_CIFAR10_Distributed.py | 22 ++++---
 .../cifar_convnet_distributed_test.py         | 61 +++++++------------
 .../CNTKv2Python/Examples/pytest.ini          |  2 +-
 .../Examples/run_cifar_convnet_distributed.py | 52 ++++++++++++++++
 bindings/python/cntk/io/__init__.py           | 11 +++-
 8 files changed, 105 insertions(+), 56 deletions(-)
 create mode 100644 Tests/EndToEndTests/CNTKv2Python/Examples/run_cifar_convnet_distributed.py

diff --git a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
index 36c547672..90ae7148c 100644
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
@@ -48,10 +48,9 @@ def create_reader(map_file, mean_file, train, distributed_after=INFINITE_SAMPLES
         ImageDeserializer(map_file, StreamDefs(
             features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
             labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
-        randomize = False,
+        multithreaded_deserializer = False,  # turn off omp as CIFAR-10 is not heavy for deserializer
         distributed_after = distributed_after)
 
-
 # Train and evaluate the network.
 def convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer, max_epochs = 80):
     set_computation_network_trace_level(0)
diff --git a/Examples/Image/Classification/ConvNet/Python/README.md b/Examples/Image/Classification/ConvNet/Python/README.md
index 87a5747a4..a33052051 100644
--- a/Examples/Image/Classification/ConvNet/Python/README.md
+++ b/Examples/Image/Classification/ConvNet/Python/README.md
@@ -35,7 +35,7 @@ We use a fixed crop ratio of `0.8` and scale the image to `32x32` pixels for tra
 
 ### ConvNet_CIFAR10_DataAug_Distributed.py
 
-The fourth example uses the same CNN as ConvNet_CIFAR10_DataAug.py, but it adds support for distributed training with simple aggregation. For a reference on distributed training, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines)
-Note that this example supports CPU-only build.
+The fourth example uses the same CNN as ConvNet_CIFAR10_DataAug.py, but it adds support for distributed training with simple aggregation. For a reference on distributed training, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
+Note that [this example](./ConvNet_CIFAR10_DataAug_Distributed.py) supports CPU-only build.
 
 `mpiexec -n <#workers> python ConvNet_CIFAR10_DataAug_Distributed.py`
diff --git a/Examples/Image/Classification/ResNet/Python/README.md b/Examples/Image/Classification/ResNet/Python/README.md
index ec5f1f3af..5228cd4b0 100644
--- a/Examples/Image/Classification/ResNet/Python/README.md
+++ b/Examples/Image/Classification/ResNet/Python/README.md
@@ -15,7 +15,7 @@ for ResNet20 and ResNet110, respectively. The ResNet20 network achieves an error
 
 ### TrainResNet_CIFAR10_Distributed.py
 
-This example code is similar to TrainResNet_CIFAR10.py, but it adds support for distributed training via [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface). Details can be found in [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines)
+[This example](./TrainResNet_CIFAR10_Distributed.py) is similar to TrainResNet_CIFAR10.py, but it adds support for distributed training via [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface). Details can be found in [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
 Note this example requires a multi-GPU machine or mpi hosts file to distribute to multiple machines.
 
 Simple aggregation, ResNet20, with a 2-GPU machine:
@@ -25,3 +25,7 @@ Simple aggregation, ResNet20, with a 2-GPU machine:
 Quantized 1-bit aggregation with 50000 samples before distributed, ResNet20, with a 2-GPU machine:
 
 `mpiexec -n 2 python TrainResNet_CIFAR10_Distributed.py -n resnet20 -q 1 -a 50000`
+
+To run with maximum parallelization with minibatch size scaled according to #workers for 3 epochs:
+
+`mpiexec -n 2 python TrainResNet_CIFAR10_Distributed.py -s True -e 3`
\ No newline at end of file
diff --git a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
index a317ec08e..e0c263c0d 100644
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
@@ -50,12 +50,12 @@ def create_reader(map_file, mean_file, train, distributed_after=INFINITE_SAMPLES
         ImageDeserializer(map_file, StreamDefs(
             features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
             labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
-        randomize = False,
+        multithreaded_deserializer = False,  # turn off omp as CIFAR-10 is not heavy for deserializer
         distributed_after = distributed_after)
 
 
 # Train and evaluate the network.
-def train_and_evaluate(reader_train, reader_test, network_name, max_epochs, distributed_trainer):
+def train_and_evaluate(reader_train, reader_test, network_name, max_epochs, distributed_trainer, scale_up=False):
 
     set_computation_network_trace_level(0)
 
@@ -79,7 +79,13 @@ def train_and_evaluate(reader_train, reader_test, network_name, max_epochs, dist
 
     # shared training parameters 
     epoch_size = 50000                    # for now we manually specify epoch size
-    minibatch_size = 128
+    
+    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
+    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
+    # up. However, bigger minimatch size on the same number of samples means less updates, 
+    # thus leads to higher training error. This is a trade-off of speed and accuracy
+    minibatch_size = 128 * (len(distributed_trainer.communicator().workers()) if scale_up else 1)
+    
     momentum_time_constant = -minibatch_size/np.log(0.9)
     l2_reg_weight = 0.0001
 
@@ -145,15 +151,17 @@ def train_and_evaluate(reader_train, reader_test, network_name, max_epochs, dist
 if __name__=='__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('-n', '--network', help='network type, resnet20 or resnet110', required=False, default='resnet20')
-    parser.add_argument('-e', '--epochs', help='total epochs', required=False, default='160')
-    parser.add_argument('-q', '--quantize_bit', help='quantized bit', required=False, default='32')
-    parser.add_argument('-a', '--distributed_after', help='number of samples to train with before running distributed', required=False, default='0')
+    parser.add_argument('-e', '--epochs', help='total epochs', type=int, required=False, default='160')
+    parser.add_argument('-q', '--quantize_bit', help='quantized bit', type=int, required=False, default='32')
+    parser.add_argument('-s', '--scale_up', help='scale up minibatch size with #workers for better parallelism', type=bool, required=False, default='False')
+    parser.add_argument('-a', '--distributed_after', help='number of samples to train with before running distributed', type=int, required=False, default='0')
 
     args = vars(parser.parse_args())
     num_quantization_bits = int(args['quantize_bit'])
     epochs = int(args['epochs'])
     distributed_after_samples = int(args['distributed_after'])
     network_name = args['network']
+    scale_up = bool(args['scale_up'])
 
     # Create distributed trainer
     print("Start training: quantize_bit = {}, epochs = {}, distributed_after = {}".format(num_quantization_bits, epochs, distributed_after_samples))
@@ -164,7 +172,7 @@ if __name__=='__main__':
     reader_train = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True, distributed_after_samples)
     reader_test  = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)
     
-    train_and_evaluate(reader_train, reader_test, network_name, epochs, distributed_trainer)
+    train_and_evaluate(reader_train, reader_test, network_name, epochs, distributed_trainer, scale_up)
     
     # Must call MPI finalize when process exit
     distributed.Communicator.finalize()
diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/cifar_convnet_distributed_test.py b/Tests/EndToEndTests/CNTKv2Python/Examples/cifar_convnet_distributed_test.py
index 80d60655c..d87ad6119 100644
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/cifar_convnet_distributed_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/cifar_convnet_distributed_test.py
@@ -7,55 +7,36 @@
 import numpy as np
 import os
 import sys
+import signal
+import subprocess
+import re
+import pytest
 from cntk.utils import cntk_device
 from cntk.cntk_py import DeviceKind_GPU
 from cntk.device import set_default_device
-from cntk.io import ReaderConfig, ImageDeserializer
-from cntk import distributed
-import pytest
-import platform
 
 abs_path = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "ConvNet", "Python"))
-from ConvNet_CIFAR10_DataAug_Distributed import convnet_cifar10_dataaug, create_reader
+sys.path.append(abs_path)
+from run_cifar_convnet_distributed import run_cifar_convnet_distributed
 
 TOLERANCE_ABSOLUTE = 2E-1
+TIMEOUT_SECONDS = 300
 
-def test_cifar_convnet_error(device_id):
-    if platform.system() == 'Windows':
-        pytest.skip('test skipped on Windows')
-
-    set_default_device(cntk_device(device_id))
+def test_cifar_convnet_distributed_mpiexec(device_id):
+    if cntk_device(device_id).type() != DeviceKind_GPU:
+        pytest.skip('test only runs on GPU')
 
+    cmd = ["mpiexec", "-n", "2", "python", os.path.join(abs_path, "run_cifar_convnet_distributed.py")]
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
     try:
-        base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
-                                *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
-        # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
-        #      and CIFAR-10_mean.xml in the base_path.
-    except KeyError:
-        base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                                *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))
-
-    base_path = os.path.normpath(base_path)
-    os.chdir(os.path.join(base_path, '..'))
-
-    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
-    set_computation_network_trace_level(1) 
-    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
-    #force_deterministic_algorithms()
-    # TODO: do the above; they lead to slightly different results, so not doing it for now
-
-    reader_train = create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, 0)
-    reader_test  = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False)
-
-    distributed_after_samples = 0
-    num_quantization_bits = 32
-    distributed_trainer = distributed.data_parallel_distributed_trainer(
-        num_quantization_bits=num_quantization_bits,
-        distributed_after=distributed_after_samples)
-
-    test_error = convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer, max_epochs=1)
+        out = p.communicate(timeout=TIMEOUT_SECONDS)[0]  # in case we have a hang
+    except subprocess.TimeoutExpired:
+        os.kill(p.pid, signal.CTRL_C_EVENT)
+        raise RuntimeError('Timeout in mpiexec, possibly hang')
+    str_out = out.decode(sys.getdefaultencoding())
+    results = re.findall("Final Results: Minibatch\[.+?\]: errs = (.+?)%", str_out)
+    assert len(results) == 2
+    assert results[0] == results[1]
     expected_test_error = 0.617
-
-    assert np.allclose(test_error, expected_test_error,
+    assert np.allclose(float(results[0])/100, expected_test_error,
                        atol=TOLERANCE_ABSOLUTE)
diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/pytest.ini b/Tests/EndToEndTests/CNTKv2Python/Examples/pytest.ini
index 7c4795540..3be93cb91 100644
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/pytest.ini
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/pytest.ini
@@ -1,2 +1,2 @@
 [pytest]
-python_files = *.py
+python_files = *_test.py
diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/run_cifar_convnet_distributed.py b/Tests/EndToEndTests/CNTKv2Python/Examples/run_cifar_convnet_distributed.py
new file mode 100644
index 000000000..be807aa4f
--- /dev/null
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/run_cifar_convnet_distributed.py
@@ -0,0 +1,52 @@
+﻿# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import numpy as np
+import os
+import sys
+import platform
+from cntk.io import ReaderConfig, ImageDeserializer
+from cntk import distributed
+from cntk.device import set_default_device, gpu
+
+abs_path = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "ConvNet", "Python"))
+from ConvNet_CIFAR10_DataAug_Distributed import convnet_cifar10_dataaug, create_reader
+
+def run_cifar_convnet_distributed():
+    try:
+        base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
+                                *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
+        # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
+        #      and CIFAR-10_mean.xml in the base_path.
+    except KeyError:
+        base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))
+
+    base_path = os.path.normpath(base_path)
+    os.chdir(os.path.join(base_path, '..'))
+
+    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
+    set_computation_network_trace_level(1) 
+    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
+    #force_deterministic_algorithms()
+    # TODO: do the above; they lead to slightly different results, so not doing it for now
+
+    reader_train = create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, 0)
+    reader_test  = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False)
+
+    distributed_after_samples = 0
+    num_quantization_bits = 32
+    distributed_trainer = distributed.data_parallel_distributed_trainer(
+        num_quantization_bits=num_quantization_bits,
+        distributed_after=distributed_after_samples)
+
+    return convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer, max_epochs=1)
+
+if __name__=='__main__':
+    set_default_device(gpu(0)) # force using GPU-0 in test for speed
+    run_cifar_convnet_distributed()
+    distributed.Communicator.finalize()
\ No newline at end of file
diff --git a/bindings/python/cntk/io/__init__.py b/bindings/python/cntk/io/__init__.py
index be4098833..3a147caa4 100644
--- a/bindings/python/cntk/io/__init__.py
+++ b/bindings/python/cntk/io/__init__.py
@@ -81,15 +81,17 @@ class MinibatchSource(cntk_py.MinibatchSource):
         randomize (bool, default True): randomize images before every epoch
         epoch_size (int): epoch size
         distributed_after (int): sample count after which minibatch source becomes distributed
+        multithreaded_deserializer (bool): using multi threaded deserializer
     '''
-    def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_after=INFINITE_SAMPLES):
+    def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_after=INFINITE_SAMPLES, multithreaded_deserializer=None):
         if not isinstance(deserializers, (list,tuple)):
             deserializers = [deserializers] # allow passing a single item or a list
         reader_config = ReaderConfig(
             deserializers=deserializers,
             randomize=randomize,
             epoch_size=epoch_size,
-            distributed_after=distributed_after)
+            distributed_after=distributed_after,
+            multithreaded_deserializer=multithreaded_deserializer)
         source = minibatch_source(reader_config)
         # transplant into this class instance
         self.__dict__ = source.__dict__
@@ -256,8 +258,9 @@ class ReaderConfig(dict):
         randomize (bool, default True): randomize images before every epoch
         epoch_size (int): epoch size
         distributed_after (int): sample count after which reader becomes distributed
+        multithreaded_deserializer (bool): using multi threaded deserializer
     '''
-    def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_after=INFINITE_SAMPLES):
+    def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_after=INFINITE_SAMPLES, multithreaded_deserializer=None):
 
         self['epochSize'] = cntk_py.SizeTWrapper(epoch_size) # force to store in size_t
         if not isinstance(deserializers, (list, tuple)):
@@ -265,6 +268,8 @@ class ReaderConfig(dict):
         self['deserializers'] = self.deserializers = deserializers or []
         self['randomize'] = randomize
         self['distributedAfterSampleCount'] = cntk_py.SizeTWrapper(distributed_after)
+        if multithreaded_deserializer != None:
+            self['multiThreadedDeserialization'] = multithreaded_deserializer
 
     @typemap
     def minibatch_source(self):

From 04e56bf2bedbddc9bf71511bc0076cc2fc740ad9 Mon Sep 17 00:00:00 2001
From: "REDMOND\\sayanpa" <sayanpa@microsoft.com>
Date: Wed, 23 Nov 2016 14:08:23 -0800
Subject: [PATCH 22/25] Increased the cell time out to 300 sec from 60 sec

---
 Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py b/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py
index 2637de1ce..8a9060fdc 100644
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py
@@ -65,7 +65,7 @@ def nb(tmpdir_factory, request, device_id):
     outPath = str(tmpdir_factory.mktemp('notebook').join('out.ipynb'))
     assert os.path.isfile(inPath)
     args = ["jupyter", "nbconvert", "--to", "notebook", "--execute",
-            "--ExecutePreprocessor.timeout=60", "--output", outPath, inPath]
+            "--ExecutePreprocessor.timeout=300", "--output", outPath, inPath]
     subprocess.check_call(args)
     nb = nbformat.read(outPath, nbformat.current_nbformat)
     return nb

From 494e39f76a94d0b5c3121a59a8564595a655e14c Mon Sep 17 00:00:00 2001
From: "REDMOND\\sayanpa" <sayanpa@microsoft.com>
Date: Tue, 22 Nov 2016 18:26:39 -0800
Subject: [PATCH 23/25] Added Sequence to sequence tutorial and test

---
 .../Examples/CNTK_204_Sequence_To_Sequence.py |   24 +
 Tutorials/CNTK_204_Sequence_To_Sequence.ipynb | 1124 +++++++++++++++++
 2 files changed, 1148 insertions(+)
 create mode 100644 Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py
 create mode 100644 Tutorials/CNTK_204_Sequence_To_Sequence.ipynb

diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py b/Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py
new file mode 100644
index 000000000..52ed55d4d
--- /dev/null
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import os
+import re
+
+abs_path = os.path.dirname(os.path.abspath(__file__))
+notebook = os.path.join(abs_path, "..", "..", "..", "..", "Tutorials", "CNTK_204_Sequence_To_Sequence.ipynb")
+
+def test_cntk_204_sequence_to_sequence_noErrors(nb):
+    errors = [output for cell in nb.cells if 'outputs' in cell
+              for output in cell['outputs'] if output.output_type == "error"]
+    print(errors)
+    assert errors == []
+
+expectedEvalError = 90
+
+def test_cntk_204_sequence_to_sequence_trainerror(nb):
+    testCell = [cell for cell in nb.cells
+                if cell.cell_type == 'code' and re.search('#Print the training error', cell.source)]
+    assert float((testCell[0].outputs[0])['text']) < expectedEvalError
\ No newline at end of file
diff --git a/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb b/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
new file mode 100644
index 000000000..cf136cf93
--- /dev/null
+++ b/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
@@ -0,0 +1,1124 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CNTK 204: Sequence to Sequence Networks with Text Data\n",
+    "\n",
+    "\n",
+    "## Introduction and Background\n",
+    "\n",
+    "This hands-on tutorial will take you through both the basics of sequence-to-sequence networks, and how to implement them in the Microsoft Cognitive Toolkit. In particular, we will implement a sequence-to-sequence model to perform grapheme to phoneme translation. We will start with some basic theory and then explain the data in more detail, and how you can download it.\n",
+    "\n",
+    "Andrej Karpathy has a [nice visualization](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) of the five paradigms of neural network architectures:\n",
+    "\n",
+    "<img src=http://cntk.ai/jup/paradigms.jpg width=750px>\n",
+    "\n",
+    "In this tutorial, we are going to be talking about the fourth paradigm: many-to-many, also known as sequence-to-sequence networks. The input is a sequence with a dynamic length, and the output is also a sequence with some dynamic length. It is the logical extension of the many-to-one paradigm in that previously we were predicting some category (which could easily be one of `V` words where `V` is an entire vocabulary) and now we want to predict a whole sequence of those categories.\n",
+    "\n",
+    "The applications of sequence-to-sequence networks are nearly limitless. It is a natural fit for machine translation (e.g. English input sequences, French output sequences); automatic text summarization (e.g. full document input sequence, summary output sequence); word to pronunciation models (e.g. character [grapheme] input sequence, pronunciation [phoneme] output sequence); and even parse tree generation (e.g. regular text input, flat parse tree output).\n",
+    "\n",
+    "## Basic theory\n",
+    "\n",
+    "A sequence-to-sequence model consists of two main pieces: (1) an encoder; and (2) a decoder. Both the encoder and the decoder are recurrent neural network (RNN) layers that can be implemented using a vanilla RNN, an LSTM, or GRU cells (here we will use LSTM). In the basic sequence-to-sequence model, the encoder processes the input sequence into a fixed representation that is fed into the decoder as a context. The decoder then uses some mechanism (discussed below) to decode the processed information into an output sequence. The decoder is a language model that is augmented with some \"strong context\" by the encoder, and so each symbol that it generates is fed back into the decoder for additional context (like a traditional LM). For an English to German translation task, the most basic setup might look something like this: \n",
+    "\n",
+    "<img src=http://cntk.ai/jup/s2s.png width=700px>\n",
+    "\n",
+    "The basic sequence-to-sequence network passes the information from the encoder to the decoder by initializing the decoder RNN with the final hidden state of the encoder as its initial hidden state. The input is then a \"sequence start\" tag (`<s>` in the diagram above) which primes the decoder to start generating an output sequence. Then, whatever word (or note or image, etc.) it generates at that step is fed in as the input for the next step. The decoder keeps generating outputs until it hits the special \"end sequence\" tag (`</s>` above).\n",
+    "\n",
+    "A more complex and powerful version of the basic sequence-to-sequence network uses an attention model. While the above setup works well, it can start to break down when the input sequences get long. At each step, the hidden state `h` is getting updated with the most recent information, and therefore `h` might be getting \"diluted\" in information as it processes each token. Further, even with a relatively short sequence, the last token will always get the last say and therefore the thought vector will be somewhat biased/weighted towards that last word. To deal with this problem, we use an \"attention\" mechanism that allows the decoder to look not only at all of the hidden states from the input, but it also learns which hidden states, for each step in decoding, to put the most weight on. We will discuss an attention implementation in a later version of this tutorial."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Problem: Grapheme-to-Phoneme Conversion\n",
+    "\n",
+    "The [grapheme](https://en.wikipedia.org/wiki/Grapheme) to [phoneme](https://en.wikipedia.org/wiki/Phoneme) problem is a translation task that takes the letters of a word as the input sequence (the graphemes are the smallest units of a writing system) and outputs the corresponding phonemes; that is, the units of sound that make up a language. In other words, the system aims to generate an unambigious representation of how to pronounce a given input word.\n",
+    "\n",
+    "### Example\n",
+    "\n",
+    "| Letters  | T | A | N | G | E | R |\n",
+    "| --- | --- |\n",
+    "| Phonemes | ~T | ~AE | ~NG | ~ER | null | null |\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task and Model Structure\n",
+    "\n",
+    "As discussed above, the task we are interested in solving is creating a model that takes some sequence as an input, and generates an output sequence based on the contents of the input. The model's job is to learn the mapping from the input sequence to the output sequence that it will generate. The job of the encoder is to come up with a good representation of the input that the decoder can use to generate a good output. For both the encoder and the decoder, the LSTM does a good job at this.\n",
+    "\n",
+    "We will use the LSTM implementation from the CNTK Blocks library. This implements the \"smarts\" of the LSTM and we can more or less think of it as a black box. What is important to understand, however, is that there are two pieces to think of when implementing an RNN: the recurrence, which is the unrolled network over a sequence, and the block, which is the piece of the network run for each element of the sequence. We only need to implement the recurrence.\n",
+    "\n",
+    "It helps to think of the recurrence as a function that keeps calling `step(x)` on the block (in our case, LSTM). At a high level, it looks like this:\n",
+    "\n",
+    "```\n",
+    "class LSTM {\n",
+    "    float hidden_state\n",
+    "    \n",
+    "    init(initial_value):\n",
+    "        hidden_state = initial_value\n",
+    "        \n",
+    "    step(x):\n",
+    "        hidden_state = LSTM_function(x, hidden_state)\n",
+    "        return hidden_state\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "So, each call to the `step(x)` function takes some input `x`, modifies the internal `hidden_state`, and returns it. Therefore, with every input `x`, the value of the `hidden_state` evolves. Below we will import some required functionality, and then implement the recurrence that makes use of this mechanism."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Importing CNTK and other useful libraries\n",
+    "\n",
+    "CNTK is a python module that contains several submodules like `io`, `learner`, `graph`, etc. We make extensive use of numpy as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os\n",
+    "from cntk import Trainer, Axis, save_model\n",
+    "from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT\n",
+    "from cntk.learner import momentum_sgd, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType\n",
+    "from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence, past_value, future_value, element_select, \\\n",
+    "                     alias, hardmax, placeholder_variable, combine, parameter, plus, times\n",
+    "from cntk.ops.functions import CloneMethod\n",
+    "from cntk.graph import find_nodes_by_name\n",
+    "from cntk.blocks import LSTM, Stabilizer\n",
+    "from cntk.initializer import glorot_uniform\n",
+    "from cntk.utils import get_train_eval_criterion, get_train_loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Downloading the data\n",
+    "\n",
+    "In this tutorial we will use a lightly pre-processed version of the CMUDict (version 0.7b) dataset from http://www.speech.cs.cmu.edu/cgi-bin/cmudict. The CMUDict data is the Carnegie Mellon University Pronouncing Dictionary is an open-source machine-readable pronunciation dictionary for North American English. The data is in the CNTKTextFormatReader format. Here is an example sequence pair from the data, where the input sequence (S0) is in the left column, and the output sequence (S1) is on the right:\n",
+    "\n",
+    "```\n",
+    "0\t|S0 3:1 |# <s>\t|S1 3:1 |# <s>\n",
+    "0\t|S0 4:1 |# A\t|S1 32:1 |# ~AH\n",
+    "0\t|S0 5:1 |# B\t|S1 36:1 |# ~B\n",
+    "0\t|S0 4:1 |# A\t|S1 31:1 |# ~AE\n",
+    "0\t|S0 7:1 |# D\t|S1 38:1 |# ~D\n",
+    "0\t|S0 12:1 |# I\t|S1 47:1 |# ~IY\n",
+    "0\t|S0 1:1 |# </s>\t|S1 1:1 |# </s>\n",
+    "```\n",
+    "\n",
+    "The code below will download the required files (training, the single sequence above for validation, and a small vocab file) and put them in your local folder (the training file is ~34 MB, testing is ~4MB, and the validation file and vocab file are both less than 1KB)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting download:  tiny.ctf\n",
+      "Download completed\n",
+      "Starting download:  cmudict-0.7b.train-dev-20-21.ctf\n",
+      "Download completed\n",
+      "Starting download:  cmudict-0.7b.mapping\n",
+      "Download completed\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "\n",
+    "def download(url, filename):\n",
+    "    \"\"\" utility to download necessary data \"\"\"\n",
+    "    response = requests.get(url, stream=True)\n",
+    "    with open(filename, \"wb\") as handle:\n",
+    "        for data in response.iter_content():\n",
+    "            handle.write(data)\n",
+    "            \n",
+    "valid = \"https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/Data/tiny.ctf?raw=true\"\n",
+    "train = \"https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf?raw=true\"\n",
+    "vocab = \"https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping?raw=true\"\n",
+    "\n",
+    "urls = [valid, train, vocab]\n",
+    "\n",
+    "for t in urls:\n",
+    "    filename = t.split('/')[-1].split('?')[0]\n",
+    "    try:\n",
+    "        f = open(filename)\n",
+    "        f.close()\n",
+    "        print(\"Reusing locally cached: \", filename)\n",
+    "    except IOError:\n",
+    "        print(\"Starting download: \", filename)\n",
+    "        download(t, filename)\n",
+    "        print(\"Download completed\")\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Select the notebook run mode\n",
+    "\n",
+    "There are two run modes:\n",
+    "- *Fast mode*: `isFast` is set to `True`. This is the default mode for the notebooks, which means we train for fewer iterations or train / test on limited data. This ensures functional correctness of the notebook though the models produced are far from what a completed training would produce.\n",
+    "\n",
+    "- *Slow mode*: We recommend the user to set this flag to `False` once the user has gained familiarity with the notebook content and wants to gain insight from running the notebooks for a longer period with different parameters for training. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "isFast = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reader\n",
+    "\n",
+    "To efficiently collect our data, randomize it for training, and pass it to the network, we use the CNTKTextFormat reader. We will create a small function that will be called when training (or testing) that defines the names of the streams in our data, and how they are referred to in the raw training data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Hepler function to load the model vocabulary file\n",
+    "def get_vocab(path):\n",
+    "    # get the vocab for printing output sequences in plaintext\n",
+    "    vocab = [w.strip() for w in open(path).readlines()]\n",
+    "    i2w = { i:ch for i,ch in enumerate(vocab) }\n",
+    "    \n",
+    "    return (vocab, i2w)\n",
+    "\n",
+    "# Read vocabulary data and generate their corresponding indices\n",
+    "vocab, i2w = get_vocab(\"cmudict-0.7b.mapping\")\n",
+    "\n",
+    "input_vocab_size = len(vocab)\n",
+    "label_vocab_size = len(vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vocabulary size is  69\n",
+      "First 15 letters are :\n",
+      "[\"'\", '</s>', '<s/>', '<s>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']\n",
+      "\n",
+      "Print dictionary with the vocabulary mapping: \n",
+      "{0: \"'\", 1: '</s>', 2: '<s/>', 3: '<s>', 4: 'A', 5: 'B', 6: 'C', 7: 'D', 8: 'E', 9: 'F', 10: 'G', 11: 'H', 12: 'I', 13: 'J', 14: 'K', 15: 'L', 16: 'M', 17: 'N', 18: 'O', 19: 'P', 20: 'Q', 21: 'R', 22: 'S', 23: 'T', 24: 'U', 25: 'V', 26: 'W', 27: 'X', 28: 'Y', 29: 'Z', 30: '~AA', 31: '~AE', 32: '~AH', 33: '~AO', 34: '~AW', 35: '~AY', 36: '~B', 37: '~CH', 38: '~D', 39: '~DH', 40: '~EH', 41: '~ER', 42: '~EY', 43: '~F', 44: '~G', 45: '~HH', 46: '~IH', 47: '~IY', 48: '~JH', 49: '~K', 50: '~L', 51: '~M', 52: '~N', 53: '~NG', 54: '~OW', 55: '~OY', 56: '~P', 57: '~R', 58: '~S', 59: '~SH', 60: '~T', 61: '~TH', 62: '~UH', 63: '~UW', 64: '~V', 65: '~W', 66: '~Y', 67: '~Z', 68: '~ZH'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print vocab and the correspoding mapping to the phonemes\n",
+    "print(\"Vocabulary size is \", len(vocab))\n",
+    "print(\"First 15 letters are :\")\n",
+    "print(vocab[:15])\n",
+    "print()\n",
+    "print(\"Print dictionary with the vocabulary mapping: \")\n",
+    "print(i2w)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use the above to create a reader for our training data. Let's create it now:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def create_reader(path, randomize, size=INFINITELY_REPEAT):\n",
+    "    return MinibatchSource(CTFDeserializer(path, StreamDefs(\n",
+    "        features  = StreamDef(field='S0', shape=input_vocab_size, is_sparse=True),\n",
+    "        labels    = StreamDef(field='S1', shape=label_vocab_size, is_sparse=True)\n",
+    "    )), randomize=randomize, epoch_size = size)\n",
+    "\n",
+    "# Train data reader\n",
+    "train_reader = create_reader(\"cmudict-0.7b.train-dev-20-21.ctf\", True)\n",
+    "\n",
+    "# Validation/Test data reader \n",
+    "valid_reader = create_reader(\"tiny.ctf\", False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now let's set our model hyperparameters..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Our input vocabulary size is 69, and those ones represent the label as well. Additionally we have 1 hidden layer with 128 nodes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model_dir = \".\" # we downloaded our data to the local directory above\n",
+    "\n",
+    "# model dimensions\n",
+    "input_vocab_dim  = input_vocab_size\n",
+    "label_vocab_dim  = label_vocab_size\n",
+    "hidden_dim = 128\n",
+    "num_layers = 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: setup the input to the network\n",
+    "\n",
+    "### Dynamic axes in CNTK (Key concept)\n",
+    "\n",
+    "One of the important concepts in understanding CNTK is the idea of two types of axes: \n",
+    "- **static axes**, which are the traditional axes of a variable's shape, and \n",
+    "- **dynamic axes**, which have dimensions that are unknown until the variable is bound to real data at computation time. \n",
+    "\n",
+    "The dynamic axes are particularly important in the world of recurrent neural networks. Instead of having to decide a maximum sequence length ahead of time, padding your sequences to that size, and wasting computation, CNTK's dynamic axes allow for variable sequence lengths that are automatically packed in minibatches to be as efficient as possible.\n",
+    "\n",
+    "When setting up sequences, there are *two dynamic axes* that are important to consider. The first is the *batch axis*, which is the axis along which multiple sequences are batched. The second is the dynamic axis particular to that sequence. The latter is specific to a particular input because of variable sequence lengths in your data. For example, in sequence to sequence networks, we have two sequences: the **input sequence**, and the **ouptput (or 'label') sequence**. One of the things that makes this type of network so powerful is that the length of the input sequence and the output sequence do not have to correspond to each other. Therefore, both the input sequence and the output sequence require their own unique dynamic axis.\n",
+    "\n",
+    "When defining the input to a network, we set up the required dynamic axes and the shape of the input variables. Below, we define the shape (vocabulary size) of the inputs, create their dynamic axes, and finally create input variables that represent input nodes in our network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Source and target inputs to the model\n",
+    "batch_axis = Axis.default_batch_axis()\n",
+    "input_seq_axis = Axis('inputAxis')\n",
+    "label_seq_axis = Axis('labelAxis')\n",
+    "\n",
+    "input_dynamic_axes = [batch_axis, input_seq_axis]\n",
+    "raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input')\n",
+    "\n",
+    "label_dynamic_axes = [batch_axis, label_seq_axis]\n",
+    "raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Questions\n",
+    "\n",
+    "1. Why do the shapes of the input variables correspond to the size of our dictionaries in sequence to sequence networks?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: define the network\n",
+    "\n",
+    "As discussed before, the sequence-to-sequence network is, at its most basic, an RNN encoder followed by an RNN decoder, and a dense output layer. We could do this in a few lines with the layers library, but let's go through things in a little more detail without adding too much complexity. The first step is to perform some manipulations on the input data; let's look at the code below and then discuss what we're doing. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Instantiate the sequence to sequence translation model\n",
+    "input_sequence = raw_input\n",
+    "\n",
+    "# Drop the sentence start token from the label, for decoder training\n",
+    "label_sequence = sequence.slice(raw_labels, \n",
+    "                       1, 0, name='label_sequence') # <s> A B C </s> --> A B C </s>\n",
+    "label_sentence_start = sequence.first(raw_labels)   # <s>\n",
+    "\n",
+    "is_first_label = sequence.is_first(label_sequence)  # 1 0 0 0 ...\n",
+    "label_sentence_start_scattered = sequence.scatter(  # <s> 0 0 0 ... (up to the length of label_sequence)\n",
+    "    label_sentence_start, is_first_label)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We have two input variables, `raw_input` and `raw_labels`. Typically, the labels would not have to be part of the network definition because they would only be used in a criterion node when we compare the network's output with the ground truth. However, in sequence-to-sequence networks, the labels themselves form part of the input to the network during training as they are fed as the input into the decoder.\n",
+    "\n",
+    "To make use of these input variables, we will pass them through computation nodes. We first set `input_sequence` to `raw_input` as a convenience step. We then perform several modifications to `label_sequence` so that it will work with our network. For now you'll just have to trust that we will make good use of this stuff later.\n",
+    "\n",
+    "First, we slice the first element off of `label_sequence` so that it's missing the sentence-start token. This is because the decoder will always first be primed with that token, both during training and evaluation. When the ground truth isn't fed into the decoder, we will still feed in a sentence-start token, so we want to consistently view the input to the decoder as a sequence that starts with an actual value.\n",
+    "\n",
+    "Then, we get `label_sequence_start` by getting the `first` element from the sequence `raw_labels`. This will be used to compose a sequence that is the first input to the decoder regardless of whether we're training or decoding. Finally, the last two statements set up an actual sequence, with the correct dynamic axis, to be fed into the decoder. The function `sequence.scatter` takes the contents of `label_sentence_start` (which is `<s>`) and turns it into a sequence with the first element containing the sequence start symbol and the rest of the elements containing 0's."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Let's create the LSTM recurrence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def LSTM_layer(input, output_dim, recurrence_hook_h=past_value, recurrence_hook_c=past_value):\n",
+    "    # we first create placeholders for the hidden state and cell state which we don't have yet\n",
+    "    dh = placeholder_variable(shape=(output_dim), dynamic_axes=input.dynamic_axes)\n",
+    "    dc = placeholder_variable(shape=(output_dim), dynamic_axes=input.dynamic_axes)\n",
+    "   \n",
+    "    # we now create an LSTM_cell function and call it with the input and placeholders\n",
+    "    LSTM_cell = LSTM(output_dim)\n",
+    "    f_x_h_c = LSTM_cell(input, (dh, dc))\n",
+    "    h_c = f_x_h_c.outputs\n",
+    "    \n",
+    "    # we setup the recurrence by specifying the type of recurrence (by default it's `past_value` -- the previous value)\n",
+    "    h = recurrence_hook_h(h_c[0])\n",
+    "    c = recurrence_hook_c(h_c[1])\n",
+    "\n",
+    "    replacements = { dh: h.output, dc: c.output }\n",
+    "    f_x_h_c.replace_placeholders(replacements)\n",
+    "\n",
+    "    h = f_x_h_c.outputs[0]\n",
+    "    c = f_x_h_c.outputs[1]\n",
+    "\n",
+    "    # and finally we return the hidden state and cell state as functions (by using `combine`)\n",
+    "    return combine([h]), combine([c])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise 1: Create the encoder\n",
+    "\n",
+    "We will use the LSTM recurrence that we defined just above. Remember that its function signature is:\n",
+    "\n",
+    "`def LSTM_layer(input, output_dim, recurrence_hook_h=past_value, recurrence_hook_c=past_value):`\n",
+    "\n",
+    "and it returns a tuple `(hidden_state, hidden_cell)`. We will complete the following four exercises below. If possible, try them out before looking at the answers.\n",
+    "\n",
+    "1. Create the encoder (set the `output_dim` and `cell_dim` to `hidden_dim` which we defined earlier).\n",
+    "2. Set `num_layers` to something higher than 1 and create a stack of LSTMs to represent the encoder.\n",
+    "3. Get the output of the encoder and put it into the right form to be passed into the decoder [hard]\n",
+    "4. Reverse the order of the `input_sequence` (this has been shown to help especially in machine translation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# 1.\n",
+    "# Create the encoder (set the output_dim to hidden_dim which we defined earlier).\n",
+    "\n",
+    "(encoder_output_h, encoder_output_c) = LSTM_layer(input_sequence, hidden_dim)\n",
+    "\n",
+    "# 2.\n",
+    "# Set num_layers to something higher than 1 and create a stack of LSTMs to represent the encoder.\n",
+    "num_layers = 2\n",
+    "output_h = alias(input_sequence) # get a copy of the input_sequence\n",
+    "for i in range(0, num_layers):\n",
+    "    (output_h, output_c) = LSTM_layer(output_h.output, hidden_dim)\n",
+    "\n",
+    "# 3.\n",
+    "# Get the output of the encoder and put it into the right form to be passed into the decoder [hard]\n",
+    "thought_vector_h = sequence.first(output_h)\n",
+    "thought_vector_c = sequence.first(output_c)\n",
+    "\n",
+    "thought_vector_broadcast_h = sequence.broadcast_as(thought_vector_h, label_sequence)\n",
+    "thought_vector_broadcast_c = sequence.broadcast_as(thought_vector_c, label_sequence)\n",
+    "\n",
+    "# 4.\n",
+    "# Reverse the order of the input_sequence (this has been shown to help especially in machine translation)\n",
+    "(encoder_output_h, encoder_output_c) = LSTM_layer(input_sequence, hidden_dim, future_value, future_value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise 2: Create the decoder\n",
+    "\n",
+    "In our basic version of the sequence-to-sequence network, the decoder generates an output sequence given the input sequence by setting the initial state of the decoder to the final hidden state of the encoder. The hidden state is represented by a tuple `(encoder_h, encoder_c)` where `h` represents the output hidden state and `c` represents the value of the LSTM cell.\n",
+    "\n",
+    "Besides setting the initial state of the decoder, we also need to give the decoder LSTM some input. The first element will always be the special sequence start tag `<s>`. After that, there are two ways that we want to wire up the decoder's input: one during training, and the other during evaluation (i.e. generating sequences on the trained network).\n",
+    "\n",
+    "For training, the input to the decoder is the output sequence from the training data, also known as the label(s) for the input sequence. During evaluation, we will instead redirect the output from the network back into the decoder as its history. Let's first set up the input for training..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(label_sequence))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Above, we use the function `element_select` which will return one of two options given the condition `is_first_label`. Remember that we're working with sequences so when the decoder LSTM is run its input will be unrolled along with the network. The above allows us to to have a dynamic input that will return a specific element given what time step we're currently processing.\n",
+    "\n",
+    "Therefore, the `decoder_input` will be `label_sentence_start_scattered` (which is simply `<s>`) when we are at the first time step, and otherwise it will return the `past_value` (i.e. the previous element given what time step we're currently at) of `label_sequence`.\n",
+    "\n",
+    "Next, we need to setup our actual decoder. Before, for the encoder, we did the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "(output_h, output_c) = LSTM_layer(input_sequence, hidden_dim, \n",
+    "                                  recurrence_hook_h=past_value, recurrence_hook_c=past_value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To be able to set the first hidden state of the decoder to be equal to the final hidden state of the encoder, we can leverage the parameters `recurrence_hookH` and `recurrent_hookC`. The default `past_value` is a function that returns, for time `t`, the element in the sequence at time `t-1`. See if you can figure out how to set that up.\n",
+    "\n",
+    "1. Create the recurrence hooks for the decoder LSTM.\n",
+    " * Hint: you'll have to create a `lambda operand:` and you will make use of the `is_first_label` mask we used earlier and the `thought_vector_broadcast_h` and `thought_vector_broadcast_c` representations of the output of the encoder.\n",
+    "\n",
+    "2. With your recurrence hooks, create the decoder.\n",
+    " * Hint: again we'll use the `LSTMP_component_with_self_stabilization()` function and again use `hidden_dim` for the `output_dim` and `cell_dim`.\n",
+    " \n",
+    "3. Create a decoder with multiple layers. Note that you will have to use different recurrence hooks for the lower layers that feed back into the stack of layers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# 1. \n",
+    "# Create the recurrence hooks for the decoder LSTM.\n",
+    "\n",
+    "recurrence_hook_h = lambda operand: element_select(is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
+    "recurrence_hook_c = lambda operand: element_select(is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "\n",
+    "# 2.\n",
+    "# With your recurrence hooks, create the decoder.\n",
+    "\n",
+    "(decoder_output_h, decoder_output_c) = LSTM_layer(decoder_input, hidden_dim, recurrence_hook_h, recurrence_hook_c)\n",
+    "\n",
+    "# 3.\n",
+    "# Create a decoder with multiple layers. \n",
+    "# Note that you will have to use different recurrence hooks for the lower layers\n",
+    "\n",
+    "num_layers = 3\n",
+    "decoder_output_h = alias(decoder_input)\n",
+    "for i in range(0, num_layers):\n",
+    "    if (i > 0):\n",
+    "        recurrence_hook_h = past_value\n",
+    "        recurrence_hook_c = past_value\n",
+    "    else:\n",
+    "        recurrence_hook_h = lambda operand: element_select(\n",
+    "            is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
+    "        recurrence_hook_c = lambda operand: element_select(\n",
+    "            is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "\n",
+    "    (decoder_output_h, decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim, \n",
+    "                                                      recurrence_hook_h, recurrence_hook_c)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise 3: Fully connected layer (network output)\n",
+    "\n",
+    "Now we're almost at the end of defining the network. All we need to do is take the output of the decoder, and run it through a linear layer. Ultimately it will be put into a `softmax` to get a probability distribution over the possible output words. However, we will include that as part of our criterion nodes (below).\n",
+    "\n",
+    "1. Add the linear layer (a weight matrix, a bias parameter, a times, and a plus) to get the final output of the network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# 1.\n",
+    "# Add the linear layer\n",
+    "\n",
+    "W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform())\n",
+    "B = parameter(shape=(label_vocab_dim), init=0)\n",
+    "z = plus(B, times(decoder_output_h, W))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Putting the model together\n",
+    "\n",
+    "With the above we have defined some of the network and asked you to define parts of it as exercises. Here let's put the whole thing into a function called `create_model()`. Remember, all this does is create a skeleton of the network that defines how data will flow through it. No data is running through it yet."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def create_model():\n",
+    "    \n",
+    "    # Source and target inputs to the model\n",
+    "    batch_axis = Axis.default_batch_axis()\n",
+    "    input_seq_axis = Axis('inputAxis')\n",
+    "    label_seq_axis = Axis('labelAxis')\n",
+    "\n",
+    "    input_dynamic_axes = [batch_axis, input_seq_axis]\n",
+    "    raw_input = input_variable(\n",
+    "        shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input')\n",
+    "\n",
+    "    label_dynamic_axes = [batch_axis, label_seq_axis]\n",
+    "    raw_labels = input_variable(\n",
+    "        shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels')\n",
+    "\n",
+    "    # Instantiate the sequence to sequence translation model\n",
+    "    input_sequence = raw_input\n",
+    "\n",
+    "    # Drop the sentence start token from the label, for decoder training\n",
+    "    label_sequence = sequence.slice(raw_labels, 1, 0, \n",
+    "                                    name='label_sequence') # <s> A B C </s> --> A B C </s>\n",
+    "    label_sentence_start = sequence.first(raw_labels)      # <s>\n",
+    "\n",
+    "    # Setup primer for decoder\n",
+    "    is_first_label = sequence.is_first(label_sequence)  # 1 0 0 0 ...\n",
+    "    label_sentence_start_scattered = sequence.scatter(\n",
+    "        label_sentence_start, is_first_label)\n",
+    "\n",
+    "    # Encoder\n",
+    "    stabilize = Stabilizer()\n",
+    "    encoder_output_h = stabilize(input_sequence)\n",
+    "    for i in range(0, num_layers):\n",
+    "        (encoder_output_h, encoder_output_c) = LSTM_layer(\n",
+    "            encoder_output_h.output, hidden_dim, future_value, future_value)\n",
+    "\n",
+    "    # Prepare encoder output to be used in decoder\n",
+    "    thought_vector_h = sequence.first(encoder_output_h)\n",
+    "    thought_vector_c = sequence.first(encoder_output_c)\n",
+    "\n",
+    "    thought_vector_broadcast_h = sequence.broadcast_as(\n",
+    "        thought_vector_h, label_sequence)\n",
+    "    thought_vector_broadcast_c = sequence.broadcast_as(\n",
+    "        thought_vector_c, label_sequence)\n",
+    "\n",
+    "    # Decoder\n",
+    "    decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence\n",
+    "\n",
+    "    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(\n",
+    "        decoder_history_hook))\n",
+    "\n",
+    "    decoder_output_h = stabilize(decoder_input)\n",
+    "    for i in range(0, num_layers):\n",
+    "        if (i > 0):\n",
+    "            recurrence_hook_h = past_value\n",
+    "            recurrence_hook_c = past_value\n",
+    "        else:\n",
+    "            recurrence_hook_h = lambda operand: element_select(\n",
+    "                is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
+    "            recurrence_hook_c = lambda operand: element_select(\n",
+    "                is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "\n",
+    "        (decoder_output_h, decoder_output_c) = LSTM_layer(\n",
+    "            decoder_output_h.output, hidden_dim, recurrence_hook_h, recurrence_hook_c)\n",
+    "\n",
+    "    # Linear output layer\n",
+    "    W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform())\n",
+    "    B = parameter(shape=(label_vocab_dim), init=0)\n",
+    "    z = plus(B, times(stabilize(decoder_output_h), W))    \n",
+    "    \n",
+    "    return z"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training\n",
+    "\n",
+    "Now that we've created the model, we are ready to train the network and learn its parameters. For sequence-to-sequence networks, the loss we use is cross-entropy. Note that we have to find the `label_sequences` node from the model because it was defined in our network and we want to compare the model's predictions specifically to the outputs of that node."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['raw_labels', 'raw_input']\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = create_model()\n",
+    "label_sequence = find_nodes_by_name(model, 'label_sequence')[0]\n",
+    "\n",
+    "# Criterion nodes\n",
+    "ce = cross_entropy_with_softmax(model, label_sequence)\n",
+    "errs = classification_error(model, label_sequence)\n",
+    "\n",
+    "# let's show the required arguments for this model\n",
+    "print([x.name for x in model.arguments])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we'll setup a bunch of parameters to drive our learning, we'll create the learner, and finally create our trainer:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# training parameters\n",
+    "lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)\n",
+    "minibatch_size = 72\n",
+    "momentum_time_constant = momentum_as_time_constant_schedule(1100)\n",
+    "clipping_threshold_per_sample = 2.3\n",
+    "gradient_clipping_with_truncation = True\n",
+    "learner = momentum_sgd(model.parameters,\n",
+    "                        lr_per_sample, momentum_time_constant,\n",
+    "                        gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,\n",
+    "                        gradient_clipping_with_truncation=gradient_clipping_with_truncation)\n",
+    "trainer = Trainer(model, ce, errs, learner)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And now we bind the features and labels from our `train_reader` to the inputs that we setup in our network definition. First however, we'll define a convenience function to help find an argument name when pointing the reader's features to an argument of our model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# helper function to find variables by name\n",
+    "def find_arg_by_name(name, expression):\n",
+    "    vars = [i for i in expression.arguments if i.name == name]\n",
+    "    assert len(vars) == 1\n",
+    "    return vars[0]\n",
+    "\n",
+    "train_bind = {\n",
+    "        find_arg_by_name('raw_input' , model) : train_reader.streams.features,\n",
+    "        find_arg_by_name('raw_labels', model) : train_reader.streams.labels\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we define our training loop and start training the network!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Minibatch: 0, Train Loss: 3.149, Train Evaluation Criterion: 0.870\n",
+      "Minibatch: 100, Train Loss: 3.029, Train Evaluation Criterion: 0.759\n",
+      "Minibatch: 200, Train Loss: 3.098, Train Evaluation Criterion: 0.778\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_progress_output_freq = 100\n",
+    "max_num_minibatch = 100 if isFast else 1000\n",
+    "\n",
+    "for i in range(max_num_minibatch):\n",
+    "    # get next minibatch of training data\n",
+    "    mb_train = train_reader.next_minibatch(minibatch_size, input_map=train_bind)\n",
+    "    trainer.train_minibatch(mb_train)\n",
+    "\n",
+    "    # collect epoch-wide stats\n",
+    "    if i % training_progress_output_freq == 0:\n",
+    "        print(\"Minibatch: {0}, Train Loss: {1:.3f}, Train Evaluation Criterion: {2:2.3f}\".format(i, \n",
+    "                        get_train_loss(trainer), get_train_eval_criterion(trainer)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model evaluation: greedy decoding\n",
+    "\n",
+    "Once we have a trained model, we of course then want to make use of it to generate output sequences! In this case, we will use greedy decoding. What this means is that we will run an input sequence through our trained network, and when we generate the output sequence, we will do so one element at a time by taking the `hardmax()` of the output of our network. This is obviously not optimal in general. Given the context, some word may always be the most probable at the first step, but another first word may be preferred given what is output later on. Decoding the optimal sequence is intractable in general. But we can do better doing a beam search where we keep around some small number of hypotheses at each step. However, greedy decoding can work surprisingly well for sequence-to-sequence networks because so much of the context is kept around in the RNN.\n",
+    "\n",
+    "To do greedy decoding, we need to hook in the previous output of our network as the input to the decoder. During training we passed the `label_sequences` (ground truth) in. You'll notice in our `create_model()` function above the following lines:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence\n",
+    "decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This gives us a way to modify the `decoder_history_hook` after training to something else. We've already trained our network, but now we need a way to evaluate it without using a ground truth. We can do that like this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "model = create_model()\n",
+    "\n",
+    "# get some references to the new model\n",
+    "label_sequence = find_nodes_by_name(model, 'label_sequence')[0]    \n",
+    "decoder_history_hook = find_nodes_by_name(model, 'decoder_history_hook')[0]  \n",
+    "\n",
+    "# and now replace the output of decoder_history_hook with the hardmax output of the network\n",
+    "def clone_and_hook():\n",
+    "    # network output for decoder history\n",
+    "    net_output = hardmax(model)\n",
+    "\n",
+    "    # make a clone of the graph where the ground truth is replaced by the network output\n",
+    "    return model.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output})\n",
+    "\n",
+    "# get a new model that uses the past network output as input to the decoder\n",
+    "new_model = clone_and_hook()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `new_model` now contains a version of the original network that shares parameters with it but that has a different input to the decoder. Namely, instead of feeding the ground truth labels into the decoder, it will feed in the history that the network has generated!\n",
+    "\n",
+    "Finally, let's see what it looks like if we train, and keep evaluating the network's output every `100` iterations by running a word's graphemes ('A B A D I') through our network. This way we can visualize the progress learning the best model... First we'll define a more complete `train()` action. It is largely the same as above but has some additional training parameters included; some additional smarts for printing out statistics as we go along; we now see progress over our data as epochs (one epoch is one complete pass over the training data); and we setup a reader for the single validation sequence we described above so that we can visually see our network's progress on that sequence as it learns."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "########################\n",
+    "# train action         #\n",
+    "########################\n",
+    "\n",
+    "def train(train_reader, valid_reader, vocab, i2w, model, max_epochs):\n",
+    "    \n",
+    "    # do some hooks that we won't need in the future\n",
+    "    label_sequence = find_nodes_by_name(model, 'label_sequence')[0]    \n",
+    "    decoder_history_hook = find_nodes_by_name(model, 'decoder_history_hook')[0]  \n",
+    "        \n",
+    "    # Criterion nodes\n",
+    "    ce = cross_entropy_with_softmax(model, label_sequence)\n",
+    "    errs = classification_error(model, label_sequence)\n",
+    "\n",
+    "    def clone_and_hook():\n",
+    "        # network output for decoder history\n",
+    "        net_output = hardmax(model)\n",
+    "\n",
+    "        # make a clone of the graph where the ground truth is replaced by the network output\n",
+    "        return model.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output})\n",
+    "\n",
+    "    # get a new model that uses the past network output as input to the decoder\n",
+    "    new_model = clone_and_hook()\n",
+    "\n",
+    "    # Instantiate the trainer object to drive the model training\n",
+    "    lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)\n",
+    "    minibatch_size = 72\n",
+    "    momentum_time_constant = momentum_as_time_constant_schedule(1100)\n",
+    "    clipping_threshold_per_sample = 2.3\n",
+    "    gradient_clipping_with_truncation = True\n",
+    "    learner = momentum_sgd(model.parameters,\n",
+    "                           lr_per_sample, momentum_time_constant,\n",
+    "                           gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,\n",
+    "                           gradient_clipping_with_truncation=gradient_clipping_with_truncation)\n",
+    "    trainer = Trainer(model, ce, errs, learner)\n",
+    "\n",
+    "    # Get minibatches of sequences to train with and perform model training\n",
+    "    i = 0\n",
+    "    mbs = 0\n",
+    "    \n",
+    "    # Set epoch size to a larger number of lower training error\n",
+    "    epoch_size = 5000 if isFast else 908241\n",
+    "    \n",
+    "    training_progress_output_freq = 100\n",
+    "\n",
+    "    # bind inputs to data from readers\n",
+    "    train_bind = {\n",
+    "        find_arg_by_name('raw_input' , model) : train_reader.streams.features,\n",
+    "        find_arg_by_name('raw_labels', model) : train_reader.streams.labels\n",
+    "    }\n",
+    "    valid_bind = {\n",
+    "        find_arg_by_name('raw_input' , new_model) : valid_reader.streams.features,\n",
+    "        find_arg_by_name('raw_labels', new_model) : valid_reader.streams.labels\n",
+    "    }\n",
+    "\n",
+    "    for epoch in range(max_epochs):\n",
+    "        loss_numer = 0\n",
+    "        metric_numer = 0\n",
+    "        denom = 0\n",
+    "\n",
+    "        while i < (epoch+1) * epoch_size:\n",
+    "            # get next minibatch of training data\n",
+    "            mb_train = train_reader.next_minibatch(minibatch_size, input_map=train_bind)\n",
+    "            trainer.train_minibatch(mb_train)\n",
+    "\n",
+    "            # collect epoch-wide stats\n",
+    "            samples = trainer.previous_minibatch_sample_count\n",
+    "            loss_numer += trainer.previous_minibatch_loss_average * samples\n",
+    "            metric_numer += trainer.previous_minibatch_evaluation_average * samples\n",
+    "            denom += samples\n",
+    "\n",
+    "            # every N MBs evaluate on a test sequence to visually show how we're doing; also print training stats\n",
+    "            if mbs % training_progress_output_freq == 0:\n",
+    "                \n",
+    "                print(\"Minibatch: {0}, Train Loss: {1:2.3f}, Train Evaluation Criterion: {2:2.3f}\".format(mbs, \n",
+    "                      get_train_loss(trainer), get_train_eval_criterion(trainer)))\n",
+    "                \n",
+    "                mb_valid = valid_reader.next_minibatch(minibatch_size, input_map=valid_bind)\n",
+    "                e = new_model.eval(mb_valid)\n",
+    "                print_sequences(e, i2w)\n",
+    "\n",
+    "            i += mb_train[find_arg_by_name('raw_labels', model)].num_samples\n",
+    "            mbs += 1\n",
+    "\n",
+    "        print(\"--- EPOCH %d DONE: loss = %f, errs = %f ---\" % (epoch, loss_numer/denom, 100.0*(metric_numer/denom)))\n",
+    "        return 100.0*(metric_numer/denom)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have our three important functions defined -- `create_model()` and `train()`, let's make use of them:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Minibatch: 0, Train Loss: 4.234, Train Evaluation Criterion: 1.000\n",
+      "['</s>', '</s>', '</s>', '</s>', '</s>', '~S']\n",
+      "--- EPOCH 0 DONE: loss = 3.817894, errs = 86.991687 ---\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Given a vocab and tensor, print the output\n",
+    "def print_sequences(sequences, i2w):\n",
+    "    for s in sequences:\n",
+    "        print([i2w[np.argmax(w)] for w in s], sep=\" \")\n",
+    "\n",
+    "# hook up data\n",
+    "train_reader = create_reader(\"cmudict-0.7b.train-dev-20-21.ctf\", True)\n",
+    "valid_reader = create_reader(\"tiny.ctf\", False)\n",
+    "vocab, i2w = get_vocab(\"cmudict-0.7b.mapping\")\n",
+    "\n",
+    "# create model\n",
+    "model = create_model()\n",
+    "    \n",
+    "# train\n",
+    "error = train(train_reader, valid_reader, vocab, i2w, model, max_epochs=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "86.992\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Print the training error \n",
+    "print(error)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task\n",
+    "Note the error is very high. This is largely due to the minimum training we have done so far. Please change the `epoch_size` to be a much higher number and re-run the `train` function. This might take considerably longer time but you will see a marked reduction in the error."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "An important extension to sequence-to-sequence models, especially when dealing with long sequences, is to use an attention mechanism. The idea behind attention is to allow the decoder, first, to look at any of the hidden state outputs from the encoder (instead of using only the final hidden state), and, second, to learn how much attention to pay to each of those hidden states given the context. This allows the outputted word at each time step `t` to depend not only on the final hidden state and the word that came before it, but instead on a weighted combination of *all* of the input hidden states!\n",
+    "\n",
+    "In the next version of this tutorial, we will talk about how to include attention in your sequence to sequence network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

From f306ffb6d79d803b334eaf488071c4783d7afa9e Mon Sep 17 00:00:00 2001
From: "REDMOND\\sayanpa" <sayanpa@microsoft.com>
Date: Wed, 23 Nov 2016 14:08:23 -0800
Subject: [PATCH 24/25] Increased the cell time out to 300 sec from 60 sec

---
 Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py b/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py
index 2637de1ce..8a9060fdc 100644
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py
@@ -65,7 +65,7 @@ def nb(tmpdir_factory, request, device_id):
     outPath = str(tmpdir_factory.mktemp('notebook').join('out.ipynb'))
     assert os.path.isfile(inPath)
     args = ["jupyter", "nbconvert", "--to", "notebook", "--execute",
-            "--ExecutePreprocessor.timeout=60", "--output", outPath, inPath]
+            "--ExecutePreprocessor.timeout=300", "--output", outPath, inPath]
     subprocess.check_call(args)
     nb = nbformat.read(outPath, nbformat.current_nbformat)
     return nb

From f5428acb45893e90be6c883d5f36127b192144af Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Wed, 23 Nov 2016 03:22:29 -0800
Subject: [PATCH 25/25] CNTK v2 library: Fix device placement for backprop root
 gradients in python and enable python module tests on GPU for Windows

---
 Source/CNTKv2LibraryDll/API/CNTKLibrary.h        |  6 +++++-
 Source/CNTKv2LibraryDll/Function.cpp             |  2 +-
 Source/CNTKv2LibraryDll/Function.h               |  4 ++--
 .../CNTKv2Python/ModuleTests/run-test            | 16 ++++++++++++++++
 .../CNTKv2Python/ModuleTests/testcases.yml       |  4 ++--
 bindings/python/cntk/ops/__init__.py             |  2 +-
 bindings/python/cntk/ops/functions.py            |  4 +++-
 7 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
index d5011703e..769a829ff 100644
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@@ -2159,13 +2159,17 @@ namespace CNTK
         /// Returns the Function that 'this' BackPropState belongs to
         ///
         FunctionPtr Function() const { return m_function; }
+        DeviceDescriptor Device() const { return m_forwardComputeDevice; }
         virtual ~BackPropState() {}
 
     protected:
-        BackPropState(const FunctionPtr& function) : m_function(function) {}
+        BackPropState(const FunctionPtr& function, const DeviceDescriptor& computeDevice) 
+            : m_function(function), m_forwardComputeDevice(computeDevice)
+        {}
 
     protected:
         FunctionPtr m_function;
+        DeviceDescriptor m_forwardComputeDevice;
     };
     typedef std::shared_ptr<BackPropState> BackPropStatePtr;
 
diff --git a/Source/CNTKv2LibraryDll/Function.cpp b/Source/CNTKv2LibraryDll/Function.cpp
index 8ac07a729..093fc3215 100644
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@@ -2625,7 +2625,7 @@ namespace CNTK
         else
             evalTimeStampVariable = arguments.begin()->first;
 
-        return (outputsToRetainBackwardStateFor.size() > 0) ? MakeSharedObject<CNTKBackPropState>(this->shared_from_this(), std::make_pair(evalTimeStampVariable, m_variableToNodeMap[evalTimeStampVariable]->GetEvalTimeStamp())) : nullptr;
+        return (outputsToRetainBackwardStateFor.size() > 0) ? MakeSharedObject<CNTKBackPropState>(this->shared_from_this(), computeDevice, std::make_pair(evalTimeStampVariable, m_variableToNodeMap[evalTimeStampVariable]->GetEvalTimeStamp())) : nullptr;
     }
 
     /*virtual*/ void CompositeFunction::Backward(const BackPropStatePtr& state,
diff --git a/Source/CNTKv2LibraryDll/Function.h b/Source/CNTKv2LibraryDll/Function.h
index f87d7cf8e..4c1f0fe4c 100644
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@@ -652,8 +652,8 @@ namespace CNTK
     class CNTKBackPropState final : public BackPropState
     {
     public:
-        CNTKBackPropState(const FunctionPtr& function, const std::pair<Variable, int64_t>& evalTimeStamp)
-            : BackPropState(function), m_evalTimeStamp(evalTimeStamp)
+        CNTKBackPropState(const FunctionPtr& function, const DeviceDescriptor& computeDevice, const std::pair<Variable, int64_t>& evalTimeStamp)
+            : BackPropState(function, computeDevice), m_evalTimeStamp(evalTimeStamp)
         {}
 
         std::pair<Variable, int64_t> EvalTimeStamp() const
diff --git a/Tests/EndToEndTests/CNTKv2Python/ModuleTests/run-test b/Tests/EndToEndTests/CNTKv2Python/ModuleTests/run-test
index 25b9cd6a2..0facc4681 100644
--- a/Tests/EndToEndTests/CNTKv2Python/ModuleTests/run-test
+++ b/Tests/EndToEndTests/CNTKv2Python/ModuleTests/run-test
@@ -2,6 +2,22 @@
 
 . $TEST_ROOT_DIR/run-test-common
 
+# Temporary workaround to force the default device to be always GPU 0 when
+# running the python unit tests since data placement is currently broken which
+# causes some of the test data to end on the default device instead of the
+# explicitly selected GPU device 0 which results in the tests to fail
+# This whould be removed when the test bugs have been addressed
+if [ "$TEST_DEVICE" == "gpu" ]; then
+  if [ -z "$CUDA_VISIBLE_DEVICES" ]; then
+    export CUDA_VISIBLE_DEVICES=0
+  else
+    IFS=','
+    visibleDevicesArray=($CUDA_VISIBLE_DEVICES)
+    unset IFS
+    export CUDA_VISIBLE_DEVICES=${visibleDevicesArray[0]}
+  fi
+fi
+
 python -c "import sys; print('Python: %s'%sys.version)"
 python -c "import numpy; print('NumPy: %s'%numpy.version.full_version)"
 python -c "import scipy; print('SciPy: %s'%scipy.version.full_version)"
diff --git a/Tests/EndToEndTests/CNTKv2Python/ModuleTests/testcases.yml b/Tests/EndToEndTests/CNTKv2Python/ModuleTests/testcases.yml
index f4e203e5a..fae002bec 100644
--- a/Tests/EndToEndTests/CNTKv2Python/ModuleTests/testcases.yml
+++ b/Tests/EndToEndTests/CNTKv2Python/ModuleTests/testcases.yml
@@ -1,8 +1,8 @@
 dataDir: .
 
 tags:
-    - bvt-l (build_sku == 'gpu') and (flavor == 'release')  and ((os == 'linux') or (device=='cpu'))
-    - nightly-l (build_sku == 'gpu') and (flavor == 'release')  and ((os == 'linux') or (device=='cpu'))
+    - bvt-l (build_sku == 'gpu') and (flavor == 'release')
+    - nightly-l (build_sku == 'gpu') and (flavor == 'release')
 
 testCases:
   Run must finish with error code 0 (outputs __COMPLETED__ in that case):
diff --git a/bindings/python/cntk/ops/__init__.py b/bindings/python/cntk/ops/__init__.py
index 8b9be3822..44e258e04 100644
--- a/bindings/python/cntk/ops/__init__.py
+++ b/bindings/python/cntk/ops/__init__.py
@@ -250,7 +250,7 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
     >>> x = C.input_variable(img.shape)
     >>> filter = np.reshape(np.array([2, -1, -1, 2], dtype = np.float32), (1, 2, 2))
     >>> kernel = C.constant(value = filter)
-    >>> C.convolution(kernel, x, auto_padding = [False]).eval({x: [img]})
+    >>> C.convolution(kernel, x, auto_padding = [False]).eval({x: [img]}) # doctest: +SKIP
     array([[[[[  6.,   8.,  10.,  12.],
               [ 16.,  18.,  20.,  22.],
               [ 26.,  28.,  30.,  32.],
diff --git a/bindings/python/cntk/ops/functions.py b/bindings/python/cntk/ops/functions.py
index fdfa40cac..97394a6f4 100644
--- a/bindings/python/cntk/ops/functions.py
+++ b/bindings/python/cntk/ops/functions.py
@@ -273,7 +273,9 @@ class Function(cntk_py.Function):
         Returns:
             dict: mapping of ``variables`` to NumPy arrays
         '''
-        root_gradients = sanitize_var_map(self.outputs, root_gradients)
+        device = state.device()
+        root_gradients = sanitize_var_map(self.outputs, root_gradients,
+                                          None, device)
 
         var_gradients = dict((var, None) for var in variables)