Merge branch 'master' into qiwye/asgd-dev

2016-11-24 11:10:30 +08:00 · 2016-11-24 11:10:30 +08:00 · 2b8b3047df
--- a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
+++ b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
@ -0,0 +1,190 @@
+# Note: reader configuration comes from AlexNet.cntk or AlexNetComposite.cntk, depending on the test
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros="$ConfigDir$/Macros.ndl"
+
+precision = "float"
+deviceId = "Auto"
+
+command = Train:Test
+
+parallelTrain = "true"
+traceLevel = 1
+numMBsToShowResult = 500
+
+modelPath = "$ModelDir$/AlexNet"
+stderr = "$OutputDir$/AlexNet"
+
+################################
+Train = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape = 227:227:3
+        labelDim = 1000
+                
+        # Local Response Normalization 
+        # k : bias 
+        # n : half radius 
+        # alpha: scale factor 
+        # beta: exponent 
+        LRN {k, n, alpha, beta} = {
+            apply (x) = {
+                x2 = x .* x
+                # reshape to insert a fake singleton reduction dimension after the 3rd axis
+                x2s = SplitDimension(x2, 3, 1) 
+                # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
+                W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
+                y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
+                # reshape back to remove the fake singleton reduction dimension
+                b = FlattenDimensions(y, 3, 2)
+                den = Exp (beta .* Log(k + b)) 
+                r = x .* Reciprocal(den)
+            }.r
+        }.apply
+
+        # initValueScale are determined to generate Gaussians with variance of 0.01 and 0.005 (for the two DenseLayers)
+        model = Sequential (
+            ConvolutionalLayer {96, (11:11), stride=(4:4), pad=false, init='gaussian', initValueScale=0.95265} : ReLU : 
+            LRN {1.0, 2, 0.0001, 0.75} : 
+            MaxPoolingLayer    {(3:3), stride=(2:2)} :
+            ConvolutionalLayer {192, (5:5), pad = true, init='gaussian', initValueScale=2.44978, initBias=0.1} : ReLU : 
+            LRN {1.0, 2, 0.0001, 0.75} : 
+            MaxPoolingLayer    {(3:3), stride=(2:2)} :
+            ConvolutionalLayer {384, (3:3), pad = true, init='gaussian', initValueScale=2.07857} : ReLU : 
+            ConvolutionalLayer {384, (3:3), pad = true, init='gaussian', initValueScale=2.93945, initBias=0.1} : ReLU : 
+            ConvolutionalLayer {256, (3:3), pad = true, init='gaussian', initValueScale=2.93945, initBias=0.1} : ReLU : 
+            MaxPoolingLayer    {(3:3), stride=(2:2)} :
+            DenseLayer         {4096, activation=ReLU, init='gaussian', initValueScale=2.40038, initBias=0.1} : Dropout :
+            DenseLayer         {4096, activation=ReLU, init='gaussian', initValueScale=1.6, initBias=0.1} : Dropout :
+            LinearLayer        {labelDim, init='gaussian', initValueScale=3.2}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        featNorm = features - Constant(114)
+        labels = Input {labelDim}
+
+        # apply model to features
+        z = model (featNorm)
+
+        # loss and error computation
+        ce       = CrossEntropyWithSoftmax  (labels, z)
+        errs     = ClassificationError      (labels, z)
+        top5Errs = ClassificationError      (labels, z, topN=5)  # only used in Eval action
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (z)
+    }
+    
+    SGD = {
+        epochSize = 0
+        minibatchSize = 256
+        learningRatesPerMB = 0.01*25:0.001*25:0.0001*25:0.00001*25:0.000001
+        momentumPerMB = 0.9
+        maxEpochs = 110
+        gradUpdateType = None
+        L2RegWeight = 0.0005
+        dropoutRate = 0.5
+        
+        # TODO: try less bits?
+        ParallelTrain = {
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = "true"
+            parallelizationStartEpoch = 3
+            DataParallelSGD = {
+                gradientBits = 32
+            }
+        }
+        
+        numMBsToShowResult = 100
+    }
+    
+    # Reader
+    reader = {
+        verbosity = 0
+        randomize = true
+        randomizationWindow = 1
+
+        deserializers = (
+        {   
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$DataDir$/train_map.txt"
+            input = {
+                features = {
+                    transforms = (
+                        {
+                            type = "Crop"
+                            cropType = "random"
+                            cropRatio = 0.88671875
+                            jitterType = "uniRatio"
+                        }:{
+                            type = "Scale"
+                            width = 227
+                            height = 227
+                            channels = 3
+                            interpolations = "linear"
+                        #}:{
+                        #    type = "Mean"
+                        #    meanFile = "$ConfigDir$/ImageNet1K_mean.xml"
+                        }:{
+                            type = "Transpose"
+                        }
+                    )
+                }
+                labels = {
+                    labelDim = 1000
+                }
+            }
+        })
+    }    
+}
+
+################################
+Test = {
+    action=test
+    minibatchSize=128
+    evalNodeNames = errs:top5Errs  # also test top-5 error rate
+    
+    # Reader
+    reader = {
+        verbosity = 0
+        randomize = false
+
+        deserializers = (
+        {
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file="$DataDir$/val_map.txt"
+            input = {
+                features = {
+                    transforms = (
+                        {
+                            type = "Crop"
+                            cropType = "center"
+                            cropRatio = 0.88671875
+                        }:{
+                            type = "Scale"
+                            width = 227
+                            height = 227
+                            channels = 3
+                        }:{
+                            type = "Transpose"
+                        }
+                    )
+                }
+                labels = {
+                    labelDim = 1000
+                }
+            }
+        })
+    }        
+}
--- a/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
+++ b/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
@ -0,0 +1,130 @@
+# ConvNet applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
+
+command = TrainConvNet:Eval
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = "../../.." ; dataDir = "$rootDir$/DataSets/CIFAR-10" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/ConvNetLRN_CIFAR10_DataAug"
+#stderr = "$outputDir$/ConvNetLRN_CIFAR10_DataAug_bs_out"
+
+TrainConvNet = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape = 32:32:3
+        labelDim = 10
+
+        featScale = 1/256
+        Normalize{f} = x => f .* x
+
+        # Local Response Normalization 
+        # k : bias 
+        # n : half radius 
+        # alpha: scale factor 
+        # beta: exponent 
+        LRN {k, n, alpha, beta} = {
+            apply (x) = {
+                x2 = x .* x
+                # reshape to insert a fake singleton reduction dimension after the 3rd axis
+                x2s = SplitDimension(x2, 3, 1) 
+                # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
+                W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
+                y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
+                # reshape back to remove the fake singleton reduction dimension
+                b = FlattenDimensions(y, 3, 2)
+                den = Exp (beta .* Log(k + b)) 
+                r = x .* Reciprocal(den)
+            }.r
+        }.apply
+
+        model = Sequential (
+            Normalize {featScale} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            LRN {1.0, 4, 0.001, 0.75} : 
+            MaxPoolingLayer {(3:3), stride = (2:2)} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            LRN {1.0, 4, 0.001, 0.75} : 
+            MaxPoolingLayer {(3:3), stride = (2:2)} :
+            DenseLayer {256} : ReLU : Dropout : 
+            DenseLayer {128} : ReLU : Dropout : 
+            LinearLayer {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        labels   = Input {labelDim}
+
+        # apply model to features
+        z = model (features)
+
+        # connect to system
+        ce       = CrossEntropyWithSoftmax     (labels, z)
+        errs     = ClassificationError         (labels, z)
+        top5Errs = ClassificationError         (labels, z, topN=5)  # only used in Eval action
+
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)  # top5Errs only used in Eval
+        outputNodes     = (z)
+    }
+
+    SGD = {
+        epochSize = 0
+        minibatchSize = 64
+
+        learningRatesPerSample = 0.0015625*20:0.00046875*20:0.00015625*20:0.000046875*10:0.000015625
+        momentumAsTimeConstant = 0*20:600*20:1200
+        maxEpochs = 80
+        L2RegWeight = 0.002
+        dropoutRate = 0.5
+
+        numMBsToShowResult = 100
+    }
+
+    reader = {
+        verbosity = 0 ; randomize = true
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$dataDir$/train_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
+                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
+                    { type = "Mean" ; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = 10 }
+            }
+        })
+    }
+}
+
+# Eval action
+Eval = {
+    action = "eval"
+    evalNodeNames = errs:top5Errs  # also test top-5 error rate
+    # Set minibatch size for testing.
+    minibatchSize = 512
+
+    reader = {
+        verbosity = 0 ; randomize = false
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$dataDir$/test_map.txt"
+            input = {
+                features = { transforms = (
+                   { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
+                   { type = "Mean"; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
+                   { type = "Transpose" }
+                )}
+                labels = { labelDim = 10 }
+            }
+        })
+    }
+}
--- a/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10_DataAug.cntk
+++ b/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10_DataAug.cntk
@ -28,8 +28,8 @@ TrainConvNet = {
            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
              MaxPoolingLayer {(3:3), stride = (2:2)} :
-            DenseLayer {256} : Dropout : ReLU : 
-            DenseLayer {128} : Dropout : ReLU : 
+            DenseLayer {256} : ReLU : Dropout : 
+            DenseLayer {128} : ReLU : Dropout : 
            LinearLayer {labelDim}
        )

--- a/Examples/Image/Classification/ConvNet/BrainScript/README.md
+++ b/Examples/Image/Classification/ConvNet/BrainScript/README.md
@ -31,4 +31,12 @@ Run the example from the current folder using:

 `cntk configFile=ConvNet_CIFAR10_DataAug.cntk`

-As seen in the CNTK configuration file [ConvNet_CIFAR10_DataAug.cntk](./ConvNet_CIFAR10_DataAug.cntk), we use a fixed crop ratio of `0.8` and scale the image to `32x32` pixels for training. Since all training images are pre-padded to `40x40` pixels, effectively we only perform translation transform without scaling. The accuracy of the network on test data is around `14%`, which is a lot better than the previous model.
+As seen in the CNTK configuration file [ConvNet_CIFAR10_DataAug.cntk](./ConvNet_CIFAR10_DataAug.cntk), we use a fixed crop ratio of `0.8` and scale the image to `32x32` pixels for training. Since all training images are pre-padded to `40x40` pixels, effectively we only perform translation transform without scaling. The accuracy of the network on test data is around `14.2%`, which is a lot better than the previous model.
+
+### ConvNetLRN_CIFAR10_DataAug.cntk
+
+The fourth example added local response normalization (LRN) to the previous example. LRN is implemented as a BrainScript function using 3D convolution with a constant kernel. You may run the example from the current folder using:
+
+`cntk configFile=ConvNetLRN_CIFAR10_DataAug.cntk`
+
+This model achieves slightly better accuracy of `13.8%`, which demonstrates the effectiveness of LRN. Nevertheless, as mentioned earlier, LRN is now rarely used by state-of-the-art deep networks.
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
@ -13,6 +13,7 @@ from cntk.layers import *
 from cntk.models import Sequential, LayerStack
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
 from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule, momentum_as_time_constant_schedule, UnitType
+from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
 from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, relu, minus, element_times, constant
 from _cntk_py import set_computation_network_trace_level

@ -71,9 +72,9 @@ def convnet_cifar10(debug_output=False):

    # Set learning parameters
    lr_per_sample          = [0.0015625]*10+[0.00046875]*10+[0.00015625]
-    lr_schedule            = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
-    momentum_time_constant = [0]*20+[-minibatch_size/np.log(0.9)]
-    mm_schedule            = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
+    lr_schedule            = learning_rate_schedule(lr_per_sample, UnitType.sample, epoch_size)
+    mm_time_constant       = [0]*20+[-minibatch_size/np.log(0.9)]
+    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant, epoch_size)
    l2_reg_weight          = 0.002

    # Instantiate the trainer object to drive the model training
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
@ -14,6 +14,7 @@ from cntk.ops import input_variable, cross_entropy_with_softmax, classification_
 from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
 from cntk import Trainer, persist, cntk_py
 from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule, momentum_as_time_constant_schedule, UnitType
+from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
 from _cntk_py import set_computation_network_trace_level

 # Paths relative to current python file.
@ -83,8 +84,8 @@ def convnet_cifar10_dataaug(reader_train, reader_test, max_epochs = 80):
    # Set learning parameters
    lr_per_sample          = [0.0015625]*20+[0.00046875]*20+[0.00015625]*20+[0.000046875]*10+[0.000015625]
    lr_schedule            = learning_rate_schedule(lr_per_sample, unit=UnitType.sample, epoch_size=epoch_size)
-    momentum_time_constant = [0]*20+[600]*20+[1200]
-    mm_schedule            = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
+    mm_time_constant       = [0]*20+[600]*20+[1200]
+    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
    l2_reg_weight          = 0.002
    
    # trainer object
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
@ -48,10 +48,9 @@ def create_reader(map_file, mean_file, train, distributed_after=INFINITE_SAMPLES
        ImageDeserializer(map_file, StreamDefs(
            features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
            labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
-        randomize = False,
+        multithreaded_deserializer = False,  # turn off omp as CIFAR-10 is not heavy for deserializer
        distributed_after = distributed_after)

-
 # Train and evaluate the network.
 def convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer, max_epochs = 80):
    set_computation_network_trace_level(0)
@ -87,8 +86,8 @@ def convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer, max_
    # Set learning parameters
    lr_per_sample          = [0.0015625]*20+[0.00046875]*20+[0.00015625]*20+[0.000046875]*10+[0.000015625]
    lr_schedule            = learning_rate_schedule(lr_per_sample, unit=UnitType.sample, epoch_size=epoch_size)
-    momentum_time_constant = [0]*20+[600]*20+[1200]
-    mm_schedule            = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
+    mm_time_constant       = [0]*20+[600]*20+[1200]
+    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
    l2_reg_weight          = 0.002
    
    # trainer object
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py
@ -11,7 +11,7 @@ from cntk import Trainer, persist
 from cntk.utils import *
 from cntk.layers import *
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
-from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule, UnitType
+from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
 from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, relu, element_times, constant

 # Paths relative to current python file.
@ -63,11 +63,11 @@ def convnet_mnist(debug_output=False):
    # Set learning parameters
    lr_per_sample          = [0.001]*10+[0.0005]*10+[0.0001]
    lr_schedule            = learning_rate_schedule(lr_per_sample, UnitType.sample, epoch_size)
-    momentum_time_constant = [0]*5+[1024]
-    mn_schedule = momentum_schedule(momentum_time_constant, epoch_size)
+    mm_time_constant       = [0]*5+[1024]
+    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant, epoch_size)

    # Instantiate the trainer object to drive the model training
-    learner     = momentum_sgd(z.parameters, lr_schedule, mn_schedule)
+    learner     = momentum_sgd(z.parameters, lr_schedule, mm_schedule)
    trainer     = Trainer(z, ce, pe, learner)

    # define mapping from reader streams to network inputs
--- a/Examples/Image/Classification/ConvNet/Python/README.md
+++ b/Examples/Image/Classification/ConvNet/Python/README.md
@ -35,7 +35,7 @@ We use a fixed crop ratio of `0.8` and scale the image to `32x32` pixels for tra

 ### ConvNet_CIFAR10_DataAug_Distributed.py

-The fourth example uses the same CNN as ConvNet_CIFAR10_DataAug.py, but it adds support for distributed training with simple aggregation. For a reference on distributed training, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines)
-Note that this example supports CPU-only build.
+The fourth example uses the same CNN as ConvNet_CIFAR10_DataAug.py, but it adds support for distributed training with simple aggregation. For a reference on distributed training, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
+Note that [this example](./ConvNet_CIFAR10_DataAug_Distributed.py) supports CPU-only build.

 `mpiexec -n <#workers> python ConvNet_CIFAR10_DataAug_Distributed.py`
--- a/Examples/Image/Classification/ResNet/Python/README.md
+++ b/Examples/Image/Classification/ResNet/Python/README.md
@ -15,7 +15,7 @@ for ResNet20 and ResNet110, respectively. The ResNet20 network achieves an error

 ### TrainResNet_CIFAR10_Distributed.py

-This example code is similar to TrainResNet_CIFAR10.py, but it adds support for distributed training via [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface). Details can be found in [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines)
+[This example](./TrainResNet_CIFAR10_Distributed.py) is similar to TrainResNet_CIFAR10.py, but it adds support for distributed training via [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface). Details can be found in [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
 Note this example requires a multi-GPU machine or mpi hosts file to distribute to multiple machines.

 Simple aggregation, ResNet20, with a 2-GPU machine:
@ -25,3 +25,7 @@ Simple aggregation, ResNet20, with a 2-GPU machine:
 Quantized 1-bit aggregation with 50000 samples before distributed, ResNet20, with a 2-GPU machine:

 `mpiexec -n 2 python TrainResNet_CIFAR10_Distributed.py -n resnet20 -q 1 -a 50000`
+
+To run with maximum parallelization with minibatch size scaled according to #workers for 3 epochs:
+
+`mpiexec -n 2 python TrainResNet_CIFAR10_Distributed.py -s True -e 3`
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
@ -50,12 +50,12 @@ def create_reader(map_file, mean_file, train, distributed_after=INFINITE_SAMPLES
        ImageDeserializer(map_file, StreamDefs(
            features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
            labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
-        randomize = False,
+        multithreaded_deserializer = False,  # turn off omp as CIFAR-10 is not heavy for deserializer
        distributed_after = distributed_after)


 # Train and evaluate the network.
-def train_and_evaluate(reader_train, reader_test, network_name, max_epochs, distributed_trainer):
+def train_and_evaluate(reader_train, reader_test, network_name, max_epochs, distributed_trainer, scale_up=False):

    set_computation_network_trace_level(0)

@ -79,7 +79,13 @@ def train_and_evaluate(reader_train, reader_test, network_name, max_epochs, dist

    # shared training parameters 
    epoch_size = 50000                    # for now we manually specify epoch size
-    minibatch_size = 128
+    
+    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
+    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
+    # up. However, bigger minimatch size on the same number of samples means less updates, 
+    # thus leads to higher training error. This is a trade-off of speed and accuracy
+    minibatch_size = 128 * (len(distributed_trainer.communicator().workers()) if scale_up else 1)
+    
    momentum_time_constant = -minibatch_size/np.log(0.9)
    l2_reg_weight = 0.0001

@ -145,15 +151,17 @@ def train_and_evaluate(reader_train, reader_test, network_name, max_epochs, dist
 if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--network', help='network type, resnet20 or resnet110', required=False, default='resnet20')
-    parser.add_argument('-e', '--epochs', help='total epochs', required=False, default='160')
-    parser.add_argument('-q', '--quantize_bit', help='quantized bit', required=False, default='32')
-    parser.add_argument('-a', '--distributed_after', help='number of samples to train with before running distributed', required=False, default='0')
+    parser.add_argument('-e', '--epochs', help='total epochs', type=int, required=False, default='160')
+    parser.add_argument('-q', '--quantize_bit', help='quantized bit', type=int, required=False, default='32')
+    parser.add_argument('-s', '--scale_up', help='scale up minibatch size with #workers for better parallelism', type=bool, required=False, default='False')
+    parser.add_argument('-a', '--distributed_after', help='number of samples to train with before running distributed', type=int, required=False, default='0')

    args = vars(parser.parse_args())
    num_quantization_bits = int(args['quantize_bit'])
    epochs = int(args['epochs'])
    distributed_after_samples = int(args['distributed_after'])
    network_name = args['network']
+    scale_up = bool(args['scale_up'])

    # Create distributed trainer
    print("Start training: quantize_bit = {}, epochs = {}, distributed_after = {}".format(num_quantization_bits, epochs, distributed_after_samples))
@ -164,7 +172,7 @@ if __name__=='__main__':
    reader_train = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True, distributed_after_samples)
    reader_test  = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)
    
-    train_and_evaluate(reader_train, reader_test, network_name, epochs, distributed_trainer)
+    train_and_evaluate(reader_train, reader_test, network_name, epochs, distributed_trainer, scale_up)
    
    # Must call MPI finalize when process exit
    distributed.Communicator.finalize()
--- a/README.md
+++ b/README.md
@ -1,9 +1,11 @@
+**The [CNTK Wiki](https://github.com/Microsoft/CNTK/wiki) has all information on CNTK including [setup](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine ), [examples](https://github.com/Microsoft/CNTK/wiki/Examples ), etc.**
+
 # Latest news
 *2016-11-21.* V 2.0 Beta 4 Release  
 Highlights of this Release:
 * New ASGD/Hogwild! training using Microsoft’s Parameter Server ([Project Multiverso](https://github.com/Microsoft/multiverso))
 * Distributed Scenarios now supported in CNTK Python API
-* New Memory compression -- ability to trade off memory usage with compute. 
+* New [Memory Compression](https://github.com/Microsoft/CNTK/wiki/Top-level-configurations#hypercompressmemory) mode to reduce memory usage on GPU
 * CNTK Docker image with 1bit-SGD support
 * Stability Improvements and bug fixes

@ -55,18 +57,6 @@ If you ARE using Model Evaluation Library we **strongly recommend** installing v

 See [Release Notes](https://github.com/Microsoft/CNTk/wiki/CNTK_1_7_2_Release_Notes) for details.

-*2016-09-28.* V 1.7.1 Binary release  
-Highlights of this Release:
-* Two Breaking Changes related to Layers library default initialization and ```fsAdagrad``` gradient-normalization scheme
-* Improvements in BrainScript
-* Enabling of Deterministic Algorithm enforcement
-* Improvements in Model Evaluation including the support of Evaluation for Azure Applications
-* Different Performance improvements
-* Multiple bug fixes
-
-See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_1_7_1_Release_Notes) (including the full list of bugs fixed)  
-Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 # What is CNTK
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -512,7 +512,7 @@ CNTK2 = [
 # Parameter{} can do several forms of initialization.
 #  - initValue=scalar, value=array --> initialize from this value  --array form not implemented yet
 #  - initFromFilePath="..." --> read from a data file
-#  - init="uniform|gaussian" (random init scaled by initValueScale). Warning: This has magic scaling factors. TODO: document them here
+#  - init="uniform|gaussian" (random init scaled by initValueScale). 
 #  - init="zero"
 # deprecated:
 #  - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -393,16 +393,11 @@ namespace CNTK
        friend class PackedValue;
        friend class MPICommunicatorImpl;
        friend class BlockMomentumDistributedTrainer;
+        friend class Internal::VariableResolver;

        template <typename T, typename ...CtorArgTypes>
        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);

-        template <typename ElementType>
-        friend Variable Internal::GetVariable(const Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                                              std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                                              std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                              std::unordered_set<FunctionPtr>& allPrimitiveFunctions);
-
    public:
        ///
        /// Construct a NDArrayView with the specified 'dataBuffer' as the backing storage.
@ -635,6 +630,7 @@ namespace CNTK
        static const size_t AutoSelectRowColSplitPoint = SIZE_MAX;

    private:
+
        CNTK_API NDArrayView(::CNTK::DataType dataType, const DeviceDescriptor& device, ::CNTK::StorageFormat storageType, const NDShape& viewShape, bool readOnly, void* tensorView);


@ -1599,11 +1595,7 @@ namespace CNTK
        template <typename T>
        friend struct std::hash;

-        template <typename ElementType>
-        friend Variable Internal::GetVariable(const Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                                              std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                                              std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                              std::unordered_set<FunctionPtr>& allPrimitiveFunctions);
+        friend class Internal::VariableResolver;

 #ifndef SWIG
    private:
@ -1952,11 +1944,7 @@ private:
        template <typename T>
        friend struct std::hash;

-        template <typename ElementType>
-        friend Variable Internal::GetVariable(const Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                                              std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                                              std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                              std::unordered_set<FunctionPtr>& allPrimitiveFunctions);
+        friend class Internal::VariableResolver;

    public:
        ///
@ -2037,11 +2025,7 @@ private:
        template <typename T>
        friend struct std::hash;

-        template <typename ElementType>
-        friend Variable Internal::GetVariable(const Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                                              std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                                              std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                              std::unordered_set<FunctionPtr>& allPrimitiveFunctions);
+        friend class Internal::VariableResolver;

    public:
        ///
@ -2175,13 +2159,17 @@ namespace CNTK
        /// Returns the Function that 'this' BackPropState belongs to
        ///
        FunctionPtr Function() const { return m_function; }
+        DeviceDescriptor Device() const { return m_forwardComputeDevice; }
        virtual ~BackPropState() {}

    protected:
-        BackPropState(const FunctionPtr& function) : m_function(function) {}
+        BackPropState(const FunctionPtr& function, const DeviceDescriptor& computeDevice) 
+            : m_function(function), m_forwardComputeDevice(computeDevice)
+        {}

    protected:
        FunctionPtr m_function;
+        DeviceDescriptor m_forwardComputeDevice;
    };
    typedef std::shared_ptr<BackPropState> BackPropStatePtr;

--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -247,10 +247,6 @@ namespace CNTK

        CNTK_API bool AreEqual(const ::CNTK::NDArrayView& view1, const ::CNTK::NDArrayView& view2, double relativeTolerance = 0.0, double absoluteTolerance = 0.0);

-        template <typename ElementType>
-        Variable GetVariable(const  Microsoft::MSR::CNTK::ComputationNodeBasePtr& node,
-                             std::unordered_map<Microsoft::MSR::CNTK::ComputationNodeBasePtr, ::CNTK::Variable>& nodeToVariableMap,
-                             std::unordered_map<::CNTK::Variable, ::CNTK::Variable>& placeholderReplacements,
-                             std::unordered_set<::CNTK::FunctionPtr>& allPrimitiveFunctions);
+        class VariableResolver;
    }
 }
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -20,28 +20,67 @@
 #include "DeprecatedNodes.h"
 #include "RNNNodes.h"

-
 using namespace Microsoft::MSR::CNTK;

 namespace CNTK
 {
    namespace Internal
    {
-        template <typename ElementType>
-        Variable GetVariable(const ComputationNodeBasePtr& node,
-                             std::unordered_map<ComputationNodeBasePtr, Variable>& nodeToVariableMap,
-                             std::unordered_map<Variable, Variable>& placeholderReplacements,
-                             std::unordered_set<FunctionPtr>& allPrimitiveFunctions)
+        // Helper class to resolve variables in the model.
+        class VariableResolver final
        {
-            auto iter = nodeToVariableMap.find(node);
-            if (iter != nodeToVariableMap.end())
+            std::unordered_map<Variable, Variable> m_placeholderReplacements;
+            std::unordered_map<ComputationNodeBasePtr, Variable> m_nodeToVariableMap;
+            std::unordered_set<FunctionPtr> m_allPrimitiveFunctions;
+
+        public:
+            const std::unordered_map<Variable, Variable>& GetPlaceHolders() const
+            {
+                return m_placeholderReplacements;
+            }
+
+            template<class ElementType>
+            Variable GetVariable(const ComputationNodeBasePtr& node)
+            {
+                auto iter = m_nodeToVariableMap.find(node);
+                if (iter != m_nodeToVariableMap.end())
                    return iter->second;

                Variable var;
-            NDShape varShape = AsNDShape(node->GetSampleLayout());
-
                if (node->IsLeaf())
                {
+                    var = ResolveLeaf<ElementType>(node);
+                }
+                else
+                {
+                    // This is a non-leaf node and maps to a primitive Function
+                    NDShape varShape = AsNDShape(node->GetSampleLayout());
+                    auto placeholderVar = PlaceholderVariable(varShape);
+                    m_nodeToVariableMap[node] = placeholderVar;
+
+                    std::vector<Variable> inputVars(node->GetNumInputs());
+                    for (size_t i = 0; i < inputVars.size(); ++i)
+                    {
+                        inputVars[i] = GetVariable<ElementType>(node->Input(i));
+                        if (inputVars[i].IsPlaceholder())
+                            m_placeholderReplacements[inputVars[i]] = Variable();
+                    }
+
+                    var = ResolveFunction<ElementType>(node, inputVars);
+
+                    if (m_placeholderReplacements.find(placeholderVar) != m_placeholderReplacements.end())
+                        m_placeholderReplacements[placeholderVar] = var;
+                }
+
+                m_nodeToVariableMap[node] = var;
+                return var;
+            }
+
+        private:
+            template<class ElementType>
+            Variable ResolveLeaf(const ComputationNodeBasePtr& node)
+            {
+                NDShape variableShape = AsNDShape(node->GetSampleLayout());
                std::wstring varUid, varName;
                if (node->Is<InputValueBase<ElementType>>())
                {
@ -54,48 +93,32 @@ namespace CNTK
                        auto inputNodeInternalDynamicAxisName = node->As<InputValueBase<ElementType>>()->GetRequestedDynamicAxis();
                        std::vector<Axis> inputVarDynamicAxes = DynamicAxesFromInternalDynamicAxisName(inputNodeInternalDynamicAxisName);

-                        var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, varName, inputVarDynamicAxes, varUid);
+                        return Variable(variableShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, varName, inputVarDynamicAxes, varUid);
                    }
-                    else
-                    {
+
                    // TODO: Allow creating inputs without a dynamic axis
                    LogicError("Found InputNode with no dynamic axes which is currently unsupported");
                }
-                }
-                else if (node->Is<LearnableParameter<ElementType>>())
+
+                if (node->Is<LearnableParameter<ElementType>>())
                {
                    bool isConstant = (node->GetLearningRateMultiplier() == 0);
                    auto& matrix = node->As<ComputationNode<ElementType>>()->Value();
                    auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorViewShape(node->GetSampleLayout()));
-                    NDArrayViewPtr value = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), varShape, false, tensorView);
-                    if (isConstant)
-                    {
-                        std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), VariableKind::Constant);
-                        var = Constant(value, varName, varUid);
+                    NDArrayViewPtr value = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), variableShape, false, tensorView);
+
+                    auto kind = isConstant ? VariableKind::Constant : VariableKind::Parameter;
+                    std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), kind);
+                    return isConstant ? (Variable)Constant(value, varName, varUid) : Parameter(value, varName, varUid);
                }
-                    else
-                    {
-                        std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), VariableKind::Parameter);
-                        var = Parameter(value, varName, varUid);
-                    }
-                }
-                else
+
                LogicError("CNTK::LoadLegacyModel: Unsupported legacy CNTK node named '%S'", node->NodeName().c_str());
-            }
-            else
-            {
-                // This is a non-leaf node and maps to a primitive Function
-                auto placeholderVar = PlaceholderVariable(varShape);
-                nodeToVariableMap[node] = placeholderVar;
-
-                std::vector<Variable> inputVars(node->GetNumInputs());
-                for (size_t i = 0; i < inputVars.size(); ++i)
-                {
-                    inputVars[i] = GetVariable<ElementType>(node->Input(i), nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions);
-                    if (inputVars[i].IsPlaceholder())
-                        placeholderReplacements[inputVars[i]] = Variable();
+                return Variable();// make compiler happy.
            }

+            template<class ElementType>
+            Variable ResolveFunction(const ComputationNodeBasePtr& node, std::vector<Variable>& inputVars)
+            {
                PrimitiveOpType opType;
                Dictionary primitiveFunctionConfigParameters;
                if (node->OperationName() == OperationNameOf(NegateNode))
@ -376,15 +399,10 @@ namespace CNTK
                std::tie(functionUid, functionName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), opType);

                FunctionPtr primitiveFunction = MakeSharedObject<PrimitiveFunction>(opType, inputVars, std::move(primitiveFunctionConfigParameters), functionName, functionUid);
-                allPrimitiveFunctions.insert(primitiveFunction);
-                var = primitiveFunction->Output();
-                if (placeholderReplacements.find(placeholderVar) != placeholderReplacements.end())
-                    placeholderReplacements[placeholderVar] = var;
-            }
-
-            nodeToVariableMap[node] = var;
-            return var;
+                m_allPrimitiveFunctions.insert(primitiveFunction);
+                return primitiveFunction->Output();
            }
+        };

        FunctionPtr LoadLegacyModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
        {
@ -410,8 +428,8 @@ namespace CNTK
            // Now traverse the model and construct the Function graph
            std::unordered_map<ComputationNodeBasePtr, Variable> nodeToVariableMap;
            std::unordered_map<Variable, Variable> placeholderReplacements;
-            std::unordered_set<FunctionPtr> allPrimitiveFunctions;
            std::vector<Variable> rootVariables;
+            VariableResolver resolver;
            auto& networkRoots = net->RootNodes();
            for (auto& rootNode : networkRoots)
            {
@ -420,11 +438,11 @@ namespace CNTK

                if (ComputationNetwork::IsNodePtr<ComputationNode<float>>(rootNode))
                {
-                    rootVariables.push_back(Internal::GetVariable<float>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
+                    rootVariables.push_back(resolver.GetVariable<float>(rootNode).Owner());
                }
                else if (ComputationNetwork::IsNodePtr<ComputationNode<double>>(rootNode))
                {
-                    rootVariables.push_back(Internal::GetVariable<double>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
+                    rootVariables.push_back(resolver.GetVariable<double>(rootNode).Owner());
                }
                else
                {
@ -433,8 +451,7 @@ namespace CNTK
            }

            auto rootComposite = Combine(rootVariables);
-            rootComposite->ReplacePlaceholders(placeholderReplacements);
-
+            rootComposite->ReplacePlaceholders(resolver.GetPlaceHolders());
            return rootComposite;
        }

--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -2625,7 +2625,7 @@ namespace CNTK
        else
            evalTimeStampVariable = arguments.begin()->first;

-        return (outputsToRetainBackwardStateFor.size() > 0) ? MakeSharedObject<CNTKBackPropState>(this->shared_from_this(), std::make_pair(evalTimeStampVariable, m_variableToNodeMap[evalTimeStampVariable]->GetEvalTimeStamp())) : nullptr;
+        return (outputsToRetainBackwardStateFor.size() > 0) ? MakeSharedObject<CNTKBackPropState>(this->shared_from_this(), computeDevice, std::make_pair(evalTimeStampVariable, m_variableToNodeMap[evalTimeStampVariable]->GetEvalTimeStamp())) : nullptr;
    }

    /*virtual*/ void CompositeFunction::Backward(const BackPropStatePtr& state,
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -652,8 +652,8 @@ namespace CNTK
    class CNTKBackPropState final : public BackPropState
    {
    public:
-        CNTKBackPropState(const FunctionPtr& function, const std::pair<Variable, int64_t>& evalTimeStamp)
-            : BackPropState(function), m_evalTimeStamp(evalTimeStamp)
+        CNTKBackPropState(const FunctionPtr& function, const DeviceDescriptor& computeDevice, const std::pair<Variable, int64_t>& evalTimeStamp)
+            : BackPropState(function, computeDevice), m_evalTimeStamp(evalTimeStamp)
        {}

        std::pair<Variable, int64_t> EvalTimeStamp() const
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -103,9 +103,9 @@ private:
        bool log = GetEnvironmentPtr() && Environment().traceLevel > 0; // note: this will not log before node is part of network
        if (log)
        {
-            fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f*%f, onCPU=%s.\n)",
+            fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f(%f*%f), onCPU=%s.\n)",
                    NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
-                    (int)randomSeed, (int)fanOut, (int)fanIn, range, initValueScale, initOnCPUOnly ? "true" : "false");
+                    (int)randomSeed, (int)fanOut, (int)fanIn, range, range/initValueScale, initValueScale, initOnCPUOnly ? "true" : "false");
        }
    }

--- a/Source/Math/ConvolveGeometry.h
+++ b/Source/Math/ConvolveGeometry.h
@ -109,67 +109,26 @@ public:
        m_originIndex = 0;
        for (int i = (int)dimCount - 1; i >= 0; i--)
        {
-            assert((m_outputShape[i] % GetMapCount(i)) == 0);
-            int outPerMap = (int)(m_outputShape[i] / GetMapCount(i));
-            // Number of cells between first and last "centers", inclusive.
-            int cells = (int)((outPerMap - 1) * GetStride(i) + 1);
-            assert(m_inputShape[i] >= cells);
-
-            // Extra cells, to the left and right of "cells".
-            int extra = (int)m_inputShape[i] - cells;
-            assert(extra >= 0);
-
-            // When LowerPad and/or UpperPad are specified, the Start[i] value is determined by those values.
-            int lo = GetAutoPad(i) ? 0 : (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : i];
-            int hi = GetAutoPad(i) ? 0 : (int)m_upperPad[m_upperPad.size() == 1 ? 0 : i];
-            if (lo != 0 || hi != 0)
-            {
-                assert(extra + lo + hi + 1 == m_kernelShape[i]);
-                // Compute the number of cells on the left and right parts of the kernel,
-                // not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
-                // placed on the right (the center is shifted to the left).
-                int right = (int)m_kernelShape[i] - 1;
-                int left = right / 2;
-                right -= left;
-                assert(left <= right);
-                assert(right <= left + 1);
-
-                assert(lo <= left);
-                assert(hi <= right);
-                m_start[i] = left - lo;
-                assert(m_start[i] + cells + right == m_inputShape[i] + hi);
-            }
+            bool padded = GetAutoPad(i); 
+            if (padded)
+                m_start[i] = 0; 
            else
            {
-                m_start[i] = extra / 2;
-#ifdef _DEBUG
-                // If we're padding then extra should be covered.
-                bool padded = GetAutoPad(i);
-                assert(!padded || extra + 1 <= m_kernelShape[i]);
-                // If we're not padding then, we should stay within the input dimension.
-                assert(padded || extra + 1 >= m_kernelShape[i]);
-
-                // Compute the number of cells on the left and right parts of the kernel,
-                // not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
-                // placed on the right (the center is shifted to the left).
-                int right = (int)m_kernelShape[i] - 1;
-                int left = right / 2;
-                right -= left;
-                assert(0 <= left);
-                assert(left <= right);
-                assert(right <= left + 1);
-
-                int min = m_start[i] - left;
-                int max = m_start[i] + (int)cells + right;
-                assert(!padded || min <= 0 && max >= m_inputShape[i]);
-                assert(padded || min >= 0 && max <= m_inputShape[i]);
-
-                int diff = min - ((int)m_inputShape[i] - max);
-                assert(std::abs(diff) <= 1);
-
-                UNUSED(padded);
-                UNUSED(diff);
-#endif
+                m_start[i] = ((int)m_kernelShape[i] - 1) / 2;
+                int lo = (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : i];
+                int hi = (int)m_upperPad[m_upperPad.size() == 1 ? 0 : i];
+                if (lo != 0 || hi != 0)
+                {
+                    m_start[i] -= lo;
+                    assert(m_start[i] >= 0); 
+                    int outPerMap = (int)(m_outputShape[i] / GetMapCount(i));
+                    int cells = (int)((outPerMap - 1) * GetStride(i) + 1);
+                    if (cells > 0)  // dummy if, just to get rid of warning 
+                    {
+                        assert(m_inputShape[i] >= cells);
+                        assert(m_start[i] + cells + (int)m_kernelShape[i] - 1 == m_inputShape[i] + hi);
+                    }
+                }
            }

            m_startIndex = m_startIndex * (int)m_inputShape[i] + m_start[i];
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -40,7 +40,6 @@
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
@ -51,7 +50,6 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <!-- TODO intentional for all? -->
-    <LinkIncremental>false</LinkIncremental>
    <TargetName>Math</TargetName>
  </PropertyGroup>
  <ItemDefinitionGroup>
@ -102,9 +100,6 @@
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
      <PrecompiledHeader>Use</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>$(MathDefine); NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
@ -113,14 +108,12 @@
      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
      <FloatingPointExceptions>false</FloatingPointExceptions>
      <TreatWarningAsError>true</TreatWarningAsError>
-      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalDependencies>$(MathLinkLibrary);Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <DelayLoadDLLs>$(MathDelayLoad); $(CudaDlls); %(DelayLoadDLLs)</DelayLoadDLLs>
      <Profile>true</Profile>
@ -147,6 +140,7 @@
  <ItemDefinitionGroup Condition="$(GpuBuild)">
    <ClCompile>
      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaInclude)</AdditionalIncludeDirectories>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
@ -155,6 +149,7 @@
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
    <ClCompile>
      <PreprocessorDefinitions>CPUONLY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <DelayLoadDLLs>$(MathDelayLoad)</DelayLoadDLLs>
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/CNTK_204_Sequence_To_Sequence.py
@ -0,0 +1,24 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import os
+import re
+
+abs_path = os.path.dirname(os.path.abspath(__file__))
+notebook = os.path.join(abs_path, "..", "..", "..", "..", "Tutorials", "CNTK_204_Sequence_To_Sequence.ipynb")
+
+def test_cntk_204_sequence_to_sequence_noErrors(nb):
+    errors = [output for cell in nb.cells if 'outputs' in cell
+              for output in cell['outputs'] if output.output_type == "error"]
+    print(errors)
+    assert errors == []
+
+expectedEvalError = 90
+
+def test_cntk_204_sequence_to_sequence_trainerror(nb):
+    testCell = [cell for cell in nb.cells
+                if cell.cell_type == 'code' and re.search('#Print the training error', cell.source)]
+    assert float((testCell[0].outputs[0])['text']) < expectedEvalError
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/cifar_convnet_distributed_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/cifar_convnet_distributed_test.py
@ -7,55 +7,36 @@
 import numpy as np
 import os
 import sys
+import signal
+import subprocess
+import re
+import pytest
 from cntk.utils import cntk_device
 from cntk.cntk_py import DeviceKind_GPU
 from cntk.device import set_default_device
-from cntk.io import ReaderConfig, ImageDeserializer
-from cntk import distributed
-import pytest
-import platform

 abs_path = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "ConvNet", "Python"))
-from ConvNet_CIFAR10_DataAug_Distributed import convnet_cifar10_dataaug, create_reader
+sys.path.append(abs_path)
+from run_cifar_convnet_distributed import run_cifar_convnet_distributed

 TOLERANCE_ABSOLUTE = 2E-1
+TIMEOUT_SECONDS = 300

-def test_cifar_convnet_error(device_id):
-    if platform.system() == 'Windows':
-        pytest.skip('test skipped on Windows')
-
-    set_default_device(cntk_device(device_id))
+def test_cifar_convnet_distributed_mpiexec(device_id):
+    if cntk_device(device_id).type() != DeviceKind_GPU:
+        pytest.skip('test only runs on GPU')

+    cmd = ["mpiexec", "-n", "2", "python", os.path.join(abs_path, "run_cifar_convnet_distributed.py")]
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    try:
-        base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
-                                *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
-        # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
-        #      and CIFAR-10_mean.xml in the base_path.
-    except KeyError:
-        base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                                *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))
-
-    base_path = os.path.normpath(base_path)
-    os.chdir(os.path.join(base_path, '..'))
-
-    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
-    set_computation_network_trace_level(1) 
-    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
-    #force_deterministic_algorithms()
-    # TODO: do the above; they lead to slightly different results, so not doing it for now
-
-    reader_train = create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, 0)
-    reader_test  = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False)
-
-    distributed_after_samples = 0
-    num_quantization_bits = 32
-    distributed_trainer = distributed.data_parallel_distributed_trainer(
-        num_quantization_bits=num_quantization_bits,
-        distributed_after=distributed_after_samples)
-
-    test_error = convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer, max_epochs=1)
+        out = p.communicate(timeout=TIMEOUT_SECONDS)[0]  # in case we have a hang
+    except subprocess.TimeoutExpired:
+        os.kill(p.pid, signal.CTRL_C_EVENT)
+        raise RuntimeError('Timeout in mpiexec, possibly hang')
+    str_out = out.decode(sys.getdefaultencoding())
+    results = re.findall("Final Results: Minibatch\[.+?\]: errs = (.+?)%", str_out)
+    assert len(results) == 2
+    assert results[0] == results[1]
    expected_test_error = 0.617
-
-    assert np.allclose(test_error, expected_test_error,
+    assert np.allclose(float(results[0])/100, expected_test_error,
                       atol=TOLERANCE_ABSOLUTE)
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/conftest.py
@ -65,7 +65,7 @@ def nb(tmpdir_factory, request, device_id):
    outPath = str(tmpdir_factory.mktemp('notebook').join('out.ipynb'))
    assert os.path.isfile(inPath)
    args = ["jupyter", "nbconvert", "--to", "notebook", "--execute",
-            "--ExecutePreprocessor.timeout=60", "--output", outPath, inPath]
+            "--ExecutePreprocessor.timeout=300", "--output", outPath, inPath]
    subprocess.check_call(args)
    nb = nbformat.read(outPath, nbformat.current_nbformat)
    return nb
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/pytest.ini
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/pytest.ini
@ -1,2 +1,2 @@
 [pytest]
-python_files = *.py
+python_files = *_test.py
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/run_cifar_convnet_distributed.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/run_cifar_convnet_distributed.py
@ -0,0 +1,52 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import numpy as np
+import os
+import sys
+import platform
+from cntk.io import ReaderConfig, ImageDeserializer
+from cntk import distributed
+from cntk.device import set_default_device, gpu
+
+abs_path = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "ConvNet", "Python"))
+from ConvNet_CIFAR10_DataAug_Distributed import convnet_cifar10_dataaug, create_reader
+
+def run_cifar_convnet_distributed():
+    try:
+        base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
+                                *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
+        # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
+        #      and CIFAR-10_mean.xml in the base_path.
+    except KeyError:
+        base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))
+
+    base_path = os.path.normpath(base_path)
+    os.chdir(os.path.join(base_path, '..'))
+
+    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
+    set_computation_network_trace_level(1) 
+    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
+    #force_deterministic_algorithms()
+    # TODO: do the above; they lead to slightly different results, so not doing it for now
+
+    reader_train = create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, 0)
+    reader_test  = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False)
+
+    distributed_after_samples = 0
+    num_quantization_bits = 32
+    distributed_trainer = distributed.data_parallel_distributed_trainer(
+        num_quantization_bits=num_quantization_bits,
+        distributed_after=distributed_after_samples)
+
+    return convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer, max_epochs=1)
+
+if __name__=='__main__':
+    set_default_device(gpu(0)) # force using GPU-0 in test for speed
+    run_cifar_convnet_distributed()
+    distributed.Communicator.finalize()
--- a/Tests/EndToEndTests/CNTKv2Python/ModuleTests/run-test
+++ b/Tests/EndToEndTests/CNTKv2Python/ModuleTests/run-test
@ -2,6 +2,22 @@

 . $TEST_ROOT_DIR/run-test-common

+# Temporary workaround to force the default device to be always GPU 0 when
+# running the python unit tests since data placement is currently broken which
+# causes some of the test data to end on the default device instead of the
+# explicitly selected GPU device 0 which results in the tests to fail
+# This whould be removed when the test bugs have been addressed
+if [ "$TEST_DEVICE" == "gpu" ]; then
+  if [ -z "$CUDA_VISIBLE_DEVICES" ]; then
+    export CUDA_VISIBLE_DEVICES=0
+  else
+    IFS=','
+    visibleDevicesArray=($CUDA_VISIBLE_DEVICES)
+    unset IFS
+    export CUDA_VISIBLE_DEVICES=${visibleDevicesArray[0]}
+  fi
+fi
+
 python -c "import sys; print('Python: %s'%sys.version)"
 python -c "import numpy; print('NumPy: %s'%numpy.version.full_version)"
 python -c "import scipy; print('SciPy: %s'%scipy.version.full_version)"
--- a/Tests/EndToEndTests/CNTKv2Python/ModuleTests/testcases.yml
+++ b/Tests/EndToEndTests/CNTKv2Python/ModuleTests/testcases.yml
@ -1,8 +1,8 @@
 dataDir: .

 tags:
-    - bvt-l (build_sku == 'gpu') and (flavor == 'release')  and ((os == 'linux') or (device=='cpu'))
-    - nightly-l (build_sku == 'gpu') and (flavor == 'release')  and ((os == 'linux') or (device=='cpu'))
+    - bvt-l (build_sku == 'gpu') and (flavor == 'release')
+    - nightly-l (build_sku == 'gpu') and (flavor == 'release')

 testCases:
  Run must finish with error code 0 (outputs __COMPLETED__ in that case):
--- a/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
+++ b/Tests/EndToEndTests/EvalClientTests/CSEvalClientTest/baseline.txt
@ -763,7 +763,7 @@ INFO: rn4_1.c_proj.y.y: loading pre-CuDNNv5 model: approximated mini-batch count
 INFO: rn4_2.c1.c.y.y: loading pre-CuDNNv5 model: approximated mini-batch count of 625625 as 10010000 trained samples.
      Statistics in further training may be biased; consider re-training instead.
 INFO: rn4_2.c2.y.y: loading pre-CuDNNv5 model: approximated mini-batch count of 625625 as 10010000 trained samples.
-      Statistics in further EvaluateImageInputUsingFeatureVector: Outcome = 340
+      Statistics in further EvaluateImageInputUsingFeatureVector: Outcome = 118

 ====== EvaluateImageInputUsingImageApi ========
 training may be biased; consider re-training instead.
@ -847,7 +847,7 @@ WARNING: rn2_1.c2.y.y: loading pre-CuDNNv5 model: approximately converting varia
 WARNING: rn2_1.c_proj.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
 WARNING: rn2_2.c1.c.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
 WARNING: rn2_2.c2.y.y: loading pre-CuDNNv5 model: approximately converting variance statistics format
-WARNING: rn3_1.c1.c.y.y: loading pre-CuDNNv5 model:EvaluateImageInputUsingImageApi: Outcome = 340
+WARNING: rn3_1.c1.c.y.y: loading pre-CuDNNv5 model:EvaluateImageInputUsingImageApi: Outcome = 118

 ====== CompareImageApiResults ========
 Both image API calls returned the same output vector.
--- a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
+++ b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
@ -80,7 +80,6 @@
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>CNTKLibrary-2.0.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <StackReserveSize Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">100000000</StackReserveSize>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
--- a/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
+++ b/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
--- a/bindings/python/cntk/initializer.py
+++ b/bindings/python/cntk/initializer.py
@ -19,6 +19,7 @@ def uniform(scale=DefaultParamInitScale, seed=None):

    Returns:
        initializer for :class:`cntk.variables.Parameter`
+        initialized to uniform distribution between `scale*[-0.05, 0.05]`
    '''
    if seed is None:
        seed = SentinelValueForAutoSelectRandomSeed
@ -37,6 +38,7 @@ def gaussian(output_rank=SentinelValueForInferParamInitRank, filter_rank=Sentine

    Returns:
        initializer for :class:`cntk.variables.Parameter`
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale*0.2/sqrt(fanIn))`. 
    '''
    if seed is None:
        seed = SentinelValueForAutoSelectRandomSeed
@ -55,6 +57,7 @@ def xavier(output_rank=SentinelValueForInferParamInitRank, filter_rank=SentinelV

    Returns:
        initializer for :class:`cntk.variables.Parameter`
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale*sqrt(3.0/fanIn)`
    '''
    if seed is None:
        seed = SentinelValueForAutoSelectRandomSeed
@ -73,6 +76,7 @@ def glorot_uniform(output_rank=SentinelValueForInferParamInitRank, filter_rank=S

    Returns:
        initializer for :class:`cntk.variables.Parameter`
+        initialized to uniform distribution between `scale*sqrt(6.0/(fanIn+fanOut))*[-1,1]`
    '''
    if seed is None:
        seed = SentinelValueForAutoSelectRandomSeed
@ -91,6 +95,7 @@ def glorot_normal(output_rank=SentinelValueForInferParamInitRank, filter_rank=Se

    Returns:
        initializer for :class:`cntk.variables.Parameter`
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale*sqrt(2.0/(fanIn+fanOut))`
    '''
    if seed is None:
        seed = SentinelValueForAutoSelectRandomSeed
@ -109,6 +114,7 @@ def he_uniform(output_rank=SentinelValueForInferParamInitRank, filter_rank=Senti

    Returns:
        initializer for :class:`cntk.variables.Parameter`
+        initialized to uniform distribution between `scale*sqrt(6.0/fanIn)*[-1,1]`
    '''
    if seed is None:
        seed = SentinelValueForAutoSelectRandomSeed
@ -127,6 +133,7 @@ def he_normal(output_rank=SentinelValueForInferParamInitRank, filter_rank=Sentin

    Returns:
        initializer for :class:`cntk.variables.Parameter`
+        initialized to Gaussian distribution with mean `0` and standard deviation `scale*sqrt(2.0/fanIn)`
    '''
    if seed is None:
        seed = SentinelValueForAutoSelectRandomSeed
@ -143,6 +150,7 @@ def bilinear(kernel_width, kernel_height):

    Returns:
        initializer for :class:`cntk.variables.Parameter`
+        useful for deconvolution layer
    '''
    return cntk_py.bilinear_initializer(kernel_width, kernel_height)

--- a/bindings/python/cntk/io/init.py
+++ b/bindings/python/cntk/io/init.py
@ -81,15 +81,17 @@ class MinibatchSource(cntk_py.MinibatchSource):
        randomize (bool, default True): randomize images before every epoch
        epoch_size (int): epoch size
        distributed_after (int): sample count after which minibatch source becomes distributed
+        multithreaded_deserializer (bool): using multi threaded deserializer
    '''
-    def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_after=INFINITE_SAMPLES):
+    def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_after=INFINITE_SAMPLES, multithreaded_deserializer=None):
        if not isinstance(deserializers, (list,tuple)):
            deserializers = [deserializers] # allow passing a single item or a list
        reader_config = ReaderConfig(
            deserializers=deserializers,
            randomize=randomize,
            epoch_size=epoch_size,
-            distributed_after=distributed_after)
+            distributed_after=distributed_after,
+            multithreaded_deserializer=multithreaded_deserializer)
        source = minibatch_source(reader_config)
        # transplant into this class instance
        self.__dict__ = source.__dict__
@ -256,8 +258,9 @@ class ReaderConfig(dict):
        randomize (bool, default True): randomize images before every epoch
        epoch_size (int): epoch size
        distributed_after (int): sample count after which reader becomes distributed
+        multithreaded_deserializer (bool): using multi threaded deserializer
    '''
-    def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_after=INFINITE_SAMPLES):
+    def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_after=INFINITE_SAMPLES, multithreaded_deserializer=None):

        self['epochSize'] = cntk_py.SizeTWrapper(epoch_size) # force to store in size_t
        if not isinstance(deserializers, (list, tuple)):
@ -265,6 +268,8 @@ class ReaderConfig(dict):
        self['deserializers'] = self.deserializers = deserializers or []
        self['randomize'] = randomize
        self['distributedAfterSampleCount'] = cntk_py.SizeTWrapper(distributed_after)
+        if multithreaded_deserializer != None:
+            self['multiThreadedDeserialization'] = multithreaded_deserializer

    @typemap
    def minibatch_source(self):
--- a/bindings/python/cntk/learner.py
+++ b/bindings/python/cntk/learner.py
@ -98,8 +98,8 @@ class Learner(cntk_py.Learner):
        Returns:
            `False` to indicate that learning has stopped for all of the parameters associated with this learner
        '''
-        from .utils import create_NDArrayView_from_NumPy
-        var_nd_map = { var:create_NDArrayView_from_NumPy(val) for var, val in
+        from .utils import _create_NDArrayView_from_NumPy
+        var_nd_map = { var: _create_NDArrayView_from_NumPy(val) for var, val in
                gradient_values.items() }

        return super(Learner, self).update(var_nd_map, training_sample_count)
--- a/bindings/python/cntk/ops/init.py
+++ b/bindings/python/cntk/ops/init.py
@ -250,7 +250,7 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
    >>> x = C.input_variable(img.shape)
    >>> filter = np.reshape(np.array([2, -1, -1, 2], dtype = np.float32), (1, 2, 2))
    >>> kernel = C.constant(value = filter)
-    >>> C.convolution(kernel, x, auto_padding = [False]).eval({x: [img]})
+    >>> C.convolution(kernel, x, auto_padding = [False]).eval({x: [img]}) # doctest: +SKIP
    array([[[[[  6.,   8.,  10.,  12.],
              [ 16.,  18.,  20.,  22.],
              [ 26.,  28.,  30.,  32.],
--- a/bindings/python/cntk/ops/functions.py
+++ b/bindings/python/cntk/ops/functions.py
@ -273,7 +273,9 @@ class Function(cntk_py.Function):
        Returns:
            dict: mapping of ``variables`` to NumPy arrays
        '''
-        root_gradients = sanitize_var_map(self.outputs, root_gradients)
+        device = state.device()
+        root_gradients = sanitize_var_map(self.outputs, root_gradients,
+                                          None, device)

        var_gradients = dict((var, None) for var in variables)

--- a/bindings/python/cntk/ops/tests/linear_test.py
+++ b/bindings/python/cntk/ops/tests/linear_test.py
@ -13,7 +13,7 @@ from __future__ import division
 import numpy as np
 import pytest
 from .ops_test_utils import unittest_helper, _test_unary_op, _test_binary_op, AA, I, precision, PRECISION_TO_TYPE, batch_dense_to_sparse, left_matrix_type, right_matrix_type
-from ...utils import sanitize_dtype_cntk, ones_like, eval
+from ...utils import sanitize_dtype_cntk, _ones_like, eval

 TENSOR_PAIRS = [
    ([30.], [10.]),
@ -74,8 +74,8 @@ def test_op_plus_var_sequences_input_input(left_batch, right_batch, device_id, p
                        for i in range(len(left_batch))]

    expected_backward = {
-        'left': ones_like(left_batch, PRECISION_TO_TYPE[precision]),
-        'right': ones_like(right_batch, PRECISION_TO_TYPE[precision])
+        'left': _ones_like(left_batch, PRECISION_TO_TYPE[precision]),
+        'right': _ones_like(right_batch, PRECISION_TO_TYPE[precision])
    }

    left_value = [AA(sample, dtype=PRECISION_TO_TYPE[precision])
--- a/bindings/python/cntk/utils/init.py
+++ b/bindings/python/cntk/utils/init.py
@ -53,18 +53,11 @@ def cntk_device(device_id):
        return gpu(device_id)


-def is_string(value):
-    if sys.version_info.major < 3:
-        return isinstance(value, basestring)
-
-    return isinstance(value, str)
-
-
-def dense_to_str(data):
+def _dense_to_str(data):
    return ' '.join(data.ravel(order='C').astype(np.str))


-def sparse_to_str(data):
+def _sparse_to_str(data):
    return ' '.join('%s:%s' % (k, v) for k, v in sorted(data.items()))


@ -96,12 +89,12 @@ def tensors_to_text_format(sample_idx, alias_tensor_map):
                # for this alias there no more sequence elements
                continue

-            if is_tensor(tensor):
+            if _is_tensor(tensor):
                if not isinstance(tensor, np.ndarray):
                    tensor = np.asarray(tensor)
-                to_str = dense_to_str
+                to_str = _dense_to_str
            elif isinstance(tensor, list) and isinstance(tensor[0], dict):
-                to_str = sparse_to_str
+                to_str = _sparse_to_str
            else:
                raise ValueError(
                    'expected a tensor (dense) or list of dicts (sparse), but got "%s"' % type(tensor))
@ -113,7 +106,7 @@ def tensors_to_text_format(sample_idx, alias_tensor_map):
    return '\n'.join(lines)


-def is_tensor(data):
+def _is_tensor(data):
    '''
    Checks whether the data is a tensor, i.e. whether it is a NumPy array or a
    list of NumPy arrays.
@ -179,7 +172,7 @@ def one_hot(batch, num_classes, dtype=None, device=None):
        value = cntk_py.Value.create_one_hot_double(num_classes, batch, device, False) 
    return value

-def has_seq_dim(var, data):
+def _has_seq_dim(var, data):
    '''
    Checks whether the data has a sequence dimensions or not. 

@ -274,7 +267,7 @@ def sanitize_shape(shape):

 def sanitize_input(arg, fallback_dtype=np.float32, reshape=None):
    """
-    Convert to :class:`cntk.ops.variables.Variable` so that it can be passed as Variable to the
+    Convert to :class:`~cntk.ops.variables.Variable` so that it can be passed as Variable to the
    CNTK operators.

      * If ``arg`` is a NumPy array and its type is neither `np.float32` nor `np.float64`, it sets it to `np.float32`.
@ -329,8 +322,7 @@ def get_data_type(*args):
    inputs. Placeholders are ignored in the type determination.

    Args:
-        args (number, list, NumPy array, :class:`cntk.ops.variables.Variable`, 
-         or :class:`cntk.ops.functions.Function`): input
+        args (number, list, NumPy array, :class:`cntk.ops.variables.Variable`, or :class:`cntk.ops.functions.Function`): input
    Returns:
        np.float32, np.float64, or None
    """
@ -410,9 +402,10 @@ def _pad_dense_to_max_len(var, batch, max_seq_len):
    Z = np.zeros((len(batch), max_seq_len) +
                 (data_point.shape), dtype=data_point.dtype)
    for idx, seq in enumerate(batch):
-        if seq[0].shape != data_point.shape:
+        elem_shape = seq[0].shape if hasattr(seq, 'shape') else ()
+        if elem_shape != data_point.shape:
            raise ValueError('shape mismatch: expected %s but got %s'
-                             % (str(data_point.shape), str(seq[0].shape)))
+                             % (str(data_point.shape), str(elem_shape)))
        Z[idx, :len(seq)] += seq
    return Z

@ -443,6 +436,11 @@ def _pad_sparse_seq_to_max_len(batch, max_seq_len):
    return Z

 def _is_dense(batch):
+    if isinstance(batch, np.ndarray):
+        return True
+    elif sparse.issparse(batch):
+        return False
+
    is_dense = True
    b = batch
    while isinstance(b, list):
@ -452,6 +450,7 @@ def _is_dense(batch):

    return True

+@typemap
 def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
    '''
    Convert to :class:`Value` with ``dtype``. If the samples in
@ -476,37 +475,31 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
    if isinstance(batch, cntk_py.Value):
        return batch

+    if isinstance(batch, list):
+        if len(batch) == 0:
+            raise ValueError('batch is empty')
+
    # We need to figure out whether the data has a sequence axis. Note that
    # it is not enough to check whether the variable's dynamic axes include the
    # sequence axis, because the sequence axis might be omitted in the data if
    # it is not needed (CNTK core would then take care of this).
-    batch_has_seq = has_seq_dim(var, batch)
+    batch_has_seq = _has_seq_dim(var, batch)

-    if isinstance(batch, list):
    is_dense = _is_dense(batch)

-        if is_dense:
-            seq_lens = [len(seq) for seq in batch]
-
-            # If the input is a list of lists of dense values, all of the same
-            # length, then we convert it into a NumPy array without requiring a
-            # mask.
-            if len(set(seq_lens)) == 1:
-                batch = np.asarray(batch)
-        else:
+    if batch_has_seq or seq_starts:
        if isinstance(batch[0], list):
            seq_lens = [len(seq) for seq in batch]
        else:
            seq_lens = [seq.shape[0] for seq in batch]

-        if batch_has_seq:
        max_seq_len = max(seq_lens)
-    else:
-        is_dense = isinstance(batch, np.ndarray)
-        # It is a sparse or dense NumPy array having all sequences being the
-        # same length, so we just calculate the sequence lengths
-        if batch_has_seq:
-            max_seq_len = batch.shape[1]
+
+        # If the input is a list of lists of dense values, all of the same
+        # length, we convert it into a NumPy array. 
+        if is_dense and len(set(seq_lens)) == 1:
+            batch_has_seq = False
+            batch = np.asarray(batch, dtype=var.dtype)

    if dtype is None:
        dtype = get_data_type(var)
@ -514,25 +507,8 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
    if device is None:
        device = use_default_device()

-    if isinstance(batch, np.ndarray):
-        if np.issubdtype(batch.dtype, int):
-            batch = batch.astype(var.dtype)
-        elif batch.dtype not in (np.float32, np.float64):
-            raise ValueError('only float32 and float64 are supported')
-
-        ndav = create_NDArrayView_from_NumPy(batch, device)
-        return Value(data=ndav)
-
-    if isinstance(batch, list):
-        if len(batch) == 0:
-            raise ValueError('batch is empty')
-
-        if not batch_has_seq and seq_starts is not None:
-            raise ValueError('specification of individual sequence begins does not'
-                    ' make sense when not using the sequence axis')
-
    # batch is now either a dense input that requires a mask, or it is sparse
-    if batch_has_seq:
+    if batch_has_seq or seq_starts:
        mask = cntk_py.NDMask((len(batch), max_seq_len), 
                device or use_default_device())
        for idx, seq_len in enumerate(seq_lens):
@ -550,8 +526,20 @@ def sanitize_batch(var, batch, seq_starts=None, dtype=None, device=None):
        mask = None

    if is_dense:
+        if batch_has_seq:
            batch = _pad_dense_to_max_len(var, batch, max_seq_len)
-        ndav = create_NDArrayView_from_NumPy(batch.astype(dtype), device)
+        if not isinstance(batch, np.ndarray):
+            batch = np.asarray(batch)
+        ndav = _create_NDArrayView_from_NumPy(batch.astype(dtype), device)
+        return Value(data=ndav, mask=mask)
+
+    if isinstance(batch, np.ndarray):
+        if np.issubdtype(batch.dtype, int):
+            batch = batch.astype(var.dtype)
+        elif batch.dtype not in (np.float32, np.float64):
+            raise ValueError('only float32 and float64 are supported')
+
+        ndav = _create_NDArrayView_from_NumPy(batch.astype(dtype), device)
        return Value(data=ndav, mask=mask)

    # There are three possibilities of providing sparse batches:
@ -638,7 +626,7 @@ def sanitize_value(shape, value, dtype, device):
        if shape is None:
            raise ValueError('you need to specify at least shape or value')
        cntk_dtype = sanitize_dtype_cntk(dtype)
-        ndav = create_NDArrayView(shape, cntk_dtype, device)
+        ndav = _create_NDArrayView(shape, cntk_dtype, device)
    else:
        np_dtype = sanitize_dtype_numpy(dtype)
        if not isinstance(value, np.ndarray) or value.dtype != np_dtype:
@ -647,7 +635,7 @@ def sanitize_value(shape, value, dtype, device):
            else:
                value = np.asarray(value, dtype=np_dtype)

-        ndav = create_NDArrayView_from_NumPy(value, device)
+        ndav = _create_NDArrayView_from_NumPy(value, device)

    return ndav

@ -772,7 +760,7 @@ def sanitize_var_map(op_arguments, arguments, precision=None,
    return var_map


-def ones_like(batch, precision):
+def _ones_like(batch, precision):
    '''
    Returns a new batch, which has the same format as ``batch`` but all values
    set to 1.
@ -783,7 +771,7 @@ def ones_like(batch, precision):
    return [np.ones_like(sample, dtype=sanitize_precision(precision)) for sample in batch]


-def create_NDArrayView(shape, data_type=cntk_py.DataType_Float, device=None):
+def _create_NDArrayView(shape, data_type=cntk_py.DataType_Float, device=None):
    shape = sanitize_shape(shape)
    if device is None:
        device = use_default_device()
@ -793,7 +781,7 @@ def create_NDArrayView(shape, data_type=cntk_py.DataType_Float, device=None):
    return view


-def create_NDArrayView_from_NumPy(nd, device=None):
+def _create_NDArrayView_from_NumPy(nd, device=None):
    if device is None:
        device = use_default_device()

@ -819,11 +807,11 @@ class Value(cntk_py.Value):
            device = use_default_device()

        if shape and dtype:
-            ndav = create_NDArrayView(shape, dtype, device)
+            ndav = _create_NDArrayView(shape, dtype, device)

        elif data:
            if isinstance(data, np.ndarray):
-                ndav = create_NDArrayView_from_NumPy(data, device)
+                ndav = _create_NDArrayView_from_NumPy(data, device)
            else:
                ndav = data

@ -841,6 +829,27 @@ class Value(cntk_py.Value):
        '''
        return super(Value, self).shape().dimensions()

+    @property
+    def mask(self):
+        '''
+        The mask matrix of this value. Each row denotes a sequence with its
+        elements describing the mask of the element:
+         * 2: beginning of sequence (e.g. an LSTM would be reset)
+         * 1: valid element
+         # 0: invalid element
+
+        Example:
+          A mask of 
+           ```[[2, 1, 1], [1, 1, 0]]
+           ```
+           describes a batch of two sequences. The first has three elements, of
+           which the first element signals the beginning of a sequence. The second
+           sequence has two elements, which are both continuations of the first
+           sequence.
+        '''
+        return np.asarray(super(Value, self).mask())
+    
+
    def __len__(self):
        '''
        Number of samples in this value object.
@ -939,7 +948,7 @@ def ensure_dev(ndav, dev):

    if ndav.device() != dev:

-        ndav_on_target = create_NDArrayView(
+        ndav_on_target = _create_NDArrayView(
            ndav.shape().dimensions(), data_type=ndav.get_data_type(), dev=dev)
        ndav_on_target.copy_from(ndav)
        ndav = ndav_on_target
@ -953,7 +962,7 @@ def value_to_seq(value):
    entries removed.

    Args:
-        value (`Value`): Value as it is returned by Swig
+        value (:class:`Value`): Value as it is returned by Swig

    Returns:
        a list of NumPy arrays
@ -1013,7 +1022,7 @@ def eval(op, arguments=None, precision=None, device=None, backward_pass=False, e
    if backward_pass:
        if expected_backward is None:
            expected_backward = arguments
-        root_gradients = {v: ones_like(o, precision) for v, o in
+        root_gradients = {v: _ones_like(o, precision) for v, o in
                          forward_output.items()}

        backward_output = op.backward(state, root_gradients, expected_backward)
--- a/bindings/python/cntk/utils/tests/utils_test.py
+++ b/bindings/python/cntk/utils/tests/utils_test.py
@ -13,6 +13,7 @@ from cntk.device import default
 from cntk.tests.test_utils import precision, PRECISION_TO_TYPE
 from cntk.ops import *
 from cntk.utils import *
+from cntk.utils import _has_seq_dim, _is_tensor

 # Keeping things short
 AA = np.asarray
@ -58,7 +59,7 @@ def test_tensor_conversion_dense(idx, alias_tensor_map, expected):
    ([AA([1, 2]), AA([])], False),
 ])
 def test_is_tensor(data, expected):
-    assert is_tensor(data) == expected
+    assert _is_tensor(data) == expected


 def test_sanitize_dtype_numpy():
@ -123,16 +124,14 @@ def test_get_data_type():
    # exception handling
    ((2,2), AA([[1,1],[2,2]]), ValueError),
    (1, [[[1,2]]], ValueError),
-    #(1, [AA([[40], [50]])], ValueError),
-    ((1,), [[[40], [50]]], ValueError),
 ])
 def test_has_seq_dim_dense(shape, batch, expected):
    i1 = input_variable(shape)
    if expected in [False, True]:
-        assert has_seq_dim(i1, batch) == expected
+        assert _has_seq_dim(i1, batch) == expected
    else:
        with pytest.raises(expected):
-            has_seq_dim(i1, batch)
+            _has_seq_dim(i1, batch)

@pytest.mark.parametrize("shape, batch, expected", [
    ((1,2), [csr([1,0]), csr([2,3]), csr([5,6])], False),
@ -141,10 +140,10 @@ def test_has_seq_dim_dense(shape, batch, expected):
 def test_has_seq_dim_sparse(shape, batch, expected):
    i1 = input_variable(shape, is_sparse=True)
    if expected in [False, True]:
-        assert has_seq_dim(i1, batch) == expected
+        assert _has_seq_dim(i1, batch) == expected
    else:
        with pytest.raises(expected):
-            has_seq_dim(i1, batch)
+            _has_seq_dim(i1, batch)

 def test_sanitize_batch_sparse():
    batch = [[csr([1,0,2]), csr([2,3,0])],
@ -160,4 +159,20 @@ def test_sanitize_batch_sparse():
    # 2 sequences, with max seq len of 2 and dimension 3
    assert b.shape == (2,2,3)

+@pytest.mark.parametrize("batch, seq_starts, expected_mask", [
+    ([[5, 6, 7],
+       [8]],
+       [True, False],
+       [[2, 1, 1], [1, 0, 0]]),
+
+    ([[5],
+       [8]],
+       [True, False],
+       [[2], [1]]),
+])
+def test_mask(batch, seq_starts, expected_mask):
+    shape = (1,)
+    var = input_variable(shape)
+    s = sanitize_batch(var, batch, seq_starts)
+    assert np.allclose(s.mask, expected_mask)